In [1]:
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split, cross_val_score

In [2]:
df = pd.read_csv('heart.csv')
X = df.drop(columns=['target'])
y = df['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [3]:
train = xgb.DMatrix(X_train, label=y_train)
test = xgb.DMatrix(X_test, label=y_test)

In [4]:
param = {
    'max_depth': 5,
    'eta': 0.7,
    'objective': 'multi:softmax',
    'num_class': 4,
    'eval_metric': 'mlogloss'
}
epochs = 10

In [5]:
model = xgb.train(param, train, epochs)

In [6]:
predictions = model.predict(test)

In [7]:
from sklearn.metrics import accuracy_score

accuracy_score(y_test, predictions)
# hacked by mosh

0.8688524590163934

In [140]:
result = xgb.cv(params=param, dtrain=train, early_stopping_rounds=50, as_pandas=True, seed=23333)
print(result)

   train-mlogloss-mean  train-mlogloss-std  test-mlogloss-mean  \
0             0.598483            0.004575            0.803200   
1             0.348291            0.014916            0.624990   
2             0.223783            0.011538            0.550890   
3             0.152932            0.007978            0.519983   
4             0.111783            0.006925            0.498660   
5             0.085110            0.006280            0.495503   
6             0.068118            0.004159            0.495472   
7             0.057034            0.003167            0.487744   
8             0.049175            0.002818            0.495871   
9             0.043431            0.002363            0.504297   

   test-mlogloss-std  
0           0.080291  
1           0.065349  
2           0.063297  
3           0.073973  
4           0.068191  
5           0.062845  
6           0.061583  
7           0.059361  
8           0.062767  
9           0.067519  


In [28]:
from sklearn.datasets import load_breast_cancer, load_iris
import numpy as np

cancer = load_breast_cancer()
X = cancer.data
y = cancer.target

# iris = load_iris()
# X = iris.data
# y = iris.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)


In [29]:
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.metrics import accuracy_score, classification_report

model = ExtraTreesClassifier(random_state=1)
model.fit(X_train, y_train)

predictions = model.predict(X_test)
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       1.00      0.88      0.94        42
           1       0.94      1.00      0.97        72

    accuracy                           0.96       114
   macro avg       0.97      0.94      0.95       114
weighted avg       0.96      0.96      0.96       114



In [16]:
from sklearn.model_selection import GridSearchCV

# defining parameter range
param_grid = {
        'n_estimators': range(50,101,25),
        'criterion': ['gini', 'entropy'],
        #'max_depth' : range(25,101,25),
        'min_samples_leaf': range(1,11,1),
        'min_samples_split': range(1, 11, 1),
        'max_features': ['sqrt','log2'],
        'random_state':[1]
}

grid = GridSearchCV(model, param_grid, refit=True, verbose=3, n_jobs=-1)

grid.fit(X_train, y_train)
store = grid.best_params_
print(grid.best_params_)
grid_pred = grid.predict(X_test)

print(classification_report(y_test, grid_pred))

Fitting 5 folds for each of 1200 candidates, totalling 6000 fits
{'criterion': 'entropy', 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50, 'random_state': 1}
              precision    recall  f1-score   support

           0       1.00      0.88      0.94        42
           1       0.94      1.00      0.97        72

    accuracy                           0.96       114
   macro avg       0.97      0.94      0.95       114
weighted avg       0.96      0.96      0.96       114





In [84]:
from sklearn.metrics import accuracy_score
from sklearn.ensemble import ExtraTreesClassifier

optimum = ExtraTreesClassifier(criterion=store['criterion'],
                                max_features=store['max_features'],
                                min_samples_leaf=store['min_samples_leaf'],
                                min_samples_split=store['min_samples_split'],
                                n_estimators=store['n_estimators'], 
                                random_state=0
                              )
default = ExtraTreesClassifier(criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1, max_features='auto' , random_state=0)

optimum.fit(X_train, y_train)
default.fit(X_train, y_train)

optimum_pred = optimum.predict(X_test)
default_pred = default.predict(X_test)

optimum_score = accuracy_score(y_test, optimum_pred)
default_score = accuracy_score(y_test, default_pred)

print(optimum_score)
print(default_score)
grid.score(X_test, y_test)

KeyError: 'criterion'

In [55]:
store = grid.best_params_
print(store)

{'criterion': 'entropy', 'max_depth': 25, 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 8, 'n_estimators': 75, 'random_state': 0}


In [79]:
print(grid.score(X_test, y_test))
print(grid.best_score_)

0.9473684210526315
0.9670329670329672
