In [18]:
import numpy as np
from numpy import arange
from scipy import stats

from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV, StratifiedKFold
from sklearn.metrics import f1_score, balanced_accuracy_score, roc_auc_score

# Load models
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import ExtraTreesClassifier, VotingClassifier
from xgboost import XGBClassifier

from mlxtend.classifier import StackingClassifier

import warnings
warnings.filterwarnings('ignore')


In [2]:
data = load_breast_cancer(as_frame=True)
data
# data.data -> dataframe of data
# data.target -> target col
# data.feature_names -> column names / features
# data.target_names -> target names malignant - 1 / benign - 0

{'data':      mean radius  mean texture  mean perimeter  mean area  mean smoothness  \
 0          17.99         10.38          122.80     1001.0          0.11840   
 1          20.57         17.77          132.90     1326.0          0.08474   
 2          19.69         21.25          130.00     1203.0          0.10960   
 3          11.42         20.38           77.58      386.1          0.14250   
 4          20.29         14.34          135.10     1297.0          0.10030   
 ..           ...           ...             ...        ...              ...   
 564        21.56         22.39          142.00     1479.0          0.11100   
 565        20.13         28.25          131.20     1261.0          0.09780   
 566        16.60         28.08          108.30      858.1          0.08455   
 567        20.60         29.33          140.10     1265.0          0.11780   
 568         7.76         24.54           47.92      181.0          0.05263   
 
      mean compactness  mean concavity  me

In [None]:
data.target.value_counts()

Here we can notice that our dataset is somehow imblanced 

In [3]:
# split dataset into train and test
X_train, X_test, y_train, y_test = train_test_split(data.data, data.target, test_size=0.3, random_state=42)

In [16]:
X_test

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
204,12.470,18.60,81.09,481.9,0.09965,0.10580,0.08005,0.03821,0.1925,0.06373,...,14.97,24.64,96.05,677.9,0.1426,0.23780,0.2671,0.10150,0.3014,0.08750
70,18.940,21.31,123.60,1130.0,0.09009,0.10290,0.10800,0.07951,0.1582,0.05461,...,24.86,26.58,165.90,1866.0,0.1193,0.23360,0.2687,0.17890,0.2551,0.06589
131,15.460,19.48,101.70,748.9,0.10920,0.12230,0.14660,0.08087,0.1931,0.05796,...,19.26,26.00,124.90,1156.0,0.1546,0.23940,0.3791,0.15140,0.2837,0.08019
431,12.400,17.68,81.47,467.8,0.10540,0.13160,0.07741,0.02799,0.1811,0.07102,...,12.88,22.91,89.61,515.8,0.1450,0.26290,0.2403,0.07370,0.2556,0.09359
540,11.540,14.44,74.65,402.9,0.09984,0.11200,0.06737,0.02594,0.1818,0.06782,...,12.26,19.68,78.78,457.8,0.1345,0.21180,0.1797,0.06918,0.2329,0.08134
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
69,12.780,16.49,81.37,502.5,0.09831,0.05234,0.03653,0.02864,0.1590,0.05653,...,13.46,19.76,85.67,554.9,0.1296,0.07061,0.1039,0.05882,0.2383,0.06410
542,14.740,25.42,94.70,668.6,0.08275,0.07214,0.04105,0.03027,0.1840,0.05680,...,16.51,32.29,107.40,826.4,0.1060,0.13760,0.1611,0.10950,0.2722,0.06956
176,9.904,18.06,64.60,302.4,0.09699,0.12940,0.13070,0.03716,0.1669,0.08116,...,11.26,24.39,73.07,390.2,0.1301,0.29500,0.3486,0.09910,0.2614,0.11620
501,13.820,24.49,92.33,595.9,0.11620,0.16810,0.13570,0.06759,0.2275,0.07237,...,16.01,32.94,106.00,788.0,0.1794,0.39660,0.3381,0.15210,0.3651,0.11830


In [4]:
# configure the cross-validation procedure
cv = StratifiedKFold(n_splits=5, shuffle=True)

In [None]:
# This function finds and returns the best params of RandomizedSearch based on mean_test_score
def find_best_params_rs(test_score, params, params_gs):
  # replace nan values with 0, so as to be able to take the max values
  np.nan_to_num(test_score, copy = False)

  # get the 10% of the best randomized Search params
  for _ in range(0, int(len(test_score)*0.1)):
    # get the index of the max score value
    max_index_col = np.argmax(test_score, axis=0)

    for param in params[max_index_col]:
      # check if the param already exists in the list
      if params[max_index_col][param] not in params_gs[param]:
        params_gs[param].append(params[max_index_col][param])

    # remove it from the array
    test_score = np.delete(test_score, max_index_col)
    params = np.delete(params, max_index_col)

  return params_gs

### Linear Discriminant Analysis

In [65]:
lda_clf = LinearDiscriminantAnalysis()

lda_params_rs = {
  'solver': ['svd', 'lsqr', 'eigen'],
  'shrinkage': arange(0, 1, 0.01),
  'tol': arange(0, 1, 0.01)
}

lda_rand_search = RandomizedSearchCV(lda_clf, lda_params_rs, scoring='f1', cv=cv, refit=True, n_iter=300)

# fit the model 
results_rs_lda = lda_rand_search.fit(X_train, y_train)
# get the best performing model fit on the whole training set
lda_best_model = results_rs_lda.best_estimator_

# evaluate model on the traindataset
yhat = lda_best_model.predict(X_test)
# evaluate the model
f1_score_lda = f1_score(y_test, yhat)

print("F1 score is: " + str(f1_score_lda))
print('Mean f1 score: %.3f' % results_rs_lda.best_score_)
print('Config: %s' % results_rs_lda.best_params_)

F1 score is: 0.9636363636363636
Mean f1 score: 0.961
Config: {'tol': 0.59, 'solver': 'eigen', 'shrinkage': 0.0}


In [66]:
# find the best parameters from the randomized search
lda_params_gs = {
  'solver': [],
  'shrinkage': [],
  'tol': []
}

lda_params_gs = find_best_params_rs(results_rs_lda.cv_results_['mean_test_score'], results_rs_lda.cv_results_['params'], lda_params_gs)

In [67]:
lda_grid_search = GridSearchCV(lda_clf, lda_params_gs, scoring='f1', cv=cv, refit=True)

# fit the model 
results_gs_lda = lda_grid_search.fit(X_train, y_train)
# get the best performing model fit on the whole training set
lda_best_model = results_gs_lda.best_estimator_

# evaluate model on the traindataset
yhat = lda_best_model.predict(X_test)
# evaluate the model
f1_score_lda = f1_score(y_test, yhat)

print("F1 score is: " + str(f1_score_lda))
print('Mean f1 score: %.3f' % results_gs_lda.best_score_)
print('Config: %s' % results_gs_lda.best_params_)

F1 score is: 0.9636363636363636
Mean f1 score: 0.961
Config: {'shrinkage': 0.0, 'solver': 'eigen', 'tol': 0.59}


### Logistic Regression

In [68]:
lr_clf = LogisticRegression()

lr_params_rs = {
  'penalty': ['l1', 'l2', 'elasticnet', 'none'],
  'max_iter' : range(100, 500, 50),
  'warm_start' : [True, False],
  'fit_intercept' : [True, False],
  'solver' : ['lbfgs', 'newton-cg', 'liblinear', 'sag', 'saga'],
  'C' : arange(0, 1, 0.01),
}

lr_rand_search = RandomizedSearchCV(lr_clf, lr_params_rs, scoring='f1', cv=cv, refit=True, n_iter=300)

# fit the model 
results_rs_lr = lr_rand_search.fit(X_train, y_train)
# get the best performing model fit on the whole training set
lr_best_model = results_rs_lr.best_estimator_

# evaluate model on the traindataset
yhat = lr_best_model.predict(X_test)
# evaluate the model
f1_score_lr = f1_score(y_test, yhat)

print("F1 score is: " + str(f1_score_lr))
print('Mean f1 score: %.3f' % results_rs_lr.best_score_)
print('Config: %s' % results_rs_lr.best_params_)

F1 score is: 0.9765258215962442
Mean f1 score: 0.968
Config: {'warm_start': False, 'solver': 'newton-cg', 'penalty': 'none', 'max_iter': 100, 'fit_intercept': True, 'C': 0.5700000000000001}


In [71]:
# find the best parameters from the randomized search
lr_params_gs = {
  'warm_start': [],
  'solver': [],
  'penalty': [],
  'max_iter': [],
  'fit_intercept': [],
  'C': [],
}

lr_params_gs = find_best_params_rs(results_rs_lr.cv_results_['mean_test_score'], results_rs_lr.cv_results_['params'], lr_params_gs)

In [72]:
lr_grid_search = GridSearchCV(lr_clf, lr_params_gs, scoring='f1', cv=cv, refit=True)

# fit the model 
results_gs_lr = lr_grid_search.fit(X_train, y_train)
# get the best performing model fit on the whole training set
lr_best_model = results_gs_lr.best_estimator_

# evaluate model on the traindataset
yhat = lr_best_model.predict(X_test)
# evaluate the model
f1_score_lr = f1_score(y_test, yhat)

print("F1 score is: " + str(f1_score_lr))
print('Mean f1 score: %.3f' % results_gs_lr.best_score_)
print('Config: %s' % results_gs_lr.best_params_)

F1 score is: 0.9767441860465117
Mean f1 score: 0.966
Config: {'C': 0.5700000000000001, 'fit_intercept': False, 'max_iter': 450, 'penalty': 'none', 'solver': 'lbfgs', 'warm_start': False}


### Support Vector Machines

In [74]:
svm_clf = SVC()

svm_params_rs = {
  'kernel': ['linear', 'poly', 'rbf', 'sigmoid', 'precomputed'],
  'C': arange(0, 10, 0.5),
  'gamma': arange(0, 1, 0.01),
  'shrinking': [True, False],
}

svm_rs_search = RandomizedSearchCV(svm_clf, svm_params_rs, scoring='f1', cv=cv, refit=True, n_iter=300, n_jobs=5)

# fit the model 
results_rs_svm = svm_rs_search.fit(X_train, y_train)
# get the best performing model fit on the whole training set
svm_best_model = results_rs_svm.best_estimator_

# evaluate model on the traindataset
yhat = svm_best_model.predict(X_test)
# evaluate the model
f1_score_svm = f1_score(y_test, yhat)

print("F1 score is: " + str(f1_score_svm))
print('Mean f1 score: %.3f' % results_rs_svm.best_score_)
print('Config: %s' % results_rs_svm.best_params_)

In [None]:
svm_params_gs = {
  'kernel': [],
  'C': [],
  'gamma': [],
  'shrinking': [],
  'degree': []
}

svm_params_gs = find_best_params_rs(results_rs_svm.cv_results_['mean_test_score'], results_rs_svm.cv_results_['params'], svm_params_gs)

In [None]:
svm_grid_search = GridSearchCV(svm_clf, svm_params_gs, scoring='f1', cv=cv, refit=True, n_jobs=5)

# fit the model 
results_gs_svm = svm_grid_search.fit(X_train, y_train)
# get the best performing model fit on the whole training set
svm_best_model = results_gs_svm.best_estimator_

# evaluate model on the traindataset
yhat = svm_best_model.predict(X_test)
# evaluate the model
f1_score_svm = f1_score(y_test, yhat)

print("F1 score is: " + str(f1_score_svm))
print('Mean f1 score: %.3f' % results_gs_svm.best_score_)
print('Config: %s' % results_gs_svm.best_params_)

### Extremely Randomized Trees

In [None]:
etc_clf = ExtraTreesClassifier()

etc_params_rs = {
  "n_estimators": range(10, 2000, 100),
  "criterion": ['gini', 'entropy', 'log_loss'],
  "max_depth": range(1, 20),
  "max_features": range(1, 21),
  "min_samples_split": range(2, 15)
}

etc_rand_search = RandomizedSearchCV(etc_clf, etc_params_rs, scoring='f1', cv=cv, refit=True, n_iter=300, n_jobs=5)

# fit the model 
results_rs_etc = etc_rand_search.fit(X_train, y_train)
# get the best performing model fit on the whole training set
etc_best_model = results_rs_etc.best_estimator_

# evaluate model on the traindataset
yhat = etc_best_model.predict(X_test)
# evaluate the model
f1_score_etc = f1_score(y_test, yhat)

print("F1 score is: " + str(f1_score_etc))
print('Mean f1 score: %.3f' % results_rs_etc.best_score_)
print('Config: %s' % results_rs_etc.best_params_)

In [None]:
# find the best parameters from the randomized search
etc_params_gs = {
  'n_estimators': [],
  'criterion': [],
  'max_depth': [],
  'max_features': [],
  'min_samples_split': []
}

etc_params_gs = find_best_params_rs(results_rs_etc.cv_results_['mean_test_score'], results_rs_etc.cv_results_['params'], etc_params_gs)

In [None]:
etc_grid_search = GridSearchCV(etc_clf, etc_params_gs, scoring='f1', cv=cv, refit=True, n_jobs=5)

# fit the model 
results_gs_etc = etc_grid_search.fit(X_train, y_train)
# get the best performing model fit on the whole training set
etc_best_model = results_gs_etc.best_estimator_

# evaluate model on the traindataset
yhat = etc_best_model.predict(X_test)
# evaluate the model
f1_score_etc = f1_score(y_test, yhat)

print("F1 score is: " + str(f1_score_etc))
print('Mean f1 score: %.3f' % results_gs_etc.best_score_)
print('Config: %s' % results_gs_etc.best_params_)

### XGBoost

In [None]:
xgb_clf = XGBClassifier(objective="binary:logistic")

xgb_params_rs = {
  'learning_rate' : arange(0.01, 0.5, 0.05),
  'max_depth' : range(1, 20),
  'min_child_weight' : range(1, 10, 2),
  'gamma': arange(0.1, 0.5, 0.05),
  'colsample_bytree' : range(0.1, 1, 0.1),
  "n_estimators": range(10, 2000, 100),
}

xgb_rand_search = RandomizedSearchCV(xgb_clf, xgb_params_rs, scoring='f1', cv=cv, refit=True, n_iter=300, n_jobs=5)

# fit the model 
results_rs_xgb = xgb_rand_search.fit(X_train, y_train)
# get the best performing model fit on the whole training set
xgb_best_model = results_rs_xgb.best_estimator_

# evaluate model on the traindataset
yhat = xgb_best_model.predict(X_test)
# evaluate the model
f1_score_xgb = f1_score(y_test, yhat)

print("F1 score is: " + str(f1_score_xgb))
print('Mean f1 score: %.3f' % results_rs_xgb.best_score_)
print('Config: %s' % results_rs_xgb.best_params_)

In [None]:
# find the best parameters from the randomized search
xgb_params_gs = {
  'learning_rate': [],
  'max_depth': [],
  'min_child_weight': [],
  'gamma': [],
  'colsample_bytree': [],
  'n_estimators': []
}

xgb_params_gs = find_best_params_rs(results_rs_xgb.cv_results_['mean_test_score'], results_rs_xgb.cv_results_['params'], xgb_params_gs)

In [None]:
xgb_grid_search = GridSearchCV(xgb_clf, xgb_params_gs, scoring='f1', cv=cv, refit=True, n_jobs=5)

# fit the model 
results_gs_xgb = xgb_grid_search.fit(X_train, y_train)
# get the best performing model fit on the whole training set
xgb_best_model = results_gs_xgb.best_estimator_

# evaluate model on the traindataset
yhat = xgb_best_model.predict(X_test)
# evaluate the model
f1_score_xgb = f1_score(y_test, yhat)

print("F1 score is: " + str(f1_score_xgb))
print('Mean f1 score: %.3f' % results_gs_xgb.best_score_)
print('Config: %s' % results_gs_xgb.best_params_)

### Final best models

In [None]:
final_lda_clf = LinearDiscriminantAnalysis(shrinkage=0.0, solver='eigen', tol=0.59).fit(X_train, y_train)
y_pred_lda = final_lda_clf.predict(X_test)

final_lr_clf = LogisticRegression(penalty='none', max_iter=450, warm_start=False, fit_intercept=False, solver='lbfgs', C=0.57).fit(X_train, y_train)
y_pred_lr = final_lr_clf.predict(X_test)

final_svm_clf = SVC(kernel='', C=0, gamma=0, shrinking=True).fit(X_train, y_train)
y_pred_svm = final_svm_clf.predict(X_test)

final_etc_clf = ExtraTreesClassifier(n_estimators=10, criterion='', max_depth=0, max_features=0, min_samples_split=0).fit(X_train, y_train)
y_pred_etc = final_etc_clf.predict(X_test)

final_xgb_clf = XGBClassifier(learning_rate=0.1, max_depth=0, min_child_weight=1, gamma=0, colsample_bytree=1, n_estimators=100).fit(X_train, y_train)
y_pred_xgb = final_xgb_clf.predict(X_test)


**Because our dataset is imbalanced, is better to use the balanced_accuracy as metric, instead of accuracy_score**

In [None]:
# Results of Final Models

print("-----Linear Discriminant Analysis------")
print("Balanced Accuracy score: %.3f" % balanced_accuracy_score(y_test, y_pred_lda))
print("F1 score is: %.3f" % f1_score(y_test, y_pred_lda))
print("ROC AUC is: %.3f" % roc_auc_score(y_test, final_lda_clf.predict_proba(X_test)[:, 1]))
print()

print("-----Linear Regression------")
print("Balanced Accuracy score: %.3f" % balanced_accuracy_score(y_test, y_pred_lr))
print("F1 score is: %.3f" % f1_score(y_test, y_pred_lr))
print("ROC AUC is: %.3f" % roc_auc_score(y_test, final_lr_clf.predict_proba(X_test)[:, 1]))
print()

print("-----Support Vector Machine------")
print("Balanced Accuracy score: %.3f" % balanced_accuracy_score(y_test, y_pred_svm))
print("F1 score is: %.3f" % f1_score(y_test, y_pred_svm))
print("ROC AUC is: %.3f" % roc_auc_score(y_test, final_svm_clf.predict_proba(X_test)[:, 1]))
print()

print("-----Extremely RandomizedTtrees------")
print("Balanced Accuracy score: %.3f" % balanced_accuracy_score(y_test, y_pred_etc))
print("F1 score is: %.3f" % f1_score(y_test, y_pred_etc))
print("ROC AUC is: %.3f" % roc_auc_score(y_test, final_etc_clf.predict_proba(X_test)[:, 1]))
print()

print("-----XGBoost Classifier------")
print("Balanced Accuracy score: %.3f" % balanced_accuracy_score(y_test, y_pred_xgb))
print("F1 score is: %.3f" % f1_score(y_test, y_pred_xgb))
print("ROC AUC is: %.3f" % roc_auc_score(y_test, final_xgb_clf.predict_proba(X_test)[:, 1]))
print()

#### Voting Classifier

In [None]:
voting_clf = VotingClassifier(estimators=[('lda', final_lda_clf), ('lr', final_lr_clf), ('svm', final_svm_clf), 
                                            ('etc', final_etc_clf), ('xgb', final_xgb_clf)], voting='hard')
voting_clf = voting_clf.fit(X_train, y_train)

y_pred_voting_clf = voting_clf.predict(X_test)

print("Balanced Accuracy score: %.3f" % balanced_accuracy_score(y_test, y_pred_voting_clf))
print("F1 score is: %.3f" % f1_score(y_test, y_pred_voting_clf))
print("ROC AUC is: %.3f" % roc_auc_score(y_test, voting_clf.predict_proba(X_test)[:, 1]))

#### Stacking Classifier

In [None]:
stacking_clf = StackingClassifier(classifiers=[final_lda_clf, final_lr_clf, final_svm_clf, final_etc_clf, final_xgb_clf], 
                          meta_classifier=LogisticRegression())

stacking_clf.fit(X_train, y_train)

y_pred_stacking_clf = stacking_clf.predict(X_test)

print("Balanced Accuracy score: %.3f" % balanced_accuracy_score(y_test, y_pred_stacking_clf))
print("F1 score is: %.3f" % f1_score(y_test, y_pred_stacking_clf))
print("ROC AUC is: %.3f" % roc_auc_score(y_test, stacking_clf.predict_proba(X_test)[:, 1]))

#### Single Classifiers