# Impotring

In [363]:
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

In [199]:
from sklearn import linear_model
from sklearn import svm
from sklearn import tree
from sklearn import tree  
from sklearn import naive_bayes
from sklearn import ensemble

In [200]:
import pandas as pd
import numpy as np
from sklearn import datasets
from sklearn import dummy, metrics
from sklearn import model_selection as ms

# Dataset loading

In [201]:
def getDataset(name):
    dataset = getattr(datasets, 'load_'+name)()

    dataset_frame = pd.DataFrame(dataset.data)
    dataset_frame['target'] = dataset.target
    target = dataset_frame['target'] 
    data = dataset_frame.drop(columns = ['target'])

    print ("Class distibution")
    print (target.value_counts())

    print ("Class distibution")
    dataset_frame.sample(10)
    return dataset, data, target

# Classifiers

In [379]:
def getSVM(train_data, test_data, train_labels, test_labels, folds, hyper_params_svc):
    
    f1_scorer = metrics.make_scorer(metrics.f1_score, average='weighted')
    
    classifier_svc = svm.SVC()
    
    svm_grid = ms.GridSearchCV(classifier_svc, 
                               hyper_params_svc,  # parameters to tune via cross validation
                               refit=True,        # fit using all available data at the end, on the best found param combination
                               scoring=f1_scorer,  
                               cv=ms.StratifiedKFold(n_splits=folds)
    )

    svm_grid_best_model = svm_grid.fit(train_data, train_labels)

    print("Best hyper-parameters:")
    print(svm_grid_best_model.best_params_)

    predictions_svm = svm_grid_best_model.predict(test_data)

    svm_grid_best_model_f1 = metrics.f1_score(test_labels, predictions_svm, average='weighted')
    print ("SVM: ")
    print ("f1 = {:.3f}".format(svm_grid_best_model_f1))

    print (metrics.classification_report(test_labels, predictions_svm))
    #metrics.plot_confusion_matrix(svm_grid_best_model, test_data, test_labels)
    


def getDecisionTreeClassifier(model, train_data, test_data, train_labels, test_labels):
    model.fit (train_data, train_labels)
    predictions = model.predict(test_data)

    f1_score = metrics.f1_score(test_labels, predictions, average='weighted');
    print ("DecisionTreeClassifier: ")
    print ("f1 = {:.3f}".format(f1_score))
    
def getLogisticRegression(model, train_data, test_data, train_labels, test_labels):
    model.fit (train_data, train_labels)
    predictions = model.predict(test_data)
    
    f1_score = metrics.f1_score(test_labels, predictions, average='weighted');
    print ("LogisticRegression: ")
    print ("f1 = {:.3f}".format(f1_score))
    
def getNaiveBayes(model, train_data, test_data, train_labels, test_labels):

    model.fit (train_data, train_labels)
    predictions = model.predict(test_data)

    f1_score = metrics.f1_score(test_labels, predictions, average='weighted');
    print ("NaiveBayes: ")
    print ("f1 = {:.3f}".format(f1_score))
    
def getRandomForestClassifier(model, train_data, test_data, train_labels, test_labels):
    
    model.fit (train_data, train_labels)
    predictions = model.predict(test_data)

    f1_score = metrics.f1_score(test_labels, predictions, average='weighted');
    print ("RandomForestClassifier: ")
    print ("f1 = {:.3f}".format(f1_score))
    
def getAdaBoostClassifier(model, train_data, test_data, train_labels, test_labels):
    model.fit (train_data, train_labels)
    predictions = model.predict(test_data)

    f1_score = metrics.f1_score(test_labels, predictions, average='weighted', labels=np.unique(predictions));
    print ("AdaBoostClassifier: ")
    print ("f1 = {:.3f}".format(f1_score))

def getVotingClassifier(train_data, test_data, train_labels, test_labels):
    model.fit (train_data, train_labels)
    predictions = model.predict(test_data)
    f1_score = metrics.f1_score(test_labels, predictions, average='weighted');
    print ("VotingClassifier: ")
    print ("f1 = {:.3f}".format(f1_score))

# Baseline

In [202]:
def getBaseline(train_data, train_labels):
    baseline = dummy.DummyClassifier(strategy='most_frequent')
    baseline.fit(train_data, train_labels)
    base_predictions = baseline.predict(data)

    accuracy = metrics.accuracy_score(train_labels, base_predictions)
    recall = metrics.recall_score(train_labels, base_predictions, average='weighted')
    precision = metrics.precision_score(train_labels, base_predictions, average='weighted')
    f1 = metrics.f1_score(train_labels, base_predictions, average='weighted')

    print ("Accuracy = {:.3f}".format(accuracy))
    print ("recall = {:.3f}".format(recall))
    print ("precision = {:.3f}".format(precision))
    print ("f1 = {:.3f}".format(f1))

    #print(metrics.classification_report(target, base_predictions))
    #metrics.plot_confusion_matrix(baseline, data, target)

# Scoring 

In [203]:
acc_scorer    = metrics.make_scorer(metrics.accuracy_score)
recall_scorer = metrics.make_scorer(metrics.recall_score, average='weighted')
prec_scorer   = metrics.make_scorer(metrics.precision_score, average='weighted')
f1_scorer     = metrics.make_scorer(metrics.f1_score, average='weighted')
scoring = {'accuracy': acc_scorer, 
           'recall': recall_scorer, 
           'precision' : prec_scorer,
           'f1': f1_scorer}

In [267]:
def cv_scores(model, train_data, train_labels, scoring, folds):
    model_scores = ms.cross_validate (model,
                                      train_data, 
                                      train_labels,
                                      scoring=scoring,
                                      cv=folds,
                                      return_train_score=True)
    np.set_printoptions(formatter={'float': '{: 0.3f}'.format})
    print('\n Evaluation results for {} folds:'.format(folds))
    for (k,v) in model_scores .items():
        print(('{}: {}').format(k,v))

# Get cross-validation predictions

In [268]:
def cross(model, test_data, test_labels):
    predictions = ms.cross_val_predict(model, test_data, test_labels)

    cv_accuracy = metrics.accuracy_score(test_labels, predictions)
    cv_recall = metrics.recall_score(test_labels, predictions, average='weighted')
    cv_precision = metrics.precision_score(test_labels, predictions, average='weighted')
    cv_f1 = metrics.f1_score(test_labels, predictions, average='weighted')
    print('\n Cross-validation predictions:')
    print ("accuracy = {:.3f}".format(cv_accuracy))
    print ("recall = {:.3f}".format(cv_recall))
    print ("precision = {:.3f}".format(cv_precision))
    print ("f1 = {:.3f}".format(cv_f1))

    print(metrics.classification_report(test_labels, predictions))
    metrics.confusion_matrix(test_labels, predictions)

# Digits dataset

### Get dataset and split dataset into train and test subsets

In [313]:
digits, data, target = getDataset('digits')
folds = 8

# разбиваем выборки
train_data, test_data, train_labels, test_labels = ms.train_test_split(digits.data, digits.target, test_size=0.3)

Class distibution
3    183
5    182
1    182
6    181
4    181
9    180
7    179
0    178
2    177
8    174
Name: target, dtype: int64
Class distibution


### Baseline

In [314]:
getBaseline(data, target)

Accuracy = 0.102
recall = 0.102
precision = 0.010
f1 = 0.019


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


## Finding the best F-score classifier

### DecisionTreeClassifier 

In [315]:
model = tree.DecisionTreeClassifier(random_state=1, criterion='entropy');
getDecisionTreeClassifier(model, train_data, test_data, train_labels, test_labels)
cv_scores(model, train_data, train_labels, scoring, folds)
cross(model, test_data, test_labels)

DecisionTreeClassifier: 
f1 = 0.866

 Evaluation results for 8 folds:
fit_time: [ 0.118  0.106  0.083  0.120  0.157  0.146  0.101  0.087]
score_time: [ 0.024  0.015  0.013  0.027  0.045  0.042  0.012  0.012]
test_accuracy: [ 0.883  0.864  0.839  0.836  0.872  0.850  0.888  0.855]
train_accuracy: [ 1.000  1.000  1.000  1.000  1.000  1.000  1.000  1.000]
test_recall: [ 0.883  0.864  0.839  0.836  0.872  0.850  0.888  0.855]
train_recall: [ 1.000  1.000  1.000  1.000  1.000  1.000  1.000  1.000]
test_precision: [ 0.894  0.868  0.853  0.847  0.871  0.858  0.897  0.855]
train_precision: [ 1.000  1.000  1.000  1.000  1.000  1.000  1.000  1.000]
test_f1: [ 0.884  0.864  0.839  0.836  0.870  0.852  0.888  0.854]
train_f1: [ 1.000  1.000  1.000  1.000  1.000  1.000  1.000  1.000]

 Cross-validation predictions:
accuracy = 0.781
recall = 0.781
precision = 0.786
f1 = 0.782
              precision    recall  f1-score   support

           0       0.93      0.89      0.91        46
           1    

### LogisticRegression

In [374]:
model = linear_model.LogisticRegression(random_state = 1, max_iter = 1000)
getLogisticRegression(model, train_data, test_data, train_labels, test_labels)
cv_scores(model, train_data, train_labels, scoring, folds)
cross(model, test_data, test_labels)


LogisticRegression: 
f1 = 0.965

 Evaluation results for 8 folds:
fit_time: [ 0.547  0.496  0.480  0.497  0.507  0.639  0.543  0.620]
score_time: [ 0.011  0.015  0.010  0.013  0.011  0.021  0.010  0.018]
test_accuracy: [ 0.963  0.969  0.969  0.943  0.962  0.882  0.941  0.980]
train_accuracy: [ 0.994  0.996  0.996  0.995  0.995  0.996  0.997  0.994]
test_recall: [ 0.963  0.969  0.969  0.943  0.962  0.882  0.941  0.980]
train_recall: [ 0.994  0.996  0.996  0.995  0.995  0.996  0.997  0.994]
test_precision: [ 0.966  0.970  0.971  0.946  0.963  0.900  0.943  0.981]
train_precision: [ 0.994  0.996  0.996  0.996  0.995  0.996  0.997  0.994]
test_f1: [ 0.963  0.969  0.969  0.943  0.961  0.886  0.941  0.980]
train_f1: [ 0.994  0.996  0.996  0.995  0.995  0.996  0.997  0.994]

 Cross-validation predictions:
accuracy = 0.930
recall = 0.930
precision = 0.934
f1 = 0.931
              precision    recall  f1-score   support

           0       1.00      0.98      0.99        46
           1       0

### RandomForestClassifier

In [352]:
model = ensemble.RandomForestClassifier(random_state=1, n_estimators=150, criterion='entropy', max_leaf_nodes=200) 
getRandomForestClassifier(model, train_data, test_data, train_labels, test_labels)
cv_scores(model, train_data, train_labels, scoring, folds)
cross(model, test_data, test_labels)

RandomForestClassifier: 
f1 = 0.979

 Evaluation results for 8 folds:
fit_time: [ 2.110  2.229  2.204  2.389  2.377  1.891  2.187  2.145]
score_time: [ 0.393  0.246  0.293  0.307  0.483  0.234  0.306  0.383]
test_accuracy: [ 0.981  0.994  0.988  0.969  0.962  0.948  0.961  0.993]
train_accuracy: [ 1.000  1.000  1.000  1.000  1.000  1.000  1.000  1.000]
test_recall: [ 0.981  0.994  0.988  0.969  0.962  0.948  0.961  0.993]
train_recall: [ 1.000  1.000  1.000  1.000  1.000  1.000  1.000  1.000]
test_precision: [ 0.983  0.994  0.988  0.970  0.962  0.951  0.962  0.994]
train_precision: [ 1.000  1.000  1.000  1.000  1.000  1.000  1.000  1.000]
test_f1: [ 0.981  0.994  0.988  0.969  0.961  0.948  0.960  0.993]
train_f1: [ 1.000  1.000  1.000  1.000  1.000  1.000  1.000  1.000]

 Cross-validation predictions:
accuracy = 0.948
recall = 0.948
precision = 0.949
f1 = 0.948
              precision    recall  f1-score   support

           0       0.96      0.98      0.97        46
           1    

### NaiveBayes

In [360]:
model = naive_bayes.MultinomialNB()
getNaiveBayes(model, train_data, test_data, train_labels, test_labels)
cv_scores(model, train_data, train_labels, scoring, folds)
cross(model, test_data, test_labels)

NaiveBayes: 
f1 = 0.911

 Evaluation results for 8 folds:
fit_time: [ 0.006  0.006  0.004  0.006  0.005  0.005  0.007  0.007]
score_time: [ 0.015  0.017  0.015  0.022  0.025  0.013  0.017  0.009]
test_accuracy: [ 0.926  0.914  0.907  0.874  0.910  0.843  0.921  0.908]
train_accuracy: [ 0.908  0.906  0.910  0.917  0.910  0.915  0.910  0.910]
test_recall: [ 0.926  0.914  0.907  0.874  0.910  0.843  0.921  0.908]
train_recall: [ 0.908  0.906  0.910  0.917  0.910  0.915  0.910  0.910]
test_precision: [ 0.934  0.924  0.913  0.882  0.914  0.865  0.930  0.911]
train_precision: [ 0.914  0.911  0.915  0.922  0.916  0.919  0.916  0.916]
test_f1: [ 0.928  0.915  0.908  0.875  0.910  0.843  0.922  0.907]
train_f1: [ 0.909  0.907  0.911  0.918  0.911  0.916  0.911  0.911]

 Cross-validation predictions:
accuracy = 0.883
recall = 0.883
precision = 0.888
f1 = 0.884
              precision    recall  f1-score   support

           0       0.98      0.96      0.97        46
           1       0.81     

### AdaBoostClassifier

In [378]:
model = ensemble.AdaBoostClassifier(random_state=1, base_estimator=ensemble.RandomForestClassifier(random_state=1, n_estimators=150, criterion='entropy'))
getAdaBoostClassifier(model, train_data, test_data, train_labels, test_labels)
cv_scores(model, train_data, train_labels, scoring, folds)
cross(model, test_data, test_labels)

AdaBoostClassifier: 
f1 = 0.974

 Evaluation results for 8 folds:
fit_time: [ 1.957  2.238  1.881  2.094  2.311  1.859  1.967  1.968]
score_time: [ 0.236  0.254  0.241  0.288  0.389  0.223  0.233  0.226]
test_accuracy: [ 0.988  0.994  0.988  0.969  0.962  0.961  0.987  0.993]
train_accuracy: [ 1.000  1.000  1.000  1.000  1.000  1.000  1.000  1.000]
test_recall: [ 0.988  0.994  0.988  0.969  0.962  0.961  0.987  0.993]
train_recall: [ 1.000  1.000  1.000  1.000  1.000  1.000  1.000  1.000]
test_precision: [ 0.989  0.994  0.988  0.972  0.963  0.962  0.988  0.994]
train_precision: [ 1.000  1.000  1.000  1.000  1.000  1.000  1.000  1.000]
test_f1: [ 0.988  0.994  0.988  0.968  0.962  0.961  0.987  0.993]
train_f1: [ 1.000  1.000  1.000  1.000  1.000  1.000  1.000  1.000]

 Cross-validation predictions:
accuracy = 0.946
recall = 0.946
precision = 0.946
f1 = 0.946
              precision    recall  f1-score   support

           0       0.94      0.98      0.96        46
           1       0

### VotingClassifier

In [376]:
clf1 = ensemble.RandomForestClassifier(random_state=1, criterion='entropy', max_leaf_nodes=200) 
clf2 = linear_model.LogisticRegression(max_iter=1000);
clf3 = ensemble.AdaBoostClassifier()

model = ensemble.VotingClassifier(estimators=[('1', clf1), ('2', clf2), ('3', clf3)])

getVotingClassifier(train_data, test_data, train_labels, test_labels)
cv_scores(model, train_data, train_labels, scoring, folds)
cross(model, test_data, test_labels)

VotingClassifier: 
f1 = 0.942

 Evaluation results for 8 folds:
fit_time: [ 1.703  1.551  1.506  1.381  1.712  1.628  1.305  1.211]
score_time: [ 0.346  0.322  0.309  0.286  0.333  0.423  0.199  0.202]
test_accuracy: [ 0.963  0.981  0.963  0.925  0.949  0.902  0.954  0.941]
train_accuracy: [ 0.996  0.998  0.996  0.995  0.998  0.998  0.998  0.996]
test_recall: [ 0.963  0.981  0.963  0.925  0.949  0.902  0.954  0.941]
train_recall: [ 0.996  0.998  0.996  0.995  0.998  0.998  0.998  0.996]
test_precision: [ 0.964  0.982  0.970  0.945  0.950  0.918  0.957  0.943]
train_precision: [ 0.996  0.998  0.996  0.995  0.998  0.998  0.998  0.996]
test_f1: [ 0.962  0.981  0.964  0.927  0.949  0.906  0.954  0.941]
train_f1: [ 0.996  0.998  0.996  0.995  0.998  0.998  0.998  0.996]

 Cross-validation predictions:
accuracy = 0.900
recall = 0.900
precision = 0.904
f1 = 0.900
              precision    recall  f1-score   support

           0       0.88      0.98      0.93        46
           1       0.7

### SVM

In [380]:
hyper_params_svc = {'kernel': ['linear','poly','rbf'],
                        'gamma': [1e-3, 1e-4, 'scale'],
                        'C': [1, 10, 100, 1000]
                       }

getSVM(train_data, test_data, train_labels, test_labels, folds, hyper_params_svc)
cv_scores(model, train_data, train_labels, scoring, folds)
cross(model, test_data, test_labels)



Best hyper-parameters:
{'C': 10, 'gamma': 'scale', 'kernel': 'rbf'}
SVM: 
f1 = 0.991
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        46
           1       1.00      1.00      1.00        50
           2       1.00      1.00      1.00        52
           3       1.00      0.96      0.98        51
           4       1.00      1.00      1.00        56
           5       0.95      0.98      0.96        56
           6       1.00      0.98      0.99        59
           7       1.00      1.00      1.00        62
           8       1.00      1.00      1.00        51
           9       0.97      0.98      0.97        57

    accuracy                           0.99       540
   macro avg       0.99      0.99      0.99       540
weighted avg       0.99      0.99      0.99       540


 Evaluation results for 8 folds:
fit_time: [ 1.969  2.686  2.924  2.282  2.202  2.003  2.273  2.005]
score_time: [ 0.233  0.337  0.307  0.306  0.238  0.281

# Breast_canсer dataset

### Get dataset and split dataset into train and test subsets

In [416]:
breast_cancer, data, target = getDataset('breast_cancer')
folds = 9
# разбиваем выборки
train_data, test_data, train_labels, test_labels = ms.train_test_split(breast_cancer.data, breast_cancer.target, test_size=0.3)

Class distibution
1    357
0    212
Name: target, dtype: int64
Class distibution


### Baseline

In [417]:
getBaseline(data, target)

Accuracy = 0.627
recall = 0.627
precision = 0.394
f1 = 0.484


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


## Finding the best F-score classifier

### DecisionTreeClassifier 

In [418]:
model = tree.DecisionTreeClassifier(random_state=1, criterion='entropy')
getDecisionTreeClassifier(model, train_data, test_data, train_labels, test_labels)
cv_scores(model, train_data, train_labels, scoring, folds)
cross(model, test_data, test_labels)

DecisionTreeClassifier: 
f1 = 0.936

 Evaluation results for 9 folds:
fit_time: [ 0.039  0.029  0.036  0.041  0.050  0.040  0.023  0.025  0.050]
score_time: [ 0.011  0.014  0.017  0.013  0.040  0.011  0.010  0.008  0.022]
test_accuracy: [ 0.889  0.889  0.933  1.000  0.909  0.886  0.909  0.930  0.930]
train_accuracy: [ 1.000  1.000  1.000  1.000  1.000  1.000  1.000  1.000  1.000]
test_recall: [ 0.889  0.889  0.933  1.000  0.909  0.886  0.909  0.930  0.930]
train_recall: [ 1.000  1.000  1.000  1.000  1.000  1.000  1.000  1.000  1.000]
test_precision: [ 0.891  0.888  0.935  1.000  0.915  0.898  0.920  0.932  0.930]
train_precision: [ 1.000  1.000  1.000  1.000  1.000  1.000  1.000  1.000  1.000]
test_f1: [ 0.889  0.888  0.934  1.000  0.910  0.888  0.906  0.931  0.930]
train_f1: [ 1.000  1.000  1.000  1.000  1.000  1.000  1.000  1.000  1.000]

 Cross-validation predictions:
accuracy = 0.930
recall = 0.930
precision = 0.932
f1 = 0.930
              precision    recall  f1-score   support



### LogisticRegression

In [420]:
model = linear_model.LogisticRegression(random_state = 1,max_iter = 2000)
getLogisticRegression(model, train_data, test_data, train_labels, test_labels)
cv_scores(model, train_data, train_labels, scoring, folds)
cross(model, test_data, test_labels)


LogisticRegression: 
f1 = 0.942

 Evaluation results for 9 folds:
fit_time: [ 0.017  0.026  0.028  0.033  0.023  0.028  0.016  0.017  0.022]
score_time: [ 0.013  0.012  0.020  0.026  0.015  0.028  0.010  0.008  0.009]
test_accuracy: [ 0.911  0.933  1.000  0.978  0.864  0.977  0.955  0.977  1.000]
train_accuracy: [ 0.972  0.972  0.958  0.960  0.966  0.958  0.960  0.955  0.958]
test_recall: [ 0.911  0.933  1.000  0.978  0.864  0.977  0.955  0.977  1.000]
train_recall: [ 0.972  0.972  0.958  0.960  0.966  0.958  0.960  0.955  0.958]
test_precision: [ 0.916  0.933  1.000  0.979  0.864  0.978  0.958  0.978  1.000]
train_precision: [ 0.972  0.972  0.957  0.960  0.966  0.958  0.960  0.955  0.958]
test_f1: [ 0.912  0.933  1.000  0.978  0.864  0.977  0.954  0.977  1.000]
train_f1: [ 0.972  0.972  0.957  0.960  0.966  0.958  0.960  0.955  0.958]

 Cross-validation predictions:
accuracy = 0.924
recall = 0.924
precision = 0.924
f1 = 0.924
              precision    recall  f1-score   support

    

### RandomForestClassifier

In [421]:
model = ensemble.RandomForestClassifier(random_state=1, n_estimators=150, criterion='entropy', max_leaf_nodes=100) 
getRandomForestClassifier(model, train_data, test_data, train_labels, test_labels)
cv_scores(model, train_data, train_labels, scoring, folds)
cross(model, test_data, test_labels)

RandomForestClassifier: 
f1 = 0.982

 Evaluation results for 9 folds:
fit_time: [ 0.882  0.760  0.854  0.949  0.816  0.921  1.006  0.895  0.971]
score_time: [ 0.170  0.163  0.196  0.185  0.162  0.214  0.229  0.237  0.203]
test_accuracy: [ 0.956  0.933  1.000  0.978  0.932  0.909  0.955  1.000  0.977]
train_accuracy: [ 1.000  1.000  1.000  1.000  1.000  1.000  1.000  1.000  1.000]
test_recall: [ 0.956  0.933  1.000  0.978  0.932  0.909  0.955  1.000  0.977]
train_recall: [ 1.000  1.000  1.000  1.000  1.000  1.000  1.000  1.000  1.000]
test_precision: [ 0.956  0.933  1.000  0.979  0.932  0.927  0.958  1.000  0.978]
train_precision: [ 1.000  1.000  1.000  1.000  1.000  1.000  1.000  1.000  1.000]
test_f1: [ 0.956  0.933  1.000  0.978  0.931  0.911  0.954  1.000  0.977]
train_f1: [ 1.000  1.000  1.000  1.000  1.000  1.000  1.000  1.000  1.000]

 Cross-validation predictions:
accuracy = 0.965
recall = 0.965
precision = 0.965
f1 = 0.965
              precision    recall  f1-score   support



### NaiveBayes

In [422]:
model = naive_bayes.GaussianNB()
getNaiveBayes(model, train_data, test_data, train_labels, test_labels)
cv_scores(model, train_data, train_labels, scoring, folds)
cross(model, test_data, test_labels)

NaiveBayes: 
f1 = 0.936

 Evaluation results for 9 folds:
fit_time: [ 0.007  0.008  0.008  0.008  0.005  0.003  0.003  0.003  0.003]
score_time: [ 0.017  0.038  0.048  0.024  0.022  0.012  0.009  0.010  0.009]
test_accuracy: [ 0.867  0.911  0.978  0.956  0.864  0.977  0.932  0.977  0.977]
train_accuracy: [ 0.943  0.941  0.935  0.943  0.952  0.935  0.944  0.938  0.938]
test_recall: [ 0.867  0.911  0.978  0.956  0.864  0.977  0.932  0.977  0.977]
train_recall: [ 0.943  0.941  0.935  0.943  0.952  0.935  0.944  0.938  0.938]
test_precision: [ 0.874  0.916  0.979  0.959  0.864  0.979  0.938  0.978  0.978]
train_precision: [ 0.943  0.941  0.935  0.944  0.953  0.935  0.944  0.939  0.938]
test_f1: [ 0.862  0.912  0.978  0.955  0.864  0.977  0.930  0.977  0.977]
train_f1: [ 0.943  0.940  0.934  0.943  0.952  0.935  0.943  0.937  0.938]

 Cross-validation predictions:
accuracy = 0.936
recall = 0.936
precision = 0.936
f1 = 0.936
              precision    recall  f1-score   support

           0

### AdaBoostClassifier

In [423]:
model = ensemble.AdaBoostClassifier(random_state=1, base_estimator=linear_model.LogisticRegression(random_state = 1))
getAdaBoostClassifier(model, train_data, test_data, train_labels, test_labels)
cv_scores(model, train_data, train_labels, scoring, folds)
cross(model, test_data, test_labels)

AdaBoostClassifier: 
f1 = 0.942

 Evaluation results for 9 folds:
fit_time: [ 0.742  0.617  0.622  0.637  0.718  0.843  0.573  0.451  0.564]
score_time: [ 0.154  0.145  0.143  0.195  0.183  0.145  0.085  0.095  0.090]
test_accuracy: [ 0.911  0.933  1.000  0.978  0.886  0.977  0.955  0.977  1.000]
train_accuracy: [ 0.975  0.972  0.960  0.963  0.966  0.955  0.963  0.961  0.958]
test_recall: [ 0.911  0.933  1.000  0.978  0.886  0.977  0.955  0.977  1.000]
train_recall: [ 0.975  0.972  0.960  0.963  0.966  0.955  0.963  0.961  0.958]
test_precision: [ 0.916  0.933  1.000  0.979  0.889  0.978  0.958  0.978  1.000]
train_precision: [ 0.974  0.972  0.960  0.963  0.966  0.955  0.963  0.961  0.958]
test_f1: [ 0.912  0.933  1.000  0.978  0.887  0.977  0.954  0.977  1.000]
train_f1: [ 0.974  0.972  0.960  0.963  0.966  0.955  0.963  0.961  0.958]

 Cross-validation predictions:
accuracy = 0.947
recall = 0.947
precision = 0.947
f1 = 0.947
              precision    recall  f1-score   support

    

### VotingClassifier

In [427]:
clf1 = ensemble.RandomForestClassifier(random_state=1, criterion='entropy', max_leaf_nodes=200) 
clf2 = linear_model.LogisticRegression(max_iter=1000);
clf3 = tree.DecisionTreeClassifier(random_state=1, criterion='entropy')

model = ensemble.VotingClassifier(estimators=[('1', clf1), ('2', clf2), ('3', clf3)])

getVotingClassifier(train_data, test_data, train_labels, test_labels)
cv_scores(model, train_data, train_labels, scoring, folds)
cross(model, test_data, test_labels)

VotingClassifier: 
f1 = 0.942

 Evaluation results for 9 folds:
fit_time: [ 0.119  0.156  0.125  0.140  0.114  0.108  0.115  0.134  0.112]
score_time: [ 0.031  0.040  0.047  0.051  0.041  0.033  0.037  0.055  0.032]
test_accuracy: [ 0.911  0.933  1.000  1.000  0.909  0.977  0.932  1.000  0.977]
train_accuracy: [ 0.997  1.000  1.000  1.000  1.000  1.000  1.000  1.000  1.000]
test_recall: [ 0.911  0.933  1.000  1.000  0.909  0.977  0.932  1.000  0.977]
train_recall: [ 0.997  1.000  1.000  1.000  1.000  1.000  1.000  1.000  1.000]
test_precision: [ 0.911  0.933  1.000  1.000  0.915  0.979  0.938  1.000  0.978]
train_precision: [ 0.997  1.000  1.000  1.000  1.000  1.000  1.000  1.000  1.000]
test_f1: [ 0.911  0.933  1.000  1.000  0.910  0.977  0.930  1.000  0.977]
train_f1: [ 0.997  1.000  1.000  1.000  1.000  1.000  1.000  1.000  1.000]

 Cross-validation predictions:
accuracy = 0.959
recall = 0.959
precision = 0.959
f1 = 0.959
              precision    recall  f1-score   support

      

### SVM

In [456]:
hyper_params_svc = {'kernel': ['linear','poly'],
                        'gamma': ['scale'],
                        'C': [1,100]
                       }

getSVM(train_data, test_data, train_labels, test_labels, folds, hyper_params_svc)
cv_scores(model, train_data, train_labels, scoring, folds)
cross(model, test_data, test_labels)

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


Best hyper-parameters:
{'C': 100, 'gamma': 'scale', 'kernel': 'linear'}
SVM: 
f1 = 0.907
              precision    recall  f1-score   support

           0       0.93      1.00      0.96        13
           1       0.91      0.87      0.89        23
           2       0.89      0.89      0.89        18

    accuracy                           0.91        54
   macro avg       0.91      0.92      0.91        54
weighted avg       0.91      0.91      0.91        54


 Evaluation results for 14 folds:
fit_time: [ 0.092  0.058  0.079  0.049  0.127  0.054  0.059  0.059  0.055  0.104
  0.059  0.069  0.054  0.061]
score_time: [ 0.043  0.044  0.051  0.035  0.071  0.031  0.049  0.041  0.038  0.060
  0.036  0.051  0.034  0.030]
test_accuracy: [ 1.000  1.000  1.000  1.000  0.889  1.000  1.000  1.000  0.875  1.000
  0.875  1.000  0.875  1.000]
train_accuracy: [ 0.991  0.991  0.991  0.991  0.991  0.991  0.991  0.991  0.991  0.991
  1.000  0.991  1.000  0.991]
test_recall: [ 1.000  1.000  1.000  1.

# Wine dataset

### Get dataset and split dataset into train and test subsets

In [440]:
wine, data, target = getDataset('wine')
folds = 14
# разбиваем выборки
train_data, test_data, train_labels, test_labels = ms.train_test_split(wine.data, wine.target, test_size=0.3)

Class distibution
1    71
0    59
2    48
Name: target, dtype: int64
Class distibution


### Baseline

In [441]:
getBaseline(data, target)

Accuracy = 0.399
recall = 0.399
precision = 0.159
f1 = 0.227


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


## Finding the best F-score classifier

### DecisionTreeClassifier 

In [442]:
model = tree.DecisionTreeClassifier(random_state=1, criterion='entropy');
getDecisionTreeClassifier(model, train_data, test_data, train_labels, test_labels)
cv_scores(model, train_data, train_labels, scoring, folds)
cross(model, test_data, test_labels)

DecisionTreeClassifier: 
f1 = 0.908

 Evaluation results for 14 folds:
fit_time: [ 0.005  0.004  0.005  0.003  0.003  0.006  0.005  0.010  0.004  0.007
  0.003  0.004  0.004  0.003]
score_time: [ 0.010  0.014  0.010  0.008  0.013  0.009  0.011  0.012  0.009  0.013
  0.009  0.008  0.009  0.009]
test_accuracy: [ 0.909  1.000  1.000  0.900  0.889  1.000  0.875  1.000  0.875  1.000
  0.750  0.875  0.875  1.000]
train_accuracy: [ 1.000  1.000  1.000  1.000  1.000  1.000  1.000  1.000  1.000  1.000
  1.000  1.000  1.000  1.000]
test_recall: [ 0.909  1.000  1.000  0.900  0.889  1.000  0.875  1.000  0.875  1.000
  0.750  0.875  0.875  1.000]
train_recall: [ 1.000  1.000  1.000  1.000  1.000  1.000  1.000  1.000  1.000  1.000
  1.000  1.000  1.000  1.000]
test_precision: [ 0.927  1.000  1.000  0.920  0.926  1.000  0.906  1.000  0.906  1.000
  0.781  0.917  0.906  1.000]
train_precision: [ 1.000  1.000  1.000  1.000  1.000  1.000  1.000  1.000  1.000  1.000
  1.000  1.000  1.000  1.000]
test_f1:

### LogisticRegression

In [443]:
model = linear_model.LogisticRegression(random_state = 1, max_iter = 1000)
getLogisticRegression(model, train_data, test_data, train_labels, test_labels)
cv_scores(model, train_data, train_labels, scoring, folds)
cross(model, test_data, test_labels)


LogisticRegression: 
f1 = 0.926

 Evaluation results for 14 folds:
fit_time: [ 0.015  0.012  0.015  0.007  0.006  0.012  0.008  0.013  0.006  0.007
  0.007  0.007  0.007  0.008]
score_time: [ 0.019  0.011  0.017  0.012  0.011  0.013  0.009  0.014  0.009  0.007
  0.008  0.007  0.009  0.008]
test_accuracy: [ 1.000  1.000  0.800  1.000  0.889  1.000  1.000  1.000  0.875  1.000
  1.000  1.000  0.875  1.000]
train_accuracy: [ 0.973  0.982  0.982  0.974  0.983  0.974  0.974  0.974  0.983  0.974
  0.974  0.974  0.991  0.983]
test_recall: [ 1.000  1.000  0.800  1.000  0.889  1.000  1.000  1.000  0.875  1.000
  1.000  1.000  0.875  1.000]
train_recall: [ 0.973  0.982  0.982  0.974  0.983  0.974  0.974  0.974  0.983  0.974
  0.974  0.974  0.991  0.983]
test_precision: [ 1.000  1.000  0.800  1.000  0.911  1.000  1.000  1.000  0.906  1.000
  1.000  1.000  0.906  1.000]
train_precision: [ 0.974  0.983  0.982  0.974  0.983  0.974  0.974  0.974  0.983  0.974
  0.974  0.974  0.992  0.983]
test_f1: [ 1

### RandomForestClassifier

In [448]:
model = ensemble.RandomForestClassifier(random_state=1, n_estimators=200, criterion='entropy')
getRandomForestClassifier(model, train_data, test_data, train_labels, test_labels)
cv_scores(model, train_data, train_labels, scoring, folds)
cross(model, test_data, test_labels)

RandomForestClassifier: 
f1 = 0.982

 Evaluation results for 14 folds:
fit_time: [ 1.249  0.979  0.783  1.120  0.715  1.038  0.861  0.884  0.818  0.795
  1.233  1.160  1.706  0.911]
score_time: [ 0.342  0.316  0.425  0.278  0.336  0.379  0.353  0.320  0.260  0.356
  0.547  0.325  0.408  0.233]
test_accuracy: [ 1.000  1.000  1.000  1.000  0.889  1.000  1.000  1.000  1.000  1.000
  0.875  1.000  0.875  1.000]
train_accuracy: [ 1.000  1.000  1.000  1.000  1.000  1.000  1.000  1.000  1.000  1.000
  1.000  1.000  1.000  1.000]
test_recall: [ 1.000  1.000  1.000  1.000  0.889  1.000  1.000  1.000  1.000  1.000
  0.875  1.000  0.875  1.000]
train_recall: [ 1.000  1.000  1.000  1.000  1.000  1.000  1.000  1.000  1.000  1.000
  1.000  1.000  1.000  1.000]
test_precision: [ 1.000  1.000  1.000  1.000  0.926  1.000  1.000  1.000  1.000  1.000
  0.906  1.000  0.906  1.000]
train_precision: [ 1.000  1.000  1.000  1.000  1.000  1.000  1.000  1.000  1.000  1.000
  1.000  1.000  1.000  1.000]
test_f1:

### NaiveBayes

In [450]:
model = naive_bayes.GaussianNB()
getNaiveBayes(model, train_data, test_data, train_labels, test_labels)
cv_scores(model, train_data, train_labels, scoring, folds)
cross(model, test_data, test_labels)

NaiveBayes: 
f1 = 0.963

 Evaluation results for 14 folds:
fit_time: [ 0.004  0.005  0.002  0.005  0.005  0.003  0.003  0.004  0.005  0.002
  0.002  0.002  0.005  0.002]
score_time: [ 0.015  0.030  0.014  0.019  0.020  0.012  0.029  0.016  0.020  0.013
  0.015  0.013  0.011  0.012]
test_accuracy: [ 1.000  1.000  1.000  1.000  0.889  1.000  1.000  1.000  1.000  1.000
  0.875  1.000  0.750  1.000]
train_accuracy: [ 0.991  0.973  0.974  0.991  0.983  0.974  0.983  0.983  0.991  0.991
  0.991  0.974  1.000  0.983]
test_recall: [ 1.000  1.000  1.000  1.000  0.889  1.000  1.000  1.000  1.000  1.000
  0.875  1.000  0.750  1.000]
train_recall: [ 0.991  0.973  0.974  0.991  0.983  0.974  0.983  0.983  0.991  0.991
  0.991  0.974  1.000  0.983]
test_precision: [ 1.000  1.000  1.000  1.000  0.926  1.000  1.000  1.000  1.000  1.000
  0.906  1.000  0.850  1.000]
train_precision: [ 0.991  0.974  0.974  0.991  0.983  0.974  0.983  0.983  0.992  0.992
  0.992  0.974  1.000  0.983]
test_f1: [ 1.000  1.

### AdaBoostClassifier

In [452]:
model = ensemble.AdaBoostClassifier(random_state=1, base_estimator=ensemble.RandomForestClassifier(random_state=1, n_estimators=150, criterion='entropy'))
getAdaBoostClassifier(model, train_data, test_data, train_labels, test_labels)
cv_scores(model, train_data, train_labels, scoring, folds)
cross(model, test_data, test_labels)

AdaBoostClassifier: 
f1 = 0.982

 Evaluation results for 14 folds:
fit_time: [ 0.672  0.558  0.613  0.590  0.548  1.024  0.979  0.589  0.725  0.690
  1.022  1.010  0.990  0.691]
score_time: [ 0.169  0.172  0.169  0.171  0.168  0.177  0.211  0.178  0.224  0.215
  0.236  0.347  0.170  0.170]
test_accuracy: [ 1.000  1.000  1.000  1.000  0.889  1.000  1.000  1.000  1.000  1.000
  0.875  1.000  1.000  1.000]
train_accuracy: [ 1.000  1.000  1.000  1.000  1.000  1.000  1.000  1.000  1.000  1.000
  1.000  1.000  1.000  1.000]
test_recall: [ 1.000  1.000  1.000  1.000  0.889  1.000  1.000  1.000  1.000  1.000
  0.875  1.000  1.000  1.000]
train_recall: [ 1.000  1.000  1.000  1.000  1.000  1.000  1.000  1.000  1.000  1.000
  1.000  1.000  1.000  1.000]
test_precision: [ 1.000  1.000  1.000  1.000  0.926  1.000  1.000  1.000  1.000  1.000
  0.906  1.000  1.000  1.000]
train_precision: [ 1.000  1.000  1.000  1.000  1.000  1.000  1.000  1.000  1.000  1.000
  1.000  1.000  1.000  1.000]
test_f1: [ 1

### VotingClassifier

In [454]:
clf1 = ensemble.RandomForestClassifier(random_state=1) 
clf2 = linear_model.LogisticRegression(max_iter=1000);
clf3 = naive_bayes.GaussianNB()

model = ensemble.VotingClassifier(estimators=[('1', clf1), ('2', clf2), ('3', clf3)])

getVotingClassifier(train_data, test_data, train_labels, test_labels)
cv_scores(model, train_data, train_labels, scoring, folds)
cross(model, test_data, test_labels)

VotingClassifier: 
f1 = 0.963

 Evaluation results for 14 folds:
fit_time: [ 0.069  0.060  0.044  0.106  0.048  0.052  0.048  0.061  0.076  0.044
  0.045  0.046  0.048  0.045]
score_time: [ 0.057  0.034  0.037  0.059  0.030  0.038  0.046  0.032  0.030  0.031
  0.029  0.031  0.031  0.033]
test_accuracy: [ 1.000  1.000  1.000  1.000  0.889  1.000  1.000  1.000  0.875  1.000
  0.875  1.000  0.875  1.000]
train_accuracy: [ 0.991  0.991  0.991  0.991  0.991  0.991  0.991  0.991  0.991  0.991
  1.000  0.991  1.000  0.991]
test_recall: [ 1.000  1.000  1.000  1.000  0.889  1.000  1.000  1.000  0.875  1.000
  0.875  1.000  0.875  1.000]
train_recall: [ 0.991  0.991  0.991  0.991  0.991  0.991  0.991  0.991  0.991  0.991
  1.000  0.991  1.000  0.991]
test_precision: [ 1.000  1.000  1.000  1.000  0.926  1.000  1.000  1.000  0.906  1.000
  0.906  1.000  0.906  1.000]
train_precision: [ 0.991  0.991  0.991  0.991  0.991  0.991  0.992  0.992  0.992  0.992
  1.000  0.992  1.000  0.992]
test_f1: [ 1.0

### SVM

In [455]:
hyper_params_svc = {'kernel': ['linear','poly','rbf'],
                        'gamma': [1e-3, 1e-4, 'scale'],
                        'C': [1, 10, 100, 1000]
                       }

getSVM(train_data, test_data, train_labels, test_labels, folds, hyper_params_svc)
cv_scores(model, train_data, train_labels, scoring, folds)
cross(model, test_data, test_labels)

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision'

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


Best hyper-parameters:
{'C': 10, 'gamma': 0.001, 'kernel': 'linear'}
SVM: 
f1 = 0.907
              precision    recall  f1-score   support

           0       0.93      1.00      0.96        13
           1       0.91      0.87      0.89        23
           2       0.89      0.89      0.89        18

    accuracy                           0.91        54
   macro avg       0.91      0.92      0.91        54
weighted avg       0.91      0.91      0.91        54


 Evaluation results for 14 folds:
fit_time: [ 0.088  0.148  0.058  0.066  0.050  0.052  0.048  0.055  0.051  0.050
  0.047  0.047  0.044  0.046]
score_time: [ 0.048  0.055  0.036  0.030  0.031  0.033  0.032  0.030  0.029  0.031
  0.031  0.029  0.031  0.027]
test_accuracy: [ 1.000  1.000  1.000  1.000  0.889  1.000  1.000  1.000  0.875  1.000
  0.875  1.000  0.875  1.000]
train_accuracy: [ 0.991  0.991  0.991  0.991  0.991  0.991  0.991  0.991  0.991  0.991
  1.000  0.991  1.000  0.991]
test_recall: [ 1.000  1.000  1.000  1.000

# Finally

digits: RandomForestClassifier
breast_cancer: RandomForestClassifier, SVM
wine: LogisticRegression, SVM