## Find the best binary classifier for a given dataset
### Imports

In [1]:
# Libraries
import pandas as pd
import numpy as np
import sklearn.datasets as data
from sklearn.model_selection import train_test_split, cross_validate
from sklearn import linear_model
import matplotlib.pyplot as plt
import seaborn as sns

# Algorithms
from MachineLearning.SupervisedLearning.LogisticRegression.Classification.functions import *
from MachineLearning.SupervisedLearning.NaiveBayes.Classification.functions import *
from MachineLearning.SupervisedLearning.NearestNeighbors.Classification.functions import *
from MachineLearning.SupervisedLearning.SVM.Classification.functions import *
from MachineLearning.SupervisedLearning.DecisionTree.Classification.functions import *

# Silence convergence warnings
from sklearn.exceptions import ConvergenceWarning
import warnings
warnings.simplefilter("ignore", category=ConvergenceWarning)



### Load dataset

In [2]:
X, y = data.load_breast_cancer(return_X_y=True)

# Create train/test splits (train_test_split() automatically uses a stratified split based on class labels)
X_train, X_test, y_train, y_test = \
    train_test_split(X, y, test_size=0.20, random_state=42)

### Specify metrics
Specify which metrics to judge the algorithm performance on. Options:
- **Accuracy:**
- **Precision:**
- **Recall:**
- **F1:**

In [3]:
scoring = "accuracy"  # TODO: Add possibility of selecting multiple scoring functions

### Set hyperparameter ranges

In [4]:
logistic_regression_parameters = {
    "C_range" : [0.001, 0.01, 0.1, 1, 10, 100, 1000]
}

naive_bayes_parameters = {
    "var_smoothing" : [0.000000001, 0.00000001]
}

knn_parameters = {
    "k_range": [i for i in range(1, 10)]
}

SVM_parameters = {
    "kernels" : ["linear", "poly", "rbf", "sigmoid"],
    "C_range": [0.001, 0.01, 0.1, 1, 10, 100, 1000]
}

decision_tree_parameters = {
    "criterion" : ["gini", "entropy", "log_loss"],
}


### Run algorithms /w 10-fold cross-validation

In [5]:
# Run logistic regression
logistic_regression_results, logistic_regression_best_estimator, logistic_regression_best_score \
    = logistic_regression_gridsearch(X_train, y_train, logistic_regression_parameters, scoring=scoring)

# Run Naive Bayes
naive_bayes_results, naive_bayes_best_estimator, naive_bayes_best_score = naive_bayes_gridsearch(
    X_train, y_train, naive_bayes_parameters, scoring=scoring)

# Run k-NN
knn_results, knn_best_estimator, knn_best_score = knn_gridsearch(
    X_train, y_train, knn_parameters, scoring=scoring)

# # Run SVM
# SVM_results, SVM_best_estimator, SVM_best_score = SVM_gridsearch(
#     X_train, y_train, SVM_parameters, scoring=scoring)

# Run decision tree
decision_tree_results, decision_tree_best_estimator, decision_tree_best_score = decision_tree_gridsearch(
    X_train, y_train, decision_tree_parameters, scoring=scoring)



### Assemble results

In [6]:
all_results = {
    "logistic_regression": pd.DataFrame(logistic_regression_results),
    "naive_bayes": pd.DataFrame(naive_bayes_results),
    "nearest_neighbors": pd.DataFrame(knn_results),
    # "SVM": pd.DataFrame(SVM_results),
    "decision_tree": pd.DataFrame(decision_tree_results)
}

best_results = [
    [logistic_regression_best_estimator, logistic_regression_best_score],
    [naive_bayes_best_estimator, naive_bayes_best_score],
    [knn_best_estimator,knn_best_score],
    # [SVM_best_estimator,SVM_best_score],
    [decision_tree_best_estimator,decision_tree_best_score],
]



In [7]:
split_columns = [f"split{i}_train_score" for i in range(10)]
split_columns.extend([f"split{i}_test_score" for i in range(10)])
all_results["decision_tree"].drop(split_columns, axis=1)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,params,mean_test_score,std_test_score,rank_test_score,mean_train_score,std_train_score
0,0.0051,0.000538,0.0005,0.0005,gini,{'criterion': 'gini'},0.929758,0.04359,1,1.0,0.0
1,0.007,0.002144,0.0005,0.0005,entropy,{'criterion': 'entropy'},0.920821,0.042059,2,1.0,0.0
2,0.0066,0.000664,0.0005,0.0005,log_loss,{'criterion': 'log_loss'},0.920821,0.042059,2,1.0,0.0


### Obtain best performance per algorithm

In [8]:
df_best_results = pd.DataFrame(best_results, columns = ["Estimator", "Score"])
# df.columns = ["Estimator", "Score"]

In [9]:
df_best_results


Unnamed: 0,Estimator,Score
0,LogisticRegression(C=10),0.951643
1,GaussianNB(),0.93401
2,KNeighborsClassifier(n_neighbors=3),0.927391
3,DecisionTreeClassifier(random_state=42),0.929758


### Compare performance