## Compare different classification models, see which one beats another under which case

In [2]:
# Naive Bayes family
from sklearn.naive_bayes import (
    GaussianNB,
    MultinomialNB,
    BernoulliNB,
    CategoricalNB
)

# Discriminant Analysis
from sklearn.discriminant_analysis import (
    LinearDiscriminantAnalysis,
    QuadraticDiscriminantAnalysis
)

# Neighbors
from sklearn.neighbors import (
    KNeighborsClassifier,
    NearestCentroid
)

# Linear Models
from sklearn.linear_model import (
    LogisticRegression,
    SGDClassifier,
    Perceptron,
    PassiveAggressiveClassifier
)

# Support Vector Machines
from sklearn.svm import (
    SVC,
    LinearSVC
)

# Decision Trees
from sklearn.tree import (
    DecisionTreeClassifier,
    ExtraTreeClassifier
)

# Ensembles
from sklearn.ensemble import (
    RandomForestClassifier,
    ExtraTreesClassifier,
    BaggingClassifier,
    AdaBoostClassifier,
    GradientBoostingClassifier,
    VotingClassifier,
    StackingClassifier
)

# Gaussian Processes
from sklearn.gaussian_process import (
    GaussianProcessClassifier
)

In [105]:
def initialise_nb_family():
    return {
        "gaussianNB": GaussianNB(),
        "multinomialNB": MultinomialNB(),
        "bernoulliNB": BernoulliNB(),
        "categoricalNB": CategoricalNB()
    }

def initialise_discriminant_analysis():
    return {
        "linearDiscriminantAnalysis": LinearDiscriminantAnalysis(),
        "quadraticDiscriminantAnalysis": QuadraticDiscriminantAnalysis(),
    }

def initialise_svm_family():
    return {
        "svc": SVC(),
        "linearSVC": LinearSVC(),
    }

def initialise_neighbours_family():
    return {
        "knn": KNeighborsClassifier(),
        "nce": NearestCentroid(),
    }

def initialise_linear_models():
    return {
        "logisticRegression": LogisticRegression(),
        "SGDClassifier": SGDClassifier(),
        "passiveAggressiveClassifier": PassiveAggressiveClassifier(),
        "perceptron": Perceptron(),
    }

def initialise_tree_models():
    return {
        "decisionTree": DecisionTreeClassifier(),
        "extraTree": ExtraTreeClassifier(),
    }

def initialise_ensemble_models():
    return {
        'randomForestClassifier': RandomForestClassifier(),
        'extraTreeClassifier': ExtraTreesClassifier(),
        'baggingClassifier': BaggingClassifier(),
        'adaBoostClassifier': AdaBoostClassifier(),
        'gradientBoostingClassifier': GradientBoostingClassifier(),
        # 'votingClassifier': VotingClassifier(),
        # 'stackingClassifier': StackingClassifier(),
    }

def initialise_gaussian_model():
    return {
        "gaussianProcess": GaussianProcessClassifier(),
    }

In [106]:
from typing import List
def initialise_models(names=None):
    models = initialise_nb_family() | initialise_discriminant_analysis() | initialise_svm_family() | initialise_neighbours_family() | initialise_linear_models() | initialise_tree_models() | initialise_ensemble_models() | initialise_gaussian_model()
    if names is None:
        return models
    return {k: models[k] for k in names}

In [112]:
from typing import Dict
from sklearn.base import ClassifierMixin
from sklearn.metrics import classification_report
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import time
def generate_report_for_models(models: Dict[str, ClassifierMixin], x, y, labels):
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=123)
    # No need for scaling, we just have 1 feature anyway...
    columns = ['training_time', 'predict_time', 'accuracy', 'precision', 'recall', 'f1'] + [str(l)+'_precision' for l in labels] + [str(l)+'_recall' for l in labels]
    all_performance = {}
    for model_name, model in models.items():
        train_start = time.time()
        model.fit(x_train, y_train)
        train_end = time.time()
        test_start = time.time()
        y_pred = model.predict(x_test)
        test_end = time.time()
        report = classification_report(y_test, y_pred, output_dict=True)
        all_performance[model_name] = [train_end-train_start, test_end-test_start, report['accuracy'], report['macro avg']['precision'], report['macro avg']['recall'], report['macro avg']['f1-score']]
        for l in labels:
            all_performance[model_name].append(report[str(l)]['precision'])
            all_performance[model_name].append(report[str(l)]['recall'])
    return pd.DataFrame.from_dict(all_performance, columns=columns, orient='index')


In [108]:

time.time()

1758803915.854551

In [116]:
# perfect x,y
import numpy as np

x_perfect = np.random.normal(0, 1, 10000)+10
x_perfect = np.clip(x_perfect, a_min=1e-8, a_max=None)
y_perfect = np.array([x > 10 for x in x_perfect]).astype(int)
x_perfect = x_perfect.reshape(-1, 1)

In [119]:
m = initialise_models()
generate_report_for_models(m, x_perfect, y_perfect, list(set(y_perfect)))

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Unnamed: 0,training_time,predict_time,accuracy,precision,recall,f1,0_precision,1_precision,0_recall,1_recall
gaussianNB,0.001473,0.000176,0.9985,0.998509,0.998495,0.9985,0.997018,1.0,1.0,0.996991
multinomialNB,0.001343,7.4e-05,0.4985,0.24925,0.5,0.332666,0.0,0.0,0.4985,1.0
bernoulliNB,0.001632,0.000385,0.4985,0.24925,0.5,0.332666,0.0,0.0,0.4985,1.0
categoricalNB,0.00163,0.00014,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
linearDiscriminantAnalysis,0.001258,8.7e-05,0.9995,0.999499,0.999501,0.9995,1.0,0.999003,0.998998,1.0
quadraticDiscriminantAnalysis,0.000645,0.000105,0.9985,0.998509,0.998495,0.9985,0.997018,1.0,1.0,0.996991
svc,0.035198,0.016818,0.9985,0.998509,0.998495,0.9985,0.997018,1.0,1.0,0.996991
linearSVC,0.073214,0.000143,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
knn,0.001502,0.028257,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
nce,0.000553,0.001375,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
