In [None]:
import pandas as pd
import numpy as np
from os import path
from sklearn.linear_model import RidgeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.gaussian_process.kernels import RBF
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_squared_log_error
from sklearn.model_selection import train_test_split
from sklearn.base import accuracy_score
from sklearn.metrics import (
    average_precision_score,
    f1_score,
    precision_score,
    recall_score,
    roc_auc_score,
)
from sklearn.inspection import permutation_importance
import matplotlib.pyplot as plt

In [None]:
data_file = ""
one_hot_encoded = True in data_file

In [None]:
df = pd.read_csv(path.join("..", "cleaned_data", data_file))
y = df["label"]
X = df.drop(columns=["label"])
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [None]:
def rmsle(y_true, y_pred):
    return np.sqrt(mean_squared_log_error(np.expm1(y_true), np.expm1(y_pred)))

In [None]:
models = [
    ("XGBoost", XGBClassifier(enable_categorical=True)),
    ("Random Forest", RandomForestClassifier()),
    ("Ridge Classifier", RidgeClassifier()),
    ("Decision Tree", DecisionTreeClassifier()),
    ("Support Vector Regression", SVC()),
    ("LightGBM", LGBMClassifier()),
    ("KNN", KNeighborsClassifier(5, weights="uniform")),
    ("Naive Bayes", GaussianNB()),
    ("Neural Network", MLPClassifier()),
    ("Quadratic Discriminant Analysis", QuadraticDiscriminantAnalysis()),
]

In [None]:
min_error = float("inf")

for name, model in models:
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    rmsle_score = rmsle(y_test, y_pred)

    if rmsle_score < min_error:
        min_error = rmsle_score
        best_model = (name, model)

    print(f"{name}: RMSLE = {rmsle_score}")
print(best_model[0])

## Plot Model Performance

In [None]:
def get_results_all_models(models):
    results = {}
    print(y_test.value_counts())
    for name, model in models:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        roc_auc = roc_auc_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)
        average_precision = average_precision_score(y_test, y_pred)
        results[name] = {
            "accuracy": accuracy,
            "roc_auc": roc_auc,
            "precision": precision,
            "recall": recall,
            "f1": f1,
            "average_precision": average_precision,
        }
        print(
            f"{name}:\n\tSanity = {pd.DataFrame(y_pred)[0].value_counts()[0]};\n\tAccuracy = {accuracy};\n\tROC AUC = {roc_auc};\n\tPrecision = {precision};\n\tRecall = {recall};\n\tF1 = {f1};\n\tAverage Precision = {average_precision}"
        )
    return results

In [None]:
def plot_model_results(results):

    names = []
    accuracies = []
    roc_auc_curves = []
    precisions = []
    recalls = []
    f1s = []
    average_precisions = []
    for name, result in results.items():
        names.append(name)
        accuracies.append(result["accuracy"])
        roc_auc_curves.append(result["roc_auc"])
        precisions.append(result["precision"])
        recalls.append(result["recall"])
        f1s.append(result["f1"])
        average_precisions.append(result["average_precision"])

    X_axis = np.arange(len(names))

    plt.bar(X_axis - 0.3, accuracies, 0.1, label="Accuracies")
    plt.bar(X_axis - 0.2, roc_auc_curves, 0.1, label="ROC AUC")
    plt.bar(X_axis - 0.1, precisions, 0.1, label="Precision")
    plt.bar(X_axis + 0.0, recalls, 0.1, label="Recall")
    plt.bar(X_axis + 0.1, f1s, 0.1, label="F1")
    plt.bar(X_axis + 0.2, average_precisions, 0.1, label="Avg Precision")

    plt.xticks(X_axis, names, rotation=90)
    plt.xlabel("Models")
    plt.ylabel("Score")
    plt.title("Metrics of Different Models")
    plt.legend()
    plt.show()

## Feature importance of best performing algorthims

In [None]:
def plot_feature_importance(model, X):

    mdi_importances = pd.Series(
        model.feature_importances_, index=X.columns
    ).sort_values(ascending=True)
    ax = mdi_importances.plot.barh()
    ax.set_title("Decision Tree Feature Importances (MDI)")
    ax.figure.tight_layout()

In [None]:
def plot_permutation_importance(model, X, y):
    result = permutation_importance(
        model, X, y, n_repeats=10, random_state=42, n_jobs=2
    )

    sorted_importances_idx = result.importances_mean.argsort()
    importances = pd.DataFrame(
        result.importances[sorted_importances_idx].T,
        columns=X.columns[sorted_importances_idx],
    )
    ax = importances.plot.box(vert=False, whis=10)
    ax.set_title("Permutation Importances")
    ax.axvline(x=0, color="k", linestyle="--")
    ax.set_xlabel("Decrease in accuracy score")
    ax.figure.tight_layout()