# Listeria Model Development and Optimization

# Introduction
## Goal: 
Test 7 models (KNN, SVM, GBM, RandomForest, Neural Network, Logistic Regression, Decision Tree) with different data inputs (standardized, original, and with/without cluster and logarithmic columns). Therefore, a grid search method was designed to use hyperparameters to optimize the models, and test on different data inputs.

## How to Run
Here is a ipynb file that can be run in an IDE or Google Colab for model testing and results. Please input the file name you would like to test, and the predictor column.

In this notebook, the exact same models that are provided on the competition, official git repo are used. The code is adapted to perform a grid search (search through a list of hyper parameters) to find the best model results.

### To analyze the results, open Analyze_Results.ipynb


In [None]:
# Importing necessary packages to run the models
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    confusion_matrix,
    roc_auc_score,
    average_precision_score,
)
from keras.models import Sequential
from keras.layers import Dense, Input
from pathlib import Path
import json
from sklearn.inspection import permutation_importance
from sklearn.model_selection import train_test_split
from tqdm import tqdm

In [2]:
# defining global variables
TEST_SIZE = .22  # for validation
RANDOM_STATE = 42  # for repeatability
# developing results table to plot

DATA_PATH = None
# !!!!!! IF YOU ARE RUNNING IN VISUAL STUDIO, UNCOMMENT THE CODE BELOW
# ROOT = Path.cwd()
# if ROOT.name == "preparation":
#     ROOT = ROOT.parent
# DATA = ROOT / "data"

# getting in file path
try:
    file_info = Path(DATA_PATH / "ListeriaSoil_clean_log.csv")
except:
    try:
        file_info = Path("ListeriaSoil_clean_log.csv")
    except Exception as e:
        raise e

Y_COL = "binary_listeria_presense"

# changing strings/catagorical data to be encoded in 1-hot vectors
# (aka want to transform arbitrary strings into integer values)
ENCODE_STR = False


# Data Preparation

In [3]:
def data_prep(file_info):
    """
    ----- inputs -----
    file_info: Path object
        file wanting to process
    ----- outputs ----
    df: pandas df
        processed anonymzied data (string columns representing intervals split into min and max, then put as minimum and maximum values for those columns)
    """

    df = pd.read_csv(Path(file_info.name))

    # drop 'index' column if it exists, as it's typically an artifact and not a feature
    if 'index' in df.columns:
        df = df.drop(columns=['index'])

    # converting output column to binary
    if Y_COL == "binary_listeria_presense":
        original_listeria_col = 'Number of Listeria isolates obtained'
        df['binary_listeria_presense'] = [row_val if row_val == 0 else 1 for row_val in df[original_listeria_col]]
        # Drop the original column to prevent data leakage
        if original_listeria_col in df.columns:
            df = df.drop(columns=[original_listeria_col])

    # switching missing values and weird failures in writing to np.inf bc pandas didnt handle properly
    df = df.replace("#NAME?", -np.inf)
    df = df.fillna(-np.inf)

    # replacing inf with max number that is not max number + 100 in dict (FOR NOT JUST 99999999)
    df = df.replace(np.inf, 99999)
    # replacing -inf with min number (not -inf) - 100 in dict (FOR NOT JUST -99999999)
    df = df.replace(-np.inf, -99999)

    # removing any colums that are comletely empty
    df = df.dropna(axis=1, how="all")

    # applying one-hot encdoing to categorical variables
    if ENCODE_STR:
        df = pd.get_dummies(df)
    return df

## Splitting into Train and Test sets

In [4]:

def get_train_test(
    df, y_col=Y_COL, scaling_used=True, test_size=TEST_SIZE,
):
    """
    ----- inputs -----
    df: pandas dict
        processed data (all numerics)
    y_col: str
        string of y labels
    test_size: int
        % want test set to be of full data
    scaling_used: boolean
        whether to test scaled data and original data (True) or only original data (False)
    ----- outputs ----
    data_testing: dict[str=scalingType][str=y/X train/test label][pd.DataFrame]
        dictionary contianing
            * string of scaling type (standard scalar, orig)
                * string of what dataset grabbing (X_train, X_test, y_train, y_test)
                    * corresponding data in a pandas dataframe
        "
    """

    # indexes for test set
    X = df.drop(columns=Y_COL)
    y = df[Y_COL]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=RANDOM_STATE)

    # data columns
    data_columns = X.columns

    if scaling_used:  # if want to run on scaled and original data
        # testing all with and without scaled data
        scaler = StandardScaler()

        X_train_scaled = scaler.fit_transform(X_train.values)
        X_test_scaled = scaler.transform(X_test.values)

        data_testing = {
            "columns": data_columns,
            "standard_scalar": {
                "X_train": X_train_scaled,
                "X_test": X_test_scaled,
                "y_train": y_train,  # using unscaled y
                "y_test": y_test,  # using unscaled y
            },
            "orig": {
                "X_train": X_train,
                "X_test": X_test,
                "y_train": y_train,
                "y_test": y_test,
            },
        }

        return data_testing

    else:  # if only want to run on original data
        data_testing = {
            "columns": data_columns,
            "orig": {
                "X_train": X_train,
                "X_test": X_test,
                "y_train": y_train,
                "y_test": y_test,
            }
        }
        return data_testing


# Model Development

In [5]:
def test_svm(data_testing, file_info, model_predictions_data):
    """
    ----- inputs -----
    data_testing: dict[str=scalingType][str=y/X train/test label][pd.DataFrame]
        dictionary contianing
            * string of scaling type (standard scalar, orig)
                * string of what dataset grabbing (X_train, X_test, y_train, y_test)
                    * corresponding data in a pandas dataframe
    model_predictions_data: list
        A list to store dictionaries of y_true and y_pred_proba for PR/AUC curves.
    ----- outputs ----

    """

    # results table
    svm_results = []

    # defining hyperparameters for svm variables
    c_vals = [1, 4]
    svm_kernels = ['linear', 'rbf']

    # grid searching model results for svm on all types of data with all types of inputs
    for scalar_type in tqdm(data_testing.keys(), desc="svm scaled vs original"):
        if scalar_type == 'columns':
            continue
        X_test = data_testing[scalar_type]["X_test"]
        X_train = data_testing[scalar_type]["X_train"]
        y_train = data_testing[scalar_type]["y_train"]
        y_test = data_testing[scalar_type]["y_test"]
        feature_names = data_testing["columns"].tolist()

        # going through possible svm combos
        for c_val in c_vals:
            for svm_kernel in svm_kernels:
                # modeling portion
                model = SVC(C=c_val, kernel=svm_kernel, max_iter=20000, probability=True)
                model.fit(X_train, y_train)
                y_pred = model.predict(X_test)
                y_pred_proba = model.predict_proba(X_test)[:, 1] # Probability of the positive class

                # validation
                accuracy = accuracy_score(y_test, y_pred)
                precision = precision_score(y_test, y_pred, zero_division=0)
                recall = recall_score(y_test, y_pred, zero_division=0)
                f1 = f1_score(y_test, y_pred, zero_division=0)
                conf_matrix = confusion_matrix(y_test, y_pred)
                roc_auc = roc_auc_score(y_test, y_pred_proba)
                pr_auc = average_precision_score(y_test, y_pred_proba)

                # getting feature importance
                coefficients = np.nan
                feature_imp = np.nan
                feature_imp_json = np.nan
                if svm_kernel == 'linear':
                    coefficients = model.coef_.ravel()
                    feature_imp = dict(zip(feature_names, coefficients))
                    feature_imp_json = json.dumps({k: float(v) for k, v in feature_imp.items()})

                # getting permutation importance
                perm = permutation_importance(
                    model, X_test, y_test,
                    n_repeats=3,
                    random_state=RANDOM_STATE,
                    scoring="f1"  # or "accuracy"
                )

                perm_imp = dict(zip(feature_names, perm.importances_mean))
                perm_imp_json = json.dumps({k: float(v) for k, v in perm_imp.items()})

                # adding hyperparameters to each of these results/outputs: saving results to dict
                svm_results.append(
                    {
                        "file name": file_info.name,
                        "accuracy": accuracy,
                        "precision": precision,
                        "recall": recall,
                        "f1": f1,
                        "roc_auc": roc_auc,
                        "pr_auc": pr_auc,
                        "confusion matrix": conf_matrix,
                        "test size": TEST_SIZE,
                        "random state": RANDOM_STATE,
                        "scalar_status": scalar_type,
                        "y variable used": Y_COL,
                        "model used": "svm",
                        "logistic_reg_c": np.nan,
                        "lr_ratios": np.nan,
                        "nn_layers": np.nan,
                        "nn_neurons": np.nan,
                        "nn_batch_size": np.nan,
                        "nn_epochs": np.nan,
                        "dt_max_depth": np.nan,
                        "dt_min_samples_split": np.nan,
                        "svm_c_val": c_val,
                        "svm_kernel": svm_kernel,
                        "knn_weights": np.nan,
                        "gbm_learning_rate": np.nan,
                        "gbm_n_estimator": np.nan,
                        "rf_n_estimators": np.nan,
                        "rf_max_depth": np.nan,
                        "rf_min_samples_leaf": np.nan,
                        "coefficient_importance": feature_imp_json,
                        "permutation_importance": perm_imp_json,
                        "y_test": y_test.tolist(),
                        "y_pred_proba": y_pred_proba.tolist(),
                    }
                )

                # store y_test and y_pred_proba for PR/AUC plotting
                model_predictions_data.append({
                    "file name": file_info.name,
                    "model used": "svm",
                    "scalar_status": scalar_type,
                    "svm_c_val": c_val,
                    "svm_kernel": svm_kernel,
                    "y_test": y_test.tolist(),
                    "y_pred_proba": y_pred_proba.tolist()
                })

    return svm_results

In [6]:
def test_logistic_reg(data_testing, file_info, model_predictions_data):
    """
    ----- inputs -----
    data_testing: dict[str=scalingType][str=y/X train/test label][pd.DataFrame]
        dictionary contianing
            * string of scaling type (standard scalar, orig)
                * string of what dataset grabbing (X_train, X_test, y_train, y_test)
                    * corresponding data in a pandas dataframe
    model_predictions_data: list
        A list to store dictionaries of y_true and y_pred_proba for PR/AUC curves.
    ----- outputs ----

    """

    log_reg_results = []

    # defining hyperparameters for logistic regression variables
    c_vals = [0.01, 0.1, 1, 4, 8]
    lr_ratios = [
        0,
        1,
    ]  # 0 = l2 penalty, 1 = l1 penalty

    # grid searching model results for log reg on all types of data with all types of inputs
    for scalar_type in tqdm(data_testing.keys(), desc="logistic regression scaled vs original"):

        if scalar_type == 'columns':
            continue
        X_test = data_testing[scalar_type]["X_test"]
        X_train = data_testing[scalar_type]["X_train"]
        y_train = data_testing[scalar_type]["y_train"]
        y_test = data_testing[scalar_type]["y_test"]
        feature_names = data_testing["columns"].tolist()

        # going through possible logistic regression combos
        for c_val in c_vals:
            for lr_rat in lr_ratios:
                # modeling portion
                model = LogisticRegression(C=c_val, l1_ratio=lr_rat)
                model.fit(X_train, y_train)
                y_pred = model.predict(X_test)
                y_pred_proba = model.predict_proba(X_test)[:, 1] # Probability of the positive class

                # validation
                accuracy = accuracy_score(y_test, y_pred)
                precision = precision_score(y_test, y_pred, zero_division=0)
                recall = recall_score(y_test, y_pred, zero_division=0)
                f1 = f1_score(y_test, y_pred, zero_division=0)
                conf_matrix = confusion_matrix(y_test, y_pred)
                roc_auc = roc_auc_score(y_test, y_pred_proba)
                pr_auc = average_precision_score(y_test, y_pred_proba)

                # getting feature importance
                coefficients = model.coef_.ravel()
                feature_imp = dict(zip(feature_names, coefficients))
                feature_imp_json = json.dumps({k: float(v) for k, v in feature_imp.items()})

                # getting permutation importance
                perm = permutation_importance(
                    model, X_test, y_test,
                    n_repeats=10,
                    random_state=RANDOM_STATE,
                    scoring="f1"  # or "accuracy"
                )

                perm_imp = dict(zip(feature_names, perm.importances_mean))
                perm_imp_json = json.dumps({k: float(v) for k, v in perm_imp.items()})

                # adding hyperparameters to each of these results/outputs: saving results to dictfile_path: str
                log_reg_results.append(
                    {
                        "file name": file_info.name,
                        "accuracy": accuracy,
                        "precision": precision,
                        "recall": recall,
                        "f1": f1,
                        "roc_auc": roc_auc,
                        "pr_auc": pr_auc,
                        "confusion matrix": conf_matrix,
                        "test size": TEST_SIZE,
                        "random state": RANDOM_STATE,
                        "scalar_status": scalar_type,
                        "y variable used": Y_COL,
                        "model used": "logistic regression",
                        "logistic_reg_c": c_val,
                        "lr_ratios": lr_rat,
                        "nn_layers": np.nan,
                        "nn_neurons": np.nan,
                        "nn_batch_size": np.nan,
                        "nn_epochs": np.nan,
                        "dt_max_depth": np.nan,
                        "dt_min_samples_split": np.nan,
                        "svm_c_val": np.nan,
                        "svm_kernel": np.nan,
                        "knn_n_neighbor": np.nan,
                        "knn_weights": np.nan,
                        "gbm_learning_rate": np.nan,
                        "gbm_n_estimator": np.nan,
                        "rf_n_estimators": np.nan,
                        "rf_max_depth": np.nan,
                        "rf_min_samples_leaf": np.nan,
                        "coefficient_importance": feature_imp_json,
                        "permutation_importance": perm_imp_json,
                        "y_test": y_test.tolist(), # Added y_test to results
                        "y_pred_proba": y_pred_proba.tolist(),
                    }
                )

                # Store y_test and y_pred_proba for PR/AUC plotting
                model_predictions_data.append({
                    "file name": file_info.name,
                    "model used": "logistic regression",
                    "scalar_status": scalar_type,
                    "logistic_reg_c": c_val,
                    "lr_ratios": lr_rat,
                    "y_test": y_test.tolist(),
                    "y_pred_proba": y_pred_proba.tolist()
                })

    return log_reg_results

In [7]:
def test_knn(data_testing, file_info, model_predictions_data):
    """
    ----- inputs -----
    data_testing: dict[str=scalingType][str=y/X train/test label][pd.DataFrame]
        dictionary contianing
            * string of scaling type (standard scalar, orig)
                * string of what dataset grabbing (X_train, X_test, y_train, y_test)
                    * corresponding data in a pandas dataframe
    model_predictions_data: list
        A list to store dictionaries of y_true and y_pred_proba for PR/AUC curves.
    ----- outputs ----

    """

    # results table
    knn_results = []

    # defining hyperparameters for knn variables
    knn_n_neighbors = [2, 5, 10, 15, 20]
    weights = ['uniform', 'distance']

    # grid searching model results for knn on all types of data with all types of inputs
    for scalar_type in tqdm(data_testing.keys(), desc="knn scaled vs original"):
        if scalar_type == 'columns':
            continue
        X_test = data_testing[scalar_type]["X_test"]
        X_train = data_testing[scalar_type]["X_train"]
        y_train = data_testing[scalar_type]["y_train"]
        y_test = data_testing[scalar_type]["y_test"]
        feature_names = data_testing["columns"].tolist()

        # going through possible KNN combos
        for knn_n_neighbor in knn_n_neighbors:
            for weight in weights:
                # modeling portion
                model = KNeighborsClassifier(
                    n_neighbors=knn_n_neighbor, weights=weight
                )
                model.fit(X_train, y_train)
                y_pred = model.predict(X_test)
                y_pred_proba = model.predict_proba(X_test)[:, 1] # Probability of the positive class

                # validation
                accuracy = accuracy_score(y_test, y_pred)
                precision = precision_score(y_test, y_pred, zero_division=0)
                recall = recall_score(y_test, y_pred, zero_division=0)
                f1 = f1_score(y_test, y_pred, zero_division=0)
                conf_matrix = confusion_matrix(y_test, y_pred)
                roc_auc = roc_auc_score(y_test, y_pred_proba)
                pr_auc = average_precision_score(y_test, y_pred_proba)

                # getting permutation importance
                perm = permutation_importance(
                    model, X_test, y_test,
                    n_repeats=3, # reduced number to speed up model runtime
                    random_state=RANDOM_STATE,
                    scoring="f1"  # or "accuracy"
                )

                perm_imp = dict(zip(feature_names, perm.importances_mean))
                perm_imp_json = json.dumps({k: float(v) for k, v in perm_imp.items()})

                # adding hyperparameters to each of these results/outputs: saving results to dict
                knn_results.append(
                    {
                        "file name": file_info.name,
                        "accuracy": accuracy,
                        "precision": precision,
                        "recall": recall,
                        "f1": f1,
                        "roc_auc": roc_auc,
                        "pr_auc": pr_auc,
                        "confusion matrix": conf_matrix,
                        "test size": TEST_SIZE,
                        "random state": RANDOM_STATE,
                        "scalar_status": scalar_type,
                        "y variable used": Y_COL,
                        "model used": "knn",
                        "logistic_reg_c": np.nan,
                        "lr_ratios": np.nan,
                        "nn_layers": np.nan,
                        "nn_neurons": np.nan,
                        "nn_batch_size": np.nan,
                        "nn_epochs": np.nan,
                        "dt_max_depth": np.nan,
                        "dt_min_samples_split": np.nan,
                        "svm_c_val": np.nan,
                        "svm_kernel": np.nan,
                        "knn_n_neighbor": knn_n_neighbor,
                        "knn_weights": weight,
                        "gbm_learning_rate": np.nan,
                        "gbm_n_estimator": np.nan,
                        "rf_n_estimators": np.nan,
                        "rf_max_depth": np.nan,
                        "rf_min_samples_leaf": np.nan,
                        "coefficient_importance": np.nan,
                        "permutation_importance": perm_imp_json,
                        "y_test": y_test.tolist(), # Added y_test to results
                        "y_pred_proba": y_pred_proba.tolist(),
                    }
                )

                # Store y_test and y_pred_proba for PR/AUC plotting
                model_predictions_data.append({
                    "file name": file_info.name,
                    "model used": "knn",
                    "scalar_status": scalar_type,
                    "knn_n_neighbor": knn_n_neighbor,
                    "knn_weights": weight,
                    "y_test": y_test.tolist(),
                    "y_pred_proba": y_pred_proba.tolist()
                })

    return knn_results

In [8]:
def test_neural_net(data_testing, file_info, model_predictions_data):
    """
    ----- inputs -----
    data_testing: dict[str=scalingType][str=y/X train/test label][pd.DataFrame]
        dictionary contianing
            * string of scaling type (standard scalar, orig)
                * string of what dataset grabbing (X_train, X_test, y_train, y_test)
                    * corresponding data in a pandas dataframe
    model_predictions_data: list
        A list to store dictionaries of y_true and y_pred_proba for PR/AUC curves.
    ----- outputs ----

    """
    # results table
    neur_net_results = []

    # editing hyperparameters for neural network variables
    nn_layers_list = [1, 2, 3, 4]
    nn_neurons_list = [16, 32, 64, 128, 256]
    nn_batch_size_list = [32, 64, 128, 256]
    nn_epochs_list = [5, 10, 20]

    # grid searching model results for neural net on all types of data with all types of inputs
    for scalar_type in tqdm(data_testing.keys(), desc="neural net scaled vs original"):
        if scalar_type == 'columns':
            continue
        X_test = data_testing[scalar_type]["X_test"]
        X_train = data_testing[scalar_type]["X_train"]
        y_train = data_testing[scalar_type]["y_train"]
        y_test = data_testing[scalar_type]["y_test"]

        # going through possible neural net combos
        for nn_layers in nn_layers_list:
            for nn_neurons in nn_neurons_list:
                for nn_batch_size in nn_batch_size_list:
                    for nn_epochs in nn_epochs_list:
                        # modeling portion
                        model = Sequential()
                        model.add(Input(shape=(X_train.shape[1],)))
                        for _ in range(nn_layers):
                            model.add(Dense(nn_neurons, activation="relu"))
                        model.add(Dense(1, activation="sigmoid"))
                        model.compile(
                            optimizer="adam",
                            loss="binary_crossentropy",
                            metrics=["accuracy"],
                        )
                        model.fit(
                            X_train,
                            y_train,
                            epochs=nn_epochs,
                            batch_size=nn_batch_size,
                            verbose=0,
                        )
                        y_pred_proba = model.predict(X_test).flatten()
                        y_pred = (y_pred_proba > 0.5).astype(int)

                        # validation
                        accuracy = accuracy_score(y_test, y_pred)
                        precision = precision_score(y_test, y_pred, zero_division=0)
                        recall = recall_score(y_test, y_pred, zero_division=0)
                        f1 = f1_score(y_test, y_pred, zero_division=0)
                        conf_matrix = confusion_matrix(y_test, y_pred)
                        roc_auc = roc_auc_score(y_test, y_pred_proba)
                        pr_auc = average_precision_score(y_test, y_pred_proba)

                        # adding hyperparameters to each of these results/outputs: saving results to dict
                        neur_net_results.append(
                            {
                                "file name": file_info.name,
                                "accuracy": accuracy,
                                "precision": precision,
                                "recall": recall,
                                "f1": f1,
                                "roc_auc": roc_auc,
                                "pr_auc": pr_auc,
                                "confusion matrix": conf_matrix,
                                "test size": TEST_SIZE,
                                "random state": RANDOM_STATE,
                                "scalar_status": scalar_type,
                                "y variable used": Y_COL,
                                "model used": "neural net",
                                "logistic_reg_c": np.nan,
                                "lr_ratios": np.nan,
                                "nn_layers": nn_layers,
                                "nn_neurons": nn_neurons,
                                "nn_batch_size": nn_batch_size,
                                "nn_epochs": nn_epochs,
                                "dt_max_depth": np.nan,
                                "dt_min_samples_split": np.nan,
                                "svm_c_val": np.nan,
                                "svm_kernel": np.nan,
                                "knn_n_neighbor": np.nan,
                                "knn_weights": np.nan,
                                "gbm_learning_rate": np.nan,
                                "gbm_n_estimator": np.nan,
                                "rf_n_estimators": np.nan,
                                "rf_max_depth": np.nan,
                                "rf_min_samples_leaf": np.nan,
                                "coefficient_importance": np.nan,
                                "permutation_importance": np.nan,
                                "y_test": y_test.tolist(), # Added y_test to results
                                "y_pred_proba": y_pred_proba.tolist(),
                            }
                        )

                        # Store y_test and y_pred_proba for PR/AUC plotting
                        model_predictions_data.append({
                            "file name": file_info.name,
                            "model used": "neural net",
                            "scalar_status": scalar_type,
                            "nn_layers": nn_layers,
                            "nn_neurons": nn_neurons,
                            "nn_batch_size": nn_batch_size,
                            "nn_epochs": nn_epochs,
                            "y_test": y_test.tolist(),
                            "y_pred_proba": y_pred_proba.tolist()
                        })

    return neur_net_results

In [9]:
def test_gbm(data_testing, file_info, model_predictions_data):
    """
    ----- inputs -----
    data_testing: dict[str=scalingType][str=y/X train/test label][pd.DataFrame]
        dictionary contianing
            * string of scaling type (standard scalar, orig)
                * string of what dataset grabbing (X_train, X_test, y_train, y_test)
                    * corresponding data in a pandas dataframe
    model_predictions_data: list
        A list to store dictionaries of y_true and y_pred_proba for PR/AUC curves.
    ----- outputs ----

    """
    # results table
    gbm_results = []

    #  editing hyperparameters for gbm variables
    gbm_learning_rates = [0.01, 0.05, 0.1, 0.2]
    gbm_n_estimators = [100, 200, 400, 800]

    # grid searching model results for gbm on all types of data with all types of inputs
    for scalar_type in tqdm(data_testing.keys(), desc="gbm scaled vs original"):
        if scalar_type == 'columns':
            continue
        X_test = data_testing[scalar_type]["X_test"]
        X_train = data_testing[scalar_type]["X_train"]
        y_train = data_testing[scalar_type]["y_train"]
        y_test = data_testing[scalar_type]["y_test"]
        feature_names = data_testing["columns"].tolist()

        # going through possible gbm combos
        for gbm_learning_rate in gbm_learning_rates:
            for gbm_n_estimator in gbm_n_estimators:
                # modeling portion
                model = GradientBoostingClassifier(
                    learning_rate=gbm_learning_rate, n_estimators=gbm_n_estimator
                )
                model.fit(X_train, y_train)
                y_pred = model.predict(X_test)
                y_pred_proba = model.predict_proba(X_test)[:, 1] # Probability of the positive class

                # validation
                accuracy = accuracy_score(y_test, y_pred)
                precision = precision_score(y_test, y_pred, zero_division=0)
                recall = recall_score(y_test, y_pred, zero_division=0)
                f1 = f1_score(y_test, y_pred, zero_division=0)
                conf_matrix = confusion_matrix(y_test, y_pred)
                roc_auc = roc_auc_score(y_test, y_pred_proba)
                pr_auc = average_precision_score(y_test, y_pred_proba)

                # getting feature importance
                gbm_imp = model.feature_importances_
                feature_imp = dict(zip(feature_names, gbm_imp))
                feature_imp_json = json.dumps({k: float(v) for k, v in feature_imp.items()})

                # getting permutation importance
                perm = permutation_importance(
                    model, X_test, y_test,
                    n_repeats=10,
                    random_state=RANDOM_STATE,
                    scoring="f1"  # or "accuracy"
                )

                perm_imp = dict(zip(feature_names, perm.importances_mean))
                perm_imp_json = json.dumps({k: float(v) for k, v in perm_imp.items()})

                # adding hyperparameters to each of these results/outputs: saving results to dict
                gbm_results.append(
                    {
                        "file name": file_info.name,
                        "accuracy": accuracy,
                        "precision": precision,
                        "recall": recall,
                        "f1": f1,
                        "roc_auc": roc_auc,
                        "pr_auc": pr_auc,
                        "confusion matrix": conf_matrix,
                        "test size": TEST_SIZE,
                        "random state": RANDOM_STATE,
                        "scalar_status": scalar_type,
                        "y variable used": Y_COL,
                        "model used": "gbm",
                        "logistic_reg_c": np.nan,
                        "lr_ratios": np.nan,
                        "nn_layers": np.nan,
                        "nn_neurons": np.nan,
                        "nn_batch_size": np.nan,
                        "nn_epochs": np.nan,
                        "dt_max_depth": np.nan,
                        "dt_min_samples_split": np.nan,
                        "svm_c_val": np.nan,
                        "svm_kernel": np.nan,
                        "knn_n_neighbor": np.nan,
                        "knn_weights": np.nan,
                        "gbm_learning_rate": gbm_learning_rate,
                        "gbm_n_estimator": gbm_n_estimator,
                        "rf_n_estimators": np.nan,
                        "rf_max_depth": np.nan,
                        "rf_min_samples_leaf": np.nan,
                        "coefficient_importance": feature_imp_json,
                        "permutation_importance": perm_imp_json,
                        "y_test": y_test.tolist(), # Added y_test to results
                        "y_pred_proba": y_pred_proba.tolist(),
                    }
                )

                # Store y_test and y_pred_proba for PR/AUC plotting
                model_predictions_data.append({
                    "file name": file_info.name,
                    "model used": "gbm",
                    "scalar_status": scalar_type,
                    "gbm_learning_rate": gbm_learning_rate,
                    "gbm_n_estimator": gbm_n_estimator,
                    "y_test": y_test.tolist(),
                    "y_pred_proba": y_pred_proba.tolist()
                })
    return gbm_results

In [10]:
def test_decision_tree(data_testing, file_info, model_predictions_data):
    """
    ----- inputs -----
    data_testing: dict[str=scalingType][str=y/X train/test label][pd.DataFrame]
        dictionary contianing
            * string of scaling type (standard scalar, orig)
                * string of what dataset grabbing (X_train, X_test, y_train, y_test)
                    * corresponding data in a pandas dataframe
    model_predictions_data: list
        A list to store dictionaries of y_true and y_pred_proba for PR/AUC curves.
    ----- outputs ----

    """
    # results table
    dec_tree_results = []

    #  editing hyperparameters for decision tree variables
    dt_max_depths = [50, 100, 200, 400, None]
    dt_min_samples_splits = [2, 10, 20, 50]

    # grid searching model results for decision tree on all types of data with all types of inputs
    for scalar_type in tqdm(data_testing.keys(), desc="decision tree scaled vs original"):
        if scalar_type == 'columns':
            continue
        X_test = data_testing[scalar_type]["X_test"]
        X_train = data_testing[scalar_type]["X_train"]
        y_train = data_testing[scalar_type]["y_train"]
        y_test = data_testing[scalar_type]["y_test"]
        feature_names = data_testing["columns"].tolist()

        # going through possible decision tree combos
        for dt_min_samples_split in dt_min_samples_splits:
            for dt_max_depth in dt_max_depths:
                # modeling portion
                model = DecisionTreeClassifier(
                    max_depth=dt_max_depth, min_samples_split=dt_min_samples_split
                )
                model.fit(X_train, y_train)
                y_pred = model.predict(X_test)
                y_pred_proba = model.predict_proba(X_test)[:, 1] # Probability of the positive class

                # validation
                accuracy = accuracy_score(y_test, y_pred)
                precision = precision_score(y_test, y_pred, zero_division=0)
                recall = recall_score(y_test, y_pred, zero_division=0)
                f1 = f1_score(y_test, y_pred, zero_division=0)
                conf_matrix = confusion_matrix(y_test, y_pred)
                roc_auc = roc_auc_score(y_test, y_pred_proba)
                pr_auc = average_precision_score(y_test, y_pred_proba)

                # getting feature importance
                tree_imp = model.feature_importances_
                feature_imp = dict(zip(feature_names, tree_imp))
                feature_imp_json = json.dumps({k: float(v) for k, v in feature_imp.items()})

                # getting permutation importance
                perm = permutation_importance(
                    model, X_test, y_test,
                    n_repeats=10,
                    random_state=RANDOM_STATE,
                    scoring="f1"  # or "accuracy"
                )

                perm_imp = dict(zip(feature_names, perm.importances_mean))
                perm_imp_json = json.dumps({k: float(v) for k, v in perm_imp.items()})

                # adding hyperparameters to each of these results/outputs
                dec_tree_results.append(
                    {
                        "file name": file_info.name,
                        "accuracy": accuracy,
                        "precision": precision,
                        "recall": recall,
                        "f1": f1,
                        "roc_auc": roc_auc,
                        "pr_auc": pr_auc,
                        "confusion matrix": conf_matrix,
                        "test size": TEST_SIZE,
                        "random state": RANDOM_STATE,
                        "scalar_status": scalar_type,
                        "y variable used": Y_COL,
                        "model used": "decision_tree",
                        "logistic_reg_c": np.nan,
                        "lr_ratios": np.nan,
                        "nn_layers": np.nan,
                        "nn_neurons": np.nan,
                        "nn_batch_size": np.nan,
                        "nn_epochs": np.nan,
                        "dt_max_depth": dt_max_depth,
                        "dt_min_samples_split": dt_min_samples_split,
                        "svm_c_val": np.nan,
                        "svm_kernel": np.nan,
                        "knn_n_neighbor": np.nan,
                        "knn_weights": np.nan,
                        "gbm_learning_rate": np.nan,
                        "gbm_n_estimator": np.nan,
                        "rf_n_estimators": np.nan,
                        "rf_max_depth": np.nan,
                        "rf_min_samples_leaf": np.nan,
                        "coefficient_importance": feature_imp_json,
                        "permutation_importance": perm_imp_json,
                        "y_test": y_test.tolist(), # Added y_test to results
                        "y_pred_proba": y_pred_proba.tolist(),
                    }
                )

                # Store y_test and y_pred_proba for PR/AUC plotting
                model_predictions_data.append({
                    "file name": file_info.name,
                    "model used": "decision_tree",
                    "scalar_status": scalar_type,
                    "dt_max_depth": dt_max_depth,
                    "dt_min_samples_split": dt_min_samples_split,
                    "y_test": y_test.tolist(),
                    "y_pred_proba": y_pred_proba.tolist()
                })

    return dec_tree_results

In [11]:
def test_random_forest(data_testing, file_info, model_predictions_data):
    """
    ----- inputs -----
    data_testing: dict[str=scalingType][str=y/X train/test label][pd.DataFrame],
        dictionary contianing,
            * string of scaling type (standard scalar, orig),
                * string of what dataset grabbing (X_train, X_test, y_train, y_test),
                    * corresponding data in a pandas dataframe,
    model_predictions_data: list
        A list to store dictionaries of y_true and y_pred_proba for PR/AUC curves.
    ----- outputs -----
    """
    # results table
    rf_results = []

    # editing hyperparameters for random forest variables
    rf_n_estimators = [100, 300, 500]
    rf_max_depths = [None, 10, 50]
    rf_min_samples_leaf = [1, 2, 4]

    for scalar_type in tqdm(data_testing.keys(), desc="random forest scaled vs original"):
        if scalar_type == 'columns':
            continue
        X_test = data_testing[scalar_type]["X_test"]
        X_train = data_testing[scalar_type]["X_train"]
        y_train = data_testing[scalar_type]["y_train"]
        y_test = data_testing[scalar_type]["y_test"]
        feature_names = data_testing["columns"].tolist()

        for rf_n_estimator in rf_n_estimators:
            for rf_max_depth in rf_max_depths:
                for rf_min_leaf in rf_min_samples_leaf:
                        model = RandomForestClassifier(
                                n_estimators=rf_n_estimator,
                                max_depth=rf_max_depth,
                                min_samples_leaf=rf_min_leaf,
                                random_state=RANDOM_STATE,
                                n_jobs=-1
                        )
                        model.fit(X_train, y_train)
                        y_pred = model.predict(X_test)
                        y_pred_proba = model.predict_proba(X_test)[:, 1] # Probability of the positive class

                        # validation
                        accuracy = accuracy_score(y_test, y_pred)
                        precision = precision_score(y_test, y_pred, zero_division=0)
                        recall = recall_score(y_test, y_pred, zero_division=0)
                        f1 = f1_score(y_test, y_pred, zero_division=0)
                        conf_matrix = confusion_matrix(y_test, y_pred)
                        roc_auc = roc_auc_score(y_test, y_pred_proba)
                        pr_auc = average_precision_score(y_test, y_pred_proba)

                        # getting feature importance
                        rf_imp = model.feature_importances_
                        feature_imp = dict(zip(feature_names, rf_imp))
                        feature_imp_json = json.dumps({k: float(v) for k, v in feature_imp.items()})

                        # getting permutation importance
                        perm = permutation_importance(
                          model, X_test, y_test,
                          n_repeats=3,
                          random_state=RANDOM_STATE,
                          scoring="f1"  # or "accuracy"
                        )

                        perm_imp = dict(zip(feature_names, perm.importances_mean))
                        perm_imp_json = json.dumps({k: float(v) for k, v in perm_imp.items()})

                        # addinghyperparameters to each of these results/outputs
                        rf_results.append(
                            {
                                "file name": file_info.name,
                                "accuracy": accuracy,
                                "precision": precision,
                                "recall": recall,
                                "f1": f1,
                                "roc_auc": roc_auc,
                                "pr_auc": pr_auc,
                                "confusion matrix": conf_matrix,
                                "test size": TEST_SIZE,
                                "random state": RANDOM_STATE,
                                "scalar_status": scalar_type,
                                "y variable used": Y_COL,
                                "model used": "random_forest",
                                "logistic_reg_c": np.nan,
                                "lr_ratios": np.nan,
                                "nn_layers": np.nan,
                                "nn_neurons": np.nan,
                                "nn_batch_size": np.nan,
                                "nn_epochs": np.nan,
                                "dt_max_depth": np.nan,
                                "dt_min_samples_split": np.nan,
                                "svm_c_val": np.nan,
                                "svm_kernel": np.nan,
                                "knn_n_neighbor": np.nan,
                                "knn_weights": np.nan,
                                "gbm_learning_rate": np.nan,
                                "gbm_n_estimator": np.nan,
                                "rf_n_estimators": rf_n_estimator,
                                "rf_max_depth": rf_max_depth,
                                "rf_min_samples_leaf": rf_min_leaf,
                                "coefficient_importance": feature_imp_json,
                                "permutation_importance": perm_imp_json,
                                "y_test": y_test.tolist(), # Added y_test to results
                                "y_pred_proba": y_pred_proba.tolist(),
                            }
                        )

                        # Store y_test and y_pred_proba for PR/AUC plotting
                        model_predictions_data.append({
                            "file name": file_info.name,
                            "model used": "random_forest",
                            "scalar_status": scalar_type,
                            "rf_n_estimators": rf_n_estimator,
                            "rf_max_depth": rf_max_depth,
                            "rf_min_samples_leaf": rf_min_leaf,
                            "y_test": y_test.tolist(),
                            "y_pred_proba": y_pred_proba.tolist()
                        })

    return rf_results

## Preparing models to run

In [12]:
def run_models_for_file(file_info, model_predictions_data) -> list:
    """
    Goal: return model results for file

    Paramaters:
        file: str
            the name of the file want to model
        file_info: Path object
            parsed information about the file
        model_predictions_data: list
            A list to store dictionaries of y_true and y_pred_proba for PR/AUC curves.

    Outputs:
        all_rows: list
            list of the dictionary model results
    """

    df = data_prep(file_info)
    if df.empty:
        return []

    data_testing = get_train_test(df, y_col=Y_COL, scaling_used=True)

    model_fns = [
          test_logistic_reg,
          test_neural_net,
          test_knn,
          test_decision_tree,
          test_random_forest,
          test_svm,
          test_gbm,
    ]

    # running each model in the model funcs list to return the results
    all_rows = []
    for fn in model_fns:
        rows = fn(data_testing, file_info, model_predictions_data) # running each function
        if rows:
            all_rows.extend(rows)

    return all_rows

## Running all Models

In [13]:
# initializing a list to store y_test and y_pred_proba for PR/AUC plotting
model_predictions_data = []

# running each model to get results
rows_results = run_models_for_file(file_info, model_predictions_data) # calling function to run models
dataframe_rows_results = pd.DataFrame(rows_results) # converting into a dataframe, so that we can save it
dataframe_rows_results.to_csv(f'results_for_{file_info.name}') # saving it into the files section as a CSV
print("\n\nCOMPLETED: k\n\n")

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step




[1m1/5[0m [32m━━━━[0m[37m━━━━━━━━━━━━━━━━[0m [1m0s[0m 42ms/step



[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11

neural net scaled vs original:  67%|██████▋   | 2/3 [09:46<04:53, 293.11s/it]

[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12

neural net scaled vs original: 100%|██████████| 3/3 [19:57<00:00, 399.08s/it]
knn scaled vs original: 100%|██████████| 3/3 [00:13<00:00,  4.48s/it]
decision tree scaled vs original: 100%|██████████| 3/3 [00:59<00:00, 19.91s/it]
random forest scaled vs original: 100%|██████████| 3/3 [09:37<00:00, 192.51s/it]
svm scaled vs original: 100%|██████████| 3/3 [00:06<00:00,  2.25s/it]
gbm scaled vs original: 100%|██████████| 3/3 [02:55<00:00, 58.47s/it]




COMPLETED: k


