In [1]:
# -- scikit-learn classifiers, uncomment the one of interest----

# from sklearn.svm import SVC
# from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
# from sklearn.naive_bayes import GaussianNB
# from sklearn.linear_model import LogisticRegression
# from sklearn.neighbors import KNeighborsClassifier

import os
import sys
import joblib
import numpy as np
import pandas as pd
from joblib import dump
import subprocess as sp
import dask.array as da
from pprint import pprint
import matplotlib.pyplot as plt
from odc.io.cgroups import get_cpu_quota
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import GridSearchCV, ShuffleSplit, KFold
from sklearn.metrics import roc_curve, auc, balanced_accuracy_score, f1_score

In [2]:
training_data = "results/test_training_data.txt"

Classifier = RandomForestClassifier

metric = 'balanced_accuracy'

In [3]:
inner_cv_splits = 5

outer_cv_splits = 5

test_size = 0.20

In [4]:
ncpus = 12

In [5]:
# load the data
model_input = np.loadtxt(training_data)

# load the column_names
with open(training_data, 'r') as file:
    header = file.readline()

column_names = header.split()[1:]

# Extract relevant indices from training data
model_col_indices = [column_names.index(var_name) for var_name in column_names[1:]]

# Convert variable names into sci-kit learn nomenclature
X = model_input[:, model_col_indices]
y = model_input[:, 0]

In [6]:
# Create the parameter grid based on the results of random search
param_grid = {
    'class_weight': ['balanced', None],
    'max_features': ['auto', 'log2', None],
    'n_estimators': [100, 200, 300],
    'criterion': ['gini', 'entropy']
}

In [7]:
outer_cv = KFold(n_splits=outer_cv_splits, shuffle=True,
                        random_state=0)

# lists to store results of CV testing
acc = []
f1 = []
roc_auc = []
i = 1

for train_index, test_index in outer_cv.split(X, y):
    print(f"Working on {i}/5 outer CV split", end='\r')
    model = Classifier(random_state=1)

    # Index training, testing, and coordinate data
    X_tr, X_tt = X[train_index, :], X[test_index, :]
    y_tr, y_tt = y[train_index], y[test_index]

    # Inner split on data within outer split
    inner_cv = KFold(n_splits=inner_cv_splits,
                     shuffle=True,
                     random_state=0)

    clf = GridSearchCV(
        estimator=model,
        param_grid=param_grid,
        scoring=metric,
        n_jobs=ncpus,
        refit=True,
        cv=inner_cv.split(X_tr, y_tr),
    )

    clf.fit(X_tr, y_tr)

    # Predict using the best model
    best_model = clf.best_estimator_
    pred = best_model.predict(X_tt)

    # Evaluate model w/ multiple metrics
    # ROC AUC
    probs = best_model.predict_proba(X_tt)
    probs = probs[:, 1]
    fpr, tpr, thresholds = roc_curve(y_tt, probs)
    auc_ = auc(fpr, tpr)
    roc_auc.append(auc_)

    # Overall accuracy
    ac = balanced_accuracy_score(y_tt, pred)
    acc.append(ac)

    # F1 scores
    f1_ = f1_score(y_tt, pred)
    f1.append(f1_)
    i += 1



Working on 5/5 outer CV split

In [8]:
print("=== Nested K-Fold Cross-Validation Scores ===")
print("Mean balanced accuracy: "+ str(round(np.mean(acc), 2)))
print("Std balanced accuracy: "+ str(round(np.std(acc), 2)))
print('\n')
print("Mean F1: "+ str(round(np.mean(f1), 2)))
print("Std F1: "+ str(round(np.std(f1), 2)))
print('\n')
print("Mean roc_auc: "+ str(round(np.mean(roc_auc), 3)))
print("Std roc_auc: "+ str(round(np.std(roc_auc), 2)))
print('=============================================')

=== Nested K-Fold Cross-Validation Scores ===
Mean balanced accuracy: 0.64
Std balanced accuracy: 0.01


Mean F1: 0.92
Std F1: 0.0


Mean roc_auc: 0.75
Std roc_auc: 0.01


In [9]:
# Generate n_splits of train-test_split
rs = ShuffleSplit(n_splits=outer_cv_splits, test_size=test_size, random_state=0)

In [10]:
# Instatiate a gridsearchCV
clf = GridSearchCV(Classifier(),
                   param_grid,
                   scoring=metric,
                   verbose=1,
                   cv=rs.split(X, y),
                   n_jobs=ncpus)

clf.fit(X, y)

print('\n')
print("The most accurate combination of tested parameters is: ")
pprint(clf.best_params_)
print('\n')
print(f"The {metric} score using these parameters is: ")
print(round(clf.best_score_, 2))

Fitting 5 folds for each of 36 candidates, totalling 180 fits


The most accurate combination of tested parameters is: 
{'class_weight': 'balanced',
 'criterion': 'entropy',
 'max_features': None,
 'n_estimators': 100}


The balanced_accuracy score using these parameters is: 
0.64


In [11]:
# Create a new model
new_model = Classifier(**clf.best_params_, random_state=1, n_jobs=ncpus)
new_model.fit(X, y)

In [12]:
dump(new_model, 'results/ml_model.joblib')

['results/ml_model.joblib']