In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.model_selection import GridSearchCV
import copy
import pickle

In [None]:
train_full = pd.read_csv("./data/base/train-white.csv", sep=",")
test_full = pd.read_csv("./data/base/test-white.csv", sep=",")

In [None]:
target = "quality"
model_name = "rf"

In [None]:
with open('./model/prepared-{}.pickle'.format(model_name), 'rb') as f:
    model = pickle.load(f)

In [None]:
X_train = copy.deepcopy(train_full.drop(columns=target))
X_test = copy.deepcopy(test_full.drop(columns=target))
y_train = train_full[target]
y_test = test_full[target]

In [None]:
X_test.head()

In [None]:
X_test.columns

In [None]:
def train_run(X, y, X_valid, y_valid, model):
    clf = model
    clf.fit(X, y)
    y_valid_pred = clf.predict(X_valid)
    y_train_pred = clf.predict(X)
    f1_train = f1_score(y, y_train_pred, average="weighted")
    precision_train = precision_score(y, y_train_pred, average="weighted")
    recall_train = recall_score(y, y_train_pred, average="weighted")
    f1 = f1_score(y_valid, y_valid_pred, average="weighted")
    precision = precision_score(y_valid, y_valid_pred, average="weighted")
    recall = recall_score(y_valid, y_valid_pred, average="weighted")

    scores = {
        "f1_train": f1_train,
        "precision_train": precision_train,
        "recall_train": recall_train,
        "f1": f1,
        "precision": precision,
        "recall": recall,
    }

    return scores

In [None]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectFromModel

In [None]:
def train_runs():

    clf = ExtraTreesClassifier(n_estimators=50)

    tree_selector = SelectFromModel(clf)
    univariate_selector1 = SelectKBest(f_classif, k=3)
    univariate_selector2 = SelectKBest(f_classif, k=6)
    univariate_selector3 = SelectKBest(f_classif, k=9)
    
    base_estimator = model
    base_estimator.steps.insert(-1, ["selector", tree_selector])
    param_grid = {
        "{}__max_depth".format(model_name): [10, 15, 20, 30],
        "selector": ["passthrough", univariate_selector1, univariate_selector2, univariate_selector3, tree_selector]
    }
    sh = GridSearchCV(base_estimator, param_grid, scoring="f1_weighted", verbose=2).fit(X_train, y_train)
    results = sh.cv_results_
    best_clf = sh.best_estimator_
    
    return best_clf, results, param_grid

In [None]:
best_clf, results, param_grid = train_runs()

In [None]:
scores = train_run(X_train, y_train, X_test, y_test, best_clf)

In [None]:
print(best_clf)
print(scores)

In [None]:
y_pred_train = best_clf.predict(X_train)
y_pred_test = best_clf.predict(X_test)

In [None]:
from sklearn.metrics import confusion_matrix

In [None]:
def compute_rates(confusion_matrix):
    FP = confusion_matrix.sum(axis=0) - np.diag(confusion_matrix)
    FN = confusion_matrix.sum(axis=1) - np.diag(confusion_matrix)
    TP = np.diag(confusion_matrix)
    TN = confusion_matrix.sum() - (FP + FN + TP)

    # Sensitivity, hit rate, recall, or true positive rate
    TPR = TP / (TP + FN)
    # Specificity or true negative rate
    TNR = TN / (TN + FP)
    # Precision or positive predictive value
    PPV = TP / (TP + FP)
    # Negative predictive value
    NPV = TN / (TN + FN)
    # Fall out or false positive rate
    FPR = FP / (FP + TN)
    # False negative rate
    FNR = FN / (TP + FN)
    # False discovery rate
    FDR = FP / (TP + FP)

    return {
        "tpr": TPR,
        "tnr": TNR,
        "ppv": PPV,
        "npv": NPV,
        "fpr": FPR,
        "fnr": FNR,
        "fdr": FDR,
    }

In [None]:
# create confusion matrix
train_conf_mat = confusion_matrix(y_train, y_pred_train)
test_conf_mat = confusion_matrix(y_test, y_pred_test)

train_rates = compute_rates(train_conf_mat)
val_rates = compute_rates(test_conf_mat)

classes = np.sort(y_train.unique())
classes_val = np.sort(y_test.unique())

# create dataframes from confusion matrices
train_df = pd.DataFrame(train_conf_mat, index=classes, columns=classes)
val_df = pd.DataFrame(test_conf_mat, index=classes_val, columns=classes_val)

fig, axes = plt.subplots(1, 2, figsize=(16, 7))

# plot heatmap for training data
sns.heatmap(train_df, annot=True, fmt='d', ax=axes[0], cmap='YlGnBu')
axes[0].set_title('Train Confusion Matrix')
axes[0].set_xlabel('Predicted label')
axes[0].set_ylabel('True label')

# plot heatmap for test data
sns.heatmap(val_df, annot=True, fmt='d', ax=axes[1], cmap='YlGnBu')
axes[1].set_title('Test Confusion Matrix')
axes[1].set_xlabel('Predicted label')
axes[1].set_ylabel('True label')

false_positives = ""
for val_class in classes_val:
    index = val_class - 3
    false_positives += "{}: {}, ".format(val_class, train_rates["fpr"][index])
print("false positives: {}".format(false_positives))
true_negatives = ""
for val_class in classes_val:
    index = val_class - 3
    true_negatives += "{}: {}, ".format(val_class, train_rates["tnr"][index])
print("true negatives: {}".format(true_negatives))

plt.tight_layout()
plt.show()

In [None]:
from sklearn.model_selection import learning_curve

In [None]:
train_size_abs, train_scores, test_scores = learning_curve(
    best_clf, X_train, y_train, scoring="f1_weighted"
)

In [None]:
for train_size, cv_train_scores, cv_test_scores in zip(
    train_size_abs, train_scores, test_scores
):
    print(f"{train_size} samples were used to train the model")
    print(f"The average train f1_weighted is {cv_train_scores.mean():.2f}")
    print(f"The average test f1_weighted is {cv_test_scores.mean():.2f}")

In [None]:
my_lc = []
for train_size, cv_train_scores, cv_test_scores in zip(
    train_size_abs, train_scores, test_scores
):
    my_lc.append(
        {
            "n": int(train_size),
            "train_score": cv_train_scores.mean(),
            "test_score": cv_test_scores.mean(),
        }
    )

my_lc

In [None]:
import sys; sys.path.insert(0, '..')
from utils.mongo.MongoModelHandler import MongoModelHandler

In [None]:
mongo_model_handler = MongoModelHandler("selection", model_name=model_name)

In [None]:
mongo_model_handler.store_model(best_clf)

In [None]:
mongo_model_handler.store_model_scores(general=scores, conf_matrix_train=train_df, conf_matrix_test=val_df, learning_curve=my_lc)

In [None]:
mongo_model_handler.store_train_process(strategies=param_grid, approach="sklearn.model_selection.GridSearchCV", metric="f1_weighted")

In [None]:
mongo_model_handler.store_train_configs(results=results)

## Custom Reset of Mongo Collections

In [None]:
# from bson.objectid import ObjectId

In [None]:

# train_config_collection = mongo_model_handler.train_config_collection

# train_config_collection.delete_many({"parameters": {"rf__max_depth": "5"}})
# train_config_collection.delete_many({"parameters": {"rf__max_depth": "10"}})
# train_config_collection.delete_many({"parameters": {"rf__max_depth": "20"}})

# Save Model

In [None]:
import pickle
with open('./model/selected-{}.pickle'.format(model_name), 'wb') as handle:
    handle.write(pickle.dumps(best_clf))