# import packages

In [None]:
from itertools import permutations, product

import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier, VotingClassifier
from sklearn.model_selection import RepeatedStratifiedKFold, train_test_split, cross_val_score, cross_validate, StratifiedKFold
from sklearn.metrics import accuracy_score, auc, confusion_matrix, balanced_accuracy_score, precision_recall_curve, auc, roc_curve, roc_auc_score, f1_score, recall_score, precision_score, brier_score_loss, average_precision_score, classification_report, log_loss
from sklearn.inspection import permutation_importance
from sklearn import preprocessing

from sklearn.neural_network import MLPClassifier
from collections import Counter
from imblearn.ensemble import BalancedBaggingClassifier, BalancedRandomForestClassifier
import xgboost as xgb
from xgboost import XGBClassifier

import numpy as np
from numpy import mean,std
from sklearn.model_selection import GridSearchCV

import pickle

from ctgan import CTGANSynthesizer
from mlxtend.classifier import StackingCVClassifier

from os import path
import tqdm
import matplotlib.pyplot as plt

from warnings import simplefilter
from collections import OrderedDict
from sklearn.svm import SVC

from torchviz import make_dot

if (os.path.abspath('').split('/')[-1] == 'project'):
    %cd utils
elif (os.path.abspath('').split('/')[-1] == 'train_and_vis'):
    %cd ../utils

import query_utils
import model_utils
import validation_utils
import data_utils

if (os.path.abspath('').split('/')[-1] == 'utils'):
    %cd ..

simplefilter(action="ignore", category=pd.errors.PerformanceWarning)
simplefilter(action='ignore', category=FutureWarning)
pd.options.mode.chained_assignment = None

### Synthetic Data creation
Performed with CTGANSynthesizer

In [None]:
isZoonotic = df.loc[df['isZoonotic']==1][:1200]
isZoonotic = isZoonotic.loc[:, isZoonotic.columns != 'isZoonotic']
print(isZoonotic)

posGanModel = CTGANSynthesizer(batch_size=60, epochs=10, verbose=True)
posGanModel.fit(isZoonotic)

# check if current model is better than pickled model
posGanModel.save('models/curr_models/posGanModel.pkl')

notZoonotic = df.loc[df['isZoonotic']==0][:3000]
notZoonotic = isZoonotic.loc[:, isZoonotic.columns != 'isZoonotic']
print(notZoonotic)

negGanModel = CTGANSynthesizer(batch_size=60, epochs=10, verbose=True)
negGanModel.fit(notZoonotic)
negGanModel.save('models/curr_models/negGanModel.pkl')

# Dataset retrieval
Workings of the function is packaged into data_utils (for readability). Data is generated within "process_data.ipynb".

In [None]:
dataset = data_utils.retrieveMerged(dir='data/')
# datasets = data_utils.retrieveAllDatasets()

In [None]:
print(dataset['f1-3'])

In [None]:
print(len(dataset['f2-4']['X']))
print(len(dataset['f2-4']['y'])-sum(dataset['f2-4']['y']))

## Keep track of scores of each model

In [None]:
# modelScores = {}
import json
modelScores = pickle.load(open('score_df.pkl', 'rb'))
modelScores = modelScores.T.to_dict()
modelScores

score_df = pickle.load(open('score_df.pkl', 'rb'))

# TODO

### Generate & validate performance of KNN (baseline) on dataset

In [None]:
print(dataset.keys())

In [None]:
modelScores = {}

In [None]:
kmer = 4

features = ['f1', 'f2', 'f3']
for kmer in range(3, 7):
    for feature in features:
        ds = dataset[f'{feature}-{kmer}']

        X, y = ds['X'], ds['y']

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

        knntest = BalancedBaggingClassifier(base_estimator=KNeighborsClassifier(n_neighbors = 1, n_jobs = -1), n_estimators = 1, n_jobs = -1)

        # knntest = KNeighborsClassifier(n_neighbors = 1, n_jobs = -1)
        knntest.fit(X_train, y_train)
        # print(knntest.score(X_test, y_test))
        x = cross_validate(knntest, X, y, cv=5, scoring=['recall', 'f1', 'accuracy', 'precision', 'roc_auc', 'neg_brier_score', 'average_precision', ''])
        
        name = f'knn_{feature}_{kmer}'
        
        if (name not in modelScores):
            modelScores[name] = {}
            for k, v in x.items():
                print(k, v.mean())
                modelScores[name][k]=v.mean()
        else:
            for k, v in x.items():
                print(k, v.mean())
            print('already in modelScores')

In [None]:
pd.DataFrame(modelScores).T

### Evaluate & validate performance of random forest (baseline) on dataset

In [None]:
kmer = 4

features = ['f1', 'f2', 'f3']
for kmer in range(3, 7):
    for feature in features:
        ds = dataset[f'{feature}-{kmer}']

        X, y = ds['X'], ds['y']

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

        # randforest = BalancedBaggingClassifier(base_estimator=DecisionTreeClassifier(max_features="sqrt"), n_estimators=1, n_jobs=-1)
        randforest = BalancedRandomForestClassifier(max_features="sqrt", n_jobs=-1)

        # randforest.fit(X_train, y_train)
        randforest.fit(X_train, y_train)
        x = cross_validate(randforest, X, y, cv=5, scoring=['recall', 'f1', 'accuracy', 'precision', 'roc_auc', 'neg_brier_score', 'average_precision'])
        name = f'rf_{feature}_{kmer}'
        if (name not in modelScores):
            modelScores[name] = {}
            for k, v in x.items():
                print(k, v.mean())
                modelScores[name][k]=v.mean()
        else:
            for k, v in x.items():
                print(k, v.mean())
                modelScores[name][k]=v.mean()
            print('already in modelScores')


### XGB classifier

In [None]:
kmer = 4

features = ['f1', 'f2', 'f3']
for kmer in range(3, 7):
    for feature in features:
        ds = dataset[f'{feature}-{kmer}']

        X, y = ds['X'], ds['y']

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

        # randforest = BalancedBaggingClassifier(base_estimator=DecisionTreeClassifier(max_features="sqrt"), n_estimators=1, n_jobs=-1)
        xgb1 = BalancedBaggingClassifier(base_estimator=XGBClassifier(
        learning_rate =0.1,
        n_estimators=300,
        max_depth=9,
        min_child_weight=1,
        gamma=0,
        subsample=0.8,
        colsample_bytree=0.8,
        objective= 'binary:logistic',
        #  scale_pos_weight=1,
        seed=42,
        n_jobs=-1,
        scale_pos_weight=6,
        ), n_estimators=1, n_jobs=-1)
        xgb1.fit(X_train, y_train)

        # randforest.fit(X_train, y_train)
        x = cross_validate(xgb1, X, y, cv=5, scoring=['recall', 'f1', 'accuracy', 'precision', 'roc_auc', 'neg_brier_score', 'average_precision'])
        name = f'xgb_{feature}_{kmer}'
        if (name not in modelScores):
            modelScores[name] = {}
            for k, v in x.items():
                print(k, v.mean())
                modelScores[name][k]=v.mean()
        else:
            for k, v in x.items():
                print(k, v.mean())
                modelScores[name][k]=v.mean()
            print('already in modelScores')

In [None]:
print(modelScores.keys())

In [None]:
# ds = datasets['merged']['lengthdivdataset-4']
features = ['f1', 'f2', 'f3']
for kmer in range(3, 7):
    for feature in features:
        ds = dataset[f'{feature}-{kmer}']

        X, y = ds['X'], ds['y']

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

        # randforest = BalancedBaggingClassifier(base_estimator=DecisionTreeClassifier(max_features="sqrt"), n_estimators=1, n_jobs=-1)
        mlp = BalancedBaggingClassifier(base_estimator=MLPClassifier(alpha=0.6, hidden_layer_sizes=(100, 180, 180, 200, 200),
              max_iter=550, random_state=42, solver='adam', activation='relu'), n_estimators=5, n_jobs=-1)
        mlp.fit(X_train, y_train)

        # randforest.fit(X_train, y_train)
        x = cross_validate(mlp, X, y, cv=5, scoring=['recall', 'f1', 'accuracy', 'precision', 'roc_auc', 'neg_brier_score', 'average_precision']])
        name = f'mlp_{feature}_{kmer}'
        if (name not in modelScores):
            modelScores[name] = {}
            for k, v in x.items():
                print(k, v.mean())
                modelScores[name][k]=v.mean()
        else:
            for k, v in x.items():
                print(k, v.mean())
                modelScores[name][k]=v.mean()
            print('already in modelScores')

In [None]:
fit_time 71.89677076339721
score_time 0.10850300788879394
test_recall 0.8584950773558369
test_f1 0.6911281484361098
test_accuracy 0.8864734363076601
test_precision 0.5796959595204415
test_roc_auc 0.9540192015137766
test_neg_brier_score -0.07964541647825815

In [None]:
print(len(modelScores.keys()))
# print(modelScores.keys())
# print(modelScores['mlp_balanced_normalized_4'])
# print(modelScores['mlp_balanced_normalized_4'])

In [None]:
features = ['f1', 'f2', 'f3']
for kmer in range(3, 7):
    for feature in features:
        ds = dataset[f'{feature}-{kmer}']

        X, y = ds['X'], ds['y']

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

        # randforest = BalancedBaggingClassifier(base_estimator=DecisionTreeClassifier(max_features="sqrt"), n_estimators=1, n_jobs=-1)
        temp_svm = BalancedBaggingClassifier(base_estimator=SVC(kernel='rbf', C=2, gamma=0.6, probability=True, random_state=42, max_iter=500), n_estimators=1, n_jobs=-1)
        temp_svm.fit(X_train, y_train)

        # randforest.fit(X_train, y_train)
        x = cross_validate(temp_svm, X, y, cv=5, scoring=['recall', 'f1', 'accuracy', 'precision', 'roc_auc', 'neg_brier_score'])
        name = f'svm_{feature}_{kmer}'
        if (name not in modelScores):
            modelScores[name] = {}
            for k, v in x.items():
                print(k, v.mean())
                modelScores[name][k]=v.mean()
        else:
            for k, v in x.items():
                print(k, v.mean())
                modelScores[name][k]=v.mean()
            print('already in modelScores')

In [None]:
ds = dataset['f2-4']
X, y = ds['X'], ds['y']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


xgb1_test = XGBClassifier(
        learning_rate =0.1,
        n_estimators=200,
        max_depth=9,
        min_child_weight=1,
        gamma=0,
        subsample=0.8,
        colsample_bytree=0.8,
        objective= 'binary:logistic',
        #  scale_pos_weight=1,
        seed=42,
        n_jobs=-1,
        scale_pos_weight=6,
)

X_train, X_validation, y_train, y_validation = train_test_split(X_train, y_train, test_size=0.2, random_state=1)

xgb1_test.fit(X_train, y_train, eval_metric='aucpr', eval_set=[(X_validation, y_validation)], early_stopping_rounds=10, verbose=10)
# xgb1.fit(X_train, y_train)

# x = cross_validate(xgb1, X, y, cv=5, scoring=['recall', 'f1', 'accuracy', 'precision', 'roc_auc', 'neg_brier_score'])

# for k, v in x.items():
#         print(k, v.mean())
        # modelScores[name][k]=v.mean()

# X_test = X_test[xgb1.get_booster().feature_names]

In [None]:
x = cross_validate(xgb1, X, y, cv=10, scoring=['recall', 'f1', 'accuracy', 'precision', 'roc_auc', 'neg_brier_score', 'average_precision'])
print(x)

In [None]:
ds = dataset['f2-4']
X, y = ds['X'], ds['y']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# train random forest
randforest = BalancedBaggingClassifier(base_estimator=DecisionTreeClassifier(max_features="sqrt"), n_estimators=1, n_jobs=-1)
randforest.fit(X_train, y_train)

# train svm
temp_svm = BalancedBaggingClassifier(base_estimator=SVC(kernel='rbf', C=2, gamma=0.6, probability=True, random_state=42), n_estimators=1, n_jobs=-1)
temp_svm.fit(X_train, y_train)

# train knn
knn = BalancedBaggingClassifier(base_estimator=KNeighborsClassifier(n_neighbors=1, n_jobs=-1), n_estimators=1, n_jobs=-1)
knn.fit(X_train, y_train)


In [None]:
pickle.dump(xgb1_test, open('models/curr_models/xgBoost-f2-4-2.pkl', 'wb'))

In [None]:
# assess each model
xgb1 = pickle.load(open('models/curr_models/xgBoost-f2-4-2.pkl', 'rb'))

models = [randforest, temp_svm, knn, xgb1]
modelNames = ['randforest', 'svm', 'knn', 'xgboost']
for i in range(len(models)):
    model = models[i]
    modelName = modelNames[i]
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    print(f'{modelName} recall: \n{recall_score(y_test, y_pred)}')
    print(f'{modelName} f1: \n{f1_score(y_test, y_pred)}')
    print(f'{modelName} accuracy: \n{accuracy_score(y_test, y_pred)}')
    print(f'{modelName} precision: \n{precision_score(y_test, y_pred)}')
    print(f'{modelName} roc_auc: \n{roc_auc_score(y_test, y_pred_proba)}')
    print(f'{modelName} brier_score: \n{brier_score_loss(y_test, y_pred_proba)}')
    print(f'{modelName} confusion matrix: \n{confusion_matrix(y_test, y_pred).ravel()}')
    print(f'{modelName} classification report: \n{classification_report(y_test, y_pred)}')
    print(f'{modelName} roc_curve: \n{roc_curve(y_test, y_pred_proba)}')
    print(f'{modelName} precision_recall_curve: \n{precision_recall_curve(y_test, y_pred_proba)}')
    print(f'{modelName} average_precision_score: \n{average_precision_score(y_test, y_pred_proba)}')

In [None]:
ds = dataset['f2-4']
X, y = ds['X'], ds['y']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
mlp = BalancedBaggingClassifier(base_estimator=MLPClassifier(alpha=0.6, hidden_layer_sizes=(100, 180, 180, 200, 200),
              max_iter=550, random_state=42, solver='adam', activation='relu'), n_estimators=5, n_jobs=-1)
mlp.fit(X_train, y_train)

# randforest.fit(X_train, y_train)
x = cross_validate(mlp, X, y, cv=5, scoring=['recall', 'f1', 'accuracy', 'precision', 'roc_auc', 'neg_brier_score', 'average_precision'])

In [None]:
for k, v in x.items():
        print(k, v.mean())
        # modelScores[name][k]=v.mean()
pickle.dump(mlp, open('models/curr_models/mlp-f2-4.pkl', 'wb'))

In [None]:
xgb1 = pickle.load(open('models/curr_models/xgBoost-f2-4-2.pkl', 'rb'))
randforest = pickle.load(open('models/curr_models/randforest-test.pkl', 'rb'))
# get brier score
print(f1_score(y_test, xgb1.predict(X_test)))
print(brier_score_loss(y_test, xgb1.predict_proba(X_test)[:, 1]))

# get roc auc score
print(roc_auc_score(y_test, xgb1.predict_proba(X_test)[:, 1]))

print(recall_score(y_test, xgb1.predict(X_test)))

# get precision recall curve
precision, recall, thresholds = precision_recall_curve(y_test, xgb1.predict_proba(X_test)[:, 1])
# plot the precision-recall curves
print(auc(recall, precision))

print(average_precision_score(y_test, xgb1.predict_proba(X_test)[:, 1]))
no_skill = len(y_test[y_test==1]) / len(y_test)
plt.plot([0, 1], [no_skill, no_skill], linestyle='--', label='No Skill')
plt.plot(recall, precision, marker='.', label='XGBoost')


# plot precision recall for knn
precision, recall, thresholds = precision_recall_curve(y_test, randforest.predict_proba(X_test)[:, 1])
print(auc(recall, precision))
plt.plot(recall, precision, marker='.', label='rand forest')

# axis labels
plt.xlabel('Recall')
plt.ylabel('Precision')
# show the legend
plt.legend()
# show the plot
plt.show()


In [None]:
print(type(xgb1).__name__)

In [None]:
mean(modelScores['knn']['test_average_precision'])

In [None]:
ds = dataset[f'f2-4']

X, y = ds['X'], ds['y']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# randforest = BalancedBaggingClassifier(base_estimator=DecisionTreeClassifier(max_features="sqrt"), n_estimators=1, n_jobs=-1)


x = cross_validate(xgb1, X, y, cv=10, scoring=['recall', 'f1', 'accuracy', 'precision', 'roc_auc', 'neg_brier_score', 'average_precision'])

for k, v in x.items():
    print(k, v.mean())
# print(accuracy_score(knn.predict(X_test), y_test))
# print(recall_score(knn.predict(X_test), y_test))


# print(accuracy_score(xgb1.predict(X_test), y_test))
# print(recall_score(xgb1.predict(X_test), y_test))
# print(f1_score(xgb1.predict(X_test), y_test))
# pickle.dump(xgb1, open('models/curr_models/xgBoost.pkl', 'wb'))

In [None]:
testdi = {}
ds = dataset[f'f3-4']

X, y = ds['X'], ds['y']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

knn = BalancedBaggingClassifier(base_estimator=KNeighborsClassifier(n_neighbors = 1, n_jobs = -1), n_estimators = 1, n_jobs = -1)

# randforest.fit(X_train, y_train)
knn.fit(X_train, y_train)

precision, recall, thresholds = precision_recall_curve(y_test, knn.predict_proba(X_test)[:,1])
area = auc(recall, precision)


print('Area Under Curve: %.2f' % area)

x = cross_validate(knn, X, y, cv=10, scoring=['recall', 'f1', 'accuracy', 'precision', 'roc_auc', 'neg_brier_score', 'average_precision'])

for k, v in x.items():
    print(k, v.mean())
    testdi[k]=v.mean()

## Order models by performance

In [None]:
modelScores = pickle.load(open('model_scores.pkl', 'rb'))
score_df = pd.DataFrame(modelScores).T

# select model with best overall scores, precision doesn't really matter, excluding accuracy just because
# score_df['total'] = score_df.apply(lambda x: x[['test_f1', 'test_recall', 'test_roc_auc', 'test_neg_brier_score']].sum(), axis=1)
score_df['name'] = score_df.apply(lambda x: x.name.split("_")[0], axis=1)
# score_df['feature'] = score_df.apply(lambda x: x.name.split("_")[1], axis=1)
score_df['kmer'] = score_df.apply(lambda x: x.name.split("_")[2], axis=1)
# sort based on total column
score_df = score_df.sort_values(by='test_f1', ascending=False)
# print(len(score_df))

# for each k-mer value, create a plot with the AUC score of each model and each feature and put it into one graph


# for kmer in modelKmers:
#     for name in modelNames:
#         for feature in modelFeatures:
#             df = score_df[score_df.index.str.contains(f'{name}_{feature}_{kmer}')]
#             df.plot.bar(y=['test_f1', 'test_recall', 'test_roc_auc', 'test_neg_brier_score', 'test_accuracy'], figsize=(20, 10))
#             plt.title(f'{name} {feature} {kmer}')
#     plt.show()

# retrieve all model names
modelNames = score_df['name'].unique()

for kmer in range(3, 7):
    # for feature in features:

    for modelName in modelNames:
        # retrieve models that match the current name and k-mer
        df = score_df[score_df['name'] == modelName]
        df = df[df['kmer'] == str(kmer)]
        # rename all indices to the name of the model
        print(df)
        df.index = df.apply(lambda x: x.name.split("_")[0], axis=1)
        
        # plot the auc for 
        df.plot.bar(y=['test_roc_auc', 'test_accuracy'], figsize=(20, 10), rot=0, )
        plt.title(f'kmer = {kmer}, feature = {feature}')
    plt.show()

# xg_boost = score_df[score_df.index.str.contains(f'{name}_{feature}_{kmer}')]
# # mlp = score_df[score_df.index.str.contains('mlp')]
# # svm = score_df[score_df.index.str.contains('svm')]
# xg_boost.plot.bar(y=['test_f1', 'test_recall', 'test_roc_auc', 'test_neg_brier_score', 'test_accuracy'], figsize=(20, 10))

# xg_boost.plot.bar(y=['test_f1', 'test_recall', 'test_roc_auc', 'test_neg_brier_score', 'test_accuracy'], figsize=(20, 10))
# pickle.dump(score_df, open('score_df.pkl', 'wb'))

In [None]:
metric = 'test_f1'

# Create a figure and subplots for each feature
fig, axs = plt.subplots(1, 4, figsize=(15, 5))

# Loop through each k-mer length
for length in range(3, 7):
    # Loop through each feature
    for i, feature in enumerate(['knn_f1', 'knn_f2', 'knn_f3']):
        # Extract the data for the current length and feature
        data = [modelScores[model][metric] for model in modelScores]
        # print(data)
        # Plot the data on the corresponding subplot
        axs[i].plot(length, 'o-')
        axs[i].set_xlabel('k-mer length')
        axs[i].set_ylabel(metric)

# Show the plot
plt.show()

## Dump models into pickle - TODO

### Grid-Searched version of the Gradient Boosting classifier

In [None]:
ds = datasets['merged'][f'normalized-{kmer}']

X_train, y_train, X_test, y_test = ds['X_train'], ds['y_train'], ds['X_test'], ds['y_test']

"""
{'n_estimators': 120, 'max_features': 2, 'max_depth': 6, 'random_state': 0, 'min_sample_split': 50, 'subsample': 0.8, 'learning_rate': 0.3}
"""

parameters={
   'n_estimators': 120, 'max_features': 2, 'max_depth': 6, 'random_state': 42, 'min_sample_split': 50, 'subsample': 0.8, 'learning_rate': 0.3
}

param_test1 = {'n_estimators':range(100,140,10), 'learning_rate':[0.1,0.15,0.2], 'subsample':[0.8,0.85,0.9], 'max_depth':range(6,9,1), 'min_samples_split':range(10,40,10), 'max_features':range(2, 5)}

gradBoost = GridSearchCV(estimator = GradientBoostingClassifier(
    n_estimators=parameters['n_estimators'], max_features=parameters['max_features'], random_state=parameters['random_state']), 
param_grid = param_test1, scoring='roc_auc',n_jobs=-1, cv=5, verbose=10)

# parameters['learning_rate']=learning_rate
gradBoost.fit(X_train, y_train)

### Load available models

In [None]:
from itertools import combinations_with_replacement
def hidden_layers_generator(hidden_layers, max_neurons):
  hd_sizes = []
  comb = combinations_with_replacement(np.arange(100,max_neurons+10,20), hidden_layers)
  hd_sizes.append(list(comb))
  return hd_sizes


# ds = datasets['merged'][f'normalized-{kmer}']

# X_train, y_train, X_test, y_test = ds['X_train'], ds['y_train'], ds['X_test'], ds['y_test']

X = pd.concat([X_train, X_test])
y = np.concatenate([ds['y_train'], ds['y_test']], axis=0)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# print(l)
hlg = hidden_layers_generator(hidden_layers=5, max_neurons=200)
print(hlg)

mlp_gs = MLPClassifier(max_iter=350, random_state=42, solver='adam')

parameter_space = {
    'hidden_layer_sizes': hlg[0],
    'activation': ['relu'],
    'alpha': [0.05, 0.1, 0.2],
}
clf = GridSearchCV(mlp_gs, parameter_space, n_jobs=-1, cv=5, verbose=10, scoring='recall')
clf.fit(X_train, y_train) # X is train samples and y is the corresponding labels

### Creation of the ensemble model

### Select the best current model

In [None]:
score_df = pickle.load(open('score_df.pkl', 'rb'))

# score_df = score_df.sort_values(by=['test'], ascending=False)
a = set([x.split("_")[0] for x in score_df.index.to_list()])
print(a)

score_df

In [None]:
pickle.load(open('model_scores.pkl', 'rb'))

In [None]:
# pretrained models
ds = dataset['f2-4']
X, y = ds['X'], ds['y']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# MLP: f2_4
mlp = BalancedBaggingClassifier(base_estimator=MLPClassifier(alpha=0.6, hidden_layer_sizes=(100, 180, 180, 200, 200),
              max_iter=550, random_state=42, solver='adam', activation='relu'), n_estimators=5, n_jobs=-1)

mlp.fit(X_train.values, y_train)

# knn
knn = BalancedBaggingClassifier(base_estimator=KNeighborsClassifier(n_neighbors=1, n_jobs=-1), n_estimators=1, n_jobs=-1)
knn.fit(X_train.values, y_train)

# SVM: f2_4
# temp_svm = BalancedBaggingClassifier(base_estimator=SVC(kernel='rbf', C=2, gamma=0.6, probability=True, random_state=42), n_estimators=10, n_jobs=-1)

# temp_svm.fit(X_train, y_train)

# RF: f2_4
randforest = BalancedRandomForestClassifier(max_features="sqrt", n_jobs=-1)

randforest.fit(X_train.values, y_train)

# XGBoost: f2_4
xgb1 = XGBClassifier(
        learning_rate =0.1,
        n_estimators=200,
        max_depth=9,
        min_child_weight=1,
        gamma=0,
        subsample=0.8,
        colsample_bytree=0.8,
        objective= 'binary:logistic',
        #  scale_pos_weight=1,
        seed=42,
        n_jobs=-1,
        scale_pos_weight=6,
)
X_train_xg, X_validation, y_train_xg, y_validation = train_test_split(X_train, y_train, test_size=0.2, random_state=1)

xgb1.fit(X_train_xg.values, y_train_xg, eval_metric='aucpr', eval_set=[(X_validation.values, y_validation)], early_stopping_rounds=20, verbose=10)
# print("cross validating stacking classifier")
# print(em.cross_validate(X, y, cv=5))
# xgb1 = pickle.load(open('models/curr_models/xgb1-test.pkl', 'rb'))


# xgb1.fit(X_train, y_train)

# em = StackingCVClassifier(classifiers = [mlp, randforest, xgb1],
#                             # shuffle = True,
#                             use_probas = True,
#                             cv = 5,
#                             use_features_in_secondary=True,
#                             meta_classifier = LogisticRegression(C = 1, random_state=42, solver='saga'), n_jobs=-1, random_state=42, verbose=1, store_train_meta_features=True)
# x = cross_validate(em, X, y, cv=5, scoring=['recall', 'f1', 'accuracy', 'precision', 'roc_auc', 'neg_brier_score'], verbose=1, n_jobs=-1)
# name = 'ensemble_lengthdiv_4'
# if (name not in modelScores):
#     modelScores[name] = {}
#     for k, v in x.items():
#         print(k, v.mean())
#         modelScores[name][k]=v.mean()
# else:
#     print('already in modelScores')
# em.fit(X_train, y_train)

In [None]:
print(cross_validate(mlp, X_train, y_train, cv=5, scoring=['recall', 'f1', 'accuracy', 'precision', 'roc_auc', 'neg_brier_score', 'neg_log_loss'], verbose=1, n_jobs=-1))
print(cross_validate(randforest, X_train, y_train, cv=5, scoring=['recall', 'f1', 'accuracy', 'precision', 'roc_auc', 'neg_brier_score', 'neg_log_loss'], verbose=1, n_jobs=-1))


In [None]:
class StackingClassifier():
    def __init__(self, classifiers, meta_classifier, n_folds=5, use_probas=True):
        self.classifiers = classifiers # assume pretrained
        self.meta_classifier = meta_classifier # logistic regression
        self.n_folds = n_folds
        self.X_train_new=None
        self.X_test_new=None
        self.y_train_new=None
        self.use_probas = use_probas
        self.feature_names_in = None

    def fit_pretrained(self, X_train, y_train):
        self.X_train_new = np.zeros((X_train.shape[0], len(self.classifiers)))
        self.y_train_new = y_train
        print(X_train.shape[0], len(y_train))
        
        for i, clf in enumerate(self.classifiers):
            if self.use_probas:
                self.X_train_new[:, i] = model.predict_proba(X_train)[:,1]
            else:
                self.X_train_new[:, i] = model.predict(X_train)

        print(len(self.X_train_new))
        
        self.meta_classifier = self.meta_classifier.fit(self.X_train_new, self.y_train_new)

    def fit_not_pretrained(self, X_train, y_train, cv = 10, verbose=False): # assume NOT pretrained
        print(X_train.shape[0], len(y_train))
        kfold = StratifiedKFold(n_splits=cv, random_state=42, shuffle=True)

        out_of_fold_predictions = np.zeros((X_train.shape[0], len(self.classifiers)))
        
        # Iterate over the folds

        for i, clf in enumerate(self.classifiers):
            print("fitting classifier", i)
            it = 0
            for train_index, holdout_index in kfold.split(X_train, y_train):
                print("fitting fold", it)
                # instance = clone(clf)
                # self.base_estimators_[i].append(instance)
                if type(clf).__name__ == 'XGBClassifier':
                    print("xgboost detected")
                    X_train_xg, X_val, y_train_xg, y_val = train_test_split(X_train[train_index], y_train[train_index], test_size=0.15, random_state=1)
                    self.classifiers[i] = clf.fit(X_train_xg, y_train_xg, eval_metric='aucpr', eval_set=[(X_val, y_val)], early_stopping_rounds=20, verbose=10)
                    
                    if self.use_probas:
                        y_pred = clf.predict_proba(X_train[holdout_index])[:,1]
                        out_of_fold_predictions[holdout_index, i] = y_pred # set indexhere to the prediction value
                    else:
                        y_pred = clf.predict(X_train[holdout_index])
                        out_of_fold_predictions[holdout_index, i] = y_pred # set indexhere to the prediction value
                else:
                    self.classifiers[i]=clf.fit(X_train[train_index], y_train[train_index])
                    if self.use_probas:
                        y_pred = clf.predict_proba(X_train[holdout_index])[:,1]
                        out_of_fold_predictions[holdout_index, i] = y_pred # set indexhere to the prediction value
                    else:
                        y_pred = clf.predict(X_train[holdout_index])
                        out_of_fold_predictions[holdout_index, i] = y_pred # set indexhere to the prediction value
                it += 1

        
        self.meta_classifier.fit(out_of_fold_predictions, y_train)
        pickle.dump(out_of_fold_predictions, open("base_predictions.pkl", "wb"))
        pickle.dump(y_train, open("y_truth.pkl", "wb"))


    def predict(self, X):
        # make 
        meta_features = np.column_stack([
            clf.predict(X) for clf in self.classifiers
        ])
        return self.meta_classifier.predict(meta_features)

    def predict_proba(self, X):
        meta_features = np.column_stack([
            clf.predict_proba(X)[:,1] for clf in self.classifiers
        ])
        return self.meta_classifier.predict_proba(meta_features)

    def cross_validate(self, X, y, scoring=['precision', 'recall', 'f1', 'average_precision', 'reg_prec', 'log_loss', 'neg_brier_score', 'roc_auc', 'accuracy'], cv=5):
        kfold = StratifiedKFold(n_splits=cv, random_state=42, shuffle=True)
        scores = {s: [] for s in scoring}
        metrics = {
            'recall': recall_score,
            'f1': f1_score,
            'accuracy': accuracy_score,
            'precision': precision_score,
            'roc_auc': roc_auc_score,
            'neg_brier_score': brier_score_loss,
            'average_precision': average_precision_score,
            'reg_prec': precision_recall_curve,
            'log_loss': log_loss
        }
        for train_index, test_index in kfold.split(X, y):
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]
            print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)
            self.fit_not_pretrained(X_train, y_train)
            if self.use_probas:
                # print(y_pred.sum())
                for s in scoring:
                    if s == 'reg_prec' or s == 'average_precision' or s == 'neg_brier_score' or s == 'log_loss' or s == 'roc_auc':
                        y_pred = self.predict_proba(X_test)[:,1]
                        if s == 'reg_prec':
                            precision, recall, _ = metrics[s](y_test, y_pred)
                            print("auc: ", auc(recall, precision))
                            scores[s].append((recall, precision))
                            # print('regprec', auc(recall, precision))
                        else:
                            scores[s].append(metrics[s](y_test, y_pred))
                        # print('regprec', auc(recall, precision))
                    else:
                        y_pred = self.predict(X_test)
                        scores[s].append(metrics[s](y_test, y_pred))
            else:
                # print("not use probas")
                y_pred = self.predict(X_test)

                # print(y_pred.sum())

                for s in scoring:
                    met = metrics[s](y_test, y_pred)
                    # print(s, met)
                    scores[s].append(met)

        return scores
    

In [None]:
#LogisticRegression(C = 1, random_state=42, solver='saga')
# BalancedBaggingClassifier(base_estimator=SVC(kernel='rbf', C=2, gamma=0.6, probability=True, random_state=42), n_jobs=-1)
em = StackingClassifier(classifiers = [mlp, randforest, xgb1], use_probas = True, meta_classifier = LogisticRegression(C = 1, random_state=42, solver='saga'))


In [None]:
em.fit_not_pretrained(X_train.values, y_train)

In [None]:
ds = dataset['f2-4']
X, y = ds['X'], ds['y']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(recall_score(y_test, em.predict(X_test.values)))
print(recall_score(y_test, xgb1.predict(X_test.values)))
print(recall_score(y_test, randforest.predict(X_test.values)))
print(recall_score(y_test, mlp.predict(X_test.values)))

print(log_loss(y_test, em.predict_proba(X_test.values)[:,1]))
print(log_loss(y_test, xgb1.predict_proba(X_test.values)[:,1]))
print(log_loss(y_test, randforest.predict_proba(X_test.values)[:,1]))
print(log_loss(y_test, mlp.predict_proba(X_test.values)[:,1]))

print(average_precision_score(y_test, em.predict_proba(X_test.values)[:,1]))
print(average_precision_score(y_test, xgb1.predict_proba(X_test.values)[:,1]))
print(average_precision_score(y_test, randforest.predict_proba(X_test.values)[:,1]))
print(average_precision_score(y_test, mlp.predict_proba(X_test.values)[:,1]))

print(accuracy_score(y_test, em.predict(X_test.values)))
print(accuracy_score(y_test, xgb1.predict(X_test.values)))
print(accuracy_score(y_test, randforest.predict(X_test.values)))
print(accuracy_score(y_test, mlp.predict(X_test.values)))

print("precision")
print(precision_score(y_test, em.predict(X_test.values)))
print(precision_score(y_test, xgb1.predict(X_test.values)))
print(precision_score(y_test, randforest.predict(X_test.values)))
print(precision_score(y_test, mlp.predict(X_test.values)))

ds = dataset['f3-4']
X, y = ds['X'], ds['y']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


knn = BalancedBaggingClassifier(base_estimator=KNeighborsClassifier(n_neighbors=1, n_jobs=-1), n_estimators=1, n_jobs=-1)
knn.fit(X_train, y_train)

print(average_precision_score(y_test, knn.predict_proba(X_test.values)[:,1]))
print(accuracy_score(y_test, knn.predict(X_test.values)))
print(recall_score(y_test, knn.predict(X_test.values)))
print(log_loss(y_test, knn.predict_proba(X_test.values)[:,1]))

In [None]:
# pickle.dump(mlp, open('models/curr_models/mlp-f2-4.pkl', 'wb'))
# pickle.dump(em, open('models/curr_models/em-f2-4-test.pkl', 'wb'))
# pickle.dump(randforest, open('models/curr_models/randforest-f2-4.pkl', 'wb'))
pickle.dump(knn, open('models/curr_models/knn-f2-4.pkl', 'wb'))

# pickle.dump(xgb1, open('models/curr_models/xgBoost-f2-4.pkl', 'wb'))


In [None]:
l = cross_validate(mlp, X, y, scoring=['precision', 'recall', 'f1', 'average_precision'], cv=10)
print([(x, np.array(l[x]).mean()) for x in l])

In [None]:
ds = dataset['f2-4']
X, y = ds['X'], ds['y']
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
cv = em.cross_validate(X.values, y, cv=10)

In [None]:
pickle.dump(em, open("models/curr_models/em_one.pkl", "wb"))

In [None]:
pickle.dump(knn, open("models/curr_models/knn-f3-4.pkl", "wb"))

## Load models

In [None]:
model_scores = {}

In [None]:
print(average_precision_score(y_test, em.predict_proba(X_test)[:,1]))
print(average_precision_score(y_test, mlp.predict_proba(X_test)[:,1]))
print(average_precision_score(y_test, knn.predict_proba(X_test)[:,1]))

In [None]:
# get testing knn to compare against
ds = dataset['f2-4']
X, y = ds['X'], ds['y']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
knn = BalancedBaggingClassifier(base_estimator=KNeighborsClassifier(n_neighbors=1, n_jobs=-1), n_estimators=1, n_jobs=-1)
knn.fit(X_train.values, y_train)

print("validating knn")
name = 'knn'
x = cross_validate(knn, X, y, cv=10, scoring=['recall', 'f1', 'accuracy', 'precision', 'roc_auc', 'neg_brier_score', 'average_precision', 'neg_log_loss'])
if (name not in model_scores):
    model_scores[name] = {}
    for k, v in x.items():
        print(k, v.mean())
        model_scores[name][k]=v.mean()
else:
    for k, v in x.items():
        print(k, v.mean())
    print('already in model_scores')

pickle.dump(knn, open("models/curr_models/knn-f2-4.pkl", "wb"))

In [None]:
# print(model_scores['xgb'])
print(model_scores['knn'])

In [None]:
print("validating svm")
name = 'svm'
x = cross_validate(temp_svm, X, y, cv=10, scoring=['recall', 'f1', 'accuracy', 'precision', 'roc_auc', 'neg_brier_score', 'average_precision', 'neg_log_loss'])
if (name not in model_scores):
    model_scores[name] = {}
    for k, v in x.items():
        print(k, v.mean())
        model_scores[name][k]=v.mean()
else:
    for k, v in x.items():
        print(k, v.mean())
    print('already in model_scores')

In [None]:
ds = dataset['f2-4']
X, y = ds['X'], ds['y']

print(cv.keys())
name = 'ensemble'
if name not in model_scores:
    model_scores[name] = {}
    for k, v in cv.items():
        if k != 'reg_prec':
            print(k, mean(v))
            model_scores[name]["test_"+ k] = mean(v)
        if k == 'log_loss':
            model_scores[name]["test_neg_"+ k] = -1*mean(v)
        else:
            model_scores[name][k]= v
# print(model_scores)

# pickle.dump(cv, open("cv.pkl", "wb"))
#     # print(x, np.array(cv[x]).mean())
# print(len(cv['reg_prec'][0]))
# for x in cv['reg_prec']:
#     # for each model
#     print(np.mean(x))
#     # print(np.mean(x[0]), np.mean(x[1]))

# knn

print(model_scores)

# rf
print("validating rf")
name = 'random forest'
x = cross_validate(randforest, X, y, cv=10, scoring=['recall', 'f1', 'accuracy', 'precision', 'roc_auc', 'neg_brier_score', 'average_precision', 'neg_log_loss'])
if (name not in model_scores):
    model_scores[name] = {}
    for k, v in x.items():
        print(k, v.mean())
        model_scores[name][k]=v.mean()
else:
    for k, v in x.items():
        print(k, v.mean())
    print('already in model_scores')


# mlp
print("validating mlp")
name = 'mlp'
x = cross_validate(mlp, X, y, cv=10, scoring=['recall', 'f1', 'accuracy', 'precision', 'roc_auc', 'neg_brier_score', 'average_precision', 'neg_log_loss'])
if (name not in model_scores):
    model_scores[name] = {}
    for k, v in x.items():
        print(k, v.mean())
        model_scores[name][k]=v.mean()
else:
    for k, v in x.items():
        print(k, v.mean())
    print('already in model_scores')

# # xgb
print("validating xgb")
name='xgb'
x = cross_validate(xgb1, X, y, cv=10, scoring=['recall', 'f1', 'accuracy', 'precision', 'roc_auc', 'neg_brier_score', 'average_precision', 'neg_log_loss'])
if (name not in model_scores):
    model_scores[name] = {}
    for k, v in x.items():
        print(k, v.mean())
        model_scores[name][k]=v.mean()
# else:
#     for k, v in x.items():
#         print(k, v.mean())
#     print('already in model_scores')

In [None]:
pickle.dump(model_scores, open("model_scores.pkl", "wb"))

In [None]:
def test(model, X, y):
    skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

    # Initialize lists to store the precision, recall, and AUC values for each fold
    precision_list = []
    recall_list = []
    auc_list = []

    # Loop through each fold
    print("beginning cv")
    for train_index, test_index in skf.split(X, y):
        # Split the data into training and testing sets
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
        # Train the model on the training data
        if type(model).__name__ == 'StackingClassifier':
            model.fit_not_pretrained(X_train, y_train)
        else:
            model.fit(X_train, y_train)
        
        # Make predictions on the test data
        y_pred = model.predict(X_test)
        
        # Calculate precision and recall
        precision, recall, _ = precision_recall_curve(y_test, y_pred)
        
        # Calculate AUC
        auc_val = auc(recall, precision)
        
        # Store the precision, recall, and AUC values for this fold
        precision_list.append(precision)
        recall_list.append(recall)
        auc_list.append(auc_val)

    # Calculate the mean and standard deviation of the AUC across all folds
    mean_auc = np.mean(auc_list)
    std_auc = np.std(auc_list)
    mean_recall = np.mean(recall_list, axis=0)
    mean_precision = np.mean(precision_list, axis=0)


    # Plot the mean precision-recall curve, along with the standard deviation
    plt.plot(mean_recall, mean_precision, color='b', label='Mean AUC = %0.2f $\pm$ %0.2f' % (mean_auc, std_auc))
    # plt.show()
    

In [None]:
print(model_scores.keys())
print(model_scores['ensemble'].keys())

In [None]:
model_scores = pickle.load(open("model_scores.pkl", "rb"))
print(model_scores.keys())
for k, v in model_scores.items():
    # graph each model's precision recall curve
    print(model_scores['ensemble']['test_average_precision'], v['test_average_precision'])
    if k == 'ensemble':
        print(k, 'brier', (model_scores['ensemble']['test_neg_brier_score']))
        print(k, 'logloss', (model_scores['ensemble']['test_neg_log_loss']))
        print(k, 'ap', (model_scores['ensemble']['test_average_precision']))
        print(k, 'roc', (model_scores['ensemble']['test_roc_auc']))
        print(k, 'f1', (model_scores['ensemble']['test_f1']))
        print(k, 'acc', (model_scores['ensemble']['test_accuracy']))
        print(k, 'prec', (model_scores['ensemble']['test_precision']))
        continue
    print(k, 'test_average_precision', (model_scores['ensemble']['test_average_precision'] - v['test_average_precision'])/v['test_average_precision'])
    print(k, 'roc', (model_scores['ensemble']['test_roc_auc'] - v['test_roc_auc'])/v['test_roc_auc'])
    print(k, 'f1', (model_scores['ensemble']['test_f1'] - v['test_f1'])/v['test_f1'])
    print(k, 'acc', (model_scores['ensemble']['test_accuracy'] - v['test_accuracy'])/v['test_accuracy'])
    print(k, 'prec', (model_scores['ensemble']['test_precision'] - v['test_precision'])/v['test_precision'])
    print(k, 'brier', -(model_scores['ensemble']['test_neg_brier_score'] - v['test_neg_brier_score'])/v['test_neg_brier_score'])
    print(k, 'neg loss', -(model_scores['ensemble']['test_neg_log_loss'] - v['test_neg_log_loss'])/v['test_neg_log_loss'])
    print(k, 'recall', (model_scores['ensemble']['test_recall'] - v['test_recall'])/v['test_recall'])





df = pd.DataFrame.from_dict(model_scores).T
df.dtypes
for column in df.columns:
    try:
        df[column] = df[column].astype(float)
        print("success")
    except:
        continue

df['PR_AUC'] = df['test_average_precision']
df.drop(columns=['test_average_precision'], inplace=True)



df[['PR_AUC', 'test_f1']].plot.bar(figsize=(8, 8), ylim=(0.4, 1), alpha=0.5, rot=30, fontsize=20)
plt.legend(fontsize = 20)

# df[['test_average_precision', 'test_roc_auc', 'test_f1']].plot.bar(y=['test_average_precision', 'test_roc_auc', 'test_f1'], figsize=(8, 8), ylim=(0.4, 1), alpha=0.5)

# increase alpha for the second bar plot
# df.loc['ensemble'].plot.bar(y=['test_average_precision', 'test_roc_auc', 'test_f1'], figsize=(8,8),ylim=(0.4, 1),alpha=1.0)

In [None]:
# plot precision recall for each model
# ds = dataset['f2-4']
# X, y = ds['X'], ds['y']
# for k, v in model_scores.items():
#     # graph each model's precision recall curve
#     print(k, v['test_average_precision'])
#     print(k, v['test_recall'])
#     print(k, v['test_f1'])
#     print(k, v['test_accuracy'])
#     print(k, v['test_precision'])

# print(model_scores['ensemble']['reg_prec'][0][0]) # 0 is recall, 1 is precision

# # plot recall first, then precision
plt.ylim(0.49, 1.01)
aa = pickle.load(open('cv.pkl', 'rb'))['reg_prec']
plt.plot(aa[1][0], aa[1][1], marker='.', label='ensemble', color='red', linewidth=1)
# print(auc(model_scores['ensemble']['reg_prec'][1][0], model_scores['ensemble']['reg_prec'][1][1]))

# plot recall first, then precision

# test(em, X, y)

# test(knn, X.values, y)

# test(randforest, X.values, y)

# test(mlp, X.values, y)

# test(xgb1, X.values, y)

# plot recall first, then precision
draw_avg_roc_curve(randforest, "random forest", X, y)
draw_avg_roc_curve(knn, "knn", X, y)


# plot recall first, then precision
# draw_avg_roc_curve(mlp, "mlp", X, y)

# plot recall first, then precision
# draw_avg_roc_curve(xgb1, "xgb", X, y)


# no_skill = len(y_test[y_test==1]) / len(y_test)
# plt.plot([0, 1], [no_skill, no_skill], linestyle='--', label='No Skill', linewidth=2)
# draw_avg_roc_curve(xgb1, "xgb", X, y)



plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Cross-Validated Precision-Recall AUC')
plt.legend(loc="lower left")
plt.show()

In [None]:
def draw_avg_roc_curve(model, name, X, y, multiple=False):
    # done w/ the help of https://stats.stackexchange.com/questions/186337/average-roc-for-repeated-10-fold-cross-validation-with-probability-estimates
    # plt.ylim(0.50, 1.01)
    splits = 5
    kf = StratifiedKFold(n_splits=splits, shuffle=True, random_state=42)
    kf.get_n_splits(X)

    precision_scores = []
    recall_scores = []
    base_fpr = np.linspace(0, 1, 101)
    
    avgauc = 0
    
    max_len_x = train_test_split(X, y, test_size=0.2, random_state=42)[0].shape[0]+1
    max_len_y = train_test_split(X, y, test_size=0.2, random_state=42)[2].shape[0]+1

    print("max len x: " + str(max_len_x))
    print("max len y: " + str(max_len_y))

    for train, test in kf.split(X, y):
        # y_pred_proba = model.predict_proba(X.iloc[test])[::,1]
        # fpr, tpr, _ = roc_curve(y[test], y_pred_proba)
        # auc_thing = roc_auc_score(y[test], y_pred_proba)
        # print("roc: " + str(auc_thing))
        # print(train)
        # print(test)
        print(len(train), len(test))
        # if the length is greater than the max length, then chop off the excess
        if len(train) > max_len_x:
            train = train[:max_len_x]

        if len(test) > max_len_y:
            test = test[:max_len_y]

        
        model = model.fit(X.iloc[train], y[train])
        print("fit done")
        y_score = model.predict_proba(X.iloc[test])
        precision, recall, _ = precision_recall_curve(y[test], y_score[:, 1])
        auc_thing = auc(recall, precision)
        
        # if not multiple:
        #     # plot variance
        #     plt.plot(recall, precision, alpha=0.15)

        avgauc += auc_thing
        print("auc split: ", auc_thing)

        # pad with 0s
        print("precision len: ", len(precision))
        print("recall len: ", len(recall))
        
        precision_scores.append(precision)
        recall_scores.append(recall)
        
    
    avgauc /= splits
    # recall_scores

    precision_scores = np.mean(precision_scores, axis=0)
    recall_scores = np.mean(recall_scores, axis=0)


    if name.lower() == "ensemble":
        plt.plot(recall_scores, precision_scores, label=f"{name}", color="red")
    else:
        plt.plot(recall_scores, precision_scores, label=f"{name}")
    # fill in areas between
    
    return round(avgauc, 3)


In [None]:
draw_avg_roc_curve(randforest, "random forest", X, y)

In [None]:
for k, v in model_scores.items():
    # print(v)
    # print(v.keys())
    print(k, v['test_neg_log_loss'])
    # print(k, v['test_neg_brier_score'])


In [None]:
# print(em.meta_classifier)
pickle.dump(cv, open('em-score.pkl', 'wb'))

In [None]:
pickle.dump(em, open('models/curr_models/em-f2-4.pkl', 'wb'))

In [None]:
# knn researchers
ds = dataset['f3-4']
X, y = ds['X'], ds['y']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# knn
knn = BalancedBaggingClassifier(base_estimator=KNeighborsClassifier(n_neighbors=1, n_jobs=-1), n_estimators=1, n_jobs=-1)
knn.fit(X_train, y_train)

In [None]:
ds = dataset['f2-4']
X, y = ds['X'], ds['y']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

precision, recall, thresholds = precision_recall_curve(y_test, em.predict_proba(X_test)[:,1])
area = auc(recall, precision)

print('Area Under Curve: %.2f' % area)

print(accuracy_score(y_test, em.predict(X_test)))

# print(average_precision_score(y_test, em.predict_proba(X_test)[:,1]))

# plot precision-recall curve
plt.plot(recall, precision, marker='.', label='Stacking', linewidth=2)

# get precision recall curve
# plot the precision-recall curves
print(auc(recall, precision))

no_skill = len(y_test[y_test==1]) / len(y_test)
# plt.plot([0, 1], [no_skill, no_skill], linestyle='--', label='No Skill', linewidth=2)

# precision, recall, thresholds = precision_recall_curve(y_test, xgb1.predict_proba(X_test)[:, 1])
# plt.plot(recall, precision, marker='.', label='XGBoost')


# plot precision recall for knn
precision, recall, thresholds = precision_recall_curve(y_test, randforest.predict_proba(X_test)[:, 1])
plt.plot(recall, precision, marker='.', label='Random Forest', linewidth=2)
print(auc(recall, precision))

# precision, recall, thresholds = precision_recall_curve(y_test, mlp.predict_proba(X_test)[:, 1])
# plt.plot(recall, precision, marker='.', label='MLP', linewidth=2)
# print(auc(recall, precision))
ds = dataset['f3-4']
X, y = ds['X'], ds['y']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

precision, recall, thresholds = precision_recall_curve(y_test, knn.predict_proba(X_test)[:, 1])
plt.xlim([0.49, 1.01])
plt.ylim([0.49, 1.01])
plt.plot(recall, precision, marker='.', label='KNN', linewidth=2)
print(auc(recall, precision))

print(precision)
print(recall)

# axis labels
plt.xlabel('Recall')
plt.ylabel('Precision')
# show the legend
plt.legend()
# show the plot
plt.show()

In [None]:
# knn, random forest, xgboost, mlp, svm, gradient boosting classifier, logistic regression
ds = dataset['f2-4']
X, y = ds['X'], ds['y']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

precision, recall, thresholds = precision_recall_curve(y_test, em.predict_proba(X_test)[:,1])
area = auc(recall, precision)

print('Area Under Curve: %.2f' % area)

print(accuracy_score(y_test, em.predict(X_test)))

# pickle.dump(em, open('models/curr_models/custom-ensemble-f2-4.pkl', 'wb'))
# print(average_precision_score(y_test, em.predict_proba(X_test)[:,1]))

# plot precision-recall curve
plt.plot(recall, precision, marker='.', label='Stacking', linewidth=2)

# get precision recall curve
# plot the precision-recall curves
print(auc(recall, precision))

# no_skill = len(y_test[y_test==1]) / len(y_test)

# plt.plot([0, 1], [no_skill, no_skill], linestyle='--', label='No Skill', linewidth=2)

# precision, recall, thresholds = precision_recall_curve(y_test, xgb1.predict_proba(X_test)[:, 1])
# plt.plot(recall, precision, marker='.', label='XGBoost')


# plot precision recall for knn
precision, recall, thresholds = precision_recall_curve(y_test, randforest.predict_proba(X_test)[:, 1])
plt.plot(recall, precision, marker='.', label='Random Forest', linewidth=2)
print(auc(recall, precision))

# precision, recall, thresholds = precision_recall_curve(y_test, mlp.predict_proba(X_test)[:, 1])
# plt.plot(recall, precision, marker='.', label='MLP', linewidth=2)
# print(auc(recall, precision))
ds = dataset['f3-4']
X, y = ds['X'], ds['y']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

precision, recall, thresholds = precision_recall_curve(y_test, knn.predict_proba(X_test)[:, 1])
plt.xlim([0.49, 1.01])
plt.ylim([0.49, 1.01])
plt.plot(recall, precision, marker='.', label='KNN', linewidth=2)
print(auc(recall, precision))

print(precision)
print(recall)

# axis labels
plt.xlabel('Recall')
plt.ylabel('Precision')
# show the legend
plt.legend()
# show the plot
plt.show()

In [None]:
ds = dataset['f2-4']
X, y = ds['X'], ds['y']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



# AUC
fpr, tpr, _ = roc_curve(y_test, em.predict_proba(X_test)[:,1])
auc_thing = roc_auc_score(y_test, em.predict_proba(X_test)[:,1])
print("roc: " + str(auc_thing))
plt.plot(fpr,tpr, marker='.', label='Stacking')

# AUC
fpr, tpr, _ = roc_curve(y_test, randforest.predict_proba(X_test)[:,1])
auc_thing = roc_auc_score(y_test, randforest.predict_proba(X_test)[:,1])
print("roc: " + str(auc_thing))
plt.plot(fpr,tpr, marker='.', label='Random Forest')

# AUC
# fpr, tpr, _ = roc_curve(y_test, mlp.predict_proba(X_test)[:,1])
# auc_thing = roc_auc_score(y_test, mlp.predict_proba(X_test)[:,1])
# print("roc: " + str(auc_thing))
# plt.plot(fpr,tpr, marker='.', label='MLP')

# # AUC
# fpr, tpr, _ = roc_curve(y_test, xgb1.predict_proba(X_test)[:,1])
# auc_thing = roc_auc_score(y_test, xgb1.predict_proba(X_test)[:,1])
# print("roc: " + str(auc_thing))
# plt.plot(fpr,tpr, marker='.', label='XGBoost')

# AUC
ds = dataset['f3-4'] # researchers
X, y = ds['X'], ds['y']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

fpr, tpr, _ = roc_curve(y_test, knn.predict_proba(X_test)[:,1])
auc_thing = roc_auc_score(y_test, knn.predict_proba(X_test)[:,1])
print("roc: " + str(auc_thing))
plt.plot(fpr,tpr, marker='.', label='KNN')


plt.show()

In [None]:
print(brier_score_loss(y_test, em.predict_proba(X_test)[:,1]))
print(brier_score_loss(y_test, randforest.predict_proba(X_test)[:,1]))
print(brier_score_loss(y_test, knn.predict_proba(X_test)[:,1]))

print(log_loss(y_test, em.predict_proba(X_test)[:,1]))
print(log_loss(y_test, randforest.predict_proba(X_test)[:,1]))
print(log_loss(y_test, knn.predict_proba(X_test)[:,1]))
print(log_loss(y_test, mlp.predict_proba(X_test)[:,1]))
print(log_loss(y_test, xgb1.predict_proba(X_test)[:,1]))


In [None]:
x = cross_validate(em.meta_clf_, X, y, cv=5, scoring=['recall', 'f1', 'accuracy', 'precision', 'roc_auc', 'neg_brier_score', 'average_precision'], verbose=1, n_jobs=-1)

In [None]:
for k, v in x.items():
    print(k, v.mean())
    # modelScores[name][k]=v.mean()

In [None]:
z = cross_val_score(em, X, y, cv=5, scoring='average_precision', verbose=1, n_jobs=-1)

In [None]:
print(z.mean())

In [None]:
print(accuracy_score(y_test, em.predict(X_test)))
print(recall_score(y_test, em.predict(X_test)))
print(f1_score(y_test, em.predict(X_test)))
# pickle.dump(em, open('models/curr_models/ensemble.pkl', 'wb'))
asdf = pickle.load(open('models/curr_models/xgb1-test.pkl', 'rb'))


# print(em.)


In [None]:
print(accuracy_score(y_test, asdf.predict(X_test)))
print(recall_score(y_test, asdf.predict(X_test)))
print(f1_score(y_test, asdf.predict(X_test)))

print(accuracy_score(y_test, temp_svm.predict(X_test)))
print(recall_score(y_test, temp_svm.predict(X_test)))
print(f1_score(y_test, temp_svm.predict(X_test)))

In [None]:
# print(em.clfs_)
x = cross_val_score(em, X, y, cv=2, scoring='recall', verbose=1, n_jobs=-1)
# x = cross_validate(em.clfs_, X, y, cv=2, scoring=['recall', 'f1', 'accuracy', 'precision', 'roc_auc', 'neg_brier_score'], verbose=1, n_jobs=-1)
# name = 'ensemble_lengthdiv_4'
# if (name not in modelScores):
#     modelScores[name] = {}
#     for k, v in x.items():
#         print(k, v.mean())
#         modelScores[name][k]=v.mean()
# else:
#     print('already in modelScores')

In [None]:
print(x)

In [None]:
# dump em
pickle.dump(em, open('models/curr_models/em.pkl', 'wb'))