In [None]:
from itertools import permutations, product

import pandas as pd
import numpy as np
from numpy import mean, std
import os

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier, VotingClassifier
from sklearn.model_selection import RepeatedStratifiedKFold, train_test_split, cross_val_score, cross_validate, StratifiedKFold, GridSearchCV
from sklearn.metrics import accuracy_score, auc, confusion_matrix, balanced_accuracy_score, precision_recall_curve, roc_curve, roc_auc_score, f1_score, recall_score, precision_score, brier_score_loss, average_precision_score, classification_report, log_loss
from sklearn.inspection import permutation_importance
from sklearn import preprocessing
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC

from collections import Counter, OrderedDict
from imblearn.ensemble import BalancedBaggingClassifier, BalancedRandomForestClassifier
import xgboost as xgb
from xgboost import XGBClassifier

import pickle
import json

from ctgan import CTGANSynthesizer
from mlxtend.classifier import StackingCVClassifier

from os import path
import tqdm
import matplotlib.pyplot as plt

from warnings import simplefilter
from torchviz import make_dot

if (os.path.abspath('').split('/')[-1] == 'project'):
    %cd utils
elif (os.path.abspath('').split('/')[-1] == 'train_and_vis'):
    %cd ../utils

import query_utils
import model_utils
import validation_utils
import data_utils

if (os.path.abspath('').split('/')[-1] == 'utils'):
    %cd ..

simplefilter(action="ignore", category=pd.errors.PerformanceWarning)
simplefilter(action='ignore', category=FutureWarning)
pd.options.mode.chained_assignment = None

In [None]:
class StackingClassifier():
    def __init__(self, classifiers, meta_classifier, n_folds=5, use_probas=True):
        self.classifiers = classifiers
        self.meta_classifier = meta_classifier
        self.n_folds = n_folds
        self.use_probas = use_probas

    def fit_not_pretrained(self, X_train, y_train, cv=10, verbose=False):
        print(f"Training stacking with {len(self.classifiers)} base models")
        kfold = StratifiedKFold(n_splits=cv, random_state=42, shuffle=True)
        out_of_fold_predictions = np.zeros((X_train.shape[0], len(self.classifiers)))
        
        for i, clf in enumerate(self.classifiers):
            print(f"Training base {i+1}/{len(self.classifiers)}")
            for fold, (train_index, holdout_index) in enumerate(kfold.split(X_train, y_train)):
                if type(clf).__name__ == 'XGBClassifier':
                    X_train_xg, X_val, y_train_xg, y_val = train_test_split(
                        X_train[train_index], y_train[train_index], test_size=0.15, random_state=1)
                    self.classifiers[i] = clf.fit(X_train_xg, y_train_xg, eval_metric='aucpr', 
                                                eval_set=[(X_val, y_val)], early_stopping_rounds=20, verbose=False)
                else:
                    self.classifiers[i] = clf.fit(X_train[train_index], y_train[train_index])
                
                if self.use_probas:
                    y_pred = clf.predict_proba(X_train[holdout_index])[:, 1]
                else:
                    y_pred = clf.predict(X_train[holdout_index])
                out_of_fold_predictions[holdout_index, i] = y_pred
        
        self.meta_classifier.fit(out_of_fold_predictions, y_train)
        print("stacking training complete")

    def predict(self, X):
        if self.use_probas:
            meta_features = np.column_stack([clf.predict_proba(X)[:, 1] for clf in self.classifiers])
        else:
            meta_features = np.column_stack([clf.predict(X) for clf in self.classifiers])
        return self.meta_classifier.predict(meta_features)

    def predict_proba(self, X):
        meta_features = np.column_stack([clf.predict_proba(X)[:, 1] for clf in self.classifiers])
        return self.meta_classifier.predict_proba(meta_features)

### Synthetic Data testing
With CTGANSynthesizer

In [None]:
isZoonotic = df.loc[df['isZoonotic']==1][:1200]
isZoonotic = isZoonotic.loc[:, isZoonotic.columns != 'isZoonotic']
print(isZoonotic)

posGanModel = CTGANSynthesizer(batch_size=60, epochs=10, verbose=True)
posGanModel.fit(isZoonotic)
posGanModel.save('models/curr_models/posGanModel.pkl')

notZoonotic = df.loc[df['isZoonotic']==0][:3000]
notZoonotic = notZoonotic.loc[:, notZoonotic.columns != 'isZoonotic']
print(notZoonotic)

negGanModel = CTGANSynthesizer(batch_size=60, epochs=10, verbose=True)
negGanModel.fit(notZoonotic)
negGanModel.save('models/curr_models/negGanModel.pkl')

In [None]:
dataset = data_utils.retrieveMerged(dir='data/')
print(f"Available datasets: {list(dataset.keys())}")
print(f"Dataset f2-4 shape: X={len(dataset['f2-4']['X'])}, y={len(dataset['f2-4']['y'])}")
print(f"Positive samples: {sum(dataset['f2-4']['y'])}, Negative samples: {len(dataset['f2-4']['y'])-sum(dataset['f2-4']['y'])}")

In [None]:
modelScores = {}
features = ['f1', 'f2', 'f3']
scoring_metrics = ['recall', 'f1', 'accuracy', 'precision', 'roc_auc', 'neg_brier_score', 'average_precision']

In [None]:
# Train all models across different feature sets and k-mer lengths
model_configs = {
    'knn': KNeighborsClassifier(n_neighbors=1, n_jobs=-1),
    'rf': BalancedRandomForestClassifier(max_features="sqrt", n_jobs=-1),
    'xgb': XGBClassifier(
        learning_rate=0.1, n_estimators=300, max_depth=9, min_child_weight=1,
        gamma=0, subsample=0.8, colsample_bytree=0.8, objective='binary:logistic',
        seed=42, n_jobs=-1, scale_pos_weight=6),
    'mlp': MLPClassifier(
        alpha=0.6, hidden_layer_sizes=(100, 180, 180, 200, 200),
        max_iter=550, random_state=42, solver='adam', activation='relu'),
    'svm': SVC(
        kernel='rbf', C=2, gamma=0.6, probability=True, random_state=42, max_iter=500)
}

for kmer in range(3, 7):
    for feature in features:
        print(f"Training models for {feature}-{kmer}...")
        ds = dataset[f'{feature}-{kmer}']
        X, y = ds['X'], ds['y']
        
        for model_name, model in model_configs.items():
            name = f'{model_name}_{feature}_{kmer}'
            if name not in modelScores:
                print(f"  Training {model_name}...")
                current_scoring = scoring_metrics if model_name != 'svm' else scoring_metrics[:-1]
                try:
                    x = cross_validate(model, X, y, cv=5, scoring=current_scoring)
                    modelScores[name] = {k: v.mean() for k, v in x.items()}
                    print(f"    Completed {model_name}")
                except Exception as e:
                    print(f"    Error: {e}")

In [None]:
# Display results
score_df = pd.DataFrame(modelScores).T
print(f"Total models trained: {len(score_df)}")
score_df.head()

In [None]:
ds = dataset['f2-4']
X, y = ds['X'], ds['y']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

mlp = BalancedBaggingClassifier(estimator=MLPClassifier(alpha=0.6, hidden_layer_sizes=(100, 180, 180, 200, 200),
              max_iter=550, random_state=42, solver='adam', activation='relu'), n_estimators=5, n_jobs=-1)

rf = BalancedRandomForestClassifier(max_features="sqrt", n_jobs=-1)

xgb = XGBClassifier(
        learning_rate=0.1, n_estimators=200, max_depth=9, min_child_weight=1,
        gamma=0, subsample=0.8, colsample_bytree=0.8, objective='binary:logistic',
        seed=42, n_jobs=-1, scale_pos_weight=6)

em = StackingClassifier(
    classifiers=[mlp, rf, xgb], 
    meta_classifier=LogisticRegression(C=1, random_state=42, solver='saga'), 
    use_probas=True
)

em.fit_not_pretrained(X_train.values, y_train)

In [None]:
def draw_avg_roc_curve(model, name, X, y, multiple=False):
    # done w/ the help of https://stats.stackexchange.com/questions/186337/average-roc-for-repeated-10-fold-cross-validation-with-probability-estimates
    # plt.ylim(0.50, 1.01)
    splits = 5
    kf = StratifiedKFold(n_splits=splits, shuffle=True, random_state=42)
    kf.get_n_splits(X)

    precision_scores = []
    recall_scores = []
    
    avgauc = 0
    
    max_len_x = train_test_split(X, y, test_size=0.2, random_state=42)[0].shape[0]+1
    max_len_y = train_test_split(X, y, test_size=0.2, random_state=42)[2].shape[0]+1

    print("max len x: " + str(max_len_x))
    print("max len y: " + str(max_len_y))

    for train, test in kf.split(X, y):
        # y_pred_proba = model.predict_proba(X.iloc[test])[::,1]
        # fpr, tpr, _ = roc_curve(y[test], y_pred_proba)
        # auc_thing = roc_auc_score(y[test], y_pred_proba)
        # print("roc: " + str(auc_thing))
        # print(train)
        # print(test)
        print(len(train), len(test))
        # if the length is greater than the max length, then chop off the excess
        if len(train) > max_len_x:
            train = train[:max_len_x]

        if len(test) > max_len_y:
            test = test[:max_len_y]

        
        model = model.fit(X.iloc[train], y[train])
        print("fit done")
        y_score = model.predict_proba(X.iloc[test])
        precision, recall, _ = precision_recall_curve(y[test], y_score[:, 1])
        auc_thing = auc(recall, precision)
        
        # if not multiple:
        #     # plot variance
        #     plt.plot(recall, precision, alpha=0.15)

        avgauc += auc_thing
        print("auc split: ", auc_thing)

        # pad with 0s
        print("precision len: ", len(precision))
        print("recall len: ", len(recall))
        
        precision_scores.append(precision)
        recall_scores.append(recall)
        
    
    avgauc /= splits
    # recall_scores

    precision_scores = np.mean(precision_scores, axis=0)
    recall_scores = np.mean(recall_scores, axis=0)


    if name.lower() == "ensemble":
        plt.plot(recall_scores, precision_scores, label=f"{name}", color="red")
    else:
        plt.plot(recall_scores, precision_scores, label=f"{name}")
    # fill in areas between
    
    return round(avgauc, 3)


In [None]:
print(accuracy_score(y_test, em.predict(X_test)))
print(recall_score(y_test, em.predict(X_test)))
print(f1_score(y_test, em.predict(X_test)))
# pickle.dump(em, open('models/curr_models/ensemble.pkl', 'wb'))
asdf = pickle.load(open('models/curr_models/xgb1-test.pkl', 'rb'))

In [None]:
x1 = cross_val_score(model_configs["rf"], X, y, cv=2, scoring='recall', verbose=1, n_jobs=-1)
x2 = cross_val_score(em, X, y, cv=2, scoring='recall', verbose=1, n_jobs=-1)
print("rf: ", x1)
print("em: ", x2)

## Precision-Recall Curves

In [None]:
ds = dataset['f2-4']
X, y = ds['X'], ds['y']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

plt.figure(figsize=(10, 8))

# Ensemble
precision, recall, _ = precision_recall_curve(y_test, em.predict_proba(X_test.values)[:,1])
area = auc(recall, precision)
plt.plot(recall, precision, marker='.', label=f'Ensemble (AUC={area:.3f})', linewidth=2, color='red')

# Random Forest
precision, recall, _ = precision_recall_curve(y_test, model_configs["rf"].predict_proba(X_test)[:, 1])
area = auc(recall, precision)
plt.plot(recall, precision, marker='.', label=f'Random Forest (AUC={area:.3f})', linewidth=2)

# KNN (on f3-4 dataset as in original)
ds_knn = dataset['f3-4']
X_knn, y_knn = ds_knn['X'], ds_knn['y']
X_train_knn, X_test_knn, y_train_knn, y_test_knn = train_test_split(X_knn, y_knn, test_size=0.2, random_state=42)
knn_f3 = BalancedBaggingClassifier(estimator=KNeighborsClassifier(n_neighbors=1, n_jobs=-1), n_estimators=1, n_jobs=-1)
knn_f3.fit(X_train_knn, y_train_knn)
precision, recall, _ = precision_recall_curve(y_test_knn, knn_f3.predict_proba(X_test_knn)[:, 1])
area = auc(recall, precision)
plt.plot(recall, precision, marker='.', label=f'KNN (AUC={area:.3f})', linewidth=2)

# No-skill line
no_skill = len(y_test[y_test==1]) / len(y_test)
plt.plot([0, 1], [no_skill, no_skill], linestyle='--', color='gray', label='No Skill')

plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curves Comparison')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()