<a href="https://colab.research.google.com/github/CleoHabets/MAI3004_RevealingResistance/blob/main/PersonalCrossVal.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Loading
Run all these cell first

In [None]:
# Basic libraries
import pandas as pd

import numpy as np

import matplotlib.pyplot as plt
from matplotlib.colors import LinearSegmentedColormap

import seaborn as sns

# Machine learning
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import (
    train_test_split,
    StratifiedKFold,
    GridSearchCV,
    cross_val_score
)
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, FunctionTransformer
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    roc_auc_score,
    roc_curve,
    confusion_matrix,
    ConfusionMatrixDisplay
)
from sklearn.dummy import DummyClassifier # For baseline/ comparison model
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
# Still need to import SVM etc..
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
import random
from tensorflow import keras

from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    roc_curve,
    confusion_matrix
)



In [None]:
import pandas as pd
import numpy as np
from scipy.sparse import load_npz

# X: Label (DataFrames)
train_X_label = pd.read_csv("train_X_label.csv", index_col=0)
test_X_label  = pd.read_csv("test_X_label.csv", index_col=0)

# X: OneHot (Sparse matrices)
train_X_OneHot = load_npz("train_X_OneHot.npz")
test_X_OneHot  = load_npz("test_X_OneHot.npz")

# X: FCGR (NumPy arrays)
train_X_fcgr = np.load("train_X_fcgr.npy")
test_X_fcgr  = np.load("test_X_fcgr.npy")

# y: load as Series (each CSV is a 1-col file)
train_y_CIP = pd.read_csv("train_y_CIP.csv", index_col=0).iloc[:, 0]
test_y_CIP  = pd.read_csv("test_y_CIP.csv", index_col=0).iloc[:, 0]

train_y_CTX = pd.read_csv("train_y_CTX.csv", index_col=0).iloc[:, 0]
test_y_CTX  = pd.read_csv("test_y_CTX.csv", index_col=0).iloc[:, 0]

train_y_CTZ = pd.read_csv("train_y_CTZ.csv", index_col=0).iloc[:, 0]
test_y_CTZ  = pd.read_csv("test_y_CTZ.csv", index_col=0).iloc[:, 0]

train_y_GEN = pd.read_csv("train_y_GEN.csv", index_col=0).iloc[:, 0]
test_y_GEN  = pd.read_csv("test_y_GEN.csv", index_col=0).iloc[:, 0]

In [None]:
train_X_fcgr_flat = train_X_fcgr.reshape(train_X_fcgr.shape[0], -1)
test_X_fcgr_flat  = test_X_fcgr.reshape(test_X_fcgr.shape[0], -1)

In [None]:
X_label = pd.concat([train_X_label, test_X_label])
from scipy.sparse import vstack
X_OneHot = vstack([train_X_OneHot, test_X_OneHot])
X_fcgr = np.vstack([train_X_fcgr_flat, test_X_fcgr_flat])

y_CIP = pd.concat([train_y_CIP, test_y_CIP])
y_CTX = pd.concat([train_y_CTX, test_y_CTX])
y_CTZ = pd.concat([train_y_CTZ, test_y_CTZ])
y_GEN = pd.concat([train_y_GEN, test_y_GEN])

In [None]:
def runmodel(train_X, train_y, test_X, test_y, model_chosen, X_name, Y_name, balance_method):

    if balance_method == "class_weight":
        cw = "balanced"
    else:
        cw = None

    if balance_method == "over_sample":
        from imblearn.over_sampling import RandomOverSampler
        ros = RandomOverSampler(random_state=42)
        train_X, train_y = ros.fit_resample(train_X, train_y)
    else:
        pass

    if model_chosen == "logistic_regression":
        model = LogisticRegression(max_iter=2000, class_weight=cw)

    elif model_chosen == "random_forrest":
        model = RandomForestClassifier(
            n_estimators=200,
            max_depth=None,
            min_samples_leaf=2,
            max_features="sqrt",
            n_jobs=-1,
            random_state=1,
            class_weight=cw
        )

    elif model_chosen == "SVM":
        model = SVC(kernel="linear", probability=True, random_state=1, class_weight=cw)

    else:
        raise ValueError("Unknown model")

    model.fit(train_X, train_y)

    y_pred = model.predict(test_X)

    if hasattr(model, "predict_proba"):
        y_score = model.predict_proba(test_X)[:, 1]
    else:
        y_score = model.decision_function(test_X)

    acc  = accuracy_score(test_y, y_pred)
    prec = precision_score(test_y, y_pred, zero_division=0)
    rec  = recall_score(test_y, y_pred, zero_division=0)
    f1   = f1_score(test_y, y_pred, zero_division=0)
    auc  = roc_auc_score(test_y, y_score)

    fpr, tpr, thresholds = roc_curve(test_y, y_score)
    cm = confusion_matrix(test_y, y_pred, normalize="true")

    return {
        "model": model_chosen,
        "X_encoding": X_name,
        "antibiotic": Y_name,
        "accuracy": float(acc),
        "precision": float(prec),
        "recall": float(rec),
        "f1": float(f1),
        "roc_auc": float(auc),
        "confusion_matrix": cm,
        "fpr": fpr,
        "tpr": tpr,
        "thresholds": thresholds,
    }

In [None]:
def cross_validate_model(X, y, model_chosen, X_name, Y_name, balance_method):

  if balance_method in ["over_sample", "class_weight", "none"]:
    pass
  else:
    raise ValueError("Unknown balance method")

  results = []

  for seed in range(0, 5):
    train_X, test_X, train_y, test_y = train_test_split(X, y, test_size = 0.2, stratify = y, random_state = seed)

    res = runmodel(train_X, train_y, test_X, test_y, model_chosen, X_name, Y_name, balance_method)
    results.append(res)

  metrics = ["accuracy", "precision", "recall", "f1", "roc_auc"]

  mean_metrics = {m: np.mean([r[m] for r in results]) for m in metrics}
  std_metrics  = {m: np.std([r[m] for r in results]) for m in metrics}
  cms = [r["confusion_matrix"] for r in results]
  cm_mean = np.mean(cms, axis=0)

  mean_fpr = np.linspace(0, 1, 200)
  tprs = []

  for r in results:
    fpr = r["fpr"]
    tpr = r["tpr"]

    tpr_interp = np.interp(mean_fpr, fpr, tpr)
    tpr_interp[0] = 0.0
    tprs.append(tpr_interp)

  mean_tpr = np.mean(tprs, axis=0)
  mean_tpr[-1] = 1.0

  final_result = {
      "model": results[0]["model"],
      "X_encoding": results[0]["X_encoding"],
      "antibiotic": results[0]["antibiotic"],
      "balance_method" : balance_method,

      "accuracy_mean": float(mean_metrics["accuracy"]),
      "accuracy_std": float(std_metrics["accuracy"]),
      "precision_mean": float(mean_metrics["precision"]),
      "precision_std": float(std_metrics["precision"]),
      "recall_mean": float(mean_metrics["recall"]),
      "recall_std": float(std_metrics["recall"]),
      "f1_mean": float(mean_metrics["f1"]),
      "f1_std": float(std_metrics["f1"]),
      "roc_auc_mean": float(mean_metrics["roc_auc"]),
      "roc_auc_std": float(std_metrics["roc_auc"]),

      "confusion_matrix": cm_mean,
      "fpr": mean_fpr,
      "tpr": mean_tpr,
  }

  return(final_result)

In [None]:
results_cv = []

In [None]:
cv_res = cross_validate_model(X_label, y_GEN, "random_forrest", "label", "GEN", "none")
results_cv.append(cv_res)

In [None]:
results_cv

[{'model': 'random_forrest',
  'X_encoding': 'label',
  'antibiotic': 'GEN',
  'balance_method': 'over_sample',
  'accuracy_mean': 0.7679012345679013,
  'accuracy_std': 0.04215647179991081,
  'precision_mean': 0.5082204995693368,
  'precision_std': 0.06574810196976172,
  'recall_mean': 0.6684210526315789,
  'recall_std': 0.06137844099837156,
  'f1_mean': 0.5763832811518645,
  'f1_std': 0.06197487992976893,
  'roc_auc_mean': 0.803713921901528,
  'roc_auc_std': 0.030975953477651117,
  'confusion_matrix': array([[0.7983871 , 0.2016129 ],
         [0.33157895, 0.66842105]]),
  'fpr': array([0.        , 0.00502513, 0.01005025, 0.01507538, 0.0201005 ,
         0.02512563, 0.03015075, 0.03517588, 0.04020101, 0.04522613,
         0.05025126, 0.05527638, 0.06030151, 0.06532663, 0.07035176,
         0.07537688, 0.08040201, 0.08542714, 0.09045226, 0.09547739,
         0.10050251, 0.10552764, 0.11055276, 0.11557789, 0.12060302,
         0.12562814, 0.13065327, 0.13567839, 0.14070352, 0.14572864,
 

In [None]:
models = [
    "logistic_regression",
    "random_forrest",
    "SVM"
]

train_X_sets = [
    train_X_label,
    train_X_OneHot,
    train_X_fcgr
]

test_X_sets = [
    test_X_label,
    test_X_OneHot,
    test_X_fcgr
]

test_y_sets = [
    test_y_CIP,
    test_y_CTX,
    test_y_CTZ,
    test_y_GEN
]

train_y_sets = [
    train_y_CIP,
    train_y_CTX,
    train_y_CTZ,
    train_y_GEN
]


# Workspace

In [None]:
import pickle

with open(, "rb") as f:
    all_results_balanced = pickle.load(f)"/content/genomic_ml_results_partial_balanced (8).pkl"

print("Loaded", len(all_results_balanced), "completed runs")

Loaded 30 completed runs


In [None]:
import pickle
with open("genomic_ml_results_partial_balanced.pkl", "wb") as f:
    pickle.dump(all_results_balanced, f)

In [None]:
from google.colab import files
files.download("genomic_ml_results_partial_balanced.pkl")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# Results checking

In [None]:
df_results_balanced = pd.DataFrame(all_results_balanced)

In [None]:
df_table_balanced = df_results_balanced[[
    "model",
    "X_encoding",
    "antibiotic",
    "accuracy",
    "precision",
    "recall",
    "f1",
    "roc_auc"
]]

In [None]:
df_results = pd.DataFrame(all_results)

In [None]:
df_table = df_results[[
    "model",
    "X_encoding",
    "antibiotic",
    "accuracy",
    "precision",
    "recall",
    "f1",
    "roc_auc"
]]