In [20]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.metrics import classification_report, roc_auc_score
from imblearn.over_sampling import SMOTE
from sklearn.metrics import (roc_auc_score, classification_report, precision_score, recall_score, f1_score, confusion_matrix, balanced_accuracy_score, average_precision_score)
from sklearn.base import clone
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.calibration import CalibratedClassifierCV


def testModel(y_true, y_pred, y_prob):
    cm = confusion_matrix(y_true, y_pred)
    # Metrics
    prec=precision_score(y_true, y_pred, zero_division=0)
    rec=recall_score(y_true, y_pred)
    f1=f1_score(y_true, y_pred)
    balAcc=balanced_accuracy_score(y_true, y_pred)
    roc=roc_auc_score(y_true, y_prob)
    ap=average_precision_score(y_true, y_prob)

    print("Precision (positive class):", prec)
    print("Recall (positive class):", rec)
    print("F1-score (positive class):", f1)
    print("Balanced Accuracy:", balAcc)
    print("ROC AUC:", roc)
    print("Average Precision (PR AUC):", ap)


        
df=pd.read_csv("risk_factors_cervical_cancer.csv")
df=df.replace("?", np.nan)
df=df.apply(pd.to_numeric, errors="coerce")
#binary(bool) columns imputation
binaryCols=[c for c in df.columns if df[c].dropna().isin([0,1]).all()]
binaryImputer=SimpleImputer(strategy="most_frequent")
df[binaryCols]=binaryImputer.fit_transform(df[binaryCols])

#continuous or discrete non binary columns imputation
continuousCols=list(set(df.columns)-set(binaryCols))
numericalImputer=SimpleImputer(strategy="median")
df[continuousCols]=numericalImputer.fit_transform(df[continuousCols])


y = df["Dx:Cancer"].values
X = df.drop(columns=["Dx:Cancer", "Dx:CIN", "Dx:HPV", "Dx",
    "Hinselmann", "Schiller", "Citology", "Biopsy",

    "STDs:condylomatosis",
    "STDs:cervical condylomatosis",
    "STDs:vaginal condylomatosis",
    "STDs:vulvo-perineal condylomatosis",
    "STDs:syphilis",
    "STDs:pelvic inflammatory disease",
    "STDs:genital herpes",
    "STDs:molluscum contagiosum",
    "STDs:AIDS",
    "STDs:HIV",
    "STDs:Hepatitis B",
    "STDs:HPV",

    "STDs: Number of diagnosis",
    "STDs: Time since first diagnosis",
    "STDs: Time since last diagnosis"])

X_train, X_test, y_train, y_test = train_test_split(
    X.values,
    y,
    test_size=0.25,
    stratify=y,
    random_state=42
)

def get_oof_predictions(model, X, y, n_splits=5):
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=0)
    oof_pred = np.zeros(len(y))

    for train_idx, val_idx in skf.split(X, y):
        model_clone = clone(model)
        model_clone.fit(X[train_idx], y[train_idx])
        oof_pred[val_idx] = model_clone.predict_proba(X[val_idx])[:, 1]

    return oof_pred
logreg = LogisticRegression(
    class_weight='balanced',
    solver='liblinear',
    max_iter=1000
)

gb = GradientBoostingClassifier(
    n_estimators=200,
    learning_rate=0.05,
    max_depth=3,
    random_state=0
)
p_logreg_oof = get_oof_predictions(logreg, X_train, y_train)
p_gb_oof = get_oof_predictions(gb, X_train, y_train)

X_stack_train = np.column_stack([p_logreg_oof, p_gb_oof])
meta_model = LogisticRegression(class_weight='balanced')

meta_calibrated = CalibratedClassifierCV(
    meta_model,
    method='isotonic',
    cv=5
)

meta_calibrated.fit(X_stack_train, y_train)
logreg.fit(X_train, y_train)
gb.fit(X_train, y_train)
p_logreg_test = logreg.predict_proba(X_test)[:, 1]
p_gb_test = gb.predict_proba(X_test)[:, 1]

X_stack_test = np.column_stack([p_logreg_test, p_gb_test])
y_prob = meta_calibrated.predict_proba(X_stack_test)[:, 1]
y_pred = (y_prob >= 0.5).astype(int)
testModel(y_test, y_pred, y_prob)



Precision (positive class): 0.0
Recall (positive class): 0.0
F1-score (positive class): 0.0
Balanced Accuracy: 0.5
ROC AUC: 0.2557142857142857
Average Precision (PR AUC): 0.017136777191496755
