In [5]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.metrics import classification_report, roc_auc_score
from imblearn.over_sampling import SMOTE
from sklearn.metrics import (roc_auc_score, classification_report, precision_score, recall_score, f1_score, confusion_matrix, balanced_accuracy_score, average_precision_score)

def testModel(y_true, y_pred, y_prob):    
    #Confusion Matrix
    cm=confusion_matrix(y_true, y_pred)
    print("Confusion Matrix:\n", cm) #to understand errors in imbalanced datasets
    
    #Precision
    prec=precision_score(y_true, y_pred)
    print("Precision (positive class):", prec) #High precision=few false positives
    
    #Recall
    rec=recall_score(y_true, y_pred)
    print("Recall (positive class):", rec) #High recall=few false negatives
    
    #F1-score
    f1=f1_score(y_true, y_pred)
    print("F1-score (positive class):", f1) #Harmonic mean of precision & recall
    
    #Balanced Accuracy
    balAcc=balanced_accuracy_score(y_true, y_pred)
    print("Balanced Accuracy:", balAcc) #Average of recall for each class
    
    #ROC AUC
    roc=roc_auc_score(y_true, y_prob)
    print("ROC AUC:", roc)
    
    #Average Precision (PR AUC)
    ap=average_precision_score(y_true, y_prob)
    print("Average Precision (PR AUC):", ap) #Focuses on performance for positive class


df=pd.read_csv("risk_factors_cervical_cancer.csv")
print(df)

# Replace "?" with NaN
df.replace("?", np.nan, inplace=True)

# Ensure numeric columns are float
numericCols = df.select_dtypes(include=["float64", "int64", "object"]).columns
for col in numericCols:
    df[col] = pd.to_numeric(df[col], errors="coerce")  # converts strings to NaN if needed

# 1. Split data
X = df.drop("Dx:Cancer", axis=1)
y = df["Dx:Cancer"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, stratify=y, random_state=42)

# 2. Impute
binaryCols = [c for c in X.columns if X[c].dropna().isin([0,1]).all()]
continuousCols = list(set(X.columns)-set(binaryCols))

X_train[binaryCols] = SimpleImputer(strategy="most_frequent").fit_transform(X_train[binaryCols])
X_train[continuousCols] = KNNImputer(n_neighbors=5).fit_transform(X_train[continuousCols])
X_test[binaryCols] = SimpleImputer(strategy="most_frequent").fit_transform(X_test[binaryCols])
X_test[continuousCols] = KNNImputer(n_neighbors=5).fit_transform(X_test[continuousCols])

# 3. Scale
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 4. Handle imbalance (optional: SMOTE)
smote = SMOTE(random_state=42)
X_train_bal, y_train_bal = smote.fit_resample(X_train_scaled, y_train)

# 5. Train model
lr = LogisticRegression(max_iter=1000, class_weight="balanced")
lr.fit(X_train_bal, y_train_bal)

# 6. Predict
y_pred = lr.predict(X_test_scaled)
y_prob = lr.predict_proba(X_test_scaled)[:,1]

# 7. Evaluate (use your testModel function)
testModel(y_test, y_pred, y_prob)


     Age Number of sexual partners First sexual intercourse  \
0     18                       4.0                     15.0   
1     15                       1.0                     14.0   
2     34                       1.0                        ?   
3     52                       5.0                     16.0   
4     46                       3.0                     21.0   
..   ...                       ...                      ...   
853   34                       3.0                     18.0   
854   32                       2.0                     19.0   
855   25                       2.0                     17.0   
856   33                       2.0                     24.0   
857   29                       2.0                     20.0   

    Num of pregnancies Smokes Smokes (years) Smokes (packs/year)  \
0                  1.0    0.0            0.0                 0.0   
1                  1.0    0.0            0.0                 0.0   
2                  1.0    0.0          