In [8]:
# optimal_pipeline.py
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import roc_auc_score, classification_report
from imblearn.over_sampling import SMOTE
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.metrics import (roc_auc_score, classification_report, precision_score, recall_score, f1_score, confusion_matrix, balanced_accuracy_score, average_precision_score)


def testModel(y_true, y_pred, y_prob):    
    #Confusion Matrix
    cm=confusion_matrix(y_true, y_pred)
    print("Confusion Matrix:\n", cm) #to understand errors in imbalanced datasets
    
    #Precision
    prec=precision_score(y_true, y_pred)
    print("Precision (positive class):", prec) #High precision=few false positives
    
    #Recall
    rec=recall_score(y_true, y_pred)
    print("Recall (positive class):", rec) #High recall=few false negatives
    
    #F1-score
    f1=f1_score(y_true, y_pred)
    print("F1-score (positive class):", f1) #Harmonic mean of precision & recall
    
    #Balanced Accuracy
    balAcc=balanced_accuracy_score(y_true, y_pred)
    print("Balanced Accuracy:", balAcc) #Average of recall for each class
    
    #ROC AUC
    roc=roc_auc_score(y_true, y_prob)
    print("ROC AUC:", roc)
    
    #Average Precision (PR AUC)
    ap=average_precision_score(y_true, y_prob)
    print("Average Precision (PR AUC):", ap) #Focuses on performance for positive class

    
# Load data
df = pd.read_csv("risk_factors_cervical_cancer.csv")
df = df.replace("?", np.nan)
df = df.apply(pd.to_numeric, errors="coerce")

# Drop columns with too many missing values
df = df.drop(columns=["STDs: Time since first diagnosis", "STDs: Time since last diagnosis"])

# Separate binary and continuous features
binary_cols = [c for c in df.columns if df[c].dropna().isin([0,1]).all()]
continuous_cols = list(set(df.columns) - set(binary_cols) - {"Biopsy"})

# Impute missing values
binary_imputer = SimpleImputer(strategy="most_frequent")
df[binary_cols] = binary_imputer.fit_transform(df[binary_cols])

knn_imputer = KNNImputer(n_neighbors=5)
df[continuous_cols] = knn_imputer.fit_transform(df[continuous_cols])

# Define features and target
X=df.drop("Biopsy", axis=1)
y=df["Biopsy"]

# Feature selection: top 10 features via ANOVA F-test
selector = SelectKBest(score_func=f_classif, k=10)
X_selected = selector.fit_transform(X, y)
selected_features = X.columns[selector.get_support()]
print("Selected features:", selected_features)

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X_selected, y, test_size=0.25, stratify=y, random_state=42
)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Handle class imbalance
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train_scaled, y_train)
print("Class distribution after SMOTE:", pd.Series(y_train_res).value_counts())

# Model 1: Logistic Regression
lr = LogisticRegression(max_iter=1000, class_weight="balanced")
lr.fit(X_train_res, y_train_res)
y_pred_lr = lr.predict(X_test_scaled)
y_prob_lr = lr.predict_proba(X_test_scaled)[:,1]

print("\nLogistic Regression Results:")
testModel(y_test, y_pred_lr, y_prob_lr)


# Model 2: Gradient Boosting
gb = GradientBoostingClassifier(n_estimators=300, random_state=42)
gb.fit(X_train_res, y_train_res)
y_pred_gb = gb.predict(X_test_scaled)
y_prob_gb = gb.predict_proba(X_test_scaled)[:,1]

print("\nGradient Boosting Results:")
testModel(y_test, y_pred_gb, y_prob_gb)



Risk Prediction RMSE (with sign-transfer): 1.4020472557052739
Quality Assessment Accuracy (with sign-transfer): 0.55


ValueError: operands could not be broadcast together with shapes (33,) (32,) 