In [1]:
# optimal_pipeline.py
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import roc_auc_score, classification_report
from imblearn.over_sampling import SMOTE
from sklearn.feature_selection import SelectKBest, f_classif

# Load data
df = pd.read_csv("risk_factors_cervical_cancer.csv")
df = df.replace("?", np.nan)
df = df.apply(pd.to_numeric, errors="coerce")

# Drop columns with too many missing values
df = df.drop(columns=["STDs: Time since first diagnosis", "STDs: Time since last diagnosis"])

# Separate binary and continuous features
binary_cols = [c for c in df.columns if df[c].dropna().isin([0,1]).all()]
continuous_cols = list(set(df.columns) - set(binary_cols) - {"Biopsy"})

# Impute missing values
binary_imputer = SimpleImputer(strategy="most_frequent")
df[binary_cols] = binary_imputer.fit_transform(df[binary_cols])

knn_imputer = KNNImputer(n_neighbors=5)
df[continuous_cols] = knn_imputer.fit_transform(df[continuous_cols])

# Define features and target
X = df.drop("Biopsy", axis=1)
y = df["Biopsy"]

# Feature selection: top 10 features via ANOVA F-test
selector = SelectKBest(score_func=f_classif, k=10)
X_selected = selector.fit_transform(X, y)
selected_features = X.columns[selector.get_support()]
print("Selected features:", selected_features)

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X_selected, y, test_size=0.25, stratify=y, random_state=42
)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Handle class imbalance
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train_scaled, y_train)
print("Class distribution after SMOTE:", pd.Series(y_train_res).value_counts())

# Model 1: Logistic Regression
lr = LogisticRegression(max_iter=1000, class_weight="balanced")
lr.fit(X_train_res, y_train_res)
y_pred_lr = lr.predict(X_test_scaled)
y_prob_lr = lr.predict_proba(X_test_scaled)[:,1]

print("\nLogistic Regression Results:")
print(classification_report(y_test, y_pred_lr))
print("ROC AUC:", roc_auc_score(y_test, y_prob_lr))

# Model 2: Gradient Boosting
gb = GradientBoostingClassifier(n_estimators=300, random_state=42)
gb.fit(X_train_res, y_train_res)
y_pred_gb = gb.predict(X_test_scaled)
y_prob_gb = gb.predict_proba(X_test_scaled)[:,1]

print("\nGradient Boosting Results:")
print(classification_report(y_test, y_pred_gb))
print("ROC AUC:", roc_auc_score(y_test, y_prob_gb))


  f = msb / msw


Selected features: Index(['STDs', 'STDs:genital herpes', 'STDs:HIV', 'Dx:Cancer', 'Dx:CIN',
       'Dx:HPV', 'Dx', 'Hinselmann', 'Schiller', 'Citology'],
      dtype='object')
Class distribution after SMOTE: Biopsy
0.0    602
1.0    602
Name: count, dtype: int64

Logistic Regression Results:
              precision    recall  f1-score   support

         0.0       0.99      0.96      0.97       201
         1.0       0.60      0.86      0.71        14

    accuracy                           0.95       215
   macro avg       0.79      0.91      0.84       215
weighted avg       0.96      0.95      0.96       215

ROC AUC: 0.9077825159914713

Gradient Boosting Results:
              precision    recall  f1-score   support

         0.0       0.98      0.96      0.97       201
         1.0       0.58      0.79      0.67        14

    accuracy                           0.95       215
   macro avg       0.78      0.87      0.82       215
weighted avg       0.96      0.95      0.95       21