In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix
import xgboost as xgb
import numpy as np
import pickle

# Load the synthetic dataset
df = pd.read_csv("synthetic.csv")

# Features and target
X = df.drop(columns=['cancer'])
y = df['cancer']

# Identify categorical and numerical columns
categorical_cols = ['menopausal_status', 'family_history', 'smoking_status', 'alcohol']
numerical_cols = [col for col in X.columns if col not in categorical_cols]

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Preprocessing for numerical data
numerical_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# Split data with stratification
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Apply preprocessing
X_train_preprocessed = preprocessor.fit_transform(X_train)
X_test_preprocessed = preprocessor.transform(X_test)

# Calculate class weight for imbalance
scale_pos_weight = (y_train == 0).sum() / (y_train == 1).sum()
print(f"\nUsing scale_pos_weight: {scale_pos_weight:.2f}")

# Train XGBoost with class weighting
model = xgb.XGBClassifier(
    max_depth=3,
    learning_rate=0.1,
    n_estimators=100,
    objective='binary:logistic',
    scale_pos_weight=scale_pos_weight,
    use_label_encoder=False,
    eval_metric='logloss',
    random_state=42
)
model.fit(X_train_preprocessed, y_train)

# Use conservative threshold to reduce false positives
CONSERVATIVE_THRESHOLD = 0.65  # Increased from 0.35 to reduce false positives
y_probs = model.predict_proba(X_test_preprocessed)[:, 1]
y_pred = (y_probs >= CONSERVATIVE_THRESHOLD).astype(int)

# Evaluate performance
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=['No Cancer', 'Cancer']))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# Save model components
with open('xgboost_model.pkl', 'wb') as file:
    pickle.dump(model, file)

with open('preprocessor.pkl', 'wb') as file:
    pickle.dump(preprocessor, file)

with open('threshold.pkl', 'wb') as file:
    pickle.dump(CONSERVATIVE_THRESHOLD, file)

print("\nModel artifacts saved:")
print("- xgboost_model.pkl (trained model)")
print("- preprocessor.pkl (feature preprocessor)")
print(f"- threshold.pkl (decision threshold: {CONSERVATIVE_THRESHOLD})")


Using scale_pos_weight: 2.33

Classification Report:
              precision    recall  f1-score   support

   No Cancer       1.00      1.00      1.00       140
      Cancer       1.00      1.00      1.00        60

    accuracy                           1.00       200
   macro avg       1.00      1.00      1.00       200
weighted avg       1.00      1.00      1.00       200

Confusion Matrix:
[[140   0]
 [  0  60]]

Model artifacts saved:
- xgboost_model.pkl (trained model)
- preprocessor.pkl (feature preprocessor)
- threshold.pkl (decision threshold: 0.65)


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
