In [1]:
import pandas as pd
import numpy as np

In [2]:
dataframe=pd.read_csv(r"../notebook/data/cancer-risk-factors.csv")

In [3]:
dataframe.drop(['Patient_ID','Overall_Risk_Score'],axis=1,inplace=True)

In [4]:
dataframe

Unnamed: 0,Cancer_Type,Age,Gender,Smoking,Alcohol_Use,Obesity,Family_History,Diet_Red_Meat,Diet_Salted_Processed,Fruit_Veg_Intake,Physical_Activity,Air_Pollution,Occupational_Hazards,BRCA_Mutation,H_Pylori_Infection,Calcium_Intake,BMI,Physical_Activity_Level,Risk_Level
0,Breast,68,0,7,2,8,0,5,3,7,4,6,3,1,0,0,28.0,5,Medium
1,Prostate,74,1,8,9,8,0,0,3,7,1,3,3,0,0,5,25.4,9,Medium
2,Skin,55,1,7,10,7,0,3,3,4,1,8,10,0,0,6,28.6,2,Medium
3,Colon,61,0,6,2,2,0,6,2,4,6,4,8,0,0,8,32.1,7,Low
4,Lung,67,1,10,7,4,0,6,3,10,9,10,9,0,0,5,25.1,2,Medium
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,Colon,60,1,4,6,4,0,10,6,4,4,5,3,1,0,4,30.3,3,Medium
1996,Prostate,84,1,5,7,8,0,10,0,1,2,1,3,0,0,2,25.9,4,Medium
1997,Lung,65,0,7,2,10,0,4,2,2,3,6,0,0,1,0,22.5,3,Low
1998,Lung,64,1,10,2,10,0,2,10,7,5,4,2,0,0,10,25.3,3,Medium


In [5]:
X=dataframe.drop(['Risk_Level'],axis=1)
y=dataframe['Risk_Level']

In [6]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y_encoded = le.fit_transform(y)


In [7]:
numerical_col = [col for col in X.columns if X[col].dtype != 'O']
categorical_col = [col for col in X.columns if X[col].dtype == 'O']


In [8]:
numerical_col

['Age',
 'Gender',
 'Smoking',
 'Alcohol_Use',
 'Obesity',
 'Family_History',
 'Diet_Red_Meat',
 'Diet_Salted_Processed',
 'Fruit_Veg_Intake',
 'Physical_Activity',
 'Air_Pollution',
 'Occupational_Hazards',
 'BRCA_Mutation',
 'H_Pylori_Infection',
 'Calcium_Intake',
 'BMI',
 'Physical_Activity_Level']

In [9]:
categorical_col

['Cancer_Type']

In [10]:
!pip install xgboost



In [11]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE

from xgboost import XGBClassifier


In [12]:
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_col)
    ],
    remainder='passthrough'
)

model_pipeline = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
        ('smote', SMOTE(random_state=42)),
        ('classifier', XGBClassifier(
    n_estimators=400,
    max_depth=5,
    learning_rate=0.03,
    subsample=0.9,
    colsample_bytree=0.9,
    eval_metric='mlogloss',
    random_state=42
)
)
    ]
)

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y_encoded,
    test_size=0.3,
    stratify=y_encoded,
    random_state=42
)


In [15]:
model_pipeline.fit(X_train,y_train)

y_proba = model_pipeline.predict_proba(X_test)
high_idx = list(le.classes_).index('High')
print("High class index:", high_idx)
y_pred_default = model_pipeline.predict(X_test)

print("===== DEFAULT MODEL =====")
print(confusion_matrix(y_test, y_pred_default))
print(classification_report(
    y_test,
    y_pred_default,
    target_names=le.classes_
))
threshold = 0.25   # try 0.20, 0.25, 0.30

# Start with default argmax prediction
y_pred_threshold = np.argmax(y_proba, axis=1)

# Override prediction if High probability crosses threshold
y_pred_threshold[y_proba[:, high_idx] >= threshold] = high_idx
print(f"\n===== THRESHOLD TUNED (High >= {threshold}) =====")
print(confusion_matrix(y_test, y_pred_threshold))
print(classification_report(
    y_test,
    y_pred_threshold,
    target_names=le.classes_
))


High class index: 0
===== DEFAULT MODEL =====
[[  9   0  22]
 [  0  56  41]
 [  4  19 449]]
              precision    recall  f1-score   support

        High       0.69      0.29      0.41        31
         Low       0.75      0.58      0.65        97
      Medium       0.88      0.95      0.91       472

    accuracy                           0.86       600
   macro avg       0.77      0.61      0.66       600
weighted avg       0.85      0.86      0.84       600


===== THRESHOLD TUNED (High >= 0.25) =====
[[ 12   0  19]
 [  0  56  41]
 [ 16  19 437]]
              precision    recall  f1-score   support

        High       0.43      0.39      0.41        31
         Low       0.75      0.58      0.65        97
      Medium       0.88      0.93      0.90       472

    accuracy                           0.84       600
   macro avg       0.68      0.63      0.65       600
weighted avg       0.83      0.84      0.84       600



In [16]:
for threshold in [0.15, 0.20, 0.25, 0.30, 0.35]:
    y_pred_tuned = np.argmax(y_proba, axis=1)
    y_pred_tuned[y_proba[:, high_idx] >= threshold] = high_idx

    print(f"\n===== Threshold = {threshold} =====")
    print(classification_report(
        y_test,
        y_pred_tuned,
        target_names=le.classes_
    ))



===== Threshold = 0.15 =====
              precision    recall  f1-score   support

        High       0.39      0.45      0.42        31
         Low       0.75      0.58      0.65        97
      Medium       0.88      0.91      0.90       472

    accuracy                           0.83       600
   macro avg       0.67      0.65      0.66       600
weighted avg       0.83      0.83      0.83       600


===== Threshold = 0.2 =====
              precision    recall  f1-score   support

        High       0.40      0.39      0.39        31
         Low       0.75      0.58      0.65        97
      Medium       0.88      0.92      0.90       472

    accuracy                           0.84       600
   macro avg       0.68      0.63      0.65       600
weighted avg       0.83      0.84      0.83       600


===== Threshold = 0.25 =====
              precision    recall  f1-score   support

        High       0.43      0.39      0.41        31
         Low       0.75      0.58      0