In [75]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer,KNNImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import AdaBoostClassifier,RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA

In [76]:
# Step 0: Loading Data
file_path = "./TrainDataset2024.xls"
df = pd.read_excel(file_path)

In [77]:
# Step 1: Data Preprocessing
# replace 999 with NaN
df.replace(999, np.nan, inplace=True)

# Imputation of Gene
print(f"Ratio of missing value in Gene: {df['Gene'].isnull().sum() / df['Gene'].count()}")
df['Gene'] = df['Gene'].fillna(-1)

# Make sure no missing in pCR
df = df[df['pCR (outcome)'].isin([0, 1])]

# Check imbalance 
print(f"Ratio of Class 1: {(df['pCR (outcome)']==1).sum() / df['pCR (outcome)'].count()}")

# Drop RFS
df.drop(columns="RelapseFreeSurvival (outcome)",inplace=True)

# Categorical imputation
categorical_features = ['ER', 'PgR','HER2', 'TrippleNegative', 'ChemoGrade','Proliferation','HistologyType','LNStatus','TumourStage', 'Gene']
imputer_cat = SimpleImputer(strategy='most_frequent')
df[categorical_features] = imputer_cat.fit_transform(df[categorical_features])

# Numerical Imputation
numerical_features = [col for col in df.columns if col not in categorical_features + ['ID', 'pCR (outcome)']]
imputer_num = KNNImputer(n_neighbors=5)
df[numerical_features] = imputer_num.fit_transform(df[numerical_features])

Ratio of missing value in Gene: 0.28205128205128205
Ratio of Class 1: 0.21265822784810126


In [78]:
# Outliers
for col in numerical_features:
    q1 = df[col].quantile(0.20)
    q3 = df[col].quantile(0.80)
    iqr = q3 - q1
    lower_bound = q1 - 1.5 * iqr
    upper_bound = q3 + 1.5 * iqr
    df[col] = np.where(df[col] < lower_bound, lower_bound, df[col])
    df[col] = np.where(df[col] > upper_bound, upper_bound, df[col])
    
# Data Standardization
scaler = StandardScaler()
df[numerical_features] = scaler.fit_transform(df[numerical_features])

In [79]:
# Splitting Data
X = df.drop(columns=['ID', 'pCR (outcome)']) 
y = df['pCR (outcome)']

# 8:2 splitting with stratified strategy
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [80]:
# Oversampling(SMOTE)
from imblearn.over_sampling import SMOTENC

smote = SMOTENC(categorical_features = categorical_features, random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)
print("Class distribution after SMOTE:", np.bincount(y_train_resampled))

Class distribution after SMOTE: [249 249]


  print("Class distribution after SMOTE:", np.bincount(y_train_resampled))


In [81]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import classification_report

# Performance evaluation
def evaluate_model(model,y_val,y_pred):
    print(model)
    print(classification_report(y_val, y_pred))
    # Metrics
    b_accuracy = balanced_accuracy_score(y_val,y_pred)
    accuracy = accuracy_score(y_val, y_pred)
    precision = precision_score(y_val, y_pred)
    recall = recall_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)
    roc_auc = roc_auc_score(y_val, y_pred)

    # Display Metrics
    print(f"Blanced Accuracy: {b_accuracy:.4f}")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1-Score: {f1:.4f}")
    print(f"ROC-AUC: {roc_auc:.4f}")     

In [82]:
# Applying PCA to deduct demensions
pca = PCA(n_components = 18)
X_train_pca,X_val_pca = pca.fit_transform(X_train_resampled),pca.transform(X_val)
X_train_pca,X_val_pca = pd.DataFrame(X_train_pca),pd.DataFrame(X_val_pca)

In [83]:
# Model 1: LogisticRegression
lr = LogisticRegression(C=1.2,max_iter=500,solver='liblinear',random_state=42)
lr.fit(X_train_pca, y_train_resampled)
y_probs = lr.predict_proba(X_val_pca)[:, 1]
y_pred = (y_probs >= 0.52).astype(int)
evaluate_model(lr,y_val,y_pred)

LogisticRegression(C=1.2, max_iter=500, random_state=42, solver='liblinear')
              precision    recall  f1-score   support

         0.0       0.89      0.76      0.82        62
         1.0       0.42      0.65      0.51        17

    accuracy                           0.73        79
   macro avg       0.65      0.70      0.66        79
weighted avg       0.79      0.73      0.75        79

Blanced Accuracy: 0.7026
Accuracy: 0.7342
Precision: 0.4231
Recall: 0.6471
F1-Score: 0.5116
ROC-AUC: 0.7026


In [84]:
# Model 2: RandomForestClassifier
rf = RandomForestClassifier(n_estimators=75,random_state=42)
rf.fit(X_train_pca, y_train_resampled)
y_probs = rf.predict_proba(X_val_pca)[:, 1]
y_pred = (y_probs >= 0.48).astype(int)
evaluate_model(rf,y_val,y_pred)

RandomForestClassifier(n_estimators=75, random_state=42)
              precision    recall  f1-score   support

         0.0       0.87      0.77      0.82        62
         1.0       0.42      0.59      0.49        17

    accuracy                           0.73        79
   macro avg       0.64      0.68      0.65        79
weighted avg       0.77      0.73      0.75        79

Blanced Accuracy: 0.6812
Accuracy: 0.7342
Precision: 0.4167
Recall: 0.5882
F1-Score: 0.4878
ROC-AUC: 0.6812


In [85]:
# Model 3: AdaBoostClassifier
clf = AdaBoostClassifier(algorithm='SAMME',learning_rate= 0.47,n_estimators=170, random_state=42)
clf.fit(X_train_pca, y_train_resampled)
y_probs = clf.predict_proba(X_val_pca)[:, 1]
y_pred = (y_probs >= 0.51).astype(int)
evaluate_model(clf,y_val,y_pred)

AdaBoostClassifier(algorithm='SAMME', learning_rate=0.47, n_estimators=170,
                   random_state=42)
              precision    recall  f1-score   support

         0.0       0.92      0.74      0.82        62
         1.0       0.45      0.76      0.57        17

    accuracy                           0.75        79
   macro avg       0.68      0.75      0.69        79
weighted avg       0.82      0.75      0.77        79

Blanced Accuracy: 0.7533
Accuracy: 0.7468
Precision: 0.4483
Recall: 0.7647
F1-Score: 0.5652
ROC-AUC: 0.7533
