In [13]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.decomposition import PCA
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from imblearn.over_sampling import KMeansSMOTE
from xgboost import XGBRegressor

In [14]:
# Step 0: Load Data
file_path = "./TrainDataset2024.xls"
df = pd.read_excel(file_path)

# Step 1: Data Preprocessing
df.replace(999, np.nan, inplace=True)

# Drop pCR (classification target)
df.drop(columns="pCR (outcome)", inplace=True)

# Ensure no missing in RFS
df = df[df['RelapseFreeSurvival (outcome)'].notnull()]

In [15]:
# Categorical Imputation
categorical_features = ['ER', 'PgR', 'HER2', 'TrippleNegative', 'ChemoGrade', 
                        'Proliferation', 'HistologyType', 'LNStatus', 'TumourStage', 'Gene']
imputer_cat = SimpleImputer(strategy='most_frequent')
df[categorical_features] = imputer_cat.fit_transform(df[categorical_features])

# Numerical Imputation
numerical_features = [col for col in df.columns if col not in categorical_features + ['ID', 'RelapseFreeSurvival (outcome)']]
imputer_num = KNNImputer(n_neighbors=5)
df[numerical_features] = imputer_num.fit_transform(df[numerical_features])

# Outlier Handling
for col in numerical_features:
    q1 = df[col].quantile(0.20)
    q3 = df[col].quantile(0.80)
    iqr = q3 - q1
    lower_bound = q1 - 1.5 * iqr
    upper_bound = q3 + 1.5 * iqr
    df[col] = np.where(df[col] < lower_bound, lower_bound, df[col])
    df[col] = np.where(df[col] > upper_bound, upper_bound, df[col])

# Data Standardization
scaler = StandardScaler()
df[numerical_features] = scaler.fit_transform(df[numerical_features])

In [17]:
# Step 2: Splitting Data
X = df.drop(columns=['ID', 'RelapseFreeSurvival (outcome)'])
y = df['RelapseFreeSurvival (outcome)']

# Train-Test Split
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Step 3: Custom Oversampling for Regression
def oversample_with_noise(X, y, ratio=0.5, noise_level=0.01):
    from sklearn.utils import resample
    minority_idx = np.where(y < np.percentile(y, 25))[0]
    X_minority = X[minority_idx]
    y_minority = y[minority_idx]
    X_resampled = resample(X_minority, replace=True, n_samples=int(len(y) * ratio), random_state=42)
    y_resampled = resample(y_minority, replace=True, n_samples=int(len(y) * ratio), random_state=42)
    X_resampled += noise_level * np.random.randn(*X_resampled.shape)
    X_augmented = np.vstack([X, X_resampled])
    y_augmented = np.hstack([y, y_resampled])
    return X_augmented, y_augmented

X_train_resampled, y_train_resampled = oversample_with_noise(X_train.values, y_train.values)
print("Training set size after custom oversampling:", X_train_resampled.shape)

# Step 4: Dimensionality Reduction
pca = PCA(n_components=18)
X_train_pca = pca.fit_transform(X_train_resampled)
X_val_pca = pca.transform(X_val)

Training set size after custom oversampling: (480, 118)




In [21]:
# Step 4: Define Models and Optimized Parameter Grids
models_and_grids = {
    "RandomForest": {
        "model": RandomForestRegressor(random_state=42),
        "param_grid": {
            'n_estimators': [100, 200, 300],
            'max_depth': [None, 10, 20],
            'min_samples_split': [2, 5],
            'min_samples_leaf': [1, 2]
        }
    },
    "GradientBoosting": {
        "model": GradientBoostingRegressor(random_state=42),
        "param_grid": {
            'n_estimators': [100, 200],
            'learning_rate': [0.01, 0.05],
            'max_depth': [3, 5],
            'subsample': [0.8, 1.0],
            'loss': ['huber']
        }
    },
    "XGBoost": {
        "model": XGBRegressor(objective='reg:squarederror', random_state=42),
        "param_grid": {
            'n_estimators': [100, 200],
            'learning_rate': [0.01, 0.05],
            'max_depth': [3, 5],
            'reg_alpha': [0, 0.1, 1],
            'reg_lambda': [1, 10, 100]
        }
    }
}

In [19]:
# Step 6: Train and Evaluate Models
results = {}
for model_name, config in models_and_grids.items():
    print(f"Training {model_name}...")
    grid_search = GridSearchCV(
        estimator=config["model"],
        param_grid=config["param_grid"],
        scoring='neg_mean_absolute_error',
        cv=5,
        n_jobs=-1,
        verbose=2
    )
    grid_search.fit(X_train_pca, y_train_resampled)
    best_model = grid_search.best_estimator_
    print(f"Best parameters for {model_name}: {grid_search.best_params_}")
    
    # Evaluate on validation set
    y_pred = best_model.predict(X_val_pca)
    mae = mean_absolute_error(y_val, y_pred)
    mse = mean_squared_error(y_val, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_val, y_pred)
    
    # Store results
    results[model_name] = {
        "MAE": mae,
        "MSE": mse,
        "RMSE": rmse,
        "R2": r2
    }

Training RandomForest...
Fitting 5 folds for each of 81 candidates, totalling 405 fits
Best parameters for RandomForest: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Training GradientBoosting...
Fitting 5 folds for each of 27 candidates, totalling 135 fits
Best parameters for GradientBoosting: {'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 150}
Training XGBoost...
Fitting 5 folds for each of 27 candidates, totalling 135 fits
Best parameters for XGBoost: {'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 150}


In [22]:
# Step 7: Print Results
print("\nModel Performance Comparison:")
for model_name, metrics in results.items():
    print(f"\n{model_name}:")
    for metric, value in metrics.items():
        print(f"{metric}: {value:.4f}")


Model Performance Comparison:

RandomForest:
MAE: 22.5453
MSE: 922.0304
RMSE: 30.3650
R2: -0.1553

GradientBoosting:
MAE: 25.6579
MSE: 1069.7854
RMSE: 32.7076
R2: -0.3405

XGBoost:
MAE: 24.5146
MSE: 1015.2533
RMSE: 31.8630
R2: -0.2722
