##### **Imports**

In [81]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import balanced_accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold

##### **Parameters**

In [82]:
# Parameters
random= 101
np.random.seed(random)


parameter_grid = {
    'n_estimators': [300],
    'learning_rate': [0.01],
    'max_depth': [2, 3, 4],
    'min_samples_split': [4, 6, 8],
    'min_samples_leaf': [3, 4, 5]
}

##### **Pre-processing**

In [83]:
df_all = pd.read_excel("./TrainDataset2023.xls")

# Drop the first column from the DataFrame
df_all.drop(df_all.columns[:1], axis=1, inplace=True)

# Remove rows which have pCR (outcome) as 999
df_all = df_all[df_all['pCR (outcome)'] != 999]

#for column 12 and onwards
cols = df_all.columns[12:]
#clip outliers in the 99th percentile
df_all[cols] = df_all[cols].clip(upper=df_all[cols].quantile(0.99), axis=1)

#min max scaling for mri values only
cols = df_all.columns[12:]
df_all[cols] = (df_all[cols] - df_all[cols].min()) / (df_all[cols].max() - df_all[cols].min())

In [84]:
# Split data into training and testing sets
X = df_all.drop('pCR (outcome)', axis=1)
y = df_all['pCR (outcome)']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random)

##### **Grid Search**

In [85]:
kf = KFold(n_splits=5, shuffle=True, random_state=random)
model = GradientBoostingClassifier()
grid_search = GridSearchCV(model, parameter_grid, cv=kf, scoring='balanced_accuracy', n_jobs=-1)


##### **Training**

In [86]:
grid_search.fit(X_train, y_train)
best_parameters = grid_search.best_params_
print(grid_search.best_score_)

0.4936643478995526


In [87]:
print("Best parameters set found on development set:")
print()
print(grid_search.best_params_)


Best parameters set found on development set:

{'learning_rate': 0.01, 'max_depth': 4, 'min_samples_leaf': 3, 'min_samples_split': 8, 'n_estimators': 300}


In [88]:
model2 = GradientBoostingClassifier(learning_rate=best_parameters['learning_rate'], max_depth=best_parameters['max_depth'], min_samples_leaf=best_parameters['min_samples_leaf'], min_samples_split=best_parameters['min_samples_split'], n_estimators=200)
model2.fit(X_train, y_train)
y_pred = model2.predict(X_test)
feature_importances = model2.feature_importances_
sorted_features = sorted(zip(feature_importances, X_train.columns), reverse=True)

print(recall_score(y_test, y_pred))
print(f1_score(y_test, y_pred))
print(balanced_accuracy_score(y_test, y_pred))

top_n_features = [feature for importance, feature in sorted_features[:10]]
print(top_n_features)

0.15789473684210525
0.2608695652173913
0.5706140350877192
['original_glszm_SizeZoneNonUniformityNormalized', 'original_glszm_ZonePercentage', 'original_firstorder_InterquartileRange', 'original_firstorder_Median', 'original_firstorder_Range', 'HER2', 'original_glszm_ZoneEntropy', 'original_firstorder_90Percentile', 'original_firstorder_Maximum', 'LNStatus']
