In [74]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import joblib
import optuna
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import mutual_info_classif

In [75]:
from sklearn.preprocessing import StandardScaler
from sklearn.impute import KNNImputer
from scipy.stats import shapiro, normaltest, levene, probplot

In [76]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from skopt import BayesSearchCV
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, balanced_accuracy_score
from skopt.space import Integer, Real, Categorical

from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

In [77]:
pcr_df= pd.read_excel("MLE 2/TrainDataset2024.xls")

In [78]:
pcr_df.head()

Unnamed: 0,ID,pCR,RelapseFreeSurvival,Age,ER,PgR,HER2,TrippleNegative,ChemoGrade,Proliferation,...,original_glszm_SmallAreaHighGrayLevelEmphasis,original_glszm_SmallAreaLowGrayLevelEmphasis,original_glszm_ZoneEntropy,original_glszm_ZonePercentage,original_glszm_ZoneVariance,original_ngtdm_Busyness,original_ngtdm_Coarseness,original_ngtdm_Complexity,original_ngtdm_Contrast,original_ngtdm_Strength
0,TRG002174,1,144.0,41.0,0,0,0,1,3,3,...,0.517172,0.375126,3.325332,0.002314,3880771.5,473.464852,0.000768,0.182615,0.030508,0.000758
1,TRG002178,0,142.0,39.0,1,1,0,0,3,3,...,0.444391,0.444391,3.032144,0.005612,2372009.744,59.45971,0.004383,0.032012,0.001006,0.003685
2,TRG002204,1,135.0,31.0,0,0,0,1,2,1,...,0.534549,0.534549,2.485848,0.006752,1540027.421,33.935384,0.007584,0.024062,0.000529,0.006447
3,TRG002206,0,12.0,35.0,0,0,0,1,3,3,...,0.506185,0.506185,2.606255,0.003755,6936740.794,46.859265,0.005424,0.013707,0.000178,0.004543
4,TRG002210,0,109.0,61.0,1,0,0,0,2,1,...,0.462282,0.462282,2.809279,0.006521,1265399.054,39.621023,0.006585,0.034148,0.001083,0.005626


In [79]:
pcr_df.shape

(400, 121)

In [80]:
pcr_df.replace(999, np.nan, inplace= True)
pcr_df.isnull().sum()

ID                           0
pCR                          5
RelapseFreeSurvival          0
Age                          0
ER                           0
                            ..
original_ngtdm_Busyness      0
original_ngtdm_Coarseness    0
original_ngtdm_Complexity    0
original_ngtdm_Contrast      0
original_ngtdm_Strength      0
Length: 121, dtype: int64

In [81]:
total_missing = pcr_df.isnull().sum().sum()
print(f"Total missing values: {total_missing}")
print(f"Percentage of missing values: {total_missing/pcr_df.size*100:.2f}%")

Total missing values: 105
Percentage of missing values: 0.22%


In [82]:
pcr_df.dropna(subset=['pCR'], inplace=True)

print(f"Dataset shape after dropping missing pCR: {pcr_df.shape}")
print(f"Missing values in pCR: {pcr_df['pCR'].isnull().sum()}")


Dataset shape after dropping missing pCR: (395, 121)
Missing values in pCR: 0


In [83]:
pcr_df = pcr_df.drop(columns=['ID', 'RelapseFreeSurvival'], axis=1)
pcr_df.head()

Unnamed: 0,pCR,Age,ER,PgR,HER2,TrippleNegative,ChemoGrade,Proliferation,HistologyType,LNStatus,...,original_glszm_SmallAreaHighGrayLevelEmphasis,original_glszm_SmallAreaLowGrayLevelEmphasis,original_glszm_ZoneEntropy,original_glszm_ZonePercentage,original_glszm_ZoneVariance,original_ngtdm_Busyness,original_ngtdm_Coarseness,original_ngtdm_Complexity,original_ngtdm_Contrast,original_ngtdm_Strength
0,1.0,41.0,0,0.0,0.0,1.0,3.0,3.0,1.0,1.0,...,0.517172,0.375126,3.325332,0.002314,3880771.5,473.464852,0.000768,0.182615,0.030508,0.000758
1,0.0,39.0,1,1.0,0.0,0.0,3.0,3.0,1.0,1.0,...,0.444391,0.444391,3.032144,0.005612,2372009.744,59.45971,0.004383,0.032012,0.001006,0.003685
2,1.0,31.0,0,0.0,0.0,1.0,2.0,1.0,1.0,0.0,...,0.534549,0.534549,2.485848,0.006752,1540027.421,33.935384,0.007584,0.024062,0.000529,0.006447
3,0.0,35.0,0,0.0,0.0,1.0,3.0,3.0,1.0,1.0,...,0.506185,0.506185,2.606255,0.003755,6936740.794,46.859265,0.005424,0.013707,0.000178,0.004543
4,0.0,61.0,1,0.0,0.0,0.0,2.0,1.0,1.0,0.0,...,0.462282,0.462282,2.809279,0.006521,1265399.054,39.621023,0.006585,0.034148,0.001083,0.005626


In [84]:
total_missing = pcr_df.isnull().sum().sum()
print(f"Total missing values: {total_missing}")
print(f"Percentage of missing values: {total_missing/pcr_df.size*100:.2f}%")

Total missing values: 97
Percentage of missing values: 0.21%


In [85]:
X = pcr_df.drop(columns=['pCR'])
y = pcr_df['pCR']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Train shape: {X_train.shape}, {y_train.shape}")
print(f"Test shape: {X_test.shape}, {y_test.shape}")


Train shape: (316, 118), (316,)
Test shape: (79, 118), (79,)


In [86]:
scaler = StandardScaler()
imputer = KNNImputer(n_neighbors= 8)

X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns)
X_train_imputed = pd.DataFrame(imputer.fit_transform(X_train_scaled), columns=X_train.columns)

X_test_scaled = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)
X_test_imputed = pd.DataFrame(imputer.transform(X_test_scaled), columns=X_test.columns)

print(f"X_train shape after scaling and imputing: {X_train_imputed.shape}")
print(f"X_test shape after scaling and imputing: {X_test_imputed.shape}")


X_train shape after scaling and imputing: (316, 118)
X_test shape after scaling and imputing: (79, 118)


In [87]:
from scipy.stats import shapiro, levene
normality_pvalues = X_train_imputed.apply(lambda x: shapiro(x).pvalue)

def check_levene_test(column):
    half_index = len(column) // 2
    group1 = column.iloc[:half_index]
    group2 = column.iloc[half_index:]
    return levene(group1, group2).pvalue

variances_pvalues = X_train_imputed.apply(check_levene_test)
normality_summary = normality_pvalues > 0.05
variance_summary = variances_pvalues > 0.05

for col in X_train_imputed.columns:
    print(f"Feature: {col}")
    print(f"  - Normally distributed? {'Yes' if normality_summary[col] else 'No'}")
    print(f"  - Equal variances? {'Yes' if variance_summary[col] else 'No'}")
    print()


Feature: Age
  - Normally distributed? Yes
  - Equal variances? Yes

Feature: ER
  - Normally distributed? No
  - Equal variances? Yes

Feature: PgR
  - Normally distributed? No
  - Equal variances? Yes

Feature: HER2
  - Normally distributed? No
  - Equal variances? Yes

Feature: TrippleNegative
  - Normally distributed? No
  - Equal variances? Yes

Feature: ChemoGrade
  - Normally distributed? No
  - Equal variances? Yes

Feature: Proliferation
  - Normally distributed? No
  - Equal variances? Yes

Feature: HistologyType
  - Normally distributed? No
  - Equal variances? Yes

Feature: LNStatus
  - Normally distributed? No
  - Equal variances? Yes

Feature: TumourStage
  - Normally distributed? No
  - Equal variances? Yes

Feature: Gene
  - Normally distributed? No
  - Equal variances? Yes

Feature: original_shape_Elongation
  - Normally distributed? No
  - Equal variances? Yes

Feature: original_shape_Flatness
  - Normally distributed? No
  - Equal variances? Yes

Feature: original_sh

In [88]:

mi_scores = mutual_info_classif(X_train_imputed, y_train, random_state=32)
mi_df = pd.DataFrame({
    'Feature': X_train_imputed.columns,
    'MI_Score': mi_scores
})

mi_df = mi_df.sort_values(by='MI_Score', ascending=False).reset_index(drop=True)

top_n = 14
top_features = mi_df.head(top_n)

important_features = ['HER2']
final_features = list(set(important_features + top_features['Feature'].tolist()))

final_features_df = pd.DataFrame({'Feature': final_features}).reset_index(drop=True)

print("\nTop N features with MI scores:")
print(top_features)

print("\nFinal features selected:")
print(final_features_df)



Top N features with MI scores:
                                            Feature  MI_Score
0                                              Gene  0.087606
1                                               PgR  0.074997
2                                                ER  0.052158
3               original_shape_Maximum2DDiameterRow  0.050315
4                  original_glszm_LargeAreaEmphasis  0.040856
5   original_glrlm_RunLengthNonUniformityNormalized  0.037803
6                           original_ngtdm_Strength  0.037607
7                                          LNStatus  0.033754
8   original_glszm_GrayLevelNonUniformityNormalized  0.030466
9             original_gldm_DependenceNonUniformity  0.030066
10                                    HistologyType  0.030062
11                  original_glrlm_ShortRunEmphasis  0.029235
12            original_glszm_GrayLevelNonUniformity  0.028979
13            original_glrlm_RunLengthNonUniformity  0.026563

Final features selected:
            

In [89]:

selected_features = final_features_df['Feature'].tolist()

X_train = X_train_imputed[selected_features]
X_test = X_test_imputed[selected_features]

print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)

print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)


X_train shape: (316, 15)
X_test shape: (79, 15)
y_train shape: (316,)
y_test shape: (79,)


In [90]:
from sklearn.svm import SVC
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, balanced_accuracy_score

svm_model = SVC(probability=True)
params = {
    'C': [0.1, 1, 10, 100, 1000],
    'gamma': [1e-3, 1e-2, 0.1, 1, 10],
    'kernel': ['rbf'],  
    'class_weight': ['balanced']
}

random_search = RandomizedSearchCV(
    estimator=svm_model,
    param_distributions=params,
    n_iter=25,  
    scoring='f1',  
    cv=8,  
    verbose=2,
    random_state=42,
    n_jobs=-1
)


random_search.fit(X_train, y_train)
best_model = random_search.best_estimator_
print("Best hyperparameters found:", random_search.best_params_)

print("Trained model saved successfully!")
y_pred = best_model.predict(X_test)
y_pred_proba = best_model.predict_proba(X_test)[:, 1] 

print("\nClassification Report for svm:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
roc_auc = roc_auc_score(y_test, y_pred_proba)
print("\nROC-AUC Score:", roc_auc)
balanced_acc = balanced_accuracy_score(y_test, y_pred)
print("\nBalanced Accuracy Score:", balanced_acc)

Fitting 8 folds for each of 25 candidates, totalling 200 fits
Best hyperparameters found: {'kernel': 'rbf', 'gamma': 0.001, 'class_weight': 'balanced', 'C': 10}
Trained model saved successfully!

Classification Report for svm:
               precision    recall  f1-score   support

         0.0       0.94      0.71      0.81        63
         1.0       0.42      0.81      0.55        16

    accuracy                           0.73        79
   macro avg       0.68      0.76      0.68        79
weighted avg       0.83      0.73      0.76        79


Confusion Matrix:
 [[45 18]
 [ 3 13]]

ROC-AUC Score: 0.8174603174603174

Balanced Accuracy Score: 0.7633928571428572


In [91]:
import joblib
joblib.dump(random_search.best_estimator_, 'best_svm_model.pkl')

print("Model saved successfully!")

Model saved successfully!


In [92]:
final_features_df.to_pickle('features.pkl')
print("Features saved successfully!")

Features saved successfully!


In [93]:
import joblib
model_to_save = {
    'model': random_search.best_estimator_,
    'parameters': random_search.best_params_,
    'selected_features': final_features_df  
}

joblib.dump(model_to_save, 'best_model.pkl')
print("Model and its parameters have been saved successfully!")


Model and its parameters have been saved successfully!


In [94]:
import joblib

preprocessing_tools = {
    'scaler': scaler,     
    'imputer': imputer   
}

joblib.dump(preprocessing_tools, 'preprocessing_tools.pkl')

print("Scaler and imputer saved successfully!")


Scaler and imputer saved successfully!


In [95]:
import joblib
import pandas as pd
import numpy as np

preprocessing_tools = joblib.load('preprocessing_tools.pkl')
scaler = preprocessing_tools['scaler']
imputer = preprocessing_tools['imputer']
model = joblib.load('best_svm_model.pkl')
print("Trained model loaded successfully!")

Trained model loaded successfully!


In [96]:
test_df= pd.read_excel('MLE 2/FinalTestDataset2024.xls')
test_df.head()

Unnamed: 0,ID,Age,ER,PgR,HER2,TrippleNegative,ChemoGrade,Proliferation,HistologyType,LNStatus,...,original_glszm_SmallAreaHighGrayLevelEmphasis,original_glszm_SmallAreaLowGrayLevelEmphasis,original_glszm_ZoneEntropy,original_glszm_ZonePercentage,original_glszm_ZoneVariance,original_ngtdm_Busyness,original_ngtdm_Coarseness,original_ngtdm_Complexity,original_ngtdm_Contrast,original_ngtdm_Strength
0,TRG002219,47.0,1,1,0,0,3,2,1,1,...,0.49835,0.49835,3.144594,0.003447,8257693.277,150.048587,0.001753,0.03711,0.001369,0.001513
1,TRG002222,41.0,1,1,0,0,3,2,1,0,...,0.622381,0.622381,2.061654,0.006535,1568441.643,26.484938,0.009649,0.019352,0.000321,0.008285
2,TRG002223,53.0,0,0,0,1,2,1,1,1,...,0.412482,0.412482,3.440353,0.005391,2656924.827,174.606929,0.001594,0.075152,0.005255,0.001444
3,TRG002235,46.0,1,1,0,0,2,1,1,1,...,0.378333,0.378333,3.531715,0.007102,1714787.173,96.787378,0.002772,0.053377,0.002666,0.002406
4,TRG002240,39.0,0,0,1,0,2,2,1,1,...,0.524767,0.524767,2.186214,0.007896,510479.346,12.789071,0.020072,0.02314,0.000463,0.017172


In [97]:
test_df.replace(999, np.nan, inplace= True)

In [98]:
total_missing = test_df.isnull().sum().sum()
print(f"Total missing values: {total_missing}")
print(f"Percentage of missing values: {total_missing/test_df.size*100:.2f}%")

Total missing values: 29
Percentage of missing values: 0.18%


In [99]:
preprocessing_tools = joblib.load('preprocessing_tools.pkl')
scaler = preprocessing_tools['scaler']
imputer = preprocessing_tools['imputer']
print("Preprocessing tools loaded successfully!")

Preprocessing tools loaded successfully!


In [100]:
test_dropped = test_df[['ID']]
test_df = test_df.drop(columns=['ID'], axis=1)
test_imputed = pd.DataFrame(imputer.transform(test_df), columns=test_df.columns)
test_scaled = pd.DataFrame(scaler.transform(test_imputed), columns=test_imputed.columns)

print(test_scaled)

          Age        ER       PgR      HER2  TrippleNegative  ChemoGrade  \
0   -0.404819  0.897616  1.169795 -0.677003        -0.692007    1.218417   
1   -0.949666  0.897616  1.169795 -0.677003        -0.692007    1.218417   
2    0.140027 -1.114062 -0.854850 -0.677003         1.445073   -0.799385   
3   -0.495627  0.897616  1.169795 -0.677003        -0.692007   -0.799385   
4   -1.131281 -1.114062 -0.854850  1.477098        -0.692007   -0.799385   
..        ...       ...       ...       ...              ...         ...   
128  0.984539 -1.114062 -0.854850  1.477098        -0.692007   -0.799385   
129 -0.486546 -1.114062 -0.854850 -0.677003         1.445073    1.218417   
130 -0.650000 -1.114062 -0.854850 -0.677003         1.445073    1.218417   
131 -1.303816 -1.114062 -0.854850 -0.677003         1.445073    1.218417   
132  0.348885 -1.114062 -0.854850  1.477098        -0.692007   -0.799385   

     Proliferation  HistologyType  LNStatus  TumourStage  ...  \
0         0.584788    

In [101]:
features_to_use = final_features_df['Feature'].tolist()
try:
    test_features = test_scaled[features_to_use]
except KeyError as e:
    print(f"Error extracting features: {e}")
    print("Available features in test_scaled:", test_scaled.columns)
    print("Expected features to use:", features_to_use)
    raise

print("Features extracted for testing:", test_features.columns)

predictions = model.predict(test_features)
predictions_proba = model.predict_proba(test_features)[:, 1]

output_df = pd.DataFrame({
    'ID': test_dropped['ID'].values,
    'Prediction': predictions
})

output_df.to_excel('final_test_output.xlsx', index=False)

print(output_df)

Features extracted for testing: Index(['HER2', 'original_glrlm_RunLengthNonUniformityNormalized',
       'original_glszm_GrayLevelNonUniformity',
       'original_shape_Maximum2DDiameterRow', 'original_ngtdm_Strength',
       'original_glszm_LargeAreaEmphasis', 'original_glrlm_ShortRunEmphasis',
       'original_glszm_GrayLevelNonUniformityNormalized', 'Gene', 'PgR', 'ER',
       'original_gldm_DependenceNonUniformity',
       'original_glrlm_RunLengthNonUniformity', 'LNStatus', 'HistologyType'],
      dtype='object')
            ID  Prediction
0    TRG002219         0.0
1    TRG002222         1.0
2    TRG002223         1.0
3    TRG002235         0.0
4    TRG002240         0.0
..         ...         ...
128  TRG002953         1.0
129  TRG002955         1.0
130  TRG002956         0.0
131  TRG002957         1.0
132  TRG002963         1.0

[133 rows x 2 columns]
