In [187]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import joblib
import optuna
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import mutual_info_classif

In [188]:
from sklearn.preprocessing import StandardScaler
from sklearn.impute import KNNImputer
from scipy.stats import shapiro, normaltest, levene, probplot

In [189]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from skopt import BayesSearchCV
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, balanced_accuracy_score
from skopt.space import Integer, Real, Categorical

from sklearn.svm import SVC

In [190]:
pcr_df= pd.read_excel("MLE 2/TrainDataset2024.xls")

In [191]:
pcr_df.head()

Unnamed: 0,ID,pCR,RelapseFreeSurvival,Age,ER,PgR,HER2,TrippleNegative,ChemoGrade,Proliferation,...,original_glszm_SmallAreaHighGrayLevelEmphasis,original_glszm_SmallAreaLowGrayLevelEmphasis,original_glszm_ZoneEntropy,original_glszm_ZonePercentage,original_glszm_ZoneVariance,original_ngtdm_Busyness,original_ngtdm_Coarseness,original_ngtdm_Complexity,original_ngtdm_Contrast,original_ngtdm_Strength
0,TRG002174,1,144.0,41.0,0,0,0,1,3,3,...,0.517172,0.375126,3.325332,0.002314,3880771.5,473.464852,0.000768,0.182615,0.030508,0.000758
1,TRG002178,0,142.0,39.0,1,1,0,0,3,3,...,0.444391,0.444391,3.032144,0.005612,2372009.744,59.45971,0.004383,0.032012,0.001006,0.003685
2,TRG002204,1,135.0,31.0,0,0,0,1,2,1,...,0.534549,0.534549,2.485848,0.006752,1540027.421,33.935384,0.007584,0.024062,0.000529,0.006447
3,TRG002206,0,12.0,35.0,0,0,0,1,3,3,...,0.506185,0.506185,2.606255,0.003755,6936740.794,46.859265,0.005424,0.013707,0.000178,0.004543
4,TRG002210,0,109.0,61.0,1,0,0,0,2,1,...,0.462282,0.462282,2.809279,0.006521,1265399.054,39.621023,0.006585,0.034148,0.001083,0.005626


In [192]:
pcr_df.shape

(400, 121)

In [193]:
pcr_df.replace(999, np.nan, inplace= True)
pcr_df.isnull().sum()

ID                           0
pCR                          5
RelapseFreeSurvival          0
Age                          0
ER                           0
                            ..
original_ngtdm_Busyness      0
original_ngtdm_Coarseness    0
original_ngtdm_Complexity    0
original_ngtdm_Contrast      0
original_ngtdm_Strength      0
Length: 121, dtype: int64

In [194]:
total_missing = pcr_df.isnull().sum().sum()
print(f"Total missing values: {total_missing}")
print(f"Percentage of missing values: {total_missing/pcr_df.size*100:.2f}%")

Total missing values: 105
Percentage of missing values: 0.22%


In [195]:
pcr_df.dropna(subset=['pCR'], inplace=True)

print(f"Dataset shape after dropping missing pCR: {pcr_df.shape}")
print(f"Missing values in pCR: {pcr_df['pCR'].isnull().sum()}")


Dataset shape after dropping missing pCR: (395, 121)
Missing values in pCR: 0


In [196]:
pcr_df = pcr_df.drop(columns=['ID', 'RelapseFreeSurvival'], axis=1)
pcr_df.head()

Unnamed: 0,pCR,Age,ER,PgR,HER2,TrippleNegative,ChemoGrade,Proliferation,HistologyType,LNStatus,...,original_glszm_SmallAreaHighGrayLevelEmphasis,original_glszm_SmallAreaLowGrayLevelEmphasis,original_glszm_ZoneEntropy,original_glszm_ZonePercentage,original_glszm_ZoneVariance,original_ngtdm_Busyness,original_ngtdm_Coarseness,original_ngtdm_Complexity,original_ngtdm_Contrast,original_ngtdm_Strength
0,1.0,41.0,0,0.0,0.0,1.0,3.0,3.0,1.0,1.0,...,0.517172,0.375126,3.325332,0.002314,3880771.5,473.464852,0.000768,0.182615,0.030508,0.000758
1,0.0,39.0,1,1.0,0.0,0.0,3.0,3.0,1.0,1.0,...,0.444391,0.444391,3.032144,0.005612,2372009.744,59.45971,0.004383,0.032012,0.001006,0.003685
2,1.0,31.0,0,0.0,0.0,1.0,2.0,1.0,1.0,0.0,...,0.534549,0.534549,2.485848,0.006752,1540027.421,33.935384,0.007584,0.024062,0.000529,0.006447
3,0.0,35.0,0,0.0,0.0,1.0,3.0,3.0,1.0,1.0,...,0.506185,0.506185,2.606255,0.003755,6936740.794,46.859265,0.005424,0.013707,0.000178,0.004543
4,0.0,61.0,1,0.0,0.0,0.0,2.0,1.0,1.0,0.0,...,0.462282,0.462282,2.809279,0.006521,1265399.054,39.621023,0.006585,0.034148,0.001083,0.005626


In [197]:
total_missing = pcr_df.isnull().sum().sum()
print(f"Total missing values: {total_missing}")
print(f"Percentage of missing values: {total_missing/pcr_df.size*100:.2f}%")

Total missing values: 97
Percentage of missing values: 0.21%


In [198]:
X = pcr_df.drop(columns=['pCR'])
y = pcr_df['pCR']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Train shape: {X_train.shape}, {y_train.shape}")
print(f"Test shape: {X_test.shape}, {y_test.shape}")


Train shape: (316, 118), (316,)
Test shape: (79, 118), (79,)


In [199]:
scaler = StandardScaler()
imputer = KNNImputer(n_neighbors= 8)

X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns)
X_train_imputed = pd.DataFrame(imputer.fit_transform(X_train_scaled), columns=X_train.columns)

X_test_scaled = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)
X_test_imputed = pd.DataFrame(imputer.transform(X_test_scaled), columns=X_test.columns)

print(f"X_train shape after scaling and imputing: {X_train_imputed.shape}")
print(f"X_test shape after scaling and imputing: {X_test_imputed.shape}")


X_train shape after scaling and imputing: (316, 118)
X_test shape after scaling and imputing: (79, 118)


In [200]:
from scipy.stats import shapiro, levene
normality_pvalues = X_train_imputed.apply(lambda x: shapiro(x).pvalue)

def check_levene_test(column):
    half_index = len(column) // 2
    group1 = column.iloc[:half_index]
    group2 = column.iloc[half_index:]
    return levene(group1, group2).pvalue

variances_pvalues = X_train_imputed.apply(check_levene_test)
normality_summary = normality_pvalues > 0.05
variance_summary = variances_pvalues > 0.05

for col in X_train_imputed.columns:
    print(f"Feature: {col}")
    print(f"  - Normally distributed? {'Yes' if normality_summary[col] else 'No'}")
    print(f"  - Equal variances? {'Yes' if variance_summary[col] else 'No'}")
    print()


Feature: Age
  - Normally distributed? Yes
  - Equal variances? Yes

Feature: ER
  - Normally distributed? No
  - Equal variances? Yes

Feature: PgR
  - Normally distributed? No
  - Equal variances? Yes

Feature: HER2
  - Normally distributed? No
  - Equal variances? Yes

Feature: TrippleNegative
  - Normally distributed? No
  - Equal variances? Yes

Feature: ChemoGrade
  - Normally distributed? No
  - Equal variances? Yes

Feature: Proliferation
  - Normally distributed? No
  - Equal variances? Yes

Feature: HistologyType
  - Normally distributed? No
  - Equal variances? Yes

Feature: LNStatus
  - Normally distributed? No
  - Equal variances? Yes

Feature: TumourStage
  - Normally distributed? No
  - Equal variances? Yes

Feature: Gene
  - Normally distributed? No
  - Equal variances? Yes

Feature: original_shape_Elongation
  - Normally distributed? No
  - Equal variances? Yes

Feature: original_shape_Flatness
  - Normally distributed? No
  - Equal variances? Yes

Feature: original_sh

In [201]:

mi_scores = mutual_info_classif(X_train_imputed, y_train, random_state=32)
mi_df = pd.DataFrame({
    'Feature': X_train_imputed.columns,
    'MI_Score': mi_scores
})

mi_df = mi_df.sort_values(by='MI_Score', ascending=False).reset_index(drop=True)

top_n = 14
top_features = mi_df.head(top_n)

important_features = ['HER2']
final_features = list(set(important_features + top_features['Feature'].tolist()))

final_features_df = pd.DataFrame({'Feature': final_features}).reset_index(drop=True)

print("\nTop N features with MI scores:")
print(top_features)

print("\nFinal features selected:")
print(final_features_df)



Top N features with MI scores:
                                            Feature  MI_Score
0                                              Gene  0.087606
1                                               PgR  0.074997
2                                                ER  0.052158
3               original_shape_Maximum2DDiameterRow  0.050315
4                  original_glszm_LargeAreaEmphasis  0.040856
5   original_glrlm_RunLengthNonUniformityNormalized  0.037803
6                           original_ngtdm_Strength  0.037607
7                                          LNStatus  0.033754
8   original_glszm_GrayLevelNonUniformityNormalized  0.030466
9             original_gldm_DependenceNonUniformity  0.030066
10                                    HistologyType  0.030062
11                  original_glrlm_ShortRunEmphasis  0.029235
12            original_glszm_GrayLevelNonUniformity  0.028979
13            original_glrlm_RunLengthNonUniformity  0.026563

Final features selected:
            

In [202]:

selected_features = final_features_df['Feature'].tolist()

X_train = X_train_imputed[selected_features]
X_test = X_test_imputed[selected_features]

print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)

print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)


X_train shape: (316, 15)
X_test shape: (79, 15)
y_train shape: (316,)
y_test shape: (79,)


In [203]:
from sklearn.svm import SVC
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, balanced_accuracy_score

svm_model = SVC(probability=True)
params = {
    'C': [0.1, 1, 10, 100, 1000],
    'gamma': [1e-3, 1e-2, 0.1, 1, 10],
    'kernel': ['rbf'],  
    'class_weight': ['balanced']
}

random_search = RandomizedSearchCV(
    estimator=svm_model,
    param_distributions=params,
    n_iter=25,  
    scoring='f1',  
    cv=8,  
    verbose=2,
    random_state=42,
    n_jobs=-1
)


random_search.fit(X_train, y_train)
best_model = random_search.best_estimator_
print("Best hyperparameters found:", random_search.best_params_)

print("Trained model saved successfully!")
y_pred = best_model.predict(X_test)
y_pred_proba = best_model.predict_proba(X_test)[:, 1] 

print("\nClassification Report for svm:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
roc_auc = roc_auc_score(y_test, y_pred_proba)
print("\nROC-AUC Score:", roc_auc)
balanced_acc = balanced_accuracy_score(y_test, y_pred)
print("\nBalanced Accuracy Score:", balanced_acc)

Fitting 8 folds for each of 25 candidates, totalling 200 fits
Best hyperparameters found: {'kernel': 'rbf', 'gamma': 0.001, 'class_weight': 'balanced', 'C': 10}
Trained model saved successfully!

Classification Report for svm:
               precision    recall  f1-score   support

         0.0       0.94      0.71      0.81        63
         1.0       0.42      0.81      0.55        16

    accuracy                           0.73        79
   macro avg       0.68      0.76      0.68        79
weighted avg       0.83      0.73      0.76        79


Confusion Matrix:
 [[45 18]
 [ 3 13]]

ROC-AUC Score: 0.8179563492063492

Balanced Accuracy Score: 0.7633928571428572


In [242]:
import joblib
model_to_save = {
    'model': random_search.best_estimator_,
    'parameters': random_search.best_params_
}

joblib.dump(model_to_save, 'best_model.pkl')

print("The following content has been saved in 'best_model.pkl':")
print("Model:", model_to_save['model'])
print("Best Parameters:", model_to_save['parameters'])


The following content has been saved in 'best_model.pkl':
Model: SVC(C=10, class_weight='balanced', gamma=0.001, probability=True)
Best Parameters: {'kernel': 'rbf', 'gamma': 0.001, 'class_weight': 'balanced', 'C': 10}


In [263]:
preprocessing_tools = {
    'scaler': scaler,     
    'imputer': imputer   
}

joblib.dump(preprocessing_tools, 'preprocessing_tools.pkl')

print("Scaler and imputer saved successfully!")


Scaler and imputer saved successfully!
