In [125]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import joblib
import optuna
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import mutual_info_classif


In [126]:
from sklearn.preprocessing import StandardScaler
from sklearn.impute import KNNImputer
from scipy.stats import shapiro, normaltest, levene, probplot

In [127]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from skopt import BayesSearchCV
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, balanced_accuracy_score
from skopt.space import Integer, Real, Categorical

from sklearn.svm import SVC

In [128]:

preprocessing_tools = joblib.load('preprocessing_tools.pkl')
scaler = preprocessing_tools['scaler']
imputer = preprocessing_tools['imputer']
loaded_data = joblib.load('best_model.pkl')
model = loaded_data['model']  

print(type(model))
print(model.get_params())


<class 'sklearn.svm._classes.SVC'>
{'C': 10, 'break_ties': False, 'cache_size': 200, 'class_weight': 'balanced', 'coef0': 0.0, 'decision_function_shape': 'ovr', 'degree': 3, 'gamma': 0.001, 'kernel': 'rbf', 'max_iter': -1, 'probability': True, 'random_state': None, 'shrinking': True, 'tol': 0.001, 'verbose': False}


In [129]:
test_df= pd.read_excel('MLE 2/FinalTestDataset2024.xls')
test_df.head()

Unnamed: 0,ID,Age,ER,PgR,HER2,TrippleNegative,ChemoGrade,Proliferation,HistologyType,LNStatus,...,original_glszm_SmallAreaHighGrayLevelEmphasis,original_glszm_SmallAreaLowGrayLevelEmphasis,original_glszm_ZoneEntropy,original_glszm_ZonePercentage,original_glszm_ZoneVariance,original_ngtdm_Busyness,original_ngtdm_Coarseness,original_ngtdm_Complexity,original_ngtdm_Contrast,original_ngtdm_Strength
0,TRG002219,47.0,1,1,0,0,3,2,1,1,...,0.49835,0.49835,3.144594,0.003447,8257693.277,150.048587,0.001753,0.03711,0.001369,0.001513
1,TRG002222,41.0,1,1,0,0,3,2,1,0,...,0.622381,0.622381,2.061654,0.006535,1568441.643,26.484938,0.009649,0.019352,0.000321,0.008285
2,TRG002223,53.0,0,0,0,1,2,1,1,1,...,0.412482,0.412482,3.440353,0.005391,2656924.827,174.606929,0.001594,0.075152,0.005255,0.001444
3,TRG002235,46.0,1,1,0,0,2,1,1,1,...,0.378333,0.378333,3.531715,0.007102,1714787.173,96.787378,0.002772,0.053377,0.002666,0.002406
4,TRG002240,39.0,0,0,1,0,2,2,1,1,...,0.524767,0.524767,2.186214,0.007896,510479.346,12.789071,0.020072,0.02314,0.000463,0.017172


In [130]:
test_df.replace(999, np.nan, inplace= True)

In [131]:
total_missing = test_df.isnull().sum().sum()
print(f"Total missing values: {total_missing}")
print(f"Percentage of missing values: {total_missing/test_df.size*100:.2f}%")

Total missing values: 29
Percentage of missing values: 0.18%


In [132]:
preprocessing_tools = joblib.load('preprocessing_tools.pkl')
scaler = preprocessing_tools['scaler']
imputer = preprocessing_tools['imputer']


In [133]:
test_dropped = test_df[['ID']]
test_df = test_df.drop(columns=['ID'], axis=1)
test_imputed = pd.DataFrame(imputer.transform(test_df), columns=test_df.columns)
test_scaled = pd.DataFrame(scaler.transform(test_imputed), columns=test_imputed.columns)

print(test_scaled)

          Age        ER       PgR      HER2  TrippleNegative  ChemoGrade  \
0   -0.404819  0.897616  1.169795 -0.677003        -0.692007    1.218417   
1   -0.949666  0.897616  1.169795 -0.677003        -0.692007    1.218417   
2    0.140027 -1.114062 -0.854850 -0.677003         1.445073   -0.799385   
3   -0.495627  0.897616  1.169795 -0.677003        -0.692007   -0.799385   
4   -1.131281 -1.114062 -0.854850  1.477098        -0.692007   -0.799385   
..        ...       ...       ...       ...              ...         ...   
128  0.984539 -1.114062 -0.854850  1.477098        -0.692007   -0.799385   
129 -0.486546 -1.114062 -0.854850 -0.677003         1.445073    1.218417   
130 -0.650000 -1.114062 -0.854850 -0.677003         1.445073    1.218417   
131 -1.303816 -1.114062 -0.854850 -0.677003         1.445073    1.218417   
132  0.348885 -1.114062 -0.854850  1.477098        -0.692007   -0.799385   

     Proliferation  HistologyType  LNStatus  TumourStage  ...  \
0         0.584788    

In [134]:
features_selected = pd.DataFrame(joblib.load('features.pkl'))
print(features_selected)

                                            Feature
0                                          LNStatus
1                   original_glrlm_ShortRunEmphasis
2   original_glrlm_RunLengthNonUniformityNormalized
3             original_glrlm_RunLengthNonUniformity
4                                                ER
5                  original_glszm_LargeAreaEmphasis
6                                              HER2
7   original_glszm_GrayLevelNonUniformityNormalized
8             original_gldm_DependenceNonUniformity
9               original_shape_Maximum2DDiameterRow
10                          original_ngtdm_Strength
11                                             Gene
12                                    HistologyType
13                                              PgR
14            original_glszm_GrayLevelNonUniformity


In [135]:
features_to_use = features_selected['Feature'].tolist()
try:
    test_features = test_scaled[features_to_use]
except KeyError as e:
    print(f"Error extracting features: {e}")
    print("Available features in test_scaled:", test_scaled.columns)
    print("Expected features to use:", features_to_use)
    raise

print("Features extracted for testing:", test_features.columns)

predictions = model.predict(test_features)
predictions_proba = model.predict_proba(test_features)[:, 1]

output_df = pd.DataFrame({
    'ID': test_dropped['ID'].values,
    'Prediction': predictions
})

output_df.to_csv('PCRPrediction.csv', index=False)


print(output_df)

Features extracted for testing: Index(['LNStatus', 'original_glrlm_ShortRunEmphasis',
       'original_glrlm_RunLengthNonUniformityNormalized',
       'original_glrlm_RunLengthNonUniformity', 'ER',
       'original_glszm_LargeAreaEmphasis', 'HER2',
       'original_glszm_GrayLevelNonUniformityNormalized',
       'original_gldm_DependenceNonUniformity',
       'original_shape_Maximum2DDiameterRow', 'original_ngtdm_Strength',
       'Gene', 'HistologyType', 'PgR',
       'original_glszm_GrayLevelNonUniformity'],
      dtype='object')
            ID  Prediction
0    TRG002219         0.0
1    TRG002222         1.0
2    TRG002223         1.0
3    TRG002235         0.0
4    TRG002240         0.0
..         ...         ...
128  TRG002953         1.0
129  TRG002955         1.0
130  TRG002956         0.0
131  TRG002957         1.0
132  TRG002963         1.0

[133 rows x 2 columns]
