# 12_interpretability.ipynb

In [2]:
import pandas as pd
from pathlib import Path
from sklearn.preprocessing import LabelEncoder
path = Path.cwd().parent.parent
pc = pd.read_csv(path / "data" / "interim" / "pc_by_feature_group_for_patients.csv")
pc.rename(columns={'Patient.ID': 'Patient ID'}, inplace=True)
clin = pd.read_csv(path / "data" / "raw" / "clinicalData_clean.csv")
TARGET = 'Mol Subtype'
data = pc.merge(clin[['Patient ID', TARGET]], on='Patient ID', how='inner')
data = data.drop('Unnamed: 0', axis=1, errors='ignore')
data = data.dropna() 

y = data[TARGET]
if y.dtype == 'object' or y.dtype.name == 'category':
    le = LabelEncoder()
    y = le.fit_transform(y)# Drop the target and patient ID columns to get the feature set
X = data.drop([TARGET, 'Patient ID'], axis=1, errors='ignore')

## Permutation importance by feature group
* Used to determine how much a model's performance relies on each feature
* Works by shuffling the values of a feature and observing the impact on the model's error rate
* A large increase in the error while shuffling indicates the feature is important, while little to no change suggests that it is not

In [3]:
from sklearn.ensemble import RandomForestClassifier

best_rf_params = {"bootstrap": True, 
                "max_depth": 10,
                "min_samples_leaf": 4,
                "min_samples_split": 5,
                "n_estimators": 500}

rf = RandomForestClassifier(**best_rf_params)
rf.fit(X, y)

In [None]:
from sklearn.inspection import permutation_importance

# Calculate permutation importance
result = permutation_importance(
    rf, X, y,
    n_repeats=10, # number of times to shuffle each feature
    random_state=42,
    n_jobs=-1 # use all CPU cores
)

In [None]:
# Get feature names
feature_names = X.columns

# Create a DataFrame for easy viewing
importances_df = pd.DataFrame({
    'feature': feature_names,
    'importance_mean': result.importances_mean, # average drop in accuracy when the feature is shuffled
    'importance_std': result.importances_std # standard deviation across repeats
}).sort_values(by='importance_mean', ascending=False)

print(importances_df)

                                       feature  importance_mean  \
0      PC1_Combining_Tumor_and_FGT_Enhancement         0.039371   
8                  PC1_FGT_Enhancement_Texture         0.035033   
4              PC1_Tumor_Enhancement_Variation         0.034707   
2                PC1_Tumor_Enhancement_Texture         0.033948   
7                        PC1_Tumor_Enhancement         0.033731   
9                PC1_FGT_Enhancement_Variation         0.033406   
6                          PC1_FGT_Enhancement         0.031128   
1                PC1_Tumor_Size_and_Morphology         0.027223   
5           PC1_Breast_and_FGT_Volume_Features         0.027007   
3  PC1_Tumor_Enhancement_Spatial_Heterogeneity         0.026247   

   importance_std  
0        0.003070  
8        0.001882  
4        0.003757  
2        0.002992  
7        0.002805  
9        0.002158  
6        0.002912  
1        0.002763  
5        0.002398  
3        0.002603  
