In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, f_classif



In [2]:
features_df = pd.read_csv("PI-CAI_features")
labels_df = pd.read_csv("marksheet.csv")



In [3]:
labels_df.rename(columns={'patient_id': 'patient_ID', 'study_id': 'study_ID'}, inplace=True)
labels_df['case_csPCa'] = labels_df['case_csPCa'].map(lambda x: 1 if x == 'YES' else 0)


In [4]:
df_ai = features_df[
    (features_df['annotator'] == 'AI') &
    (features_df['ROI'] == 'lesion') &
    (features_df['sequence'].isin(['adc', 't2w', 'hbv']))
].copy()

exclude_cols = ['annotator', 'ROI', 'ROI_ID', 'img_path', 'seg_path', 'extraction_ID']
df_ai_clean = df_ai.drop(columns=exclude_cols).copy()
df_ai_clean['sequence'] = df_ai_clean['sequence'].astype(str)
df_ai_clean.set_index(['patient_ID', 'study_ID', 'sequence'], inplace=True)


In [5]:
df_wide = df_ai_clean.unstack(level=-1)
df_wide.columns = [f"{seq}_{col}" for col, seq in df_wide.columns]
df_wide.reset_index(inplace=True)

In [16]:
print(df_ai_clean.columns.size, df_wide.columns.size)

1015 3047


In [6]:
df_final = df_wide.merge(labels_df, on=['patient_ID', 'study_ID'], how='inner')
y = df_final['case_csPCa']
X = df_final.drop(columns=['patient_ID', 'study_ID', 'case_csPCa'])


In [20]:
X

Unnamed: 0,adc_original_shape_Elongation,hbv_original_shape_Elongation,t2w_original_shape_Elongation,adc_original_shape_Flatness,hbv_original_shape_Flatness,t2w_original_shape_Flatness,adc_original_shape_LeastAxisLength,hbv_original_shape_LeastAxisLength,t2w_original_shape_LeastAxisLength,adc_original_shape_MajorAxisLength,...,mri_date,patient_age,psa,psad,prostate_volume,histopath_type,lesion_GS,lesion_ISUP,case_ISUP,center
0,0.733925,0.733925,0.810849,0.556818,0.556818,0.536311,6.292914,6.292914,5.872887,11.301554,...,2012-07-18,64,12.10,0.24,51.0,MRBx,"4+3,0+0",30,3,RUMC
1,0.223764,0.223764,0.219432,0.174031,0.174031,0.187087,9.527915,9.527915,10.296406,54.748291,...,2020-12-06,81,11.10,0.20,56.0,SysBx+MRBx,"4+3,3+4",32,3,ZGT
2,0.569978,0.569978,0.561657,0.353260,0.353260,0.342182,7.127349,7.127349,6.991918,20.175907,...,2012-08-02,64,9.90,0.14,70.0,MRBx,3+4,2,2,RUMC
3,0.914667,0.914667,0.947138,0.804680,0.804680,0.858744,8.315141,8.315141,8.432565,10.333475,...,2018-11-21,64,4.50,0.10,52.0,RP,3+4,2,2,ZGT
4,0.714127,0.714127,1.000000,0.659643,0.659643,0.750000,5.020273,5.020273,5.196152,7.610592,...,2012-12-15,61,10.00,0.24,42.0,MRBx,"N/A,N/A,N/A,3+4",2,2,RUMC
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
403,0.831726,0.831726,0.831726,0.622951,0.622951,0.622951,10.157247,10.157247,10.157247,16.305044,...,2016-10-28,65,6.40,,50.0,RP,3+4,2,2,PCNN
404,0.563701,0.563701,0.517514,0.542741,0.542741,0.473542,7.190927,7.190927,6.502643,13.249271,...,2020-11-07,64,4.30,0.20,23.0,SysBx+MRBx,4+3,3,3,ZGT
405,0.941727,0.941727,0.861388,0.791911,0.791911,0.818372,9.809743,9.809743,10.240434,12.387438,...,2012-08-25,71,12.50,0.21,62.0,MRBx,"3+4,N/A,3+3",21,2,RUMC
406,0.413306,0.413306,0.392137,0.294534,0.294534,0.303200,6.688317,6.688317,6.774319,22.708135,...,2019-06-28,81,5.28,0.12,44.0,SysBx+MRBx,3+4,2,2,RUMC


In [7]:
sequence_combinations = {
    "ADC": ['adc'],
    "T2W": ['t2w'],
    "HBV": ['hbv'],
    "ADC + T2W": ['adc', 't2w'],
    "ADC + HBV": ['adc', 'hbv'],
    "T2W + HBV": ['t2w', 'hbv'],
    "ADC + T2W + HBV": ['adc', 't2w', 'hbv']
}

In [8]:
pca_selected_features = {}
anova_selected_features = {}

In [9]:
for name, prefixes in sequence_combinations.items():
    cols = [col for col in X.columns if any(col.startswith(prefix) for prefix in prefixes)]
    if not cols:
        continue
    X_subset = X[cols].fillna(X[cols].median())
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X_subset)
    pca = PCA(n_components=0.90)
    pca.fit(X_scaled)
    loadings = abs(pca.components_).sum(axis=0)
    feature_scores = dict(zip(X_subset.columns, loadings))
    sorted_features = sorted(feature_scores.items(), key=lambda x: -x[1])
    pca_selected_features[name] = [f for f, _ in sorted_features]


# PCA: selezione unsupervised

In [10]:

pca = PCA(n_components=0.90)
X_pca = pca.fit_transform(X_scaled)
loadings = np.abs(pca.components_).sum(axis=0)
top_indices = np.argsort(loadings)[::-1][:50]
pca_selected_features[name] = [X_subset.columns[i] for i in top_indices]

Seleziona features trovate con PCA

In [11]:
for name in sequence_combinations:
    print(f"\n🔹 Features selected for {name}:")
    feats = pca_selected_features.get(name, [])
    if feats:
        for feat in feats:
            print(f"  - {feat}")
    else:
        print("  (nessuna feature selezionata)")
    print("-" * 80)


🔹 Features selected for ADC:
  - adc_original_shape_Flatness
  - adc_original_shape_Elongation
  - adc_wavelet-HLH_firstorder_RootMeanSquared
  - adc_wavelet-HLH_firstorder_Mean
  - adc_wavelet-LHH_firstorder_Median
  - adc_original_shape_Sphericity
  - adc_wavelet-HHH_firstorder_Mean
  - adc_wavelet-LHL_ngtdm_Contrast
  - adc_wavelet-HHL_firstorder_Mean
  - adc_wavelet-HHH_firstorder_RootMeanSquared
  - adc_original_glszm_LargeAreaLowGrayLevelEmphasis
  - adc_wavelet-LHH_firstorder_Mean
  - adc_wavelet-LHH_firstorder_Skewness
  - adc_wavelet-HHL_firstorder_RootMeanSquared
  - adc_wavelet-LHH_firstorder_RootMeanSquared
  - adc_wavelet-LLH_glcm_ClusterShade
  - adc_wavelet-HLH_firstorder_Median
  - adc_wavelet-HHL_firstorder_Skewness
  - adc_wavelet-HHH_glszm_LargeAreaLowGrayLevelEmphasis
  - adc_wavelet-HLL_gldm_LargeDependenceLowGrayLevelEmphasis
  - adc_wavelet-HHL_glcm_ClusterShade
  - adc_wavelet-HHH_ngtdm_Contrast
  - adc_wavelet-HHH_glcm_ClusterShade
  - adc_wavelet-HHH_glszm_La

In [23]:
print(len(feats))

50
