In [3]:
import numpy as np
import pandas as pd
import os.path as op
from sklearn.feature_selection import SelectKBest, chi2
import pickle
from scipy.fft import fft, fftfreq
from tqdm import tqdm

In [2]:
def load_obj(name, path):
    with open(op.join(path, name), 'rb') as f:
        return pickle.load(f)

In [3]:
patients = ["25", "27", "38", "61", "83", "97", "18", "22", "54", "68", "69", "84", "105", "42", "20", "30", "11", "47", "56", "101", "117", "4", "8", "26", "66", "67", "79", "92", "111", "2", "24", "65", "73", "100", "118", "119", "23", "35","40", "62", "102", "107", "108", "109"]

In [4]:
patients_df = load_obj("preprocessed_data.pkl","/sps/crnl/ademasson/data/features_extraction")

In [5]:
patients_df.keys()

dict_keys(['X_25', 'y_25', 'X_27', 'y_27', 'X_38', 'y_38', 'X_61', 'y_61', 'X_83', 'y_83', 'X_97', 'y_97', 'X_18', 'y_18', 'X_22', 'y_22', 'X_54', 'y_54', 'X_68', 'y_68', 'X_69', 'y_69', 'X_84', 'y_84', 'X_105', 'y_105', 'X_42', 'y_42', 'X_20', 'y_20', 'X_30', 'y_30', 'X_11', 'y_11', 'X_47', 'y_47', 'X_56', 'y_56', 'X_101', 'y_101', 'X_117', 'y_117', 'X_4', 'y_4', 'X_8', 'y_8', 'X_26', 'y_26', 'X_66', 'y_66', 'X_67', 'y_67', 'X_79', 'y_79', 'X_92', 'y_92', 'X_111', 'y_111', 'X_2', 'y_2', 'X_24', 'y_24', 'X_65', 'y_65', 'X_73', 'y_73', 'X_100', 'y_100', 'X_118', 'y_118', 'X_119', 'y_119', 'X_23', 'y_23', 'X_35', 'y_35', 'X_40', 'y_40', 'X_62', 'y_62', 'X_102', 'y_102', 'X_107', 'y_107', 'X_108', 'y_108', 'X_109', 'y_109'])

In [29]:
list_of_features = ["ppa", "upslope", "downslope", "std", "amplitude_ratio", "sharpness", "average_slope", "main_frequency", "phase_congruency"]

In [30]:
cv = 5
patients_per_fold = len(patients) // cv

In [17]:
for i in tqdm(range(0,cv)):
    test_patients = patients[i*patients_per_fold:(i+1)*patients_per_fold]
    train_patients = patients[:i*patients_per_fold] + patients[(i+1)*patients_per_fold:]
    # Train sets
    X_train = None
    y_train = None
    for j, patient in enumerate(train_patients):
        X_patient_set = patients_df[f"X_{patient}"]
        y_patient_set = patients_df[f"y_{patient}"]
        if X_train is None:
            X_train = X_patient_set
        else:
            X_train = pd.concat([X_train, X_patient_set], axis=0, ignore_index=True)

        if y_train is None:
            y_train = y_patient_set
        else:
            y_train = np.hstack([y_train, y_patient_set])
            
    # Test sets
    X_test = None
    y_test = None
    for j, patient in enumerate(test_patients):
        X_patient_set = patients_df[f"X_{patient}"]
        y_patient_set = patients_df[f"y_{patient}"]             
        if X_test is None:
            X_test = X_patient_set
        else:
            X_test = pd.concat([X_test, X_patient_set], axis=0, ignore_index=True)

        if y_test is None:
            y_test = y_patient_set
        else:
            y_test = np.hstack([y_test, y_patient_set])
            
    
    # SelectKBest
    train_selector = SelectKBest(chi2, k=X_train.shape[1])
#     y_train[:] = y_train[:X_train.shape[1]]
    train_selector.fit_transform(X_train, y_train)
    train_feature_scores = {feature: train_selector.scores_[i] for i, feature in enumerate(X_train.columns)}
    train_feature_mean_scores = {}
    for feature in list_of_features:
        scores = []
        for col in X_train.columns:
            if col.startswith(feature):
                scores.append(train_feature_scores[col])
        train_feature_mean_scores[feature] = np.mean(scores)
    train_feature_mean_scores = {k: v for k, v in sorted(train_feature_mean_scores.items(), key=lambda x: x[1], reverse=True)}
    
    test_selector = SelectKBest(chi2, k=X_test.shape[1])
#     y_test[:] = y_test[:X_test.shape[1]]
    test_selector.fit_transform(X_test, y_test)
    test_feature_scores = {feature: test_selector.scores_[i] for i, feature in enumerate(X_test.columns)}
    test_feature_mean_scores = {}
    for feature in list_of_features:
        scores = []
        for col in X_test.columns:
            if col.startswith(feature):
                scores.append(test_feature_scores[col])
        test_feature_mean_scores[feature] = np.mean(scores)
    test_feature_mean_scores = {k: v for k, v in sorted(test_feature_mean_scores.items(), key=lambda x: x[1], reverse=True)}
    
    print("--------------------------")
    print(f"CV n° {i+1}")
    print("--------------------------")
    print("Feature scores on Train set : ")
    for k,v in train_feature_mean_scores.items():
        print(k + " : " + str(v/len(train_patients)))
    print("--------------------------")
    print("Feature scores on Test set : ")
    for k,v in test_feature_mean_scores.items():
        print(k + " : " + str(v/len(test_patients)))
    print("--------------------------")

 20%|██        | 1/5 [00:35<02:21, 35.28s/it]

--------------------------
CV n° 1
--------------------------
Feature scores on Train set : 
std : 3.3681694557346664
ppa : 3.232113074059955
average_slope : 1.3647151000114666
upslope : 0.8210867292151032
amplitude_ratio : 0.4713060574652315
sharpness : 0.3781794495017944
downslope : 0.26184123830906386
main_frequency : 0.21286297738463808
phase_congruency : 0.0020523560842613245
--------------------------
Feature scores on Test set : 
ppa : 2.2693674674026445
std : 2.248466917205477
amplitude_ratio : 1.2815445369252538
average_slope : 1.0211687122046789
upslope : 0.4922881798393459
sharpness : 0.27112143652462084
downslope : 0.2222036363232927
main_frequency : 0.21078242557042165
phase_congruency : 0.0034768899777414294
--------------------------


 40%|████      | 2/5 [01:11<01:46, 35.58s/it]

--------------------------
CV n° 2
--------------------------
Feature scores on Train set : 
std : 3.3193101294462006
ppa : 3.2045893971541877
average_slope : 1.2691743542937295
upslope : 0.7312402182856631
amplitude_ratio : 0.49921925992199134
sharpness : 0.33764961858521114
downslope : 0.24954010824991857
main_frequency : 0.15361416113335186
phase_congruency : 0.0018795697725526419
--------------------------
Feature scores on Test set : 
ppa : 2.564003056617864
std : 2.5233106754195394
average_slope : 1.6752904561973496
upslope : 1.056674382479084
amplitude_ratio : 0.6609594052268705
sharpness : 0.63769582940431
main_frequency : 0.40059128578476755
downslope : 0.35366699494897536
phase_congruency : 0.004077192844460162
--------------------------


 60%|██████    | 3/5 [01:46<01:11, 35.57s/it]

--------------------------
CV n° 3
--------------------------
Feature scores on Train set : 
std : 3.196156881041073
ppa : 3.0057363395806
average_slope : 1.057941739576991
upslope : 0.5594011896748392
amplitude_ratio : 0.5006490013062245
sharpness : 0.2190831412520605
main_frequency : 0.21783885542131343
downslope : 0.19461281134263825
phase_congruency : 0.00206702626240038
--------------------------
Feature scores on Test set : 
ppa : 3.348922897733546
average_slope : 3.107679237585826
std : 3.050060125340736
upslope : 2.280955671369411
sharpness : 1.880592862395085
amplitude_ratio : 1.094148682040836
downslope : 0.6770209338872232
main_frequency : 0.2858040590341538
phase_congruency : 0.004193896991734877
--------------------------


 80%|████████  | 4/5 [02:22<00:35, 35.68s/it]

--------------------------
CV n° 4
--------------------------
Feature scores on Train set : 
std : 3.444100176240592
ppa : 3.290310964710928
average_slope : 1.348808369382191
upslope : 0.7393460393391756
amplitude_ratio : 0.43201855149473867
sharpness : 0.3127824558002643
downslope : 0.2460166472051445
main_frequency : 0.13381435465581515
phase_congruency : 0.0017390400642625589
--------------------------
Feature scores on Test set : 
std : 2.752583780009679
ppa : 2.5019244421901026
amplitude_ratio : 1.6766717290876616
average_slope : 1.5846081514063792
upslope : 1.2031197774330606
sharpness : 0.9821175064871133
downslope : 0.4015792883767464
main_frequency : 0.27152401807199555
phase_congruency : 0.0046762271590830035
--------------------------


100%|██████████| 5/5 [02:58<00:00, 35.61s/it]

--------------------------
CV n° 5
--------------------------
Feature scores on Train set : 
ppa : 2.446936169396782
std : 2.4284765665230306
average_slope : 1.5995981955815812
upslope : 1.0726065965368
sharpness : 0.7049184094660838
downslope : 0.355725319000917
amplitude_ratio : 0.25256020334120954
main_frequency : 0.05669168604798975
phase_congruency : 0.0012685640654564515
--------------------------
Feature scores on Test set : 
std : 7.314594997571441
ppa : 6.857561930027476
amplitude_ratio : 2.4875005739126133
average_slope : 2.1711040234570667
upslope : 1.2808898429536895
main_frequency : 0.8370147326019971
sharpness : 0.6021423294341738
downslope : 0.38753877208121457
phase_congruency : 0.0073509884843020335
--------------------------





In [32]:
features_columns = {}

for feature in list_of_features:
    matching_columns = [col for col in patients_df['X_25'].columns if col.startswith(feature)]
    features_columns[feature] = matching_columns

In [25]:
list_of_features = ["std", "ppa", "average_slope", "upslope", "amplitude_ratio", "down_slope", "main_frequency", "phase_congruency"]

In [34]:
features_columns

{'ppa': ['ppa_Channel_1',
  'ppa_Channel_2',
  'ppa_Channel_3',
  'ppa_Channel_4',
  'ppa_Channel_5',
  'ppa_Channel_6',
  'ppa_Channel_7',
  'ppa_Channel_8',
  'ppa_Channel_9',
  'ppa_Channel_10',
  'ppa_Channel_11',
  'ppa_Channel_12',
  'ppa_Channel_13',
  'ppa_Channel_14',
  'ppa_Channel_15',
  'ppa_Channel_16',
  'ppa_Channel_17',
  'ppa_Channel_18',
  'ppa_Channel_19',
  'ppa_Channel_20',
  'ppa_Channel_21',
  'ppa_Channel_22',
  'ppa_Channel_23',
  'ppa_Channel_24',
  'ppa_Channel_25',
  'ppa_Channel_26',
  'ppa_Channel_27',
  'ppa_Channel_28',
  'ppa_Channel_29',
  'ppa_Channel_30',
  'ppa_Channel_31',
  'ppa_Channel_32',
  'ppa_Channel_33',
  'ppa_Channel_34',
  'ppa_Channel_35',
  'ppa_Channel_36',
  'ppa_Channel_37',
  'ppa_Channel_38',
  'ppa_Channel_39',
  'ppa_Channel_40',
  'ppa_Channel_41',
  'ppa_Channel_42',
  'ppa_Channel_43',
  'ppa_Channel_44',
  'ppa_Channel_45',
  'ppa_Channel_46',
  'ppa_Channel_47',
  'ppa_Channel_48',
  'ppa_Channel_49',
  'ppa_Channel_50',
  