In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os.path as op
import pickle
from scipy.fft import fft, fftfreq
from tqdm import tqdm

In [2]:
def load_obj(name, path):
    with open(op.join(path, name), 'rb') as f:
        return pickle.load(f)

In [3]:
data_path = "/sps/crnl/pmouches/data/MEG_PHRC_2006_Ap18"
file_prefix = "data_raw_"

In [4]:
patients = ["42", "83", "4", "118", "108"]

# Window Creation

In [5]:
def create_windows_dataframe(data, samples_per_window, nb_samples, nb_channels, sample_rate, overlap_samples):
    windows = []
    
    for i in range(len(data)):
        start = 0
        end = samples_per_window
        while end < nb_samples:
            window_data = {"is_annoted": False, "block_id":i}
            for k in range(nb_channels):
                channel_data = data[i][k, start:end]
                window_data[f"Channel_{k+1}"] = np.array(channel_data)
            window_data["Start_Time"] = start / sample_rate 
            window_data["End_Time"] = end / sample_rate  
            windows.append(window_data)
            start += samples_per_window - overlap_samples
            end = start + samples_per_window
    df = pd.DataFrame(windows)
    return df

In [6]:
def annotate_window(annotations, df):
    for i in range(len(annotations)):
        for event in annotations[i]:
            time_stamp = event/150
            for index, row in df.iterrows():
                if row["Start_Time"] <= time_stamp <= row["End_Time"] and row["block_id"] == i:
                    df.at[index, "is_annoted"] = True
                    
    return df

In [7]:
def window_creation(data):
    data = data[data['is_reannoted'] == True]
    data.reset_index(drop=True, inplace=True)
    nb_channels = data["meg"][0].shape[0]
    nb_samples = data["meg"][0].shape[1]
    sample_rate = 150 # Hz
    window_size = 0.2 # 200 ms
    overlap = 0.03 # 30 ms

    samples_per_window = int(window_size * sample_rate)
    overlap_samples = int (overlap * sample_rate)

    df = create_windows_dataframe(data["meg"], samples_per_window, nb_samples,nb_channels, sample_rate, overlap_samples)
    df = annotate_window(data["reannot"], df)
    return df

# Feature creation

In [8]:
def compute_window_ppa(window):
    return np.max(window) - np.min(window)

In [9]:
def compute_window_upslope(window):
    return np.max(np.diff(window))

In [10]:
def compute_window_downslope(window):
    return np.min(np.diff(window))

In [11]:
def compute_window_std(window):
    return np.std(window)

In [12]:
def compute_window_amplitude_ratio(window):
    return (np.max(window)-np.min(window))/np.mean(window)

In [13]:
def compute_window_sharpness(window):
    slopes = np.diff(window)
    return np.max(np.abs(slopes[1:]-slopes[:-1]))

In [14]:
def compute_window_average_slope(window):
    abs_slopes = np.abs(np.diff(window))
    return np.max((abs_slopes[:-1] + abs_slopes[1:]) / 2)

In [15]:
def compute_window_main_frequency(window):
    n = len(window)
    fft_result = fft(window)
    frequencies = fftfreq(n)
    amplitudes = np.abs(fft_result)
    peak_frequency_index = np.argmax(amplitudes)
    main_frequency = frequencies[peak_frequency_index]
    return main_frequency

In [16]:
list_of_features = ["ppa", "upslope", "downslope", "std", "amplitude_ratio", "sharpness", "average_slope", "main_frequency"]
compute_features = {"ppa" : compute_window_ppa, "upslope" : compute_window_upslope, "downslope": compute_window_downslope, "std": compute_window_std, 
                    "amplitude_ratio": compute_window_amplitude_ratio, "sharpness" : compute_window_sharpness, "average_slope": compute_window_average_slope, "main_frequency" : compute_window_main_frequency}

In [17]:
def create_features(df):
    channels = [col for col in df.columns if col.startswith("Channel_")]    
    data_to_df = {}
    for channel in channels:
        channel_features = {feat: [] for feat in list_of_features}
        for window in df[channel]:
            for feat in list_of_features:
                channel_features[feat].append(compute_features[feat](window))
        for feat in list_of_features:
            data_to_df[f'{feat}_{channel}'] = channel_features[feat]
    return pd.DataFrame(data_to_df)

# Pre-processing

In [18]:
from sklearn.preprocessing import  MinMaxScaler

In [19]:
def standardization(df):
    channels = [col for col in df.columns if col.startswith("Channel_")]    
    for channel in channels:
        channel_data = np.array(df[channel].tolist())
        samples_per_window = channel_data.shape[1]
        channel_data = channel_data.reshape(-1)
        mean = np.mean(channel_data)
        std = np.std(channel_data)
        standardized_data = (channel_data - mean) / std
        channel_data = standardized_data.reshape(-1, samples_per_window)
        df[channel] = channel_data.tolist()
    return df

In [20]:
def preprocessing(set):
    y = set['is_annoted']
    standard_scaler = StandardScaler()
    standardized_set = standardization(set)
    X = create_features(standardized_set)
    min_max_scaler = MinMaxScaler()
    X_scaled = min_max_scaler.fit_transform(X)
    X_scaled_df = pd.DataFrame(X_scaled, columns=X.columns)
    return X_scaled_df, y

# Models comparative

In [21]:
import time
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
import xgboost as xgb
from tqdm import tqdm

In [22]:
models = {
    "SVM": SVC(),
    "SVM with Kernel": SVC(kernel='rbf'),
    "KNN": KNeighborsClassifier(),
    "Neural Network": MLPClassifier(max_iter=1000),
    "Logistic Regression": LogisticRegression(),
    "AdaBoost": AdaBoostClassifier(),
    "XGBoost": xgb.XGBClassifier(),
    "RandomForest": RandomForestClassifier()
}


In [23]:
patient_score = {}
for patient in tqdm(patients): 
    print("Patient " + patient)
    data = pd.DataFrame(load_obj(f"{file_prefix}{patient}.pkl",data_path))
    patient_df = window_creation(data)
    train_set, test_set = train_test_split(patient_df, test_size=0.2)
    X_train, y_train = preprocessing(train_set)
    X_test, y_test = preprocessing(test_set)
    patient_score[patient] = {}
    for feature in list_of_features:
        print("Feature " +feature)
        columns = [col for col in X_train.columns if col.startswith(feature)]
        X_train_feature = X_train[columns]
        X_test_feature = X_test[columns]
        train_times = {}
        f1_scores = {}
        for name, model in models.items():
            start_time = time.time()
            model.fit(X_train_feature, y_train)
            train_times[name] = time.time() - start_time
            y_pred = model.predict(X_test_feature)
            f1_scores[name] = f1_score(y_test, y_pred, pos_label=1) 
        patient_score[patient][feature] = {"time": train_times, "f1_score" : f1_scores}

  0%|          | 0/5 [00:00<?, ?it/s]

Patient 42
Feature ppa
Feature upslope
Feature downslope


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Feature std
Feature amplitude_ratio
Feature sharpness
Feature average_slope
Feature main_frequency


 20%|██        | 1/5 [06:56<27:45, 416.35s/it]

Patient 83
Feature ppa
Feature upslope


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Feature downslope


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Feature std
Feature amplitude_ratio
Feature sharpness


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Feature average_slope
Feature main_frequency


 40%|████      | 2/5 [15:04<22:56, 458.86s/it]

Patient 4
Feature ppa
Feature upslope
Feature downslope


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Feature std
Feature amplitude_ratio
Feature sharpness
Feature average_slope
Feature main_frequency


 60%|██████    | 3/5 [20:53<13:37, 408.53s/it]

Patient 118
Feature ppa


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Feature upslope
Feature downslope


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Feature std
Feature amplitude_ratio
Feature sharpness
Feature average_slope


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Feature main_frequency


 80%|████████  | 4/5 [29:11<07:23, 443.94s/it]

Patient 108
Feature ppa
Feature upslope
Feature downslope


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Feature std
Feature amplitude_ratio
Feature sharpness
Feature average_slope
Feature main_frequency


100%|██████████| 5/5 [35:30<00:00, 426.01s/it]


In [28]:
patient_score

{'42': {'ppa': {'time': {'SVM': 0.2385234832763672,
    'SVM with Kernel': 0.22693514823913574,
    'KNN': 0.0071239471435546875,
    'Neural Network': 4.058966636657715,
    'Logistic Regression': 0.059790611267089844,
    'AdaBoost': 6.329739332199097,
    'XGBoost': 1.35618257522583,
    'RandomForest': 4.079839706420898},
   'f1_score': {'SVM': 0.1875,
    'SVM with Kernel': 0.1875,
    'KNN': 0.46511627906976744,
    'Neural Network': 0.4615384615384615,
    'Logistic Regression': 0.39999999999999997,
    'AdaBoost': 0.3050847457627119,
    'XGBoost': 0.35555555555555557,
    'RandomForest': 0.3}},
  'upslope': {'time': {'SVM': 0.230987548828125,
    'SVM with Kernel': 0.23298931121826172,
    'KNN': 0.007934093475341797,
    'Neural Network': 22.043094158172607,
    'Logistic Regression': 0.06471538543701172,
    'AdaBoost': 6.283048152923584,
    'XGBoost': 1.7554266452789307,
    'RandomForest': 4.863475322723389},
   'f1_score': {'SVM': 0.35555555555555557,
    'SVM with Kerne

In [50]:
patient_df = {}
# Parcours du dictionnaire pour extraire les temps et f1_scores
for patient, features in patient_score.items():
    model_f1_scores = {}
    for feature, metrics in features.items():
        
        if feature != 'main_frequency':
            for model, f1_score in metrics['f1_score'].items():
                if model not in model_f1_scores:
                    model_f1_scores[model] = []
                model_f1_scores[model].append(f1_score)
    average_scores = {model: np.mean(f1_scores) for model, f1_scores in model_f1_scores.items()}
    patient_df[patient] = average_scores
    

# Calcul des moyennes

print(patient_df)

{'42': {'SVM': 0.16675685425685427, 'SVM with Kernel': 0.16675685425685427, 'KNN': 0.32361069973390405, 'Neural Network': 0.31475821475821475, 'Logistic Regression': 0.38740708331686374, 'AdaBoost': 0.3472386769326196, 'XGBoost': 0.3414502164502165, 'RandomForest': 0.34251677868997554}, '83': {'SVM': 0.0864139941690962, 'SVM with Kernel': 0.0864139941690962, 'KNN': 0.2257097677127531, 'Neural Network': 0.288361771910159, 'Logistic Regression': 0.2873567325051582, 'AdaBoost': 0.2798044217687075, 'XGBoost': 0.2252191476880838, 'RandomForest': 0.2978058199404073}, '4': {'SVM': 0.42515151515151517, 'SVM with Kernel': 0.42515151515151517, 'KNN': 0.4764957264957265, 'Neural Network': 0.604349842501103, 'Logistic Regression': 0.5132127132127132, 'AdaBoost': 0.5794895052037908, 'XGBoost': 0.5843940440897845, 'RandomForest': 0.5954370411167557}, '118': {'SVM': 0.4872102246644319, 'SVM with Kernel': 0.4872102246644319, 'KNN': 0.48035245436716195, 'Neural Network': 0.4442459898195264, 'Logistic R

In [51]:
average_data = {
    'Model': list(average_times.keys()),
    'Mean_F1 Score': list(average_f1_scores.values()),
    'Max_F1 Score': list(max_f1_scores.values()),
    'Min_F1 Score': list(min_f1_scores.values()),
    'Std_F1 Score': list(std_f1_scores.values())
    
}
df = pd.DataFrame(patient_df)

In [52]:
df

Unnamed: 0,42,83,4,118,108
SVM,0.166757,0.086414,0.425152,0.48721,0.040816
SVM with Kernel,0.166757,0.086414,0.425152,0.48721,0.040816
KNN,0.323611,0.22571,0.476496,0.480352,0.361955
Neural Network,0.314758,0.288362,0.60435,0.444246,0.351701
Logistic Regression,0.387407,0.287357,0.513213,0.542471,0.496041
AdaBoost,0.347239,0.279804,0.57949,0.499195,0.255885
XGBoost,0.34145,0.225219,0.584394,0.523746,0.23977
RandomForest,0.342517,0.297806,0.595437,0.55582,0.351631
