In [1]:
import os
import pandas as pd
import numpy as np
import itertools
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.mixture import GaussianMixture
from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score
from sklearn.decomposition import PCA
import seaborn as sns
from sklearn.manifold import TSNE
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import KFold


In [2]:
cd = os.getcwd()
path_raw_data = os.path.join(cd, r"data\TiMeS_Raw_Data2023.xlsx")
file_paths = [
    r".\data\Raw_MissingDataImputed\TiMeS_matrix_mdImputed_allT1.xlsx",
    r".\data\Raw_MissingDataImputed\TiMeS_matrix_mdImputed_allT2.xlsx",
    r".\data\Raw_MissingDataImputed\TiMeS_matrix_mdImputed_allT3.xlsx",
    r".\data\Raw_MissingDataImputed\TiMeS_matrix_mdImputed_allT4.xlsx",
]

In [3]:
dataframes = []
for path in file_paths:
    df = pd.read_excel(path) 
    dataframes.append(df) 


In [4]:
motor_tests = [
    "Fugl.Meyer_affected_TOTAL",
    "P.G_affected_FIST_mean",
    "B.B_blocks_affected_hand",
    "Purdue_affected_hand"
]

qol_tests = [
    "mRS",
    "Barthel"
]

attention_tests = [    
    "TAP_alert_without_warning_RT",
    "TAP_alert_with_warning_RT",
    "TAP_divided_attention_single_condition_Auditive_RT", 
    "TAP_divided_attention_single_condition_Visual_RT", 
    "TAP_divided_attention_both_condition_Auditive_RT",
    "TAP_divided_attention_both_condition_Visual_RT",
    "Bells_omissions_total.1",
    "CTM_A_time"
]

executive_tests = [
    "Bi.manual_coordination_corrected",
    "FAB_TOT", 
    "AST_unaffected_TOTAL", 
    "CERAD_copy_TOTAL",
    "Stroop_interference_time",
    "Digit_sequencing_TOTAL",
    "Digit_backward_TOTAL",
    "Corsi_backward_TOTAL",
    "CTM_B_time"
]

memory_tests = [
    "Corsi_forward_TOTAL",
    "Digit_forward_TOTAL"
]

sensory_test = [
    "RASP_TOTAL_unaffected"
]

Language_tests = [
    "Fluency_phon_final_score",
    "Fluency_sem_final_score",
    "LAST_TOTAL"
]

Neglect_tests = [
    "Line_bissec_20cm",
    "Line_bissec_.5cm",
    "Bells_omissions_L.R"
]

In [5]:
columns_for_labelisation = ["Patient"] + motor_tests + qol_tests + attention_tests + executive_tests + memory_tests + sensory_test + Language_tests + Neglect_tests
  
print(len(columns_for_labelisation))

33


In [6]:
filtered_dataframes = []
for df in dataframes:
    filtered_df = df[columns_for_labelisation] 
    filtered_dataframes.append(filtered_df) 

In [7]:
filtered_dataframes[3].head()

Unnamed: 0,Patient,Fugl.Meyer_affected_TOTAL,P.G_affected_FIST_mean,B.B_blocks_affected_hand,Purdue_affected_hand,mRS,Barthel,TAP_alert_without_warning_RT,TAP_alert_with_warning_RT,TAP_divided_attention_single_condition_Auditive_RT,...,CTM_B_time,Corsi_forward_TOTAL,Digit_forward_TOTAL,RASP_TOTAL_unaffected,Fluency_phon_final_score,Fluency_sem_final_score,LAST_TOTAL,Line_bissec_20cm,Line_bissec_.5cm,Bells_omissions_L.R
0,P001,59,50.0,52,11,0,100,274,221,506,...,78.85,9,9,179.0,13,17,15,0.5,1.25,0
1,P002,57,21.333333,46,12,1,100,534,586,726,...,74.71,7,11,175.0,13,31,15,5.0,-1.0,-3
2,P003,59,39.333333,57,11,0,100,227,245,397,...,80.56,8,12,178.0,15,20,15,-2.0,-1.5,1
3,P004,27,15.333333,36,4,1,100,300,276,668,...,202.6,9,9,173.0,16,16,15,1.5,0.0,0
4,P006,53,16.666667,45,3,1,100,277,460,507,...,164.06,5,8,165.0,19,16,15,-12.0,-2.75,-2


In [8]:
import numpy as np

# Exemple de seuils pour chaque colonne
thresholds = {
    "Fugl.Meyer_affected_TOTAL": 50,
    "P.G_affected_FIST_mean": 18,  # Normalement 20 pour hommes, 15 pour femmes
    "B.B_blocks_affected_hand": 40,
    "Purdue_affected_hand": 12,
    "mRS": 1,
    "Barthel": 90, 
    "TAP_alert_without_warning_RT": 400,
    "TAP_alert_with_warning_RT" : 300,
    "TAP_divided_attention_single_condition_Auditive_RT" : 450, 
    "TAP_divided_attention_single_condition_Visual_RT" : 400, 
    "TAP_divided_attention_both_condition_Auditive_RT" : 550,
    "TAP_divided_attention_both_condition_Visual_RT" : 500,
    "Bells_omissions_total.1" : 6,
    "CTM_A_time": 60,
    "Bi.manual_coordination_corrected": 85,
    "FAB_TOT" : 16, 
    "AST_unaffected_TOTAL" : 15, 
    "CERAD_copy_TOTAL" : 9,
    "Stroop_interference_time": 90,
    "Digit_sequencing_TOTAL" : 6,
    "Digit_backward_TOTAL" : 4,
    "Corsi_backward_TOTAL" : 4,
    "CTM_B_time" : 120,
    "Corsi_forward_TOTAL" : 5,
    "Digit_forward_TOTAL" : 6,
    "RASP_TOTAL_unaffected" : 60,
    "Fluency_phon_final_score" : 15,
    "Fluency_sem_final_score" : 20,
    "LAST_TOTAL" : 40,
    "Line_bissec_20cm" : 2,
    "Line_bissec_.5cm" : 1,
    "Bells_omissions_L.R" : 2
    
}

# Étape 1 : Créer un DataFrame pour les labels
labels = filtered_dataframes[3][["Patient"]].copy()  # Garder uniquement la colonne "Patient"


# Étape 2 : Calculer les votes
for col, threshold in thresholds.items():

    # Vérifier si la colonne existe dans T4
    if col in filtered_dataframes[3].columns:
        # Si une valeur est NaN dans T4, prendre celle de T3
        t4_values = filtered_dataframes[3][col].fillna(filtered_dataframes[2].get(col, np.nan))
    else:
        # Si la colonne n'existe pas dans T4, utiliser directement T3
        t4_values = filtered_dataframes[2].get(col, np.nan)

    # Vérifier les seuils (1 si supérieur ou égal, 0 sinon)
    labels[col] = (t4_values >= threshold).astype(int)

# Étape 3 : Appliquer le vote à la majorité
labels["Recovered"] = labels.iloc[:, 2:].sum(axis=1) >= (len(thresholds) / 2)

# Convertir les labels en 0 ou 1
labels["Recovered"] = labels["Recovered"].astype(int)

# Résultat final
labels.head()


Unnamed: 0,Patient,Fugl.Meyer_affected_TOTAL,P.G_affected_FIST_mean,B.B_blocks_affected_hand,Purdue_affected_hand,mRS,Barthel,TAP_alert_without_warning_RT,TAP_alert_with_warning_RT,TAP_divided_attention_single_condition_Auditive_RT,...,Corsi_forward_TOTAL,Digit_forward_TOTAL,RASP_TOTAL_unaffected,Fluency_phon_final_score,Fluency_sem_final_score,LAST_TOTAL,Line_bissec_20cm,Line_bissec_.5cm,Bells_omissions_L.R,Recovered
0,P001,1,1,1,0,0,1,0,0,1,...,1,1,1,0,0,0,0,1,0,0
1,P002,1,1,1,1,1,1,1,1,1,...,1,1,1,0,1,0,1,0,0,1
2,P003,1,1,1,0,0,1,0,0,0,...,1,1,1,1,1,0,0,0,0,1
3,P004,0,0,0,0,1,1,0,0,1,...,1,1,1,1,0,0,0,0,0,1
4,P006,1,0,1,0,1,1,0,1,1,...,1,1,1,1,0,0,0,0,0,1


In [9]:
labels = labels[["Patient", "Recovered"]]
print(len(labels[labels["Recovered"]==1]))
print(len(labels[labels["Recovered"]==0]))

40
6


In [10]:
labels.head(46)

Unnamed: 0,Patient,Recovered
0,P001,0
1,P002,1
2,P003,1
3,P004,1
4,P006,1
5,P008,1
6,P010,1
7,P011,1
8,P013,1
9,P014,1
