In [1]:
import os
import pandas as pd
import numpy as np
import itertools
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.mixture import GaussianMixture
from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score
from sklearn.decomposition import PCA
import seaborn as sns
from sklearn.manifold import TSNE
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
import torch.optim as optim
from imblearn.over_sampling import SMOTE



# Data extraction

In [2]:
cd = os.getcwd()
path_raw_data = os.path.join(cd, r"data\TiMeS_Raw_Data2023.xlsx")
file_paths = [
    r".\data\Raw_MissingDataImputed\TiMeS_matrix_mdImputed_allT1.xlsx",
    r".\data\Raw_MissingDataImputed\TiMeS_matrix_mdImputed_allT2.xlsx",
    r".\data\Raw_MissingDataImputed\TiMeS_matrix_mdImputed_allT3.xlsx",
    r".\data\Raw_MissingDataImputed\TiMeS_matrix_mdImputed_allT4.xlsx",
]

In [3]:
dataframes = []
for path in file_paths:
    df = pd.read_excel(path) 
    dataframes.append(df) 


In [4]:
motor_tests = [
    "Fugl.Meyer_affected_TOTAL",
    "P.G_affected_FIST_mean",
    "B.B_blocks_affected_hand",
    "Purdue_affected_hand"
]

qol_tests = [
    "mRS",
    "Barthel"
]

attention_tests = [    
    "TAP_alert_without_warning_RT",
    "TAP_alert_with_warning_RT",
    "TAP_divided_attention_single_condition_Auditive_RT", 
    "TAP_divided_attention_single_condition_Visual_RT", 
    "TAP_divided_attention_both_condition_Auditive_RT",
    "TAP_divided_attention_both_condition_Visual_RT",
    "Bells_omissions_total.1",
    "CTM_A_time"
]

executive_tests = [
    "Bi.manual_coordination_corrected",
    "FAB_TOT", 
    "AST_unaffected_TOTAL", 
    "CERAD_copy_TOTAL",
    "Stroop_interference_time",
    "Digit_sequencing_TOTAL",
    "Digit_backward_TOTAL",
    "Corsi_backward_TOTAL",
    "CTM_B_time"
]

memory_tests = [
    "Corsi_forward_TOTAL",
    "Digit_forward_TOTAL"
]

sensory_test = [
    "RASP_TOTAL_unaffected"
]

Language_tests = [
    "Fluency_phon_final_score",
    "Fluency_sem_final_score",
    "LAST_TOTAL"
]

Neglect_tests = [
    "Line_bissec_20cm",
    "Line_bissec_.5cm",
    "Bells_omissions_L.R"
]

# Labeling according to T4 scores

In [5]:
columns_for_labeling = ["Patient"] + motor_tests + qol_tests + attention_tests + executive_tests + memory_tests + sensory_test + Language_tests + Neglect_tests
print(len(columns_for_labeling))

33


In [6]:
filtered_dataframes = []
for df in dataframes:
    filtered_df = df[columns_for_labeling] 
    filtered_dataframes.append(filtered_df) 

In [7]:
# Threshold for each test
thresholds = {
    "Fugl.Meyer_affected_TOTAL": 50,
    "P.G_affected_FIST_mean": 18,
    "B.B_blocks_affected_hand": 40,
    "Purdue_affected_hand": 12,
    "mRS": 1,
    "Barthel": 90, 
    "TAP_alert_without_warning_RT": 400,
    "TAP_alert_with_warning_RT" : 300,
    "TAP_divided_attention_single_condition_Auditive_RT" : 450, 
    "TAP_divided_attention_single_condition_Visual_RT" : 400, 
    "TAP_divided_attention_both_condition_Auditive_RT" : 550,
    "TAP_divided_attention_both_condition_Visual_RT" : 500,
    "Bells_omissions_total.1" : 6,
    "CTM_A_time": 60,
    "Bi.manual_coordination_corrected": 85,
    "FAB_TOT" : 16, 
    "AST_unaffected_TOTAL" : 15, 
    "CERAD_copy_TOTAL" : 9,
    "Stroop_interference_time": 90,
    "Digit_sequencing_TOTAL" : 6,
    "Digit_backward_TOTAL" : 4,
    "Corsi_backward_TOTAL" : 4,
    "CTM_B_time" : 120,
    "Corsi_forward_TOTAL" : 5,
    "Digit_forward_TOTAL" : 6,
    "RASP_TOTAL_unaffected" : 60,
    "Fluency_phon_final_score" : 15,
    "Fluency_sem_final_score" : 20,
    "LAST_TOTAL" : 40,
    "Line_bissec_20cm" : 2,
    "Line_bissec_.5cm" : 1,
    "Bells_omissions_L.R" : 2
    
}

# Create a dataframe with the "Patient" column
labels = pd.merge(
    filtered_dataframes[3],  
    filtered_dataframes[2], 
    on="Patient",           
    how="outer", 
    suffixes=('_T4', '_T3')
)       

# Calculate the labeling for each test
for col, threshold in thresholds.items():
    # Generate column names for T4 and T3
    t4_col = f"{col}_T4"  # Column from T4 after merge
    t3_col = f"{col}_T3"  # Column from T3 after merge
    
    # Check if the column exists in the merged DataFrame
    if t4_col in labels.columns:
        # Take T4 values, fallback to T3 values if T4 is NaN
        values = labels[t4_col].fillna(labels[t3_col])
        labels[t4_col] = (values >= threshold).astype(int)
        if t3_col in labels.columns:
            labels.drop(t3_col, axis=1, inplace=True)

        #labels.drop(t3_col, axis=1, errors='ignore')
    else:
        # If T4 column doesn't exist, use T3 values directly
        values = labels[t3_col]
        labels[t3_col] = (values >= threshold).astype(int)
        labels.drop(t4_col)

    # verify if the value is higher or lower the threshold
    #labels[col] = (values >= threshold).astype(int)

# Apply a majority voting
labels["Recovered"] = labels.iloc[:, 2:].sum(axis=1) >= (len(thresholds) / 2)

# Convert the labels in 1 = Recovered, 0 = not recovered
labels["Recovered"] = labels["Recovered"].astype(int)


In [8]:
labels = labels[["Patient", "Recovered"]]
print("Number of recovered patients: ", len(labels[labels["Recovered"]==1]))
print("Number of unrecovered patients: ", len(labels[labels["Recovered"]==0]))


Number of recovered patients:  54
Number of unrecovered patients:  7


In [9]:
labels.to_csv("./data/labels.csv", index=False)


In [10]:
#Merged all the sessions in dataframe
sessions = ["T1", "T2", "T3", "T4"]
merged_df = None

for session, df in zip(sessions, filtered_dataframes):
    
    df = df.rename(
        columns={col: f"{col}_{session}" for col in df.columns if col != "Patient"}
    )
    
    if merged_df is None:
        merged_df = df
    else:
        merged_df = pd.merge(merged_df, df, on="Patient", how="outer")

# Add the "Recovered" label to the merged dataframe
# Intersection based on the "Patient" column
labeled_merged_df = pd.merge(
    labels,
    merged_df,
    on="Patient",
    how="inner"
)

# Show the resulting dataframe
labeled_merged_df.head()


Unnamed: 0,Patient,Recovered,Fugl.Meyer_affected_TOTAL_T1,P.G_affected_FIST_mean_T1,B.B_blocks_affected_hand_T1,Purdue_affected_hand_T1,mRS_T1,Barthel_T1,TAP_alert_without_warning_RT_T1,TAP_alert_with_warning_RT_T1,...,CTM_B_time_T4,Corsi_forward_TOTAL_T4,Digit_forward_TOTAL_T4,RASP_TOTAL_unaffected_T4,Fluency_phon_final_score_T4,Fluency_sem_final_score_T4,LAST_TOTAL_T4,Line_bissec_20cm_T4,Line_bissec_.5cm_T4,Bells_omissions_L.R_T4
0,P001,0,58.0,45.333333,48.0,10.0,0.0,100.0,245.0,220.0,...,78.85,9.0,9.0,179.0,13.0,17.0,15.0,0.5,1.25,0.0
1,P002,1,57.0,15.0,39.0,5.0,1.0,100.0,524.0,408.0,...,74.71,7.0,11.0,175.0,13.0,31.0,15.0,5.0,-1.0,-3.0
2,P003,1,60.0,34.0,51.0,11.0,0.0,100.0,345.0,480.0,...,80.56,8.0,12.0,178.0,15.0,20.0,15.0,-2.0,-1.5,1.0
3,P004,1,47.0,17.666667,36.0,0.0,1.0,95.0,285.0,281.0,...,202.6,9.0,9.0,173.0,16.0,16.0,15.0,1.5,0.0,0.0
4,P006,1,54.0,26.333333,36.0,5.0,2.0,90.0,363.0,332.0,...,164.06,5.0,8.0,165.0,19.0,16.0,15.0,-12.0,-2.75,-2.0


# Dynamic Neural Networks

In [11]:
# Function to generate embeddings using Random Forest
def generate_random_forest_embeddings(features, labels, sessions, n_estimators=10):
    embeddings = []
    session_features = {session: [features.columns.get_loc(col) for col in features.columns if session in col] for session in sessions}
    
    for session, session_indices in session_features.items():
        session_data = features.iloc[:, session_indices].values
        
        rf = RandomForestRegressor(n_estimators=n_estimators, random_state=42)
        rf.fit(session_data, labels)
        session_embedding = rf.apply(session_data)
        embeddings.append(session_embedding)
    
    return np.stack(embeddings, axis=1)

In [12]:
# Function to train and evaluate an RNN model
def train_rnn_model(embeddings, labels, hidden_size=16, num_layers=2, num_epochs=50, test_size=0.2):
    X = torch.tensor(embeddings, dtype=torch.float32)
    y = torch.tensor(labels.values, dtype=torch.float32).unsqueeze(1)

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)

    class RNNClassifier(nn.Module):
        def __init__(self, input_size, hidden_size, num_layers, output_size=1):
            super(RNNClassifier, self).__init__()
            self.rnn = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
            self.fc = nn.Linear(hidden_size, output_size)
            self.sigmoid = nn.Sigmoid()

        def forward(self, x):
            _, (hn, _) = self.rnn(x)
            out = self.fc(hn[-1])
            return self.sigmoid(out)

    input_size = embeddings.shape[2]
    model = RNNClassifier(input_size, hidden_size, num_layers)
    criterion = nn.BCELoss()
    optimizer = optim.Adam(model.parameters(), lr=1e-3)

    for epoch in range(num_epochs):
        model.train()
        optimizer.zero_grad()
        outputs = model(X_train)
        loss = criterion(outputs, y_train)
        loss.backward()
        optimizer.step()

        if (epoch + 1) % 10 == 0:
            model.eval()
            with torch.no_grad():
                y_pred_proba = model(X_test)
                y_pred = (y_pred_proba >= 0.5).float()

                accuracy = accuracy_score(y_test.numpy(), y_pred.numpy())
                f1 = f1_score(y_test.numpy(), y_pred.numpy(), zero_division=0)
                precision = precision_score(y_test.numpy(), y_pred.numpy(), zero_division=0)
                recall = recall_score(y_test.numpy(), y_pred.numpy(), zero_division=0)

            print(f"Epoch [{epoch + 1}/{num_epochs}], Loss: {loss.item():.4f}")
            print(f"Accuracy: {accuracy:.4f}, F1-Score: {f1:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}")


In [13]:
def train_Hybrid_lstm_model(X, y, n_time_points, n_epochs=50, hidden_size=16, num_layers=1, test_size=0.2):
    import torch.nn as nn  # Import nécessaire dans la fonction si le scope est limité

    # Définir la classe dans la fonction
    class HybridLSTM(nn.Module):
        def __init__(self, input_size, hidden_size, num_layers):
            super(HybridLSTM, self).__init__()
            self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
            self.fc = nn.Linear(hidden_size, 1)  # Binary output
            self.sigmoid = nn.Sigmoid()  # Activation function for binary classification

        def forward(self, x):
            lstm_out, _ = self.lstm(x)  # lstm_out: (batch_size, time_points, hidden_size)
            output = self.fc(lstm_out[:, -1, :])  # Last temporal output
            return self.sigmoid(output)

    n_patients, _, n_features_per_time_point = X.shape

    # Convert data to PyTorch tensors
    X_tensor = torch.tensor(X, dtype=torch.float32)
    y_tensor = torch.tensor(y, dtype=torch.float32).unsqueeze(1)  # Add dimension for binary output

    # Divide data into train/test sets
    X_train, X_test, y_train, y_test = train_test_split(X_tensor, y_tensor, test_size=test_size, random_state=42)

    # Initialize the LSTM model
    model = HybridLSTM(n_features_per_time_point, hidden_size, num_layers)
    criterion = nn.BCELoss()  # Binary Cross Entropy Loss
    optimizer = optim.Adam(model.parameters(), lr=1e-3)

    # Training loop
    for epoch in range(n_epochs):
        model.train()
        optimizer.zero_grad()
        outputs = model(X_train)
        loss = criterion(outputs, y_train)
        loss.backward()
        optimizer.step()

        # Evaluation
        model.eval()
        with torch.no_grad():
            y_pred_proba = model(X_test)
            y_pred = (y_pred_proba >= 0.5).float()

            # Calculate metrics
            accuracy = accuracy_score(y_test.numpy(), y_pred.numpy())
            f1 = f1_score(y_test.numpy(), y_pred.numpy(), zero_division=0)
            precision = precision_score(y_test.numpy(), y_pred.numpy(), zero_division=0)
            recall = recall_score(y_test.numpy(), y_pred.numpy(), zero_division=0)

        if (epoch + 1) % 10 == 0:
            print(f"Epoch {epoch + 1}, Loss: {loss.item():.4f}")
            print(f"Accuracy: {accuracy:.4f}, F1-Score: {f1:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}")

In [14]:
# Preprocessing
features = labeled_merged_df.drop(columns=["Patient", "Recovered"]).fillna(0)
scaler = StandardScaler()
standardized_features = pd.DataFrame(scaler.fit_transform(features), columns=features.columns)

sessions = ["T1", "T2", "T3", "T4"]


In [15]:
embeddings_no_smote = generate_random_forest_embeddings(standardized_features, labeled_merged_df["Recovered"], sessions)
print("RNN model - Training and evaluation WITHOUT SMOTE:")
train_rnn_model(embeddings_no_smote, labeled_merged_df["Recovered"])

RNN model - Training and evaluation WITHOUT SMOTE:
Epoch [10/50], Loss: 0.6905
Accuracy: 0.9231, F1-Score: 0.9600, Precision: 0.9231, Recall: 1.0000
Epoch [20/50], Loss: 0.6452
Accuracy: 0.9231, F1-Score: 0.9600, Precision: 0.9231, Recall: 1.0000
Epoch [30/50], Loss: 0.5873
Accuracy: 0.9231, F1-Score: 0.9600, Precision: 0.9231, Recall: 1.0000
Epoch [40/50], Loss: 0.5111
Accuracy: 0.9231, F1-Score: 0.9600, Precision: 0.9231, Recall: 1.0000
Epoch [50/50], Loss: 0.4340
Accuracy: 0.9231, F1-Score: 0.9600, Precision: 0.9231, Recall: 1.0000


In [16]:
X = embeddings_no_smote  # Already in (n_patients, n_time_points, n_features_per_time_point) format
y = labeled_merged_df["Recovered"].values

# Call the function
print("Hybrid LSTM model - Training and evaluation WITHOUT SMOTE:")
train_Hybrid_lstm_model(X, y, n_time_points=4, n_epochs=50, hidden_size=16, num_layers=1, test_size=0.2)


Hybrid LSTM model - Training and evaluation WITHOUT SMOTE:
Epoch 10, Loss: 0.6835
Accuracy: 0.9231, F1-Score: 0.9600, Precision: 0.9231, Recall: 1.0000
Epoch 20, Loss: 0.6066
Accuracy: 0.9231, F1-Score: 0.9600, Precision: 0.9231, Recall: 1.0000
Epoch 30, Loss: 0.5219
Accuracy: 0.9231, F1-Score: 0.9600, Precision: 0.9231, Recall: 1.0000
Epoch 40, Loss: 0.4457
Accuracy: 0.9231, F1-Score: 0.9600, Precision: 0.9231, Recall: 1.0000
Epoch 50, Loss: 0.3956
Accuracy: 0.9231, F1-Score: 0.9600, Precision: 0.9231, Recall: 1.0000


### Add smote to balanced our dataset

In [17]:
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(standardized_features, labeled_merged_df["Recovered"])
X_resampled = pd.DataFrame(X_resampled, columns=standardized_features.columns)

embeddings_with_smote = generate_random_forest_embeddings(X_resampled, y_resampled, sessions)
print("RNN model - Training and evaluation WITHOUT SMOTE:")
train_rnn_model(embeddings_with_smote, y_resampled)

RNN model - Training and evaluation WITHOUT SMOTE:
Epoch [10/50], Loss: 0.6962
Accuracy: 0.5455, F1-Score: 0.0000, Precision: 0.0000, Recall: 0.0000
Epoch [20/50], Loss: 0.6878
Accuracy: 0.5455, F1-Score: 0.0000, Precision: 0.0000, Recall: 0.0000
Epoch [30/50], Loss: 0.6766
Accuracy: 0.9091, F1-Score: 0.8889, Precision: 1.0000, Recall: 0.8000
Epoch [40/50], Loss: 0.6584
Accuracy: 0.7727, F1-Score: 0.8000, Precision: 0.6667, Recall: 1.0000
Epoch [50/50], Loss: 0.6256
Accuracy: 1.0000, F1-Score: 1.0000, Precision: 1.0000, Recall: 1.0000


In [18]:
X_flattened = embeddings_with_smote.reshape(embeddings_with_smote.shape[0], -1)  # Flatten to 2D for SMOTE
y = torch.tensor(y_resampled, dtype=torch.float32).unsqueeze(1)  # Add a dimension for the output

# Apply SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_flattened, y)

# Reshape the resampled data back to (n_patients, n_time_points, n_features_per_time_point)
n_time_points = 4
n_patients_resampled = X_resampled.shape[0]
n_features_per_time_point = X_resampled.shape[1] // n_time_points
X_resampled_reshaped = X_resampled.reshape(n_patients_resampled, n_time_points, n_features_per_time_point)

print("Hybrid LSTM model - Training and evaluation WITHOUT SMOTE:")
train_Hybrid_lstm_model(X_resampled_reshaped, y_resampled, n_time_points=4, n_epochs=50, hidden_size=16, num_layers=1, test_size=0.2)

Hybrid LSTM model - Training and evaluation WITHOUT SMOTE:
Epoch 10, Loss: 0.6434
Accuracy: 0.6818, F1-Score: 0.5333, Precision: 0.8000, Recall: 0.4000
Epoch 20, Loss: 0.6144
Accuracy: 0.7273, F1-Score: 0.6250, Precision: 0.8333, Recall: 0.5000
Epoch 30, Loss: 0.5816
Accuracy: 0.7727, F1-Score: 0.7059, Precision: 0.8571, Recall: 0.6000
Epoch 40, Loss: 0.5473
Accuracy: 0.8182, F1-Score: 0.7778, Precision: 0.8750, Recall: 0.7000
Epoch 50, Loss: 0.5030
Accuracy: 0.8636, F1-Score: 0.8235, Precision: 1.0000, Recall: 0.7000
