In [3]:
from google.colab import files

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [1]:
import os
import pandas as pd
import re
import numpy as np
from textblob import TextBlob
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding
from keras.initializers import Constant
from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout
from keras.losses import mean_squared_error

In [5]:
%cd /content/drive/MyDrive/703_Project/LSTM/

/content/drive/MyDrive/703_Project/LSTM


**Create combined csv files**





In [None]:
def process_folder(main_folder_path, group_label):
    """ Traite tous les fichiers CSV dans un dossier et retourne un DataFrame fusionné. """
    frames = []  # Liste pour stocker les DataFrames de chaque fichier
    for subdir, dirs, files in os.walk(main_folder_path):
      for filename in files:
        if filename.endswith('.csv'):
            file_path = os.path.join(subdir, filename)
            df = pd.read_csv(file_path)
            df['conversation_id'] = filename.split('.')[0]  # Utilisez le nom du fichier comme ID
            df['group'] = group_label
            df = df[['text', 'conversation_id', 'group']]
            frames.append(df)
    return pd.concat(frames)

# Chemins des dossiers pour les fichiers CSV des groupes contrôle et patient
control_main_folder_path = "Control_csv_data_clean"
patient_main_folder_path = "Patient_csv_data_clean"

# Traitement des dossiers et fusion des DataFrames
control_df = process_folder(control_main_folder_path, 'control')
patient_df = process_folder(patient_main_folder_path, 'patient')
combined_df = pd.concat([control_df, patient_df])

# Enregistrement du DataFrame fusionné dans un nouveau fichier CSV
combined_df.to_csv('combined_control_patient_data.csv', index=False)

**Step 1 : data split and preprocessing**

In [86]:
# Charger les données
data = pd.read_csv('combined_control_patient_data.csv')
data_df = pd.DataFrame(data)

# Fonction de nettoyage des données textuelles
def clean_text(text):
    text = text.lower()
    text = re.sub(r"[^a-zA-Z0-9']", " ", text)  # Supprimer les caractères non alphanumériques
    text = re.sub(r"\s+", " ", text)  # Supprimer les espaces supplémentaires
    return text

# Nettoyage du texte
data_df['text'] = data_df['text'].apply(clean_text)
#print(data_df.head(10))

# Fonction pour calculer le score de sentiment
def sentiment_score(text):
    analysis = TextBlob(text)
    return analysis.sentiment.polarity  # Retourne un score entre -1 et 1

# Appliquer la fonction à chaque réponse
data_df['sentiment'] = data_df['text'].apply(sentiment_score)

# Ici, 'sentiment' est la colonne des labels que nous venons de créer
y = data_df['sentiment'].values

# Division des données tout en conservant les identifiants de conversation
X_train, X_temp, y_train, y_temp, ids_train, ids_temp = train_test_split(data_df.drop(columns=['sentiment']), data_df['sentiment'], data_df['conversation_id'], test_size=0.2, random_state=42)
X_val, X_test, y_val, y_test, ids_val, ids_test = train_test_split(X_temp, y_temp, ids_temp, test_size=0.5, random_state=42)

# Préparation de la tokenisation
tokenizer = Tokenizer(num_words=12400)  # Limite à 12400 mots
tokenizer.fit_on_texts(data_df['text'])

sequences = tokenizer.texts_to_sequences(data_df['text'])


# Tokenization et Padding des ensembles
max_seq_length = 15  # Ajuster selon la longueur désirée
X_train_padded = pad_sequences(tokenizer.texts_to_sequences(X_train['text']), maxlen=max_seq_length)
X_val_padded = pad_sequences(tokenizer.texts_to_sequences(X_val['text']), maxlen=max_seq_length)
X_test_padded = pad_sequences(tokenizer.texts_to_sequences(X_test['text']), maxlen=max_seq_length)

In [None]:
"""
data_sample_df['text'] = data_sample_df['text'].apply(clean_text)
sample_sequences = tokenizer.texts_to_sequences(data_sample_df['text'])
sample_padded = pad_sequences(sample_sequences, maxlen=max_seq_length)
"""

In [None]:
"""
# examiner les séquences tokenisées et les longueurs de séquence
for i in range(5):
    print(f"Texte original: {data['text'].iloc[i]}")
    print(f"Séquence tokenisée: {sequences[i]}")
    print(f"Longueur de la séquence: {len(sequences[i])}\n")
"""

**How to get num_words and max_seq_length**

In [7]:
# Tokenizer sans limiter le nombre de mots
tokenizer = Tokenizer()
tokenizer.fit_on_texts(data_df['text'])

# Nombre total de mots uniques
word_count = len(tokenizer.word_index)
print(f"Nombre total de mots uniques dans le jeu de données : {word_count}")

Nombre total de mots uniques dans le jeu de données : 12385


In [8]:
text_lengths = [len(text.split()) for text in data_df['text']]
print(f"Moyenne: {np.mean(text_lengths)}")
print(f"Médiane: {np.median(text_lengths)}")
print(f"Percentile 90: {np.percentile(text_lengths, 90)}")

Moyenne: 7.460360360360361
Médiane: 6.0
Percentile 90: 15.0


**Verifications**

In [9]:
# Vérification des dimensions pour l'entraînement du modèle

print(f"Dimensions de X_train_padded: {X_train_padded.shape}")
print(f"Dimensions de y_train: {y_train.shape}")
print(f"Dimensions de X_test_padded: {X_test_padded.shape}")
print(f"Dimensions de y_test: {y_test.shape}")
print(f"Dimensions de X_val_padded: {X_val_padded.shape}")
print(f"Dimensions de y_val: {y_val.shape}")

Dimensions de X_train_padded: (63048, 15)
Dimensions de y_train: (63048,)
Dimensions de X_test_padded: (7881, 15)
Dimensions de y_test: (7881,)
Dimensions de X_val_padded: (7881, 15)
Dimensions de y_val: (7881,)


In [10]:
print(f"Dimensions de ids_train: {ids_train.shape}")
print(f"Dimensions de ids_val: {ids_val.shape}")
print(f"Dimensions de ids_test: {ids_test.shape}")

Dimensions de ids_train: (63048,)
Dimensions de ids_val: (7881,)
Dimensions de ids_test: (7881,)


**Step 2 : Word Embeddings preparation**

In [87]:
# Chargement des embeddings GloVe
EMBEDDING_DIM = 100  # Par exemple, pour GloVe 6B avec des vecteurs de 100 dimensions
embedding_index = {}
with open('glove.6B.100d.txt', 'r', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embedding_index[word] = coefs

# Préparation de la matrice d'embedding
embedding_matrix = np.zeros((len(tokenizer.word_index) + 1, EMBEDDING_DIM))
for word, i in tokenizer.word_index.items():
    embedding_vector = embedding_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

# Création de la couche d'embedding en utilisant la matrice d'embedding
embedding_layer = Embedding(len(tokenizer.word_index) + 1,
                            EMBEDDING_DIM,
                            embeddings_initializer=Constant(embedding_matrix),
                            input_length=max_seq_length,
                            trainable=False)

In [12]:
#contrôler la matrice d'embedding
print("Dimensions de la matrice d'embedding:", embedding_matrix.shape)

Dimensions de la matrice d'embedding: (12386, 100)


**Verifications**

In [None]:
# Tokenizer Vocabulary
for word, index in tokenizer.word_index.items():
    print(f"Mot : {word}, Index : {index}")
    if index == 100:  # Afficher les 10 premiers mots
        break


In [None]:
# Correspondence in the Embedding Matrix
for word, index in tokenizer.word_index.items():
    if word in embedding_index:
        print(f"Le mot '{word}' est dans GloVe.")
    else:
        print(f"Le mot '{word}' n'est pas dans GloVe.")
    if index == 100:  # Vérifier pour les 10 premiers mots
        break


In [None]:
# Coverage %
total_mots = len(tokenizer.word_index)
mots_dans_glove = sum(1 for word in tokenizer.word_index if word in embedding_index)
pourcentage_couverture = (mots_dans_glove / total_mots) * 100
print(f"Pourcentage de mots du tokeniseur couverts par GloVe : {pourcentage_couverture}%")


**Step 3 : LSTM model**

In [88]:
# Création du modèle LSTM
model = Sequential()
model.add(embedding_layer)  # Utilisez la couche d'embedding préparée précédemment
model.add(LSTM(64, return_sequences=False))  # return_sequences=False pour la dernière couche LSTM
model.add(Dropout(0.5))
model.add(Dense(1, activation='tanh'))  # Score de sentiment entre -1 et 1

model.compile(optimizer='adam', loss='mean_squared_error')


In [None]:
"""
grouped = data.groupby('conversation_id')  # Groupez par ID de conversation
X = []

for _, group in grouped:
    # Tokenisez et paddez chaque réponse dans la conversation
    sequences = tokenizer.texts_to_sequences(group['text'])
    padded_sequences = pad_sequences(sequences, maxlen=max_seq_length)
    X.append(padded_sequences)

y = []

for _, group in grouped:
    sentiment_scores = group['sentiment'].values  # Scores de sentiment pour la conversation
    y.append(sentiment_scores)

print(y)
"""

**Step 4 : Model training**

In [76]:
from keras.callbacks import EarlyStopping

early_stopping = EarlyStopping(monitor='val_loss', patience=5)

# Entraînement du modèle avec Early Stopping
model.fit(X_train_padded, y_train, epochs=3, batch_size=32, validation_data=(X_val_padded, y_val), callbacks=[early_stopping])


Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.src.callbacks.History at 0x78fb66707790>

**Step 5 : Results analysis**

In [89]:
# Évaluation du Modèle sur l'Ensemble de Test
test_loss = model.evaluate(X_test_padded, y_test)
print(f"Perte sur l'ensemble de test: {test_loss}")


Perte sur l'ensemble de test: 0.09841075539588928


In [90]:
# Prédiction des Sentiments sur l'Ensemble de Test
pred_sentiment = model.predict(X_test_padded)



In [103]:
# Créer un DataFrame pour associer les prédictions aux conversations
test_df = pd.DataFrame({
    'text' : X_test['text'],
    'conversation_id': ids_test,
    'group' : X_test['group'],
    'predicted_sentiment': pred_sentiment.flatten()
})

# Grouper par conversation_id
conversations = test_df.groupby('conversation_id')
evolution_sentiments = {}

for conversation_id, group in conversations:
    sentiment_debut = group['predicted_sentiment'].iloc[0]
    sentiment_fin = group['predicted_sentiment'].iloc[-1]
    evolution_sentiments[conversation_id] = sentiment_fin - sentiment_debut

In [104]:
test_df.columns

Index(['text', 'conversation_id', 'group', 'predicted_sentiment'], dtype='object')

In [105]:
test_df.to_csv('test_df.csv', index="False")

In [93]:
evolutions_patient = [evolution_sentiments[cid] for cid in conversations.indices if data_df[data_df['conversation_id'] == cid]['group'].iloc[0] == 'patient']
evolutions_control = [evolution_sentiments[cid] for cid in conversations.indices if data_df[data_df['conversation_id'] == cid]['group'].iloc[0] == 'control']


In [96]:
data_df.to_csv('data_df_final.csv', index = 'False')

**Test sample**

In [None]:
data_sample = pd.read_csv('test.csv', sep=';')
data_sample_df = pd.DataFrame(data_sample)
#print(data_sample.head())

In [46]:
# Charger l'échantillon de données
# Appliquez les mêmes étapes de nettoyage et de tokenisation
data_sample_df['text'] = data_sample_df['text'].apply(clean_text)
sample_sequences = tokenizer.texts_to_sequences(data_sample_df['text'])
sample_padded = pad_sequences(sample_sequences, maxlen=max_seq_length)

# Prédire les sentiments
predicted_sentiments = model.predict(sample_padded)



In [47]:
data_sample['predicted_sentiment'] = predicted_sentiments.flatten()

In [48]:
conversations = data_sample.groupby('conversation_id')
evolution_sentiments = {}

for conversation_id, group in conversations:
    sentiment_debut = group['predicted_sentiment'].iloc[0]
    sentiment_fin = group['predicted_sentiment'].iloc[-1]
    evolution_sentiments[conversation_id] = sentiment_fin - sentiment_debut

In [49]:
evolutions_aphasique = [evolution_sentiments[cid] for cid in conversations.indices if data_sample[data_sample['conversation_id'] == cid]['group'].iloc[0] == 'patient']
evolutions_non_aphasique = [evolution_sentiments[cid] for cid in conversations.indices if data_sample[data_sample['conversation_id'] == cid]['group'].iloc[0] == 'control']


In [50]:
data_sample.to_csv('data_sample.csv', index = 'False')