In [1]:
from google.colab import files

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [10]:
import os
import pandas as pd
import re
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding
from keras.initializers import Constant

In [4]:
%cd /content/drive/MyDrive/703_Project/LSTM/

/content/drive/MyDrive/703_Project/LSTM


**Create combined csv files**





In [None]:
def process_folder(main_folder_path, group_label):
    """ Traite tous les fichiers CSV dans un dossier et retourne un DataFrame fusionné. """
    frames = []  # Liste pour stocker les DataFrames de chaque fichier
    for subdir, dirs, files in os.walk(main_folder_path):
      for filename in files:
        if filename.endswith('.csv'):
            file_path = os.path.join(subdir, filename)
            df = pd.read_csv(file_path)
            df['conversation_id'] = filename.split('.')[0]  # Utilisez le nom du fichier comme ID
            df['group'] = group_label
            df = df[['text', 'conversation_id', 'group']]
            frames.append(df)
    return pd.concat(frames)

# Chemins des dossiers pour les fichiers CSV des groupes contrôle et patient
control_main_folder_path = "Control_csv_data_clean"
patient_main_folder_path = "Patient_csv_data_clean"

# Traitement des dossiers et fusion des DataFrames
control_df = process_folder(control_main_folder_path, 'control')
patient_df = process_folder(patient_main_folder_path, 'patient')
combined_df = pd.concat([control_df, patient_df])

# Enregistrement du DataFrame fusionné dans un nouveau fichier CSV
combined_df.to_csv('combined_control_patient_data.csv', index=False)

**Step 1 :data preprocessing**

In [14]:
# Charger les données
data = pd.read_csv('combined_control_patient_data.csv')

# Fonction de nettoyage des données textuelles
def clean_text(text):
    text = text.lower()
    text = re.sub(r"[^a-zA-Z0-9]", " ", text)  # Supprimer les caractères non alphanumériques
    text = re.sub(r"\s+", " ", text)  # Supprimer les espaces supplémentaires
    return text

# Nettoyage du texte
data['text'] = data['text'].apply(clean_text)
print(data.head(10))

# Préparation de la tokenisation
tokenizer = Tokenizer(num_words=121000)  # Limite à 12100 mots
tokenizer.fit_on_texts(data['text'])
sequences = tokenizer.texts_to_sequences(data['text'])

# Padding des séquences pour une longueur uniforme
max_seq_length = 15  # Ajuster selon la longueur désirée
X = pad_sequences(sequences, maxlen=max_seq_length)


                                                text conversation_id    group
0        a family is preparing for a birthday party        wright16a  control
1  the guests come over and walk through the door...       wright16a  control
2                          the mother is very upset        wright16a  control
3    and she takes a broom and tries to hit the dog        wright16a  control
4  the little boy whose birthday it is is crying ...       wright16a  control
5                 a woman has fallen into the water        wright16a  control
6                       and there s a lot of rapids        wright16a  control
7  a man wearing a life vest is trying to help th...       wright16a  control
8  he has tied the branch to himself with the bel...       wright16a  control
9  the woman is thankful that he is there to help...       wright16a  control


**How to get num_words and max_seq_length**

In [9]:
# Tokenizer sans limiter le nombre de mots
tokenizer = Tokenizer()
tokenizer.fit_on_texts(data['text'])

# Nombre total de mots uniques
word_count = len(tokenizer.word_index)
print(f"Nombre total de mots uniques dans le jeu de données : {word_count}")

Nombre total de mots uniques dans le jeu de données : 12092


In [8]:
text_lengths = [len(text.split()) for text in data['text']]
print(f"Moyenne: {np.mean(text_lengths)}")
print(f"Médiane: {np.median(text_lengths)}")
print(f"Percentile 90: {np.percentile(text_lengths, 90)}")

Moyenne: 7.701700291841137
Médiane: 6.0
Percentile 90: 15.0


**Step 2 : Word Embeddings preparation**

In [11]:
# Chargement des embeddings GloVe
EMBEDDING_DIM = 100  # Par exemple, pour GloVe 6B avec des vecteurs de 100 dimensions
embedding_index = {}
with open('glove.6B.100d.txt', 'r', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embedding_index[word] = coefs

# Préparation de la matrice d'embedding
embedding_matrix = np.zeros((len(tokenizer.word_index) + 1, EMBEDDING_DIM))
for word, i in tokenizer.word_index.items():
    embedding_vector = embedding_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

# Création de la couche d'embedding en utilisant la matrice d'embedding
embedding_layer = Embedding(len(tokenizer.word_index) + 1,
                            EMBEDDING_DIM,
                            embeddings_initializer=Constant(embedding_matrix),
                            input_length=max_seq_length,
                            trainable=False)

**Step 3 : Data split**