##### Deep Learing ###

1) Data Cleaning

In [32]:
from logic.processing import load_data, preproc

from sklearn.model_selection import train_test_split

import pandas as pd
import numpy as np

from gensim.models import Word2Vec

from imblearn.over_sampling import RandomOverSampler

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, Embedding
from tensorflow.keras.callbacks import EarlyStopping

In [2]:
data = load_data('Combined Data.csv')

In [3]:
df = preproc(data,bi = False)

2) Split train/test

In [4]:
X = data['clean']
y = data['status']

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

3) Embedding

In [9]:
word2vec = Word2Vec(sentences=[sentence.split() for sentence in X_train], 
                    vector_size=60, 
                    min_count=5, 
                    window=5)

In [21]:
def embed_sentence(word2vec, sentence):
    embedded_sentence = []
    for word in sentence:
        if word in word2vec.wv:
            embedded_sentence.append(word2vec.wv[word])
        
    return np.array(embedded_sentence)
def embedding(word2vec, sentences):
    embed = []
    
    for sentence in sentences:
        embedded_sentence = embed_sentence(word2vec, sentence)
        embed.append(embedded_sentence)
        
    return np.array(embed)

In [23]:
X_train_embed = embedding(word2vec, X_train)
X_test_embed = embedding(word2vec, X_test)

KeyboardInterrupt: 

4) Balancing

In [15]:
"""ros = RandomOverSampler(random_state=101)
X_train_resampled, y_train_resampled = ros.fit_resample(X_train_embed, y_train)"""

ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (42434,) + inhomogeneous part.

5) Label encoding

In [27]:
y_train_encoded = pd.get_dummies(y_train).values
y_test_encoded = pd.get_dummies(y_test).values

5) Padding

In [28]:
max_len = 500

In [29]:
X_train_pad = pad_sequences(X_train_embed, padding='post',maxlen=max_len)
X_test_pad = pad_sequences(X_test_embed, padding='post',maxlen=max_len)

6.1) Model A

In [34]:
model = Sequential([
    LSTM(64, return_sequences=False, input_shape=(X_train_pad.shape[1], X_test_pad.shape[2])),  # LSTM avec 64 unités
    Dropout(0.3),  # Régularisation
    Dense(32, activation='relu'),  # Couche entièrement connectée
    Dropout(0.3),  # Régularisation
    Dense(y_train_encoded.shape[1], activation='softmax')  # Sortie avec activation softmax pour la classification
])

In [35]:
model.compile(
    optimizer='adam',
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

In [36]:
model.summary()

In [37]:
history = model.fit(
    X_train_pad, y_train_encoded,
    epochs=10,
    batch_size=64,
    verbose=1
)

Epoch 1/10
[1m664/664[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m145s[0m 212ms/step - accuracy: 0.3410 - loss: 1.6566
Epoch 2/10
[1m664/664[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m134s[0m 201ms/step - accuracy: 0.3358 - loss: 1.6428
Epoch 3/10
[1m664/664[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m163s[0m 246ms/step - accuracy: 0.3630 - loss: 1.6007
Epoch 4/10
[1m664/664[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m154s[0m 232ms/step - accuracy: 0.3842 - loss: 1.5727
Epoch 5/10
[1m664/664[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m144s[0m 217ms/step - accuracy: 0.5126 - loss: 1.3979
Epoch 6/10
[1m664/664[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m167s[0m 252ms/step - accuracy: 0.5218 - loss: 1.3685
Epoch 7/10
[1m664/664[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m151s[0m 227ms/step - accuracy: 0.5212 - loss: 1.3607
Epoch 8/10
[1m664/664[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m145s[0m 218ms/step - accuracy: 0.5229 - loss: 1.3455
Epoch 9/

In [38]:
loss, accuracy = model.evaluate(X_test_pad, y_test_encoded, verbose=0)
print(f"Accuracy sur l'ensemble de test : {accuracy:.4f}")

Accuracy sur l'ensemble de test : 0.5394
