# Data Processing and splitting

In [1]:
from logic.processing import load_data, preproc, balance_dataset, data_filter
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay,classification_report
from sklearn.pipeline import make_pipeline
from tensorflow.keras.preprocessing.text import text_to_word_sequence
from gensim.models import Word2Vec
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np
from tensorflow.keras import layers,Sequential
from tensorflow.keras.layers import InputLayer
from tensorflow.keras.callbacks import EarlyStopping
import pandas as pd
import joblib

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/Flotchi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/Flotchi/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/Flotchi/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/Flotchi/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/Flotchi/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
2024-12-04 09:50:29.864823: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
df_train = load_data('drugsComTrain_raw.csv')
df_test = load_data('drugsComTest_raw.csv')

In [3]:
df_train_filter = data_filter(df_train)
df_test_filter = data_filter(df_test)

In [4]:
df_test_filter.head()

Unnamed: 0,uniqueID,drugName,condition,review,rating,date,usefulCount,sentiment,review_length
0,163740,Mirtazapine,Depression,"""I&#039;ve tried a few antidepressants over th...",10,28-Feb-12,22,1,68
1,206473,Mesalamine,"Crohn's Disease, Maintenance","""My son has Crohn&#039;s disease and has done ...",8,17-May-09,17,1,48
3,39293,Contrave,Weight Loss,"""Contrave combines drugs that were used for al...",9,5-Mar-17,35,1,143
4,97768,Cyclafem 1 / 35,Birth Control,"""I have been on this birth control for one cyc...",9,22-Oct-15,4,1,149
7,169852,Amitriptyline,Migraine Prevention,"""This has been great for me. I&#039;ve been on...",9,21-Apr-09,32,1,64


In [5]:
df_train_prep = preproc(df_train_filter)
df_test_prep = preproc(df_test_filter)

In [6]:
df_train_prep.head()

Unnamed: 0,uniqueID,drugName,condition,review,rating,date,usefulCount,sentiment,review_length,clean
1,95260,Guanfacine,ADHD,"""My son is halfway through his fourth week of ...",8,27-Apr-10,192,1,141,my son be halfway through his fourth week of i...
3,138000,Ortho Evra,Birth Control,"""This is my first time using any form of birth...",8,3-Nov-15,10,1,89,this be my first time use any form of birth co...
4,35696,Buprenorphine / naloxone,Opiate Dependence,"""Suboxone has completely turned my life around...",9,27-Nov-16,37,1,124,suboxone have completely turn my life around i...
5,155963,Cialis,Benign Prostatic Hyperplasia,"""2nd day on 5mg started to work with rock hard...",2,28-Nov-15,43,0,68,nd day on mg start to work with rock hard erec...
6,165907,Levonorgestrel,Emergency Contraception,"""He pulled out, but he cummed a bit in me. I t...",1,7-Mar-17,5,0,30,he pull out but he cummed a bite in me i take ...


In [7]:
X_train = df_train_prep['clean']
y_train = df_train_prep['sentiment']

In [8]:
X_test = df_test_prep['clean']
y_test = df_test_prep['sentiment']

In [9]:
X_train_b,y_train_b = balance_dataset(X_train,y_train)
X_test_b,y_test_b = balance_dataset(X_test,y_test)

In [10]:
len(y_train_b[y_train_b==0])==len(y_train_b[y_train_b==1])

True

# Tokenize

In [11]:
X_train_tk = [text_to_word_sequence(_) for _ in X_train_b]

In [12]:
X_test_tk = [text_to_word_sequence(_) for _ in X_test_b]

# Embedding

In [13]:
word2vec_model = Word2Vec(
    sentences=X_train_tk,
    vector_size=100,  # dimension des embeddings
    window=10,         # fenêtre de contexte
    min_count=5,      # mots minimums pour inclusion
    sg=1,             # Skip-Gram
    epochs=10,        # Nombre d'époques
)

In [14]:
len(word2vec_model.wv.key_to_index)

9903

In [15]:
def embed_sentence(word2vec, sentence):
    embedded_sentence = []
    for word in sentence:
        if word in word2vec.wv:
            embedded_sentence.append(word2vec.wv[word])
        
    return np.array(embedded_sentence)

def embedding(word2vec, sentences):
    embed = []
    
    for sentence in sentences:
        embedded_sentence = embed_sentence(word2vec, sentence)
        embed.append(embedded_sentence)
        
    return embed

In [16]:
X_train_embed = embedding(word2vec_model, X_train_tk)
X_test_embed = embedding(word2vec_model, X_test_tk)

# Padding

In [17]:
X_train_pad = pad_sequences(
    X_train_embed,
    maxlen=200,        # Longueur maximale de mots
    padding='pre',   # Padding avant la séquence
    truncating='post' # Tronquage à la fin
)

In [18]:
X_test_pad = pad_sequences(
    X_test_embed,
    maxlen=200,        # Longueur maximale de mots
    padding='pre',   # Padding avant la séquence
    truncating='post' # Tronquage à la fin
)

In [19]:
X_train_pad.shape,X_test_pad.shape

((56314, 200, 100), (19092, 200, 100))

In [20]:
X_train_pad = np.array(X_train_pad)

In [21]:
X_test_pad = np.array(X_test_pad)

In [33]:
joblib.dump(X_train_pad, 'pad_array.pkl')

OSError: [Errno 28] No space left on device

# Model Building

In [22]:
def init_model():
    model = Sequential()
    model.add(InputLayer((200,100)))
    model.add(layers.Masking(mask_value=0.))
    model.add(layers.LSTM(20, activation='tanh'))
    model.add(layers.Dense(15, activation='relu'))
    model.add(layers.Dense(1, activation='sigmoid'))
    
    return model

In [23]:
model = init_model()

In [24]:
model.compile(loss='binary_crossentropy',
                  optimizer='rmsprop',
                  metrics=['accuracy'])

In [25]:
model.summary()

# Model training

In [26]:
es = EarlyStopping(patience=5, restore_best_weights=True)

model.fit(X_train_pad, y_train_b, 
          batch_size = 32,
          epochs=10,
          validation_split=0.3,
          callbacks=[es]
         )

Epoch 1/10
[1m1232/1232[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m105s[0m 82ms/step - accuracy: 0.5771 - loss: 0.6733 - val_accuracy: 0.6147 - val_loss: 0.6520
Epoch 2/10
[1m1232/1232[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m92s[0m 75ms/step - accuracy: 0.6199 - loss: 0.6442 - val_accuracy: 0.6155 - val_loss: 0.6478
Epoch 3/10
[1m1232/1232[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m93s[0m 75ms/step - accuracy: 0.6269 - loss: 0.6406 - val_accuracy: 0.6197 - val_loss: 0.6441
Epoch 4/10
[1m1232/1232[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m96s[0m 78ms/step - accuracy: 0.6269 - loss: 0.6353 - val_accuracy: 0.6195 - val_loss: 0.6449
Epoch 5/10
[1m1232/1232[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m99s[0m 80ms/step - accuracy: 0.6296 - loss: 0.6345 - val_accuracy: 0.6202 - val_loss: 0.6448
Epoch 6/10
[1m1232/1232[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m95s[0m 77ms/step - accuracy: 0.6296 - loss: 0.6340 - val_accuracy: 0.6204 - val_loss: 0.6414
Epo

<keras.src.callbacks.history.History at 0x17c041f60>

# Evaluation

In [27]:
res = model.evaluate(X_test_pad, y_test_b, verbose=0)

In [28]:
print(f'The accuracy evaluated on the test set is of {res[1]*100:.3f}%')

The accuracy evaluated on the test set is of 62.440%


In [29]:
y_pred = model.predict(X_test_pad)

[1m597/597[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 22ms/step


In [30]:
y_pred = np.round(y_pred)

In [31]:
print(classification_report(y_test_b, y_pred))

              precision    recall  f1-score   support

           0       0.61      0.69      0.65      9546
           1       0.64      0.56      0.60      9546

    accuracy                           0.62     19092
   macro avg       0.63      0.62      0.62     19092
weighted avg       0.63      0.62      0.62     19092

