# Data Processing and splitting

In [14]:
from logic.processing import load_data, preproc, balance_dataset, data_filter
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay,classification_report
from sklearn.pipeline import make_pipeline
from gensim.models import Word2Vec
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np
from tensorflow.keras import layers,Sequential
from tensorflow.keras.layers import InputLayer,Dropout,BatchNormalization, Bidirectional,LSTM,Dense
from tensorflow.keras.callbacks import EarlyStopping
import joblib
import pandas as pd
import os
from tensorflow.keras.preprocessing.text import text_to_word_sequence

In [None]:
df_train = load_data('drugsComTrain_raw.csv')
df_test = load_data('drugsComTest_raw.csv')

In [None]:
df_train.head()

In [None]:
df_train_filter = data_filter(df_train)
df_test_filter = data_filter(df_test)

In [None]:
df_test_filter.head()

In [None]:
df_train_prep = preproc(df_train_filter)
df_test_prep = preproc(df_test_filter)

In [None]:
X_train = df_train_prep['clean']
y_train = df_train_prep['sentiment']

In [None]:
X_test = df_test_prep['clean']
y_test = df_test_prep['sentiment']

In [None]:
X_train_b,y_train_b = balance_dataset(X_train,y_train)
X_test_b,y_test_b = balance_dataset(X_test,y_test)

In [None]:
len(y_train_b[y_train_b==0])==len(y_train_b[y_train_b==1])

# Tokenize

In [None]:
X_train_tk = [text_to_word_sequence(_) for _ in X_trai_b]

In [None]:
X_test_tk = [text_to_word_sequence(_) for _ in X_test_b]

# Embedding

In [None]:
word2vec = Word2Vec(sentences=X_train_tk, vector_size=60, min_count=10, window=10)

In [None]:
joblib.dump(word2vec,'word2vec.pkl')

In [None]:
len(word2vec.wv.key_to_index)

In [17]:
def embed_sentence(word2vec, sentence):
    embedded_sentence = []
    for word in sentence:
        if word in word2vec.wv:
            embedded_sentence.append(word2vec.wv[word])
        
    return np.array(embedded_sentence)

def embedding(word2vec, sentences):
    embed = []
    
    for sentence in sentences:
        embedded_sentence = embed_sentence(word2vec, sentence)
        embed.append(embedded_sentence)
        
    return embed

In [None]:
X_train_embed = embedding(word2vec, X_train_tk)
X_test_embed = embedding(word2vec, X_test_tk)

# Padding

In [None]:
X_train_pad = pad_sequences(X_train_embed, dtype='float32', padding='post', maxlen=200)
X_test_pad = pad_sequences(X_test_embed, dtype='float32', padding='post', maxlen=200)

In [None]:
X_train_pad.shape,X_test_pad.shape

# Model Building

In [None]:
def init_model():
    model = Sequential()
    model.add(InputLayer((200,60)))
    model.add(layers.Masking(mask_value=0.))
    model.add(Bidirectional(LSTM(64, activation='tanh', return_sequences=False)))
    model.add(Dropout(0.3))
    model.add(Dense(32, activation='relu'))
    model.add(BatchNormalization())
    model.add(layers.Dense(1, activation='sigmoid'))
    
    return model

In [None]:
model = init_model()

In [None]:
model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

In [None]:
model.summary()

# Model training

In [None]:
es = EarlyStopping(patience=5, restore_best_weights=True)

model.fit(X_train_pad, y_train_b, 
          batch_size = 32,
          epochs=50,
          validation_split=0.3,
          callbacks=[es]
         )

In [None]:
joblib.dump(model,'lstm2.pkl')

# Evaluation

In [None]:
res = model.evaluate(X_test_pad, y_test_b, verbose=0)

In [None]:
print(f'The accuracy evaluated on the test set is of {res[1]*100:.3f}%')

In [None]:
y_pred = model.predict(X_test_pad)

In [None]:
y_pred = np.round(y_pred)

In [None]:
print(classification_report(y_test_b, y_pred))

# Demo test

In [11]:
import string
from nltk.stem import WordNetLemmatizer

In [8]:
sentence = 'This medication worked wonders for me! Within a week, I noticed a significant improvement in my symptoms. No side effects whatsoever, and I feel like I have my life back. Highly recommend!'

In [9]:
def process(st):
        for punc in string.punctuation:
            st = st.replace(punc, '')
        ans = st.casefold().replace('\n', ' ')
        ansd = ''.join(x for x in ans if not x.isdigit())
        lemmaverb = [WordNetLemmatizer().lemmatize(word, pos='v') for word in ansd.split()]
        lemmanouns = [WordNetLemmatizer().lemmatize(word, pos='n') for word in lemmaverb]
        nans = ' '.join(lemmanouns)
        return nans

In [12]:
sentence = process(sentence)

In [None]:
sentence

In [15]:
sentence_tk = text_to_word_sequence(sentence)

In [None]:
sentence_tk

In [18]:
sentence_emb = embed_sentence(word2vec, sentence_tk)

In [None]:
sentence_emb.shape

In [19]:
sentence_emb = sentence_emb.reshape(1,sentence_emb.shape[0],sentence_emb.shape[1])

In [20]:
sentence_pad = pad_sequences(sentence_emb, dtype='float32', padding='post', maxlen=200)

In [None]:
sentence_pad.shape

In [29]:
pred = model.predict(sentence_pad)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 593ms/step


In [30]:
pred[0][0]

0.9983726

# Shap

In [2]:
! pip install shap

Collecting shap
  Using cached shap-0.46.0-cp310-cp310-macosx_10_9_x86_64.whl.metadata (24 kB)
Collecting slicer==0.0.8 (from shap)
  Using cached slicer-0.0.8-py3-none-any.whl.metadata (4.0 kB)
Collecting numba (from shap)
  Using cached numba-0.60.0-cp310-cp310-macosx_10_9_x86_64.whl.metadata (2.7 kB)
Collecting cloudpickle (from shap)
  Downloading cloudpickle-3.1.0-py3-none-any.whl.metadata (7.0 kB)
Collecting llvmlite<0.44,>=0.43.0dev0 (from numba->shap)
  Using cached llvmlite-0.43.0-cp310-cp310-macosx_10_9_x86_64.whl.metadata (4.8 kB)
Using cached shap-0.46.0-cp310-cp310-macosx_10_9_x86_64.whl (459 kB)
Using cached slicer-0.0.8-py3-none-any.whl (15 kB)
Downloading cloudpickle-3.1.0-py3-none-any.whl (22 kB)
Using cached numba-0.60.0-cp310-cp310-macosx_10_9_x86_64.whl (2.6 MB)
Using cached llvmlite-0.43.0-cp310-cp310-macosx_10_9_x86_64.whl (31.1 MB)
Installing collected packages: slicer, llvmlite, cloudpickle, numba, shap
Successfully installed cloudpickle-3.1.0 llvmlite-0.43.0 nu

In [22]:
import shap

  from .autonotebook import tqdm as notebook_tqdm


In [7]:
word2vec = joblib.load('word2vec.pkl')

In [6]:
model = joblib.load('lstm.pkl')

In [37]:
explainer = shap.Explainer(model, text_to_word_sequence)