# Data Processing and splitting

In [16]:
from logic.processing import load_data, preproc, balance_dataset
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay,classification_report
from sklearn.pipeline import make_pipeline
from tensorflow.keras.preprocessing.text import text_to_word_sequence
from gensim.models import Word2Vec
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np
from tensorflow.keras import layers,Sequential
from tensorflow.keras.layers import InputLayer,Dropout,BatchNormalization, Bidirectional,LSTM,Dense
from tensorflow.keras.callbacks import EarlyStopping
import joblib
import pandas as pd
import kagglehub
import os

In [8]:
def data_filter(df,uselfCount=0,min_length=30):
    df = df.dropna()
    df = df[df['rating'].isin([1,2,3,8,9,10])]
    df['sentiment'] = 0
    df['sentiment'] = (df['rating'].isin([8,9,10])).astype(int)
    df = df[df['usefulCount']>uselfCount]
    df['review_length'] = df['review'].apply(lambda x: len(str(x).split()))
    df = df[df.review_length>=min_length]
    return df

In [15]:
path = kagglehub.dataset_download("jessicali9530/kuc-hackathon-winter-2018")

Downloading from https://www.kaggle.com/api/v1/datasets/download/jessicali9530/kuc-hackathon-winter-2018?dataset_version_number=2...


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 40.7M/40.7M [00:05<00:00, 7.68MB/s]

Extracting files...





In [17]:
df = pd.read_csv(os.path.join(path,'drugsComTrain_raw.csv'))

In [18]:
df1 = data_filter(df)

In [19]:
df1.shape

(109890, 9)

In [20]:
y = df1['sentiment']

In [21]:
y.shape

(109890,)

In [22]:
X_pad = np.load('merged.npy')

In [23]:
X_pad.shape

(109890, 128)

# Model Building

In [36]:
def init_model():
    model = Sequential()
    model.add(InputLayer((128,)))
    model.add(Dense(256, activation='relu'))  
    model.add(Dropout(0.3))  
    model.add(Dense(128, activation='relu'))  
    model.add(Dropout(0.3))  
    model.add(Dense(1, activation='sigmoid'))  
    
    return model

In [37]:
model = init_model()

In [38]:
model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

In [39]:
model.summary()

# Model training

In [40]:
es = EarlyStopping(patience=5, restore_best_weights=True)

model.fit(X_pad, y, 
          batch_size = 32,
          epochs=50,
          validation_split=0.3,
          callbacks=[es]
         )

Epoch 1/50
[1m2404/2404[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 4ms/step - accuracy: 0.7498 - loss: 0.5358 - val_accuracy: 0.7766 - val_loss: 0.4827
Epoch 2/50
[1m2404/2404[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 3ms/step - accuracy: 0.7701 - loss: 0.4965 - val_accuracy: 0.7818 - val_loss: 0.4841
Epoch 3/50
[1m2404/2404[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 3ms/step - accuracy: 0.7694 - loss: 0.4907 - val_accuracy: 0.7816 - val_loss: 0.4717
Epoch 4/50
[1m2404/2404[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 4ms/step - accuracy: 0.7749 - loss: 0.4827 - val_accuracy: 0.7834 - val_loss: 0.4670
Epoch 5/50
[1m2404/2404[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 4ms/step - accuracy: 0.7784 - loss: 0.4773 - val_accuracy: 0.7815 - val_loss: 0.4661
Epoch 6/50
[1m2404/2404[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 4ms/step - accuracy: 0.7800 - loss: 0.4734 - val_accuracy: 0.7855 - val_loss: 0.4614
Epoch 7/50
[1

<keras.src.callbacks.history.History at 0x133ad0760>

# Save Model

In [45]:
joblib.dump(model,'lstm.pkl')

['lstm.pkl']

# Evaluation

In [31]:
res = model.evaluate(X_test_pad, y_test_b, verbose=0)

In [32]:
print(f'The accuracy evaluated on the test set is of {res[1]*100:.3f}%')

The accuracy evaluated on the test set is of 90.305%


In [None]:
y_pred = model.predict(X_test_pad)

In [None]:
y_pred = np.round(y_pred)

In [None]:
print(classification_report(y_test_b, y_pred))

# Demo test

In [41]:
import string
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [42]:
def process(st):
        for punc in string.punctuation:
            st = st.replace(punc, '')
        ans = st.casefold().replace('\n', ' ')
        ansd = ''.join(x for x in ans if not x.isdigit())

        lemmaverb = [WordNetLemmatizer().lemmatize(word, pos='v') for word in ansd.split()]
        lemmanouns = [WordNetLemmatizer().lemmatize(word, pos='n') for word in lemmaverb]
        nans = ' '.join(lemmanouns)
        return nans

In [43]:
sentence = 'I had a terrible experience with this medication. After just a few days, I developed severe headaches and constant nausea. It didn’t improve my condition at all, and I had to stop taking it. I wouldn’t recommend it to anyone.'

In [44]:
process(sentence)

'i have a terrible experience with this medication after just a few day i develop severe headache and constant nausea it didn’t improve my condition at all and i have to stop take it i wouldn’t recommend it to anyone'