In [2]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

In [3]:
data = pd.read_csv("amazon_reviews.csv")


In [4]:
data.head()


Unnamed: 0,Review,Label
0,Great CD: My lovely Pat has one of the GREAT v...,1
1,One of the best game music soundtracks - for a...,1
2,Batteries died within a year ...: I bought thi...,0
3,"works fine, but Maha Energy is better: Check o...",1
4,Great for the non-audiophile: Reviewed quite a...,1


In [5]:
data.shape



(400000, 2)

In [7]:
type(data)


pandas.core.frame.DataFrame

In [8]:
data.tail()


Unnamed: 0,Review,Label
399995,Unbelievable- In a Bad Way: We bought this Tho...,0
399996,"Almost Great, Until it Broke...: My son reciev...",0
399997,Disappointed !!!: I bought this toy for my son...,0
399998,Classic Jessica Mitford: This is a compilation...,1
399999,"Comedy Scene, and Not Heard: This DVD will be ...",0


In [9]:
data["Label"].value_counts()


Label
1    200000
0    200000
Name: count, dtype: int64

In [10]:
# one hot encoding
# label encoder

In [11]:
# positive -> 1
# negative -> 0
data.replace({"Review": {"positive": 1, "negative": 0}}, inplace=True)

In [12]:
data.head()


Unnamed: 0,Review,Label
0,Great CD: My lovely Pat has one of the GREAT v...,1
1,One of the best game music soundtracks - for a...,1
2,Batteries died within a year ...: I bought thi...,0
3,"works fine, but Maha Energy is better: Check o...",1
4,Great for the non-audiophile: Reviewed quite a...,1


In [13]:
data.tail()


Unnamed: 0,Review,Label
399995,Unbelievable- In a Bad Way: We bought this Tho...,0
399996,"Almost Great, Until it Broke...: My son reciev...",0
399997,Disappointed !!!: I bought this toy for my son...,0
399998,Classic Jessica Mitford: This is a compilation...,1
399999,"Comedy Scene, and Not Heard: This DVD will be ...",0


In [14]:
data["Label"].value_counts()


Label
1    200000
0    200000
Name: count, dtype: int64

In [15]:

# LSTM -> LONG SHORT TERM MEMORY
# RNN -> TEXTUAL DATA

In [16]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [17]:
train_data, test_data = train_test_split(data, test_size = 0.2, random_state=42)


In [18]:
train_data.shape


(320000, 2)

In [26]:
test_data.shape


(80000, 2)

In [30]:
tokenizer = Tokenizer(num_words = 5000)
tokenizer.fit_on_texts(train_data["Review"])


In [33]:
X_train = pad_sequences(tokenizer.texts_to_sequences(train_data["Review"]), maxlen=300)
X_test = pad_sequences(tokenizer.texts_to_sequences(test_data["Review"]), maxlen=300)

In [9]:
X_train


NameError: name 'X_train' is not defined

In [35]:
X_test


array([[   0,    0,    0, ...,   10,    4,  156],
       [   0,    0,    0, ...,    1, 3045,    7],
       [   0,    0,    0, ...,  282,   55, 4587],
       ...,
       [   0,    0,    0, ..., 1374, 1215, 1468],
       [   0,    0,    0, ...,   40,    8,   19],
       [   0,    0,    0, ...,   15,  774,   11]])

In [36]:
Y_train = train_data["Label"]
Y_test = test_data["Label"]

In [37]:
Y_train


242245    0
288918    1
105103    0
63504     0
239180    0
         ..
259178    0
365838    0
131932    0
146867    0
121958    1
Name: Label, Length: 320000, dtype: int64

In [38]:
# LSTM MODEL BUILDING


In [39]:
model = Sequential()
model.add(Embedding(input_dim =5000, output_dim = 128, input_length = 200))
model.add(LSTM(128, dropout=0.2, recurrent_dropout = 0.2))
model.add(Dense(1, activation = "sigmoid"))

In [40]:
model.summary()


In [41]:
model.compile(optimizer = "adam", loss="binary_crossentropy", metrics=["accuracy"])


In [42]:
model.fit(X_train, Y_train, epochs = 5, batch_size = 64, validation_split = 0.2)


Epoch 1/5
[1m4000/4000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2786s[0m 695ms/step - accuracy: 0.8472 - loss: 0.3544 - val_accuracy: 0.9192 - val_loss: 0.2046
Epoch 2/5
[1m4000/4000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4277s[0m 1s/step - accuracy: 0.9240 - loss: 0.1960 - val_accuracy: 0.9238 - val_loss: 0.1930
Epoch 3/5
[1m4000/4000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1379s[0m 345ms/step - accuracy: 0.9357 - loss: 0.1668 - val_accuracy: 0.9310 - val_loss: 0.1816
Epoch 4/5
[1m4000/4000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1415s[0m 354ms/step - accuracy: 0.9449 - loss: 0.1463 - val_accuracy: 0.9338 - val_loss: 0.1729
Epoch 5/5
[1m4000/4000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8095s[0m 2s/step - accuracy: 0.9517 - loss: 0.1291 - val_accuracy: 0.9326 - val_loss: 0.1796


<keras.src.callbacks.history.History at 0x1e3ec894920>

In [43]:
model.save("model.h5")




In [44]:
import joblib
joblib.dump(tokenizer, "tokenizer.pkl")


['tokenizer.pkl']

In [45]:
loss, accuracy = model.evaluate(X_test, Y_test)


[1m2500/2500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m280s[0m 112ms/step - accuracy: 0.9315 - loss: 0.1817


In [46]:
print(loss)


0.18053610622882843


In [47]:
print(accuracy)


0.9311375021934509


In [48]:
# Building Predictive System


In [49]:
def predictive_system(review):
  sequences = tokenizer.texts_to_sequences([review])
  padded_sequence = pad_sequences(sequences, maxlen=200)
  prediction = model.predict(padded_sequence)
  sentiment = "positive" if prediction[0][0] > 0.5 else "negative"
  return sentiment

In [81]:
predictive_system("The sound quality is amazing! The bass is deep, and the treble is clear. They fit snugly in my ears and are super comfortable even after hours of use. Battery life lasts for days, and the noise cancellation is impressive for the price. Highly recommend")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 352ms/step


'positive'

In [83]:
predictive_system("Terrible experience! The earbuds stopped working after just one week. The sound quality was mediocre, and the battery barely lasted 2 hours. Worst of all, customer support was unhelpful and refused to provide a replacement. Do not buy these!")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step


'negative'

In [85]:
predictive_system("The kettle looks great, but that’s the only good thing about it. It takes too long to boil water, and the handle gets extremely hot, making it hard to use. After just a month, it started leaking water. Disappointed with the quality.")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step


'negative'

In [87]:
predictive_system("The smartwatch is excellent for fitness tracking and notifications. The screen is bright and responsive, and the battery life is decent. However, the strap feels a bit cheap, and the app interface could use some improvement. Overall, it’s a great value for the price.")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step


'positive'