<a href="https://colab.research.google.com/github/Eddiemtk1/1AI-Model/blob/main/Amazon_Sentiment_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#Imported necessary libraries pandas for data handling, visualisation ,and tensorflow trains the model
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, SpatialDropout1D
from sklearn.model_selection import train_test_split
import pickle
import re

In [None]:
#loaded the dataset from google drive instead of downloading it to my pc to speed up development.

from google.colab import drive
drive.mount('/content/drive')
df = pd.read_csv('/content/drive/MyDrive/AI/Reviews.csv')

Mounted at /content/drive


In [None]:
#show first 5 entries
df.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...


In [None]:
#The dataset has many columns that aren't needed for the sentiment analysis project so cleaning is needed.
#Cleanning dataset and keeping only relevant columns
#The text has what the reviewer wrote and the score is how many stars they left, theres 5stars where a positive review is.
df = df[['Text','Score']]

In [None]:
df.head()

Unnamed: 0,Text,Score
0,I have bought several of the Vitality canned d...,5
1,Product arrived labeled as Jumbo Salted Peanut...,1
2,This is a confection that has been around a fe...,4
3,If you are looking for the secret ingredient i...,2
4,Great taffy at a great price. There was a wid...,5


In [None]:
#Convert text to  lowercase so its consistent
#Remove punctuation and numbers
#This was kept simple to keep preprocessing fast
def clean_text(text):
  text = str(text).lower()
  text = re.sub('<.*?>', '', text)
  text = re.sub(r'[^a-zA-Z\s]', '', text)
  return text

df['Cleaned_Text'] = df['Text'].apply(clean_text)

In [None]:
df.head()

Unnamed: 0,Text,Score,Cleaned_Text
0,I have bought several of the Vitality canned d...,5,i have bought several of the vitality canned d...
1,Product arrived labeled as Jumbo Salted Peanut...,1,product arrived labeled as jumbo salted peanut...
2,This is a confection that has been around a fe...,4,this is a confection that has been around a fe...
3,If you are looking for the secret ingredient i...,2,if you are looking for the secret ingredient i...
4,Great taffy at a great price. There was a wid...,5,great taffy at a great price there was a wide...


In [None]:
#Define positive, neutral and negative sentiment
#1-2 stars = Negative, 3 stars=Neutral, and 4-5 stars = Positive
def get_sentiment(score):
  if score <= 2:
    return 0
  elif score == 3:
    return 1
  else:
    return 2


In [None]:
#Adds a Sentiment column
df['Sentiment'] = df['Score'].apply(get_sentiment)

In [None]:
df.head()

Unnamed: 0,Text,Score,Cleaned_Text,Sentiment
0,I have bought several of the Vitality canned d...,5,i have bought several of the vitality canned d...,2
1,Product arrived labeled as Jumbo Salted Peanut...,1,product arrived labeled as jumbo salted peanut...,0
2,This is a confection that has been around a fe...,4,this is a confection that has been around a fe...,2
3,If you are looking for the secret ingredient i...,2,if you are looking for the secret ingredient i...,0
4,Great taffy at a great price. There was a wid...,5,great taffy at a great price there was a wide...,2


In [None]:
score_counts = df['Score'].value_counts()
print(score_counts)

Score
5    363122
4     80655
1     52268
3     42640
2     29769
Name: count, dtype: int64


In [None]:
#Balance the dataset by having 5000 reviews from positive, neutral and negative.
df_neg = df[df['Sentiment'] == 0]
df_neu = df[df['Sentiment'] == 1]
df_pos = df[df['Sentiment'] == 2]

min_sample = min(len(df_neg), len(df_neu), len(df_pos), 5000)

#min_sample randomly picks 5000 reviews from each categorie
#random_state=42 is a seed, it makes sure that if i run the code again
#tomorrow the same 5000 random reviews will be picked
df_neg_balanced = df_neg.sample(n=min_sample, random_state=42)
df_neu_balanced = df_neu.sample(n=min_sample, random_state=42)
df_pos_balanced = df_pos.sample(n=min_sample, random_state=42)

#pd.concat combines the 3 balanced piles into one list
#.sample(frac=1) shuffles the list, if not there would be all negative entries
#followed by all neutral, then all positive
#After shuffling, the ro
df_balanced = pd.concat([df_neg_balanced, df_neu_balanced, df_pos_balanced])
df_balanced = df_balanced.sample(frac=1).reset_index(drop=True)

# Tokenisation

In [None]:
#Only cay attention to the 5000 most common words,
#I limited vocab to 5000 words to keep the model size small as it needs to load onto my website
#Every review must be exactly 100 words long
MAX_WORDS = 5000
MAX_LEN = 100

tokenizer = Tokenizer(num_words=MAX_WORDS, oov_token='<OOV>')#OOV is out of vocabulary
tokenizer.fit_on_texts(df_balanced['Cleaned_Text'].values)

X = tokenizer.texts_to_sequences(df_balanced['Cleaned_Text'].values)#Converts sentences into list of integers
X = pad_sequences(X, maxlen=MAX_LEN)

#The model's output layer has 3 neurons(neg,neu,pos).The 'answer key' Y must have 3 numbers for every review
Y = pd.get_dummies(df_balanced['Sentiment']).values

In [None]:
#Split training data 80% and test data 20%
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [None]:
#LSTM was chosen as it can handle sequential data like text
model = Sequential()
model.add(Embedding(MAX_WORDS, 128,))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(3, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
history = model.fit(X_train, Y_train, epochs=5, batch_size=64, validation_data=(X_test, Y_test))

Epoch 1/5
[1m188/188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m60s[0m 287ms/step - accuracy: 0.4443 - loss: 1.0324 - val_accuracy: 0.6330 - val_loss: 0.8024
Epoch 2/5
[1m188/188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m55s[0m 291ms/step - accuracy: 0.6865 - loss: 0.7253 - val_accuracy: 0.6470 - val_loss: 0.7785
Epoch 3/5
[1m188/188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 288ms/step - accuracy: 0.7434 - loss: 0.6194 - val_accuracy: 0.6420 - val_loss: 0.7989
Epoch 4/5
[1m188/188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 285ms/step - accuracy: 0.7910 - loss: 0.5483 - val_accuracy: 0.6423 - val_loss: 0.8270
Epoch 5/5
[1m188/188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m83s[0m 292ms/step - accuracy: 0.8064 - loss: 0.4893 - val_accuracy: 0.6430 - val_loss: 0.8591


In [None]:
model.save('sentiment_model.keras')
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)