In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
import sklearn

In [2]:
df=pd.read_csv('amazon_review_polarity_csv/train.csv',header=None)
df.columns = ['sentiment', 'title','text']
df.head()

Unnamed: 0,sentiment,title,text
0,2,Stuning even for the non-gamer,This sound track was beautiful! It paints the ...
1,2,The best soundtrack ever to anything.,I'm reading a lot of reviews saying that this ...
2,2,Amazing!,This soundtrack is my favorite music of all ti...
3,2,Excellent Soundtrack,I truly like this soundtrack and I enjoy video...
4,2,"Remember, Pull Your Jaw Off The Floor After He...","If you've played the game, you know how divine..."


In [3]:
df.drop('title', axis=1, inplace=True)
df

Unnamed: 0,sentiment,text
0,2,This sound track was beautiful! It paints the ...
1,2,I'm reading a lot of reviews saying that this ...
2,2,This soundtrack is my favorite music of all ti...
3,2,I truly like this soundtrack and I enjoy video...
4,2,"If you've played the game, you know how divine..."
...,...,...
3599995,1,The high chair looks great when it first comes...
3599996,1,I have used this highchair for 2 kids now and ...
3599997,1,"We have a small house, and really wanted two o..."
3599998,1,not sure what this book is supposed to be. It ...


In [4]:
df['sentiment'] = df['sentiment'].map({1:0,2:1})
df

Unnamed: 0,sentiment,text
0,1,This sound track was beautiful! It paints the ...
1,1,I'm reading a lot of reviews saying that this ...
2,1,This soundtrack is my favorite music of all ti...
3,1,I truly like this soundtrack and I enjoy video...
4,1,"If you've played the game, you know how divine..."
...,...,...
3599995,0,The high chair looks great when it first comes...
3599996,0,I have used this highchair for 2 kids now and ...
3599997,0,"We have a small house, and really wanted two o..."
3599998,0,not sure what this book is supposed to be. It ...


In [5]:
df_1=df[df['sentiment']==0]
df_1.shape

(1800000, 2)

In [6]:
df_2=df[df['sentiment']==1]
df_2.shape

(1800000, 2)

In [7]:
df_1_downsampled=df_1.sample(5000)
df_1_downsampled.shape

(5000, 2)

In [8]:
df_2_downsampled=df_2.sample(5000)
df_2_downsampled.shape

(5000, 2)

In [9]:
df_balanced=pd.concat([df_1_downsampled,df_2_downsampled])
df_balanced.shape

(10000, 2)

In [10]:
print(df_balanced['sentiment'].value_counts())
df_balanced

sentiment
0    5000
1    5000
Name: count, dtype: int64


Unnamed: 0,sentiment,text
1866664,0,I left a review September of 2007. I tried the...
3424911,0,"It may not happen to others, but I get a total..."
3208521,0,"You get what you pay for, I got two and one wa..."
3066358,0,You cannot set your height in the English sett...
368423,0,"I waited for the sequel to ""Silence of the Lam..."
...,...,...
1654776,1,This book is not just a knitting book--it is o...
3100129,1,Generally a fascinating look into the life of ...
2351599,1,I was fascinated by this character and the sto...
2825677,1,i realy liked the kunuffle bunny i liked the p...


In [11]:
from tqdm import tqdm  
import spacy
nlp = spacy.load("en_core_web_sm")

def preprocess(cleaned_review):
    doc=nlp(cleaned_review)
    filtered_tokens=[]
    for token in doc:
        if token.is_stop or token.is_punct:
            continue
        filtered_tokens.append(token.lemma_)
    return " ".join(filtered_tokens)
tqdm.pandas()
df_balanced['preprocessed_txt']=df_balanced['text'].progress_apply(preprocess)


100%|██████████| 10000/10000 [02:13<00:00, 75.12it/s]


In [12]:
df_balanced

Unnamed: 0,sentiment,text,preprocessed_txt
1866664,0,I left a review September of 2007. I tried the...,leave review September 2007 try new band come ...
3424911,0,"It may not happen to others, but I get a total...",happen total screen freezeabout 10 minute high...
3208521,0,"You get what you pay for, I got two and one wa...",pay get crack get cause barrel jam get piece f...
3066358,0,You cannot set your height in the English sett...,set height English setting set height scale de...
368423,0,"I waited for the sequel to ""Silence of the Lam...",wait sequel silence lamb 10 year suppose live ...
...,...,...,...
1654776,1,This book is not just a knitting book--it is o...,book knitting book curl read myth adaptation f...
3100129,1,Generally a fascinating look into the life of ...,generally fascinating look life Lincoln contem...
2351599,1,I was fascinated by this character and the sto...,fascinate character story begin end find backd...
2825677,1,i realy liked the kunuffle bunny i liked the p...,realy like kunuffle bunny like picher dettall ...


In [13]:
from sklearn.model_selection import train_test_split
X_Train, X_Test, Y_Train, Y_Test=train_test_split(df_balanced['preprocessed_txt'],df_balanced['sentiment'], stratify=df_balanced['sentiment'])

In [14]:
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text

In [15]:
bert_preprocess=hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3")
bert_encoder=hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4")

In [16]:
def get_sentence_embeding(sentences):
    preprocessed_txt=bert_preprocess(sentences)
    return bert_encoder(preprocessed_txt)['pooled_output']

In [17]:
text_input=tf.keras.layers.Input(shape=(),dtype=tf.string, name='text')
preprocessed_txt=bert_preprocess(text_input)
outputs=bert_encoder(preprocessed_txt)

In [18]:
l=tf.keras.layers.Dropout(0.1, name='dropout')(outputs['pooled_output'])
l=tf.keras.layers.Dense(1, activation='sigmoid', name='output')(l)

In [19]:
model=tf.keras.Model(inputs=[text_input],outputs=[l])
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 text (InputLayer)              [(None,)]            0           []                               
                                                                                                  
 keras_layer (KerasLayer)       {'input_mask': (Non  0           ['text[0][0]']                   
                                e, 128),                                                          
                                 'input_word_ids':                                                
                                (None, 128),                                                      
                                 'input_type_ids':                                                
                                (None, 128)}                                                  

In [20]:
METRICS=[
    tf.keras.metrics.BinaryAccuracy(name="accuracy"),
    tf.keras.metrics.Precision(name='Precision'),
    tf.keras.metrics.Recall(name='Recall'),
]

In [21]:
model.compile(optimizer='adam',
    loss='binary_crossentropy',
    metrics=METRICS)

In [22]:
model.fit(X_Train,Y_Train,epochs=30)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.callbacks.History at 0x1e3aa5b7520>

In [23]:
model.evaluate(X_Test,Y_Test)



[0.5364626049995422,
 0.7372000217437744,
 0.7092449069023132,
 0.8040000200271606]

In [24]:
Y_Predicted=model.predict(X_Test)
Y_Predicted=Y_Predicted.flatten()



In [25]:
Y_Predicted=np.where(Y_Predicted>0.5,1,0)
Y_Predicted

array([1, 1, 1, ..., 1, 0, 1])

In [26]:
from sklearn.metrics import confusion_matrix,classification_report

cm=confusion_matrix(Y_Test,Y_Predicted)
cm

array([[ 838,  412],
       [ 245, 1005]], dtype=int64)

In [27]:
print(classification_report(Y_Test,Y_Predicted))

              precision    recall  f1-score   support

           0       0.77      0.67      0.72      1250
           1       0.71      0.80      0.75      1250

    accuracy                           0.74      2500
   macro avg       0.74      0.74      0.74      2500
weighted avg       0.74      0.74      0.74      2500

