# Amazon Fine Food Reviews

## Pretrained embedding layer of word2vec

In [1]:
## Importing necessary libraries

import pandas as pd
import numpy as np
# NLP libraries
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from string import punctuation

from bs4 import BeautifulSoup

In [2]:
from keras.preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding
from keras.layers import GlobalMaxPooling1D
from keras.layers import Dense
from keras.layers import LSTM
from keras.callbacks import *
from sklearn.model_selection import train_test_split

In [3]:
df  = pd.read_csv("ReviewsNew.csv")

In [4]:
# deleting reviews with neutral score == 3
df = df[df.Score != 3.0]

In [5]:
stop_words = set(stopwords.words("english"))
# creating instance for lemmatizer
lemmatizer = WordNetLemmatizer()

In [6]:
print(stop_words)

{'both', "haven't", 'having', 'doesn', 'then', 'that', 'is', 'here', 'before', 'an', 'haven', "wouldn't", 'own', 'm', 'ain', "you've", 'were', 'now', "mightn't", "shouldn't", 'wasn', 'my', 'not', 'than', "isn't", 'himself', 'few', 'yourselves', 'are', 'do', 'up', 'shan', 'too', 'don', "you'd", 'couldn', 'we', 'did', 'into', 'i', 'won', 'which', 'ours', 't', 'had', "that'll", 'you', "hasn't", 'his', 'a', "she's", 'yourself', 'hers', "aren't", 'as', 've', 'aren', 'theirs', 'been', 'be', 'but', 'its', 'more', 'our', 'when', 'just', 'off', 'same', "you'll", 'shouldn', 'needn', 'your', 'will', "you're", 'while', 'such', 'there', 'this', 'because', 'all', 'only', 'll', 'the', 'being', 'down', 'once', 'in', 's', 'ourselves', "doesn't", 'does', 'further', "it's", "hadn't", 'about', 'no', 'above', 'between', 'her', 'why', 'through', 'if', 'am', 'yours', 'it', 'after', 'isn', 'myself', 'where', "couldn't", 'itself', 'them', 'during', 'how', 'those', "wasn't", 'whom', 'from', 'doing', 'at', 'thes

In [7]:
stop_words.remove("not")
stop_words.remove("no")

In [8]:
# creating function for preprocessing text
    
#corpus =[]
def preprocess_text(text):
    """
    This function preprocess the text and return cleaned text
    """
    #removing links
    text = re.sub(r"http\S+","", text) 
    
    #removing html tags and keeping only texts
    text = BeautifulSoup(text,'lxml').get_text() 
    
    # removing words containing numeric digits
    text = re.sub(r"\S*\d\S*","", text).strip() 
    
    #removing non-alphabetic characters
    text = re.sub(r"[^a-zA-Z]+"," ", text) 
    
    # converting words with characters appearing more than 2 times to the normal meaningful words
    text = re.sub(r"(.)\1+",r"\1\1",text)
    
    # converting to lower case and creating list of tokenized words
    text = word_tokenize(text.lower())
    
    # removing stop words
    text = [word for word in text if not word in stop_words]
    
    # removing punctuations
    text = [word for word in text if word not in punctuation ]
    
    #lemmatization (obtaining verb form of word)
    text = [lemmatizer.lemmatize(word) for word in text] 
    
    # creating list of words appeared in all text data
    #corpus.append(text) 
    
    text = " ".join(text)
    
    text.strip()
    
    return text

In [9]:
df['cleanedtext']=df['ReviewText'].apply(preprocess_text)

In [10]:
df.to_csv("NewCleanText.csv", index = False)

In [11]:
df2 = pd.read_csv("NewCleanText.csv")
df2.head()

Unnamed: 0,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,ReviewSummary,ReviewText,Helpfulness,Review_type,%Helpful,word_count,cleanedtext
0,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1.0,1.0,5.0,1303862000.0,Good Quality Dog Food,I have bought several of the Vitality canned ...,1.0,1,more than 75%,50,bought several vitality canned dog food produc...
1,B00171APVA,A21BT40VZCCYT4,Carol A. Reed,0.0,0.0,5.0,1351210000.0,Healthy Dog Food,This is a very healthy dog food. Good for the...,0.0,1,not voted,26,healthy dog food good digestion also good smal...
2,B0019CW0HE,A2P6ACFZ8FTNVV,Melissa Benjamin,0.0,1.0,1.0,1331165000.0,Bad,I fed this to my Golden Retriever and he hate...,0.0,0,not voted,39,fed golden retriever hated eat gave terrible d...
3,B006F2NYI2,A132DJVI37RB4X,Scottdrum,2.0,5.0,2.0,1332374000.0,"Not hot, not habanero","I have to admit, I was a sucker for the large...",0.4,0,between 25% to 75%,72,admit sucker large quantity oz shopping hot sa...
4,B000P41A28,A82WIMR4RSVLI,Emrose mom,0.0,1.0,4.0,1337472000.0,The best weve tried so far,We have a 7 week old... He had gas and consti...,0.0,1,not voted,199,week old gas constipation problem first week t...


In [12]:
df2['cleanedtext'][0]

'bought several vitality canned dog food product found good quality product look like stew processed meat smell better labrador finicky appreciates product better'

In [13]:
max_features = 50000
tokenizer = Tokenizer(num_words=max_features, split=' ', oov_token="<oov>", char_level=False)

In [14]:
X = df2['cleanedtext'].apply(str)
y = df2['Review_type']

In [15]:
xtrain, xtest, ytrain, ytest = train_test_split(X,y, test_size=0.2,
                                                random_state=2, stratify=y)

In [16]:
xtrain.shape, xtest.shape, ytrain.shape, ytest.shape

((291074,), (72769,), (291074,), (72769,))

In [17]:
xtrain.head()

192497    tea pleasant taste bulk form economical sleep ...
107659    people taste vary sure not really big cheese s...
46288     keeping type packaged food around couple year ...
107060    well fourth garden far herb use basil always s...
169693    purchase cento tomato puree tomato caseswe use...
Name: cleanedtext, dtype: object

In [18]:
tokenizer.fit_on_texts(xtrain)

In [19]:
word_index = tokenizer.word_index
print("found %s unique tokens" % len(word_index))

found 86635 unique tokens


In [20]:
word_index

{'<oov>': 1,
 'not': 2,
 'like': 3,
 'taste': 4,
 'good': 5,
 'product': 6,
 'one': 7,
 'great': 8,
 'flavor': 9,
 'love': 10,
 'tea': 11,
 'coffee': 12,
 'food': 13,
 'would': 14,
 'get': 15,
 'make': 16,
 'no': 17,
 'time': 18,
 'really': 19,
 'amazon': 20,
 'much': 21,
 'use': 22,
 'also': 23,
 'price': 24,
 'best': 25,
 'little': 26,
 'find': 27,
 'buy': 28,
 'dog': 29,
 'tried': 30,
 'bag': 31,
 'store': 32,
 'well': 33,
 'even': 34,
 'try': 35,
 'better': 36,
 'cup': 37,
 'chocolate': 38,
 'day': 39,
 'year': 40,
 'box': 41,
 'eat': 42,
 'drink': 43,
 'sugar': 44,
 'first': 45,
 'used': 46,
 'brand': 47,
 'found': 48,
 'water': 49,
 'go': 50,
 'sweet': 51,
 'bought': 52,
 'made': 53,
 'treat': 54,
 'way': 55,
 'free': 56,
 'mix': 57,
 'order': 58,
 'delicious': 59,
 'give': 60,
 'thing': 61,
 'think': 62,
 'since': 63,
 'two': 64,
 'favorite': 65,
 'could': 66,
 'bit': 67,
 'say': 68,
 'add': 69,
 'cat': 70,
 'know': 71,
 'recommend': 72,
 'lot': 73,
 'many': 74,
 'nice': 75,
 'g

In [21]:
maxlen = 200

In [22]:
train_seq = tokenizer.texts_to_sequences(xtrain)
train_pad = pad_sequences(train_seq, maxlen=maxlen)

test_seq = tokenizer.texts_to_sequences(xtest)
test_pad = pad_sequences(test_seq, maxlen=maxlen)

In [23]:
train_pad.shape, test_pad.shape

((291074, 200), (72769, 200))

## Pretrained word2vec

In [24]:
import gensim
from gensim.models import Word2Vec

In [25]:
embedding_wv = gensim.models.KeyedVectors.load_word2vec_format("C:\\AdityaDATA\\pretrained_models\\GoogleNews-vectors-negative300.bin", binary=True)

In [26]:
# Creating embedding mattrix

vocab_size = len(tokenizer.word_index)+1
print(vocab_size)

86636


In [27]:
embedding_matrix = np.zeros((vocab_size,300))

In [28]:
for word, i in tokenizer.word_index.items():
    try:
        em_vec = embedding_wv[word]
        if em_vec is not None:
            embedding_matrix[i] = em_vec
    except:
        pass

In [29]:
from keras.callbacks import *

In [30]:
# Define embedding layer

embed_layer = Embedding(vocab_size,300, weights=[embedding_matrix],
                        input_length= maxlen, trainable=False)

In [31]:
model = Sequential()

model.add(embed_layer)

model.add(LSTM(128, return_sequences=True, dropout=0.2))

model.add(GlobalMaxPooling1D())

#model.add(LSTM(128, return_sequences=False, dropout=0.2))

model.add(Dense(64,activation='relu'))

model.add(Dense(1,activation='sigmoid'))

In [32]:
model.compile(optimizer='adam',loss='binary_crossentropy', metrics=['accuracy'])

es = EarlyStopping(monitor='val_loss',mode='min', patience=3, verbose=1)
mc = ModelCheckpoint("best_model.h5", monitor='val_accuracy', mode='max',save_best_only=True, verbose=1)

In [33]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 200, 300)          25990800  
                                                                 
 lstm (LSTM)                 (None, 200, 128)          219648    
                                                                 
 global_max_pooling1d (Globa  (None, 128)              0         
 lMaxPooling1D)                                                  
                                                                 
 dense (Dense)               (None, 64)                8256      
                                                                 
 dense_1 (Dense)             (None, 1)                 65        
                                                                 
Total params: 26,218,769
Trainable params: 227,969
Non-trainable params: 25,990,800
______________________________________

In [34]:
history = model.fit(train_pad, ytrain, batch_size=256,epochs=10,
                   validation_data=(test_pad, ytest), callbacks=[es,mc])

Epoch 1/10
Epoch 1: val_accuracy improved from -inf to 0.92274, saving model to best_model.h5
Epoch 2/10
Epoch 2: val_accuracy improved from 0.92274 to 0.93472, saving model to best_model.h5
Epoch 3/10
Epoch 3: val_accuracy improved from 0.93472 to 0.93800, saving model to best_model.h5
Epoch 4/10
Epoch 4: val_accuracy improved from 0.93800 to 0.93978, saving model to best_model.h5
Epoch 5/10
Epoch 5: val_accuracy did not improve from 0.93978
Epoch 6/10
Epoch 6: val_accuracy did not improve from 0.93978
Epoch 7/10
Epoch 7: val_accuracy did not improve from 0.93978
Epoch 7: early stopping


In [35]:
embedding_matrix.shape

(86636, 300)

In [36]:
import pickle

In [37]:
pickle.dump(tokenizer, open("tokenizer.pkl","wb"))

In [44]:
pickle.dump(stop_words, open("stopwords.pkl","wb"))

In [38]:
from keras.models import load_model

In [39]:
loaded_m = load_model("best_model.h5")

In [50]:
text = """Makes the cat hyper active it grunts some noise and attacks everything in it's way !!!!!
It has edta and some other harmful things smells like vomit kitten likes not gonna order again"""

In [51]:
text = [preprocess_text(text)]
text_seq = tokenizer.texts_to_sequences(text)
text_pad =  pad_sequences(text_seq, maxlen=maxlen)
print(text_pad)
print(text_pad.shape)

[[    0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0 

In [52]:
print(text)

['make cat hyper active grunt noise attack everything way edta harmful thing smell like vomit kitten like not gon na order']


In [53]:
result = loaded_m.predict(text_pad)

print(result.shape)
print(result)

(1, 1)
[[0.15080856]]


In [54]:
result[0][0]

0.15080856