In [1]:
%matplotlib inline

import sqlite3
import pandas as pd
import numpy as np
import nltk
import string
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.metrics import confusion_matrix, r2_score, classification_report
from sklearn import metrics
from sklearn.metrics import roc_curve, auc
from nltk.stem.porter import PorterStemmer

# using the SQLite Table to read data.
con=sqlite3.connect(r"C:\Users\garga\3D Objects\ML PROJECTS\amazon fine food reviews NLP\archive (1)\database.sqlite")

#filtering only positive and negative reviews i.e.,
# not taking into consideration those reviews with score=3
filtered_data=pd.read_sql_query("""
SELECT *
FROM Reviews
WHERE Score !=3
""" , con)

reviews_data=pd.read_csv(r"C:\Users\garga\3D Objects\ML PROJECTS\amazon fine food reviews NLP\archive (1)\reviews.csv")

In [2]:
def partition(score):
    if score<3:
        return 'negative'
    else:
        return 'positive'
    

filtered_data['positivenegative']=filtered_data['Score'].map(partition)
filtered_data['Score']=filtered_data['positivenegative']
filtered_data.drop('positivenegative',axis=1,inplace=True)

In [3]:
filtered_data

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,positive,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,negative,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,positive,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,negative,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,positive,1350777600,Great taffy,Great taffy at a great price. There was a wid...
...,...,...,...,...,...,...,...,...,...,...
525809,568450,B001EO7N10,A28KG5XORO54AY,Lettie D. Carter,0,0,positive,1299628800,Will not do without,Great for sesame chicken..this is a good if no...
525810,568451,B003S1WTCU,A3I8AFVPEE8KI5,R. Sawyer,0,0,negative,1331251200,disappointed,I'm disappointed with the flavor. The chocolat...
525811,568452,B004I613EE,A121AA1GQV751Z,"pksd ""pk_007""",2,2,positive,1329782400,Perfect for our maltipoo,"These stars are small, so you can give 10-15 o..."
525812,568453,B004I613EE,A3IBEVCTXKNOH,"Kathy A. Welch ""katwel""",1,1,positive,1331596800,Favorite Training and reward treat,These are the BEST treats for training and rew...


## Data Cleaning_Deduplication
###### It is observed that data has duplicate entry. It can be seen by following example.

In [4]:
display=pd.read_sql_query("""
SELECT * 
FROM reviews
WHERE score!=3 AND UserId= "AR5J8UI46CURR"
ORDER BY ProductId
""",con)

In [5]:
display

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,78445,B000HDL1RQ,AR5J8UI46CURR,Geetha Krishnan,2,2,5,1199577600,LOACKER QUADRATINI VANILLA WAFERS,DELICIOUS WAFERS. I FIND THAT EUROPEAN WAFERS ...
1,138317,B000HDOPYC,AR5J8UI46CURR,Geetha Krishnan,2,2,5,1199577600,LOACKER QUADRATINI VANILLA WAFERS,DELICIOUS WAFERS. I FIND THAT EUROPEAN WAFERS ...
2,138277,B000HDOPYM,AR5J8UI46CURR,Geetha Krishnan,2,2,5,1199577600,LOACKER QUADRATINI VANILLA WAFERS,DELICIOUS WAFERS. I FIND THAT EUROPEAN WAFERS ...
3,73791,B000HDOPZG,AR5J8UI46CURR,Geetha Krishnan,2,2,5,1199577600,LOACKER QUADRATINI VANILLA WAFERS,DELICIOUS WAFERS. I FIND THAT EUROPEAN WAFERS ...
4,155049,B000PAQ75C,AR5J8UI46CURR,Geetha Krishnan,2,2,5,1199577600,LOACKER QUADRATINI VANILLA WAFERS,DELICIOUS WAFERS. I FIND THAT EUROPEAN WAFERS ...


In [6]:
#sorting data according to product id
sorted_data=filtered_data.sort_values('ProductId',axis=0,ascending=True)

In [7]:
#deduplication of entries
final=sorted_data.drop_duplicates(subset={"UserId","ProfileName","Time","Summary","Text"},keep='first',inplace=False)
final.shape

(365333, 10)

In [8]:
(final['Id'].size*100)/(filtered_data['Id'].size)

69.4795117665182

##### It was seen that data contains two rows whose value of HelplessnessNumerator is greater than the value of HelplessnessDenominator which is not practically possible. so we have to remove those cases.

In [9]:
display=pd.read_sql_query("""
SELECT *
FROM reviews
WHERE HelpfulnessNumerator>HelpfulnessDenominator
""",con)
display

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,44737,B001EQ55RW,A2V0I904FH7ABY,Ram,3,2,4,1212883200,Pure cocoa taste with crunchy almonds inside,It was almost a 'love at first bite' - the per...
1,64422,B000MIDROQ,A161DK06JJMCYF,"J. E. Stephens ""Jeanne""",3,1,5,1224892800,Bought This for My Son at College,My son loves spaghetti so I didn't hesitate or...


In [10]:
final=final[final['HelpfulnessNumerator']<=final['HelpfulnessDenominator']]
final.shape

(365331, 10)

In [11]:
# Before starting next phase of preprocessing let us see if our data is balanced or not
final['Score'].value_counts()

positive    307967
negative     57364
Name: Score, dtype: int64

###### data is not balanced. We have much more positive review that negative review

## Text preprocessing : Lemmitization, Stemming, Stop-word removal

 Now that we have finished deduplication our data requires some preprocessing before we go on further with analysis and making the prediction model.
Hence in the preprocessing phase we do the following in the order below:
1. Begin by removing the html tags
2. Removing any punctuation of limited set of special character like , or . or # etc
3. Check if the word is made only of english letters
4. Check to see if length of word is greaer than 2
5. Convert all words into lowercase
6. Remove stopwords
7. Finally snowball stemming to words(trimming of words)

After which we collect the words used to describe positive and negative review

In [15]:
# find sentences containing html tags
import re
i=0;
for sent in final['Text'].values:
    if (len(re.findall('<.*>',sent))):
        print(i)
        print(sent)
        break;
    i+=1;

6
I set aside at least an hour each day to read to my son (3 y/o). At this point, I consider myself a connoisseur of children's books and this is one of the best. Santa Clause put this under the tree. Since then, we've read it perpetually and he loves it.<br /><br />First, this book taught him the months of the year.<br /><br />Second, it's a pleasure to read. Well suited to 1.5 y/o old to 4+.<br /><br />Very few children's books are worth owning. Most should be borrowed from the library. This book, however, deserves a permanent spot on your shelf. Sendak's best.


In [16]:
import re
# Tutorial about python regular expression: https://pymotw.com/2/re/
import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem.wordnet  import WordNetLemmatizer
import nltk
nltk.download('stopwords')

stop=set(stopwords.words('english')) #set of stopwords
sno=nltk.stem.SnowballStemmer('english') #initialising the snowball stemmer

def cleanhtml(sentence): #function to clean the words of any html tags
    cleanr=re.compile('<.*?>')
    cleantext=re.sub(cleanr,' ',sentence)
    return cleantext
def cleanpunc(sentence): #function tp clean the word of any punctuation
    cleaned=re.sub(r'[?|!|\'|"|#]',r'',sentence)
    cleaned=re.sub(r'[.|,|)|(|\|/]',r'',cleaned)
    return cleaned
print(stop)
print('***********************************')
print(sno.stem('tasteful'))

{"that'll", 'how', 'have', 'ours', 'be', "doesn't", 'all', 'same', 'needn', 'ma', 'below', "wouldn't", 'didn', 'most', 'out', "you'd", 'is', 'as', 'now', 'couldn', 'hasn', 'wasn', 've', "should've", 'ain', 'they', 'not', 'do', 'i', 'for', 'him', 'no', 'more', 'my', "haven't", 'very', 'and', 'd', "it's", 'any', 'too', 'those', 'we', 'o', 'with', 'doing', "won't", 'me', 'then', 'm', "aren't", "isn't", 'can', 'herself', 'hers', "didn't", 'it', 'mightn', 'while', 'don', 't', 'll', 'won', "hasn't", 'there', 'am', 'through', 'she', 'are', 'which', 'or', 'again', 'will', "you'll", 'yours', 'haven', 'on', 'yourself', 'when', 'hadn', 'until', 'above', 'here', 'own', "shouldn't", 'wouldn', "you're", 'if', 'your', "wasn't", 'its', 'ourselves', 'than', 'himself', 'theirs', 'was', "couldn't", 'doesn', 'isn', 'you', 'from', 'yourselves', 'so', "shan't", 'that', 'what', 'shouldn', 'in', 'but', 'why', 'has', 'an', 'by', 'some', 'about', 'whom', 'been', 'the', 'once', 's', 'only', 'after', 'her', 'down

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\garga\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [17]:
# Code for implementing step-by-step the checks mentioned in the preprocessing
# this code takes time to run
i=0
strl=' '
final_string=[]
all_positive_words=[]
all_negative_words=[]
s=''
for sent in final['Text'].values:
    filtered_sentence=[]
    #print(sent)
    sent=cleanhtml(sent)
    for w in sent.split():
        for cleaned_word in cleanpunc(w).split():
            if((cleaned_word.isalpha()) & (len(cleaned_word)>2)):
                if(cleaned_word.lower() not in stop):
                    s=(sno.stem(cleaned_word.lower())).encode('utf8')
                    filtered_sentence.append(s)
                    if (final['Score'].values)[i]=='positive':
                        all_positive_words.append(s)
                    if (final['Score'].values)[i]=='negative':
                        all_negative_words.append(s)
                else:
                    continue
            else:
                continue
    #print(filtered_sentence)
    strl=b" ".join(filtered_sentence)
    #print("*********************************************************")
    
    final_string.append(strl)
    i+=1

In [18]:
final['CleanedText']=final_string #adding a column of cleaned text to final dataframe

In [49]:
final.head(3)

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text,CleanedText
138706,150524,6641040,ACITT7DI6IDDL,shari zychinski,0,0,positive,939340800,EVERY book is educational,this witty little book makes my son laugh at l...,b'witti littl book make son laugh loud recit c...
138688,150506,6641040,A2IW4PEEKO2R0U,Tracy,1,1,positive,1194739200,"Love the book, miss the hard cover version","I grew up reading these Sendak books, and watc...",b'grew read sendak book watch realli rosi movi...
138689,150507,6641040,A1S4A3IQ2MU7V4,"sally sue ""sally sue""",1,1,positive,1191456000,chicken soup with rice months,This is a fun way for children to learn their ...,b'fun way children learn month year learn poem...


In [56]:
print(final['Text'].values[1])
print('*****************************************************************************************************************')
print(final_string[1])

I grew up reading these Sendak books, and watching the Really Rosie movie that incorporates them, and love them. My son loves them too. I do however, miss the hard cover version. The paperbacks seem kind of flimsy and it takes two hands to keep the pages open.
*****************************************************************************************************************
b'grew read sendak book watch realli rosi movi incorpor love son love howev miss hard cover version paperback seem kind flimsi take two hand keep page open'


In [20]:
# storing this final table into SQLitr table for future
conn=sqlite3.connect('final.sqlite')
c=conn.cursor()
conn.text_factory=str
final.to_sql('Reviews',conn,schema=None,if_exists='replace')

#### Bag of Words(BoW)

In [58]:
count_vec=CountVectorizer() # in-scikit learn
final_count=count_vec.fit_transform(final['CleanedText'])

print(final_count.shape)

print(final_count[1])

(365331, 120723)
  (0, 11819)	1
  (0, 97004)	1
  (0, 61493)	2
  (0, 46290)	1
  (0, 85427)	1
  (0, 92416)	1
  (0, 115261)	1
  (0, 85505)	1
  (0, 88974)	1
  (0, 68288)	1
  (0, 52653)	1
  (0, 51107)	1
  (0, 66491)	1
  (0, 48046)	1
  (0, 24833)	1
  (0, 113698)	1
  (0, 76556)	1
  (0, 92158)	1
  (0, 57239)	1
  (0, 39898)	1
  (0, 103311)	1
  (0, 110449)	1
  (0, 47773)	1
  (0, 56696)	1
  (0, 76121)	1
  (0, 73968)	1


### [7.2.6] Word2Vec

In [21]:
import subprocess
subprocess.check_call(["python", '-m', 'pip', 'install', 'gensim'])

0

In [22]:
# We can use Google news data corpus training output for our data if we dont have large data corpus
# or we can train word2vec by our own data

# Using Google News Word2Vectors
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
import pickle

#in this project we are using pretrained model by google
#its 3.3GB file, once you load this into your memory
# it occupies around 9 GB in RAM
# we will provide a pickle file which contains dict,
# and it contains all our courpus words as keys and model[word] as values
# to use this code snippet, download "GoogleNews-vectors-negative300.bin"
# from https://drive.google.com/file/d/OB7XkCwpI5KDYN1NUTT1SS21pQmM/edit

model=KeyedVectors.load_word2vec_format('C:\\Users\\garga\\Downloads\\archive (1)\\GoogleNews-vectors-negative300.bin.gz', encoding='utf-8', unicode_errors='ignore',binary=True)

In [23]:
model.most_similar(positive=['woman'],topn=10)


[('man', 0.7664012908935547),
 ('girl', 0.7494640946388245),
 ('teenage_girl', 0.7336829900741577),
 ('teenager', 0.631708562374115),
 ('lady', 0.6288785934448242),
 ('teenaged_girl', 0.614178478717804),
 ('mother', 0.6076306104660034),
 ('policewoman', 0.6069462299346924),
 ('boy', 0.5975908637046814),
 ('Woman', 0.5770983099937439)]

In [24]:
model.most_similar(positive=['tasty'],topn=20)

[('delicious', 0.8730388283729553),
 ('scrumptious', 0.8007041215896606),
 ('yummy', 0.7856924533843994),
 ('flavorful', 0.7420164346694946),
 ('delectable', 0.7385422587394714),
 ('juicy_flavorful', 0.7114803791046143),
 ('appetizing', 0.7017217874526978),
 ('crunchy_salty', 0.7012301087379456),
 ('flavourful', 0.6912213563919067),
 ('flavoursome', 0.6857702732086182),
 ('delish', 0.6799105405807495),
 ('delicous', 0.6778561472892761),
 ('savory', 0.6747907400131226),
 ('delectable_dessert', 0.6705280542373657),
 ('generously_portioned', 0.6703447103500366),
 ('braised_oxtail', 0.6700161695480347),
 ('palate_pleasing', 0.6693287491798401),
 ('butternut_soup', 0.6692979335784912),
 ('Sweet_potato_fries', 0.6692339181900024),
 ('crispy_fries', 0.6679065227508545)]

In [25]:
model.most_similar(positive=['like'],topn=20)

[('really', 0.5752447843551636),
 ('weird', 0.5676319599151611),
 ('crazy', 0.5382446050643921),
 ('kind', 0.5310239791870117),
 ('maybe', 0.5220046043395996),
 ('loooove', 0.5187614560127258),
 ('anymore', 0.5177682638168335),
 ('Kinda_reminds', 0.5151873230934143),
 ('definitely', 0.5117843747138977),
 ('kinda_fishy', 0.5090124607086182),
 ('sort', 0.5081151127815247),
 ("don'ta", 0.5079847574234009),
 ('nicer', 0.5077103972434998),
 ('Iike', 0.5074232816696167),
 ('kinda', 0.5046947002410889),
 ('alright', 0.5043249726295471),
 ('differently', 0.5027458071708679),
 ('think', 0.5020042657852173),
 ('loooooove', 0.5017264485359192),
 ('okay', 0.500439465045929)]

In [26]:
model.similarity('tasty','taste')

0.45559818

In [74]:
# Training own Word2Vec model using own text corpus
import subprocess
subprocess.check_call(["python", '-m', 'pip', 'install', 'gensim'])

i=0
list_of_sentence=[]
for sentence in final['Text'].values:
    filtered_sentence=[]
    sentence=cleanhtml(sentence)
    for w in sentence.split():
        for cleaned_word in cleanpunc(w).split():
            if(cleaned_word.isalpha()):
                filtered_sentence.append(cleaned_word.lower())
            else:
                continue
    list_of_sentence.append(filtered_sentence)
                

In [75]:
print(final['Text'].values[0])
print("********************************************************")
print(list_of_sentence[0])

this witty little book makes my son laugh at loud. i recite it in the car as we're driving along and he always can sing the refrain. he's learned about whales, India, drooping roses:  i love all the new words this book  introduces and the silliness of it all.  this is a classic book i am  willing to bet my son will STILL be able to recite from memory when he is  in college
********************************************************
['this', 'witty', 'little', 'book', 'makes', 'my', 'son', 'laugh', 'at', 'loud', 'i', 'recite', 'it', 'in', 'the', 'car', 'as', 'were', 'driving', 'along', 'and', 'he', 'always', 'can', 'sing', 'the', 'refrain', 'hes', 'learned', 'about', 'whales', 'india', 'drooping', 'i', 'love', 'all', 'the', 'new', 'words', 'this', 'book', 'introduces', 'and', 'the', 'silliness', 'of', 'it', 'all', 'this', 'is', 'a', 'classic', 'book', 'i', 'am', 'willing', 'to', 'bet', 'my', 'son', 'will', 'still', 'be', 'able', 'to', 'recite', 'from', 'memory', 'when', 'he', 'is', 'in', '

In [76]:
import gensim
w2v_model=gensim.models.Word2Vec(final['CleanedText'],min_count=5,workers=4)

In [77]:
print(len(w2v_model.wv))

32


In [78]:
w2v_model.wv.most_similar('tasty')

KeyError: "Key 'tasty' not present in vocabulary"

In [32]:
w2v_model.wv.most_similar('like')

[('dislike', 0.5928206443786621),
 ('resemble', 0.5923832058906555),
 ('prefer', 0.5716824531555176),
 ('love', 0.5470753908157349),
 ('enjoy', 0.5408315062522888),
 ('weird', 0.5344480872154236),
 ('mean', 0.5211399793624878),
 ('think', 0.5163823366165161),
 ('overpower', 0.5097289681434631),
 ('overwhelm', 0.4976568818092346)]

here we can see there is difference between the  most similar words when trained from google news or when trained by our own data corpus

In [69]:
print(w2v_model.wv.key_to_index)



In [61]:
w2v_model.wv[final['CleanedText']]

KeyError: "Key 'b'witti littl book make son laugh loud recit car drive along alway sing refrain hes learn whale india droop love new word book introduc silli classic book will bet son still abl recit memori colleg'' not present"

In [40]:
vectorizer=CountVectorizer
count_vect_feat=vectorizer.get_feature_names() #list of words in the BoW
count_vect_feat.index('like')

TypeError: get_feature_names() missing 1 required positional argument: 'self'