In [1]:
%matplotlib inline 

import sqlite3
import pandas as pd 
import numpy as np 
import nltk 
import string 
import matplotlib.pyplot as plt
import seaborn as sns 

from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import confusion_matrix
from sklearn import metrics 
from sklearn.metrics import roc_curve, auc

#natural language processing tool kit 
from nltk.stem.porter import PorterStemmer

#this opens a connection between my database and code
con = sqlite3.connect('../DataFiles/database.sqlite')

#selecting only those rows which have score rating as either 1, 2, 3 or 4.
filtered_data = pd.read_sql_query("SELECT * FROM Reviews WHERE Score != 3", con)

#here i am giving a rating to the socre based on their value 1, 2 as negative 4, 5 as postive and neglecting 3
def partition(x):
    if(x<3):
        return 'negative'
    else:
        return 'positive'

#next we need the value of score field with positive nad negative values based on their values 
#selecting the score column 
#it return series 
actualScore = filtered_data['Score']
#map function return a same type of  of data after applying function to each item of a given iterable. here we are applying it on series that means it return series only
positiveNegative = actualScore.map(partition)
filtered_data['Score'] = positiveNegative



In [2]:
filtered_data.shape
filtered_data.head()

#look at the output of the data 
#here time is stored in the unix timestamp format 

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,positive,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,negative,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,positive,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,negative,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,positive,1350777600,Great taffy,Great taffy at a great price. There was a wid...


# Data Cleaning: Deduplication 
- it is mandatory to remove duplicate data in order to get unbaised results
- if you are giving garbage to the machine learning as a data then it will also give you the garbage

In [3]:
#example of duplicate data 
#here you know that there is this porblem hence it is easy otherwise in real time we need to find this by appling query
display = pd.read_sql_query("""
SELECT *
FROM Reviews 
WHERE Score !=3 AND UserId = "AR5J8UI46CURR"
ORDER BY ProductID
""", con)
display

#copy the product it and you can directly paste it into amazon.com/dp/{id}
#dp stands for detail page and id is also called ASIN - amazon standard identification number 
#so we need to dedup the data 

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,78445,B000HDL1RQ,AR5J8UI46CURR,Geetha Krishnan,2,2,5,1199577600,LOACKER QUADRATINI VANILLA WAFERS,DELICIOUS WAFERS. I FIND THAT EUROPEAN WAFERS ...
1,138317,B000HDOPYC,AR5J8UI46CURR,Geetha Krishnan,2,2,5,1199577600,LOACKER QUADRATINI VANILLA WAFERS,DELICIOUS WAFERS. I FIND THAT EUROPEAN WAFERS ...
2,138277,B000HDOPYM,AR5J8UI46CURR,Geetha Krishnan,2,2,5,1199577600,LOACKER QUADRATINI VANILLA WAFERS,DELICIOUS WAFERS. I FIND THAT EUROPEAN WAFERS ...
3,73791,B000HDOPZG,AR5J8UI46CURR,Geetha Krishnan,2,2,5,1199577600,LOACKER QUADRATINI VANILLA WAFERS,DELICIOUS WAFERS. I FIND THAT EUROPEAN WAFERS ...
4,155049,B000PAQ75C,AR5J8UI46CURR,Geetha Krishnan,2,2,5,1199577600,LOACKER QUADRATINI VANILLA WAFERS,DELICIOUS WAFERS. I FIND THAT EUROPEAN WAFERS ...


In [4]:
#Sorting data based on ProductId 
sorted_data = filtered_data.sort_values('ProductId', axis=0 , ascending=True , inplace=False , kind='quicksort' , na_position='last')

In [5]:
#Deduplication of entries
#find duplicates in such a way that of UserId, ProfileName, Time , Text are same then it is a duplicate
#keep the forst one remove the rest of theem 
#inplace the return the copy of the data 
#visit this function documentation 
#intitally we were having around 50000 rows now after removeing dplicates we are only left with 36000 rows 
final=sorted_data.drop_duplicates(subset={"UserId","ProfileName","Time","Text"}, keep='first', inplace=False)
final.shape

(364173, 10)

In [6]:
#helpfullness numerator cannot be greater than helpfull denominator 
display = pd.read_sql_query("""
SELECT *
FROM Reviews
Where Score != 3 AND Id=44737 OR Id=64422
ORDER BY ProductID
""",con)
display


Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,64422,B000MIDROQ,A161DK06JJMCYF,"J. E. Stephens ""Jeanne""",3,1,5,1224892800,Bought This for My Son at College,My son loves spaghetti so I didn't hesitate or...
1,44737,B001EQ55RW,A2V0I904FH7ABY,Ram,3,2,4,1212883200,Pure cocoa taste with crunchy almonds inside,It was almost a 'love at first bite' - the per...


In [7]:
#so we will remove this rows as they does not make any sense now 
final = final[final.HelpfulnessNumerator <= final.HelpfulnessDenominator]
print(final.shape)

(364171, 10)


In [8]:
#how many positive and negative review are present in our dataset?
final['Score'].value_counts()

positive    307061
negative     57110
Name: Score, dtype: int64

# Bag of Words
- we are going to apply this on our final code 

In [15]:
#BoW
#see the documentation of CountVectorizer in scikit learn 
count_vect = CountVectorizer()
final_counts = count_vect.fit_transform(final['Text'].values)

In [16]:
import pprint 
type(final_counts)

scipy.sparse.csr.csr_matrix

In [17]:
final_counts.get_shape()
#for each review we have a vector, and every column refer to unique word

(364171, 115281)

In [19]:
#finding sentences containing HTML tags 
import re
i = 0
for sent in final['Text'].values:
    if(len(re.findall('<.*>' , sent))):
        print(i)
        print(sent)
        break;
    i += 1

6
I set aside at least an hour each day to read to my son (3 y/o). At this point, I consider myself a connoisseur of children's books and this is one of the best. Santa Clause put this under the tree. Since then, we've read it perpetually and he loves it.<br /><br />First, this book taught him the months of the year.<br /><br />Second, it's a pleasure to read. Well suited to 1.5 y/o old to 4+.<br /><br />Very few children's books are worth owning. Most should be borrowed from the library. This book, however, deserves a permanent spot on your shelf. Sendak's best.


In [23]:
import string  
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer

stop = set(stopwords.words('english'))
#initialising the snowball stemer 
sno = nltk.stem.SnowballStemmer('english')

#will the the <data> with space 
def cleanhtml(sentence):
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, ' ' , sentence)
    return cleantext 

def cleanpunc(sentence):
    cleaned = re.sub(r'[?|!|\'|"|#]' , r'' , sentence)
    cleaned = re.sub(r'[.|,|)|(|\|/]' , r'' , sentence)
    return cleaned 

print(stop)
print('######################################')
print(sno.stem('tasty'))

{'doing', 'him', 'once', 'on', 'why', 'ain', 'out', 'that', 'because', "aren't", 'itself', 'ours', 'myself', 'has', 'mightn', 'aren', "you've", 'so', 'nor', 'which', 'who', 'until', "hasn't", 'they', 'but', 'more', "mightn't", 'too', 'are', 'through', 'any', 'his', 'where', 'if', 'haven', 'its', 'some', "don't", 'hasn', "isn't", 'how', 'should', 'o', 'no', 'am', 'those', 'very', 'will', 'ma', 'not', 'couldn', 'here', 'do', 'your', 'being', 'above', 'each', "should've", "you're", 'such', 't', 'me', 'be', 'these', 'won', 'again', 'to', 'yours', 'after', 'other', 'down', 'can', 'hers', "that'll", 'further', 'have', 'doesn', 've', 'hadn', 'then', 'mustn', 'didn', 'wouldn', "shan't", 'been', 'isn', 's', 'under', 'he', 'below', 'in', 'for', "weren't", 'a', 'own', 'same', 'my', "wouldn't", 'needn', 'both', 'when', 'i', 'is', 'whom', 'there', 'shan', 'she', 'having', 'this', 'while', "shouldn't", 'most', 'd', 'our', 'yourselves', "you'd", 'over', "it's", 'yourself', "doesn't", 'before', 'himse

In [32]:
#code 
i = 0
strl = ' '
final_string = []
#store words from positive and negative words 
all_positive_words = []
all_negative_words = []
s = ' '
#for each review we are runnug this loop
for sent in final['Text'].values:
    filtered_sentence = []
    #print(sent)
    sent = cleanhtml(sent)
    #we are spliting the each sentence means we are runnig the loop on each word of the review 
    for w in sent.split():
        # for each word we are removing its punctuation 
        for cleaned_words in cleanpunc(w).split():
            #checking if its alphanumeric and its length is greater than 2
            if((cleaned_words.isalpha()) & (len(cleaned_words)>2)):
                #here we are removing stop words we are saying this word should not be in stopword
                if((cleaned_words.lower() not in stop)):
                    s = (sno.stem(cleaned_words.lower())).encode('utf8')
                    filtered_sentence.append(s)
                    if(final['Score'].values)[i] == 'positive':
                        all_positive_words.append(s)
                    if(final['Score'].values)[i] == 'negative':
                        all_negative_words.append(s)
                else:
                    continue
            else:
                continue
    #print(filtered_sentence)
    strl = b" ".join(filtered_sentence)
    #print('###################################################################################################')
    
    final_string.append(strl)
    i += 1

In [33]:
final['CleanedText'] = final_string #adding a column of cleaned text in data 

In [34]:
final.head(3)

#store final table into an Sqlite table for future 
#because we dont want to rerun the above code its ode to store intermediate data 

conn = sqlite3.connect('final.sqlite')
c = conn.cursor()
conn.text_factory = str
final.to_sql('Reviews', conn, schema=None, if_exists='replace')

# Bi-Grams and n-Grams 
- we will analysis our postive and negative review 
- we will start by getting the frequency distribution of the word 

In [37]:
freq_dist_positive = nltk.FreqDist(all_positive_words)
freq_dist_negative = nltk.FreqDist(all_negative_words)

print('Most common Postive words : ', freq_dist_positive.most_common(20))
print('\n############################################################################\n')
print('Most commmon Negative words : ', freq_dist_negative.most_common(20))

#now look at the word 'like' in occurs in positive as well as negative 
#there is chances that in negative word it may be 'not like' so we will use bi-grams 

Most common Postive words :  [(b'like', 138531), (b'tast', 126159), (b'good', 107583), (b'love', 106314), (b'flavor', 106287), (b'use', 103251), (b'great', 98289), (b'one', 94769), (b'product', 86413), (b'tri', 85388), (b'tea', 80626), (b'coffe', 75775), (b'make', 74686), (b'get', 71759), (b'food', 62462), (b'would', 55402), (b'time', 53612), (b'buy', 53479), (b'realli', 52433), (b'eat', 51179)]

############################################################################

Most commmon Negative words :  [(b'tast', 33876), (b'like', 32136), (b'product', 27341), (b'one', 20203), (b'flavor', 18758), (b'would', 17927), (b'tri', 17641), (b'use', 15171), (b'good', 14597), (b'coffe', 14187), (b'get', 13733), (b'buy', 13563), (b'order', 12739), (b'food', 12287), (b'tea', 11259), (b'even', 11034), (b'box', 10518), (b'make', 9806), (b'time', 9580), (b'bag', 9459)]


In [45]:
#how to get Bigrams 
#in countVectorizer there is a parameter called ngram_range=(1,2) means between one and two get all n-grams  
#dimesion in bi-gram is more than dimesion in uni gram 
#removing stopword like 'not' should be avoided before building n-grams 
count_vert = CountVectorizer(ngram_range=(1,10))
final_bigram_counts = count_vect.fit_transform(final['Text'].values)

In [46]:
final_bigram_counts.get_shape()

(364171, 115281)

In [47]:
print(final.shape)

(364171, 11)


# TF-IDF

In [48]:
tf_idf_vect = TfidfVectorizer(ngram_range=(1,2))
#there is no stopping , no stemming we jus give the raw data 
final_tf_idf = tf_idf_vect.fit_transform(final['Text'].values)
#this is sparse matrix 

In [49]:
final_tf_idf.get_shape()

(364171, 2910192)

In [50]:
#list of all the word in corpus univariate as well as bivariat e
features = tf_idf_vect.get_feature_names()
len(features)

2910192

In [51]:
features[100000:100010]

['ales until',
 'ales ve',
 'ales would',
 'ales you',
 'alessandra',
 'alessandra ambrosia',
 'alessi',
 'alessi added',
 'alessi also',
 'alessi and']

In [53]:
#suppose for review 3 we need to get the vector 
print(final_tf_idf[3,:].toarray()[0])
#there will be some values which are 1

[0. 0. 0. ... 0. 0. 0.]


In [56]:
def top_tfidf_feats(row, features, top_n=25):
    '''get the top n tfidf values in row and return them with their corresponding ranks'''
    topn_ids = np.argsort(row)[: : -1][:top_n]
    top_feats = [(features[i], row[i]) for i in topn_ids]
    df = pd.DataFrame(top_feats)
    df.columns = ['feature' , 'tfidf']
    return df

#this is for review 1 
top_tfidf = top_tfidf_feats(final_tf_idf[1,:].toarray()[0], features, 25)
    


In [57]:
top_tfidf

Unnamed: 0,feature,tfidf
0,sendak books,0.173437
1,rosie movie,0.173437
2,paperbacks seem,0.173437
3,cover version,0.173437
4,these sendak,0.173437
5,the paperbacks,0.173437
6,pages open,0.173437
7,really rosie,0.168074
8,incorporates them,0.168074
9,paperbacks,0.168074


# Word2Vec
- we can make our own word2Vector or we can use word2vec trained by someone else like in this case we will use it from google


In [63]:
#Using Google News Word2Vectors
from gensim.models import Word2Vec
from gensim.models import KeyedVectors

#google givrs you all this in a file called GoogleNews-vectors-negative300 , its a big table where for every word you have a vector and each word is represented in 300 dimension 
#its 1.9 gb in size 
#it occumpies more than 9 gb of ram  so run this only if you have more than 12 gb of ram
#this model is pretrained 

#model = KeyedVector.load_word2vec_format('GoogleNews-vectors-negative300.bin')
#model.wv['computer']
#model.wv.similarity('woman' , 'man')
#model.wv.most_similar('woman')
#model.wv.most_similar('tasti')
#model.wv.most_similar('tasty')
#model.wv.similarity('tasty' , 'tast')

In [66]:
#train your own w2v
# here we are creating the list of sentences 

import gensim 
i=0 
list_of_sent=[]
for sent in final['Text'].values:
    filtered_sentence = []
    sent = cleanhtml(sent)
    for w in sent.split():
        for cleaned_words in cleanpunc(w).split():
            if(cleaned_words.isalpha()):
                filtered_sentence.append(cleaned_words.lower())
            else:
                continue
    list_of_sent.append(filtered_sentence)

In [67]:
print(final['Text'].values[0])
print('\n##################################################\n')
print(list_of_sent[0])

this witty little book makes my son laugh at loud. i recite it in the car as we're driving along and he always can sing the refrain. he's learned about whales, India, drooping roses:  i love all the new words this book  introduces and the silliness of it all.  this is a classic book i am  willing to bet my son will STILL be able to recite from memory when he is  in college

##################################################

['this', 'witty', 'little', 'book', 'makes', 'my', 'son', 'laugh', 'at', 'loud', 'i', 'recite', 'it', 'in', 'the', 'car', 'as', 'driving', 'along', 'and', 'he', 'always', 'can', 'sing', 'the', 'refrain', 'learned', 'about', 'whales', 'india', 'drooping', 'i', 'love', 'all', 'the', 'new', 'words', 'this', 'book', 'introduces', 'and', 'the', 'silliness', 'of', 'it', 'all', 'this', 'is', 'a', 'classic', 'book', 'i', 'am', 'willing', 'to', 'bet', 'my', 'son', 'will', 'still', 'be', 'able', 'to', 'recite', 'from', 'memory', 'when', 'he', 'is', 'in', 'college']


In [68]:
#to train w2v there is this simple funciton we can use form gensim
#min_count says if the word does not occur five times dont construct the word2vec for it 
#size = what dimesion vector you want, we 
#workers = use all the cores you specify of your system 
w2v_model = gensim.models.Word2Vec(list_of_sent , min_count=5 , size=50 , workers=4)

In [69]:
words = list(w2v_model.wv.vocab)
print(len(words))

33656


In [70]:
w2v_model.wv.most_similar('tasty')

[('tastey', 0.9087656736373901),
 ('yummy', 0.8532534837722778),
 ('satisfying', 0.8439520001411438),
 ('delicious', 0.8220393657684326),
 ('filling', 0.8204648494720459),
 ('flavorful', 0.7869648933410645),
 ('addicting', 0.7753177881240845),
 ('nutritious', 0.7666256427764893),
 ('versatile', 0.7622461318969727),
 ('tasteful', 0.7507215142250061)]

In [72]:
w2v_model.wv.most_similar('like')

[('prefer', 0.6951279044151306),
 ('resemble', 0.6779277324676514),
 ('dislike', 0.6543527841567993),
 ('enjoy', 0.6094087362289429),
 ('gross', 0.6017134785652161),
 ('hate', 0.5953317880630493),
 ('mean', 0.5948614478111267),
 ('alright', 0.5923280119895935),
 ('liked', 0.5815554857254028),
 ('fake', 0.5730670690536499)]

In [74]:
count_vect_feat = count_vect.get_feature_names() #list of words in Bow
#for each word we are getting the corresponding similar word 
#i want to find what is the index of like 
count_vect_feat.index('like')
print(count_vect_feat[64055])

like


# Avg W2V, TFIDF-W2V

In [76]:
#average w2v 
#computing average w2v for each review 
sent_vectors = [] #the average w2v for each sentence/review is stored here
for sent in list_of_sent: #for each review/sentence 
    sent_vec = np.zeros(50) # as the word vector are of zero length 
    cnt_words = 0 # num of words with a valid vector in the sentence
    for word in sent:
        try:
            #here we are using our own w2v  model 
            vec = w2v_model.wv[word]
            sent_vec += vec
            cnt_words += 1
        except:
            pass
    sent_vec /= cnt_words
    sent_vectors.append(sent_vec)
print(len(sent_vectors))
print(len(sent_vectors[0]))

  from ipykernel import kernelapp as app


364171
50


In [None]:
# TF-IDF weighted Word2Vec
tfidf_feat = tf_idf_vect.get_feature_names() # tfidf words/col-names
# final_tf_idf is the sparse matrix with row= sentence, col=word and cell_val = tfidf

tfidf_sent_vectors = []; # the tfidf-w2v for each sentence/review is stored in this list
row=0;
for sent in list_of_sent: # for each review/sentence 
    sent_vec = np.zeros(50) # as word vectors are of zero length
    weight_sum =0; # num of words with a valid vector in the sentence/review
    for word in sent: # for each word in a review/sentence
        try:
            vec=w2v_model.wv[word]
            #obtain the tf_idf of the word
            tfidf = final_tf_idf[row, tfidf_feat.index(word)]
            sent_vec += (vec * tf_idf)
            weight_sum += tf_idf
        except:
            pass
    sent_vec /= weight_sum
    tfidf_sent_vectors.append(sent_vec)
    row += 1
print(len(tfidf_sent_vectors))
print(len(tfidf_sent_vectors[0]))