In [1]:
%matplotlib inline 

import sqlite3
import pandas as pd 
import numpy as np 
import nltk 
import string 
import matplotlib.pyplot as plt
import seaborn as sns 

from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import confusion_matrix
from sklearn import metrics 
from sklearn.metrics import roc_curve, auc

#natural language processing tool kit 
from nltk.stem.porter import PorterStemmer

#this opens a connection between my database and code
con = sqlite3.connect('../DataFiles/database.sqlite')

#selecting only those rows which have score rating as either 1, 2, 3 or 4.
filtered_data = pd.read_sql_query("SELECT * FROM Reviews WHERE Score != 3", con)

#here i am giving a rating to the socre based on their value 1, 2 as negative 4, 5 as postive and neglecting 3
def partition(x):
    if(x<3):
        return 'negative'
    else:
        return 'positive'

#next we need the value of score field with positive nad negative values based on their values 
#selecting the score column 
#it return series 
actualScore = filtered_data['Score']
#map function return a same type of  of data after applying function to each item of a given iterable. here we are applying it on series that means it return series only
positiveNegative = actualScore.map(partition)
filtered_data['Score'] = positiveNegative



In [2]:
filtered_data.shape
filtered_data.head()

#look at the output of the data 
#here time is stored in the unix timestamp format 

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,positive,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,negative,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,positive,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,negative,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,positive,1350777600,Great taffy,Great taffy at a great price. There was a wid...


# Data Cleaning: Deduplication 
- it is mandatory to remove duplicate data in order to get unbaised results
- if you are giving garbage to the machine learning as a data then it will also give you the garbage

In [3]:
#example of duplicate data 
#here you know that there is this porblem hence it is easy otherwise in real time we need to find this by appling query
display = pd.read_sql_query("""
SELECT *
FROM Reviews 
WHERE Score !=3 AND UserId = "AR5J8UI46CURR"
ORDER BY ProductID
""", con)
display

#copy the product it and you can directly paste it into amazon.com/dp/{id}
#dp stands for detail page and id is also called ASIN - amazon standard identification number 
#so we need to dedup the data 

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,78445,B000HDL1RQ,AR5J8UI46CURR,Geetha Krishnan,2,2,5,1199577600,LOACKER QUADRATINI VANILLA WAFERS,DELICIOUS WAFERS. I FIND THAT EUROPEAN WAFERS ...
1,138317,B000HDOPYC,AR5J8UI46CURR,Geetha Krishnan,2,2,5,1199577600,LOACKER QUADRATINI VANILLA WAFERS,DELICIOUS WAFERS. I FIND THAT EUROPEAN WAFERS ...
2,138277,B000HDOPYM,AR5J8UI46CURR,Geetha Krishnan,2,2,5,1199577600,LOACKER QUADRATINI VANILLA WAFERS,DELICIOUS WAFERS. I FIND THAT EUROPEAN WAFERS ...
3,73791,B000HDOPZG,AR5J8UI46CURR,Geetha Krishnan,2,2,5,1199577600,LOACKER QUADRATINI VANILLA WAFERS,DELICIOUS WAFERS. I FIND THAT EUROPEAN WAFERS ...
4,155049,B000PAQ75C,AR5J8UI46CURR,Geetha Krishnan,2,2,5,1199577600,LOACKER QUADRATINI VANILLA WAFERS,DELICIOUS WAFERS. I FIND THAT EUROPEAN WAFERS ...


In [4]:
#Sorting data based on ProductId 
sorted_data = filtered_data.sort_values('ProductId', axis=0 , ascending=True , inplace=False , kind='quicksort' , na_position='last')

In [5]:
#Deduplication of entries
#find duplicates in such a way that of UserId, ProfileName, Time , Text are same then it is a duplicate
#keep the forst one remove the rest of theem 
#inplace the return the copy of the data 
#visit this function documentation 
#intitally we were having around 50000 rows now after removeing dplicates we are only left with 36000 rows 
final=sorted_data.drop_duplicates(subset={"UserId","ProfileName","Time","Text"}, keep='first', inplace=False)
final.shape

(364173, 10)

In [6]:
#helpfullness numerator cannot be greater than helpfull denominator 
display = pd.read_sql_query("""
SELECT *
FROM Reviews
Where Score != 3 AND Id=44737 OR Id=64422
ORDER BY ProductID
""",con)
display


Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,64422,B000MIDROQ,A161DK06JJMCYF,"J. E. Stephens ""Jeanne""",3,1,5,1224892800,Bought This for My Son at College,My son loves spaghetti so I didn't hesitate or...
1,44737,B001EQ55RW,A2V0I904FH7ABY,Ram,3,2,4,1212883200,Pure cocoa taste with crunchy almonds inside,It was almost a 'love at first bite' - the per...


In [7]:
#so we will remove this rows as they does not make any sense now 
final = final[final.HelpfulnessNumerator <= final.HelpfulnessDenominator]
print(final.shape)

(364171, 10)


In [8]:
#how many positive and negative review are present in our dataset?
final['Score'].value_counts()

positive    307061
negative     57110
Name: Score, dtype: int64

# Bag of Words
- we are going to apply this on our final code 

In [15]:
#BoW
#see the documentation of CountVectorizer in scikit learn 
count_vect = CountVectorizer()
final_counts = count_vect.fit_transform(final['Text'].values)

In [16]:
import pprint 
type(final_counts)

scipy.sparse.csr.csr_matrix

In [17]:
final_counts.get_shape()
#for each review we have a vector, and every column refer to unique word

(364171, 115281)

In [19]:
#finding sentences containing HTML tags 
import re
i = 0
for sent in final['Text'].values:
    if(len(re.findall('<.*>' , sent))):
        print(i)
        print(sent)
        break;
    i += 1

6
I set aside at least an hour each day to read to my son (3 y/o). At this point, I consider myself a connoisseur of children's books and this is one of the best. Santa Clause put this under the tree. Since then, we've read it perpetually and he loves it.<br /><br />First, this book taught him the months of the year.<br /><br />Second, it's a pleasure to read. Well suited to 1.5 y/o old to 4+.<br /><br />Very few children's books are worth owning. Most should be borrowed from the library. This book, however, deserves a permanent spot on your shelf. Sendak's best.


In [20]:
import string 
from nltk.corpus import stopwords 
from nltk.stem import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer

stop = set(stopwords.words('english'))
#initialising the snowball stemer 
sno = nltk.stem.SnowballStemmer('english')

#will the the <data> with space 
def cleanhtml(sentence):
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, ' ' , sentence)
    return cleantext 

def cleanpunc(sentence):
    cleaned = re.sub(r'[?|!|\'|"|#]' , r'' , sentence)
    cleaned = re.sub(r'[.|,|)|(|\|/]' , r'' , sentence)
    return cleaned 

print(stop)
print('######################################')
print(sno.stem('tasty'))

LookupError: 
**********************************************************************
  Resource [93mstopwords[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('stopwords')
  [0m
  Attempted to load [93mcorpora/stopwords[0m

  Searched in:
    - '/home/aman_garg/nltk_data'
    - '/home/aman_garg/anaconda3/nltk_data'
    - '/home/aman_garg/anaconda3/share/nltk_data'
    - '/home/aman_garg/anaconda3/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
**********************************************************************
