In [30]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import nltk
nltk.download('stopwords')

import seaborn as sns

import string
import sqlite3

from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

from sklearn import metrics
from sklearn.metrics import confusion_matrix, auc, roc_curve
from nltk.stem.porter import PorterStemmer
import re

%matplotlib inline

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Ashish\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


In [3]:
conn = sqlite3.connect("datasets/food_reviews.sqlite")

In [4]:
filtered_data = pd.read_sql_query("""
    select * from Reviews
    WHERE Score != 3
""",conn)

In [5]:
actualScores = filtered_data['Score']
posNeg = actualScores.map(lambda i:"Positive" if i>3 else "Negative")
filtered_data["Score"] = posNeg

In [6]:
print(filtered_data.shape)
filtered_data.head(3)

(525814, 10)


Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,Positive,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,Negative,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,Positive,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...


## Data Cleaning : DeDuplication
We notice that for same user : Time, Summary, Text are almost same
Reason : Different variations of same product

In [8]:
display = pd.read_sql_query("""
select * from Reviews 
where Score !=3 and UserId = "AR5J8UI46CURR"
ORDER by ProductID
""",conn)

display

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,78445,B000HDL1RQ,AR5J8UI46CURR,Geetha Krishnan,2,2,5,1199577600,LOACKER QUADRATINI VANILLA WAFERS,DELICIOUS WAFERS. I FIND THAT EUROPEAN WAFERS ...
1,138317,B000HDOPYC,AR5J8UI46CURR,Geetha Krishnan,2,2,5,1199577600,LOACKER QUADRATINI VANILLA WAFERS,DELICIOUS WAFERS. I FIND THAT EUROPEAN WAFERS ...
2,138277,B000HDOPYM,AR5J8UI46CURR,Geetha Krishnan,2,2,5,1199577600,LOACKER QUADRATINI VANILLA WAFERS,DELICIOUS WAFERS. I FIND THAT EUROPEAN WAFERS ...
3,73791,B000HDOPZG,AR5J8UI46CURR,Geetha Krishnan,2,2,5,1199577600,LOACKER QUADRATINI VANILLA WAFERS,DELICIOUS WAFERS. I FIND THAT EUROPEAN WAFERS ...
4,155049,B000PAQ75C,AR5J8UI46CURR,Geetha Krishnan,2,2,5,1199577600,LOACKER QUADRATINI VANILLA WAFERS,DELICIOUS WAFERS. I FIND THAT EUROPEAN WAFERS ...


In [10]:
#Sort data acc o ProductID
sorted_data = filtered_data.sort_values('ProductId', axis=0,ascending = True)

In [12]:
#deduplication
final = sorted_data.drop_duplicates(subset={"UserId","ProfileName","Time","Text"},keep='first',inplace=False)
final.shape

(364173, 10)

Previous Data  : 525814 <br>
After Deduplication : 364173

### Another Problem : HelpNum and HelpDenom 
num : ppl who said yes<br>
denom : all ppl <br>
In some cases num > denom

In [18]:
final = final[final.HelpfulnessNumerator <= final.HelpfulnessDenominator]
final['Score'].value_counts()

Positive    307061
Negative     57110
Name: Score, dtype: int64

## Bag of Words (BoW)

In [21]:
count_vec = CountVectorizer()
# CountVectorizer(binary=True) if you need Binary BoW
final_counts = count_vec.fit_transform(final['Text'].values)

In [20]:
print(type(final_counts))
print(final_counts.get_shape())

<class 'scipy.sparse.csr.csr_matrix'>
(364171, 115281)


+ final_counts is sparse matrix ie each vector is sparse vector
+ Shape : (364171, 115281)
    + No of records : 364171
    + No of Dimensions ( no of unique words in dict) : 115281

## TEXT PROCESSING : Stemming, Stop Word Removal, Lemmetization

+ Remove HTML tags
+ Remove Special Chars and Puncuations
+ Check if word has only letters
+ Check if len(word) > 2
+ Convert to lowercase
+ Remove Stopwords
+ Stemming - Snowball | Porter

In [26]:
for text in final['Text'].values:
    if (len(re.findall('<.*?>',text))):
        print(text)
        break

I set aside at least an hour each day to read to my son (3 y/o). At this point, I consider myself a connoisseur of children's books and this is one of the best. Santa Clause put this under the tree. Since then, we've read it perpetually and he loves it.<br /><br />First, this book taught him the months of the year.<br /><br />Second, it's a pleasure to read. Well suited to 1.5 y/o old to 4+.<br /><br />Very few children's books are worth owning. Most should be borrowed from the library. This book, however, deserves a permanent spot on your shelf. Sendak's best.


In [34]:
import re
import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
sno = nltk.stem.SnowballStemmer('english')

In [31]:
# set of all english stopwords
stop = set(stopwords.words('english'))

#remove HTML tags
def removeHTML(sentence):
    htmlRe = re.compile('<.*?>')
    cleanedTxt = re.sub(htmlRe, ' ',sentence)
    return cleanedTxt

# remove punctuation from word
def removePunctuations(sentence):
    cleanedTxt = re.sub(r'[?|!|\'|"|#]', r' ',sentence)
    cleanedTxt = re.sub(r'[.|,|)|(|\|/]', r' ',sentence)
    return cleanedTxt

In [38]:
print(sno.stem("Tasteful"))
print(sno.stem("Tasty"))

tast
tasti


### Pre Processing Steps

In [44]:
i = 0
str1 = ' '
final_strings = []
all_pos_words = []
all_neg_words = []
stem_word =''

for sentence in final['Text'].values:
    filtered_sentence = []
    sentence = removeHTML(sentence)
    for word in sentence.split():
        #remove punctuations
        for cleaned_word in removePunctuations(word).split():
            #check len and only numbers
            if cleaned_word.isalpha() and len(cleaned_word)>2:
                #remove stop word
                if cleaned_word.lower() not in stop:
                    stem_word = sno.stem(cleaned_word.lower()).encode('utf-8')
                    
                    filtered_sentence.append(stem_word);
                    
                    #positive and negative class
                    if (final['Score'].values)[i] == 'positive':
                        all_pos_words.append(stem_word)
                    if (final['Score'].values)[i] == 'negative':
                        all_neg_words.append(stem_word)
                #word is stop word
                else:
                    continue
            #word in numeric and len < 2
            else:
                continue 
        #loop ends for one sentence
    str1 = b" ".join(filtered_sentence)
    final_strings.append(str1)
    i+=1
                    

### Add cleaned sentences to dataset and Save to new Dataset

In [47]:
final['CleanedText'] = final_strings

#store as new Dataset
conn = sqlite3.connect("datasets/cleaned_food_reviews.sqlite")
c = conn.cursor()
conn.text_factory = str
final.to_sql('Reviews',conn, if_exists='replace')

### Uni-Grams

In [49]:
# Get frequency distribution of words
freq_pos = nltk.FreqDist(all_pos_words)
freq_neg = nltk.FreqDist(all_neg_words)

print("20 Most common +ve words: ",freq_pos.most_common(20))
print()
print("20 Most common -ve words: ",freq_neg.most_common(20))

[]

**Observation** : 'Like' appears frequently in both +ve and -ve
Logic : 
+ in +ve : like
+ in -ve :  not like

**Solution** : use bi-grams

### Bi grams

In [52]:
count_vec = CountVectorizer( ngram_range=(1,2) )
# n_gram = (1,2) get uni and bi-grams
final_bigram_counts = count_vec.fit_transform(final['Text'].values)

In [55]:
print(final_bigram_counts.get_shape())

(364171, 2910192)


**Observation** : dimensions >>>> BoW

## TF-IDF

In [54]:
tf_id_vect = TfidfVectorizer(ngram_range = (1,2))
final_tf_id_vect = tf_id_vect.fit_transform(final['Text'].values)

In [59]:
print(final_tf_id_vect.shape)
features = tf_id_vect.get_feature_names()
len(features)

(364171, 2910192)


2910192

In [62]:
features[100000:100020]

['ales until',
 'ales ve',
 'ales would',
 'ales you',
 'alessandra',
 'alessandra ambrosia',
 'alessi',
 'alessi added',
 'alessi also',
 'alessi and',
 'alessi are',
 'alessi at',
 'alessi brand',
 'alessi breadsticks',
 'alessi caffe',
 'alessi cento',
 'alessi chicken',
 'alessi coarse',
 'alessi coffees',
 'alessi decaf']