In [62]:
import re
import time
import numpy as np
import pandas as pd
from sklearn import svm
from sklearn.svm import SVC
import matplotlib.pyplot as plt  
from nltk.corpus import stopwords
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix,classification_report

### Read the File

In [63]:
df = pd.read_csv("Reviews.csv")
df.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...


In [64]:
df.shape

(568454, 10)

In [65]:
df["Score"].value_counts()

5    363122
4     80655
1     52268
3     42640
2     29769
Name: Score, dtype: int64

In [66]:
df["ProfileName"].value_counts()

C. F. Hill "CFH"                               451
O. Brown "Ms. O. Khannah-Brown"                421
Gary Peterson                                  389
Rebecca of Amazon "The Rebecca Review"         365
Chris                                          363
                                              ... 
Steven Wolff                                     1
joycomeau                                        1
Lizz                                             1
Phyllis A. De Smet-Howard "tweedsmerewillo"      1
srfell17                                         1
Name: ProfileName, Length: 218416, dtype: int64

### Binary classifying the "Score" column

In [67]:
def partition(x):
    if x < 3:
        return 0
    else:
        return 1
df['Score'] = df['Score'].map(partition)           

In [68]:
df["Score"].value_counts()

1    486417
0     82037
Name: Score, dtype: int64

In [69]:
df[(df['ProfileName'] == 'R. Ellis "Bobby"') & (df['Summary'] == 'The price is right')]

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
8934,8935,B0007A0AP8,A74SHV5ZD3RLT,"R. Ellis ""Bobby""",15,15,1,1303862400,The price is right,We have a little Maltese that we spoil to no e...
175816,175817,B0014DUUFC,A74SHV5ZD3RLT,"R. Ellis ""Bobby""",15,15,1,1303862400,The price is right,We have a little Maltese that we spoil to no e...
534266,534267,B0007A0AOY,A74SHV5ZD3RLT,"R. Ellis ""Bobby""",15,15,1,1303862400,The price is right,We have a little Maltese that we spoil to no e...
545769,545770,B001E5E1C8,A74SHV5ZD3RLT,"R. Ellis ""Bobby""",15,15,1,1303862400,The price is right,We have a little Maltese that we spoil to no e...


### Eliminating Duplicate features

In [70]:
data = df.drop_duplicates(subset = {"UserId" , "Text"})
data.shape

(393606, 10)

In [71]:
data[data['HelpfulnessNumerator']>data['HelpfulnessDenominator']]

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
44736,44737,B001EQ55RW,A2V0I904FH7ABY,Ram,3,2,1,1212883200,Pure cocoa taste with crunchy almonds inside,It was almost a 'love at first bite' - the per...
64421,64422,B000MIDROQ,A161DK06JJMCYF,"J. E. Stephens ""Jeanne""",3,1,1,1224892800,Bought This for My Son at College,My son loves spaghetti so I didn't hesitate or...


In [72]:
data=data[data.HelpfulnessNumerator<=data.HelpfulnessDenominator]
data.shape

(393604, 10)

In [73]:
data['Score'].value_counts() 

1    336528
0     57076
Name: Score, dtype: int64

In [74]:
data['Text'].values

array(['I have bought several of the Vitality canned dog food products and have found them all to be of good quality. The product looks more like a stew than a processed meat and it smells better. My Labrador is finicky and she appreciates this product better than  most.',
       'Product arrived labeled as Jumbo Salted Peanuts...the peanuts were actually small sized unsalted. Not sure if this was an error or if the vendor intended to represent the product as "Jumbo".',
       'This is a confection that has been around a few centuries.  It is a light, pillowy citrus gelatin with nuts - in this case Filberts. And it is cut into tiny squares and then liberally coated with powdered sugar.  And it is a tiny mouthful of heaven.  Not too chewy, and very flavorful.  I highly recommend this yummy treat.  If you are familiar with the story of C.S. Lewis\' "The Lion, The Witch, and The Wardrobe" - this is the treat that seduces Edmund into selling out his Brother and Sisters to the Witch.',
    

### Pruning the data

In [75]:
i = 0
for words in data['Text'].values:
    if(len(re.findall('<.*?>', words))):
        print(words,"\n")
        i += 1
    if i == 5:
        break

I don't know if it's the cactus or the tequila or just the unique combination of ingredients, but the flavour of this hot sauce makes it one of a kind!  We picked up a bottle once on a trip we were on and brought it back home with us and were totally blown away!  When we realized that we simply couldn't find it anywhere in our city we were bummed.<br /><br />Now, because of the magic of the internet, we have a case of the sauce and are ecstatic because of it.<br /><br />If you love hot sauce..I mean really love hot sauce, but don't want a sauce that tastelessly burns your throat, grab a bottle of Tequila Picante Gourmet de Inclan.  Just realize that once you taste it, you will never want to use any other sauce.<br /><br />Thank you for the personal, incredible service! 

Twizzlers, Strawberry my childhood favorite candy, made in Lancaster Pennsylvania by Y & S Candies, Inc. one of the oldest confectionery Firms in the United States, now a Subsidiary of the Hershey Company, the Company 

In [76]:
stop_words = set(stopwords.words('english'))
print(stop_words)

{'has', "weren't", 'or', 'were', 'down', 'are', 'should', 'very', 'few', 'we', 'because', 'll', 'have', 'itself', 'won', 'mustn', "shan't", 'but', 'these', 'into', 'over', "mightn't", "couldn't", 'is', 'me', 'this', 'couldn', 'all', 'about', 'ours', 'her', 'their', 'until', "didn't", 'am', 'himself', "shouldn't", 'above', 'further', "aren't", 'same', 'doing', 'hers', 'both', 'yourself', 'had', 'don', "won't", "doesn't", 'o', 'you', "it's", 'after', 'as', 'haven', 'was', 'a', 'below', 'no', "wasn't", 'doesn', "you're", 'myself', 't', 'hasn', 'he', 'when', 'each', 'here', 'once', "you'll", "wouldn't", 'y', "you've", 'your', 'and', 'ain', 'own', 'will', 'herself', 'them', 'the', "needn't", 's', 'theirs', 'other', 'his', 'what', "that'll", 'nor', 'being', 'at', 'it', 'those', 'does', 'shouldn', 'in', 'while', 'off', 'up', 'whom', 'there', 'so', 're', 'isn', "mustn't", 'aren', 'my', "isn't", 'they', 'with', 'then', 'didn', 'mightn', 'only', 'an', 'during', 'yours', 'how', 'did', "hadn't", '

In [77]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ANKUR\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [78]:
#stemming
stemming = nltk.stem.SnowballStemmer('english')

In [79]:
print(stemming.stem("operating"))
print(stemming.stem("operated"))
print(stemming.stem("operation"))
print(stemming.stem("operator"))

oper
oper
oper
oper
vaishnav


In [80]:
def data_cleaning(series):
    i = 0
    string = ""
    final_string = []    
    list_of_sent = []    
    cleanr = re.compile('<.*?>') 
    
    for sent in series.values:
        filtered_sent = []
        sent = re.sub(cleanr, ' ', sent) 
        sent = re.sub('[^a-zA-Z0-9\n]', ' ', sent) 
        sent = re.sub('\s+',' ', sent) 
        sent = sent.lower() 
        for word in sent.split():
            if word not in stop_words and len(word)>2:
                word = stemming.stem(word) 
                filtered_sent.append(word)
        list_of_sent.append(filtered_sent) 
        string = " ".join(filtered_sent) 
        final_string.append(string) 
        i+=1
    return final_string, list_of_sent

In [82]:
for x in data['Text'].iloc[:5].values:
    print(x,"\n\n")

I have bought several of the Vitality canned dog food products and have found them all to be of good quality. The product looks more like a stew than a processed meat and it smells better. My Labrador is finicky and she appreciates this product better than  most. 


Product arrived labeled as Jumbo Salted Peanuts...the peanuts were actually small sized unsalted. Not sure if this was an error or if the vendor intended to represent the product as "Jumbo". 


This is a confection that has been around a few centuries.  It is a light, pillowy citrus gelatin with nuts - in this case Filberts. And it is cut into tiny squares and then liberally coated with powdered sugar.  And it is a tiny mouthful of heaven.  Not too chewy, and very flavorful.  I highly recommend this yummy treat.  If you are familiar with the story of C.S. Lewis' "The Lion, The Witch, and The Wardrobe" - this is the treat that seduces Edmund into selling out his Brother and Sisters to the Witch. 


If you are looking for the

In [84]:
final_string, list_of_sent = data_cleaning(data['Text'].iloc[:5])
for x in final_string:
    print(x,"\n\n")

bought sever vital can dog food product found good qualiti product look like stew process meat smell better labrador finicki appreci product better 


product arriv label jumbo salt peanut peanut actual small size unsalt sure error vendor intend repres product jumbo 


confect around centuri light pillowi citrus gelatin nut case filbert cut tini squar liber coat powder sugar tini mouth heaven chewi flavor high recommend yummi treat familiar stori lewi lion witch wardrob treat seduc edmund sell brother sister witch 


look secret ingredi robitussin believ found got addit root beer extract order good made cherri soda flavor medicin 


great taffi great price wide assort yummi taffi deliveri quick taffi lover deal 




In [86]:
start = time.time()
final_string, list_of_sent = data_cleaning(data['Text'])
end = time.time()
print("Time takes in seconds =", end - start)

Time takes in seconds = 410.6850972175598


In [87]:
data['CleanedText'] = final_string
data

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text,CleanedText
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,1,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...,bought sever vital can dog food product found ...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,0,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...,product arriv label jumbo salt peanut peanut a...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,1,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...,confect around centuri light pillowi citrus ge...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,0,1307923200,Cough Medicine,If you are looking for the secret ingredient i...,look secret ingredi robitussin believ found go...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,1,1350777600,Great taffy,Great taffy at a great price. There was a wid...,great taffi great price wide assort yummi taff...
...,...,...,...,...,...,...,...,...,...,...,...
568449,568450,B001EO7N10,A28KG5XORO54AY,Lettie D. Carter,0,0,1,1299628800,Will not do without,Great for sesame chicken..this is a good if no...,great sesam chicken good better restur eaten h...
568450,568451,B003S1WTCU,A3I8AFVPEE8KI5,R. Sawyer,0,0,0,1331251200,disappointed,I'm disappointed with the flavor. The chocolat...,disappoint flavor chocol note especi weak milk...
568451,568452,B004I613EE,A121AA1GQV751Z,"pksd ""pk_007""",2,2,1,1329782400,Perfect for our maltipoo,"These stars are small, so you can give 10-15 o...",star small give one train session tri train do...
568452,568453,B004I613EE,A3IBEVCTXKNOH,"Kathy A. Welch ""katwel""",1,1,1,1331596800,Favorite Training and reward treat,These are the BEST treats for training and rew...,best treat train reward dog good groom lower c...


### Sample dataset

In [88]:
negative = data[data['Score'] == 0][['Score', 'Time', 'CleanedText']]
positive = data[data['Score'] == 1][['Score', 'Time', 'CleanedText']]
data1 = pd.concat([negative.iloc[:5,:], positive.iloc[:5,:]], axis = 0, ignore_index = True)
positive.head()

Unnamed: 0,Score,Time,CleanedText
0,1,1303862400,bought sever vital can dog food product found ...
2,1,1219017600,confect around centuri light pillowi citrus ge...
4,1,1350777600,great taffi great price wide assort yummi taff...
5,1,1342051200,got wild hair taffi order five pound bag taffi...
6,1,1340150400,saltwat taffi great flavor soft chewi candi in...


### Sorting data based on time

In [89]:
data1 = data[['Score', 'Time', 'CleanedText']].copy()
data1.sort_values('Time', inplace = True)
data1

Unnamed: 0,Score,Time,CleanedText
150523,1,939340800,witti littl book make son laugh loud recit car...
150500,1,940809600,rememb see show air televis year ago child sis...
451855,1,944092800,beetlejuic well written movi everyth excel act...
230284,1,944438400,twist rumplestiskin captur film star michael k...
451854,1,946857600,beetlejuic excel funni movi keaton hilari wack...
...,...,...,...
467555,1,1351209600,use alessi decaffen caff expresso year sometim...
217005,0,1351209600,high expect seed disappoint half seed plant fa...
79281,1,1351209600,complaint product like would rate good subject...
414150,1,1351209600,hard find groceri buy case onlin one box make ...


In [90]:
data1['Score'].value_counts()

1    336528
0     57076
Name: Score, dtype: int64

### Deleting the data variable to empty space

In [91]:
del data

In [92]:
data1

Unnamed: 0,Score,Time,CleanedText
150523,1,939340800,witti littl book make son laugh loud recit car...
150500,1,940809600,rememb see show air televis year ago child sis...
451855,1,944092800,beetlejuic well written movi everyth excel act...
230284,1,944438400,twist rumplestiskin captur film star michael k...
451854,1,946857600,beetlejuic excel funni movi keaton hilari wack...
...,...,...,...
467555,1,1351209600,use alessi decaffen caff expresso year sometim...
217005,0,1351209600,high expect seed disappoint half seed plant fa...
79281,1,1351209600,complaint product like would rate good subject...
414150,1,1351209600,hard find groceri buy case onlin one box make ...


### Seperating the target variable from the other features

In [93]:
X = data1['CleanedText']
y = data1['Score']

### Splitting the training and testing data 

In [94]:
X_train_raw, X_test_raw, y_train, y_test = train_test_split(X[:8000], y[:8000], test_size = 0.3, shuffle = False )

In [95]:
X_train_raw.shape, y_train.shape, X_test_raw.shape, y_test.shape

((5600,), (5600,), (2400,), (2400,))

In [96]:
X_train_raw.values

array(['witti littl book make son laugh loud recit car drive along alway sing refrain learn whale india droop rose love new word book introduc silli classic book will bet son still abl recit memori colleg',
       'rememb see show air televis year ago child sister later bought day thirti someth use seri book amp song student teach preschool amp turn whole school purchas along book children amp tradit live',
       'beetlejuic well written movi everyth excel act special effect delight chose view movi',
       ...,
       'look organ black tea replac tetley tea usual drink stash tea wish less expens feel worth',
       'help recoveri time run keep nasti lactic acid away',
       'review today correct navita natur great brand prefer cacao powder nib fresh factor use pound time awar howev process food bought three pound cacao powder slight differ tast assum maca would tast well differ noth someth would buy tast smokey deep complex take lot somewhat gag ad tbls smoothi realli tell load caca

In [97]:
X_train_raw.value_counts()

mate actual tea tea like beverag origin argentina uruguay primarili consum countri well paraguay southern brazil brew dri leav stemlet perenni yerba mate tree name deriv quichua word mati design gourd tradit serv averag 300 000 ton mate produc year consist similar green tea mate distinct full bittersweet flavor note alfalfa resembl tea milder 196 volatil chemic compound contain yerba mate 144 also contain tea know manifold health benefit among plant ingredi alkaloid xanthin theophyllin theobromin well vitamin sever vitamin numer miner south american guarani indian tradit use plant medicin purpos inter alia stimul central nervous system promot mental physic cardiovascular activ enhanc resist fatigu reduc fever mitig thirst hunger decreas blood pressur diuret laxat purgat sudorif antirheumat legend knowledg infus power first impart young guarani woman father mysteri shaman reward woman faith stay exhaust father tribe continu search arabl land recent year mate becom cultur phenomenon thro

### Text preprocessing with Count BoW technique on training data

In [98]:
bow = CountVectorizer()
X_train = bow.fit_transform(X_train_raw.values)

In [99]:
X_train.shape

(5600, 12496)

### Text preprocessing with Count BoW technique on testing data

In [100]:
X_test = bow.transform(X_test_raw.values)

In [101]:
X_test.shape

(2400, 12496)

### Standardization by removing mean  and scaling by variance 1 

In [102]:
s = StandardScaler(with_mean=False) 
X_train= s.fit_transform(X_train)
X_test = s.transform(X_test)

### Finding Test Accuracy

In [103]:
clf = SVC(C= 10, gamma = 0.5 )
clf.fit(X_train , y_train)
y_pred = clf.predict(X_test)
acc = accuracy_score(y_test, y_pred) * 100
print("Accuracy =", acc)

Accuracy = 89.375


In [104]:
confusion_matrix(y_test, y_pred)

array([[   0,  255],
       [   0, 2145]], dtype=int64)

In [105]:
classification_report(y_test,y_pred)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


'              precision    recall  f1-score   support\n\n           0       0.00      0.00      0.00       255\n           1       0.89      1.00      0.94      2145\n\n    accuracy                           0.89      2400\n   macro avg       0.45      0.50      0.47      2400\nweighted avg       0.80      0.89      0.84      2400\n'

### Finding best hyperparameters

In [106]:
start = time.time()
clf = SVC()
param_grid = [{'kernel':['rbf'],'gamma':[50,5,10,0.5],
             'C':[10,0.1,0.001] }]
gsv = GridSearchCV(clf,param_grid,cv=5,n_jobs=-1)
gsv.fit(X_train,y_train)
end = time.time()
print("Time required ="+str(end-start)+"s")

Time required =281.64959621429443s


In [107]:
print("Best HyperParameter: ",gsv.best_params_)
print("Best Accuracy: %.2f%%"%(gsv.best_score_*100))

Best HyperParameter:  {'C': 10, 'gamma': 50, 'kernel': 'rbf'}
Best Accuracy: 89.11%
