### Importing the dependencies:

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import sqlite3    
import pickle    
import time

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import StandardScaler

from sklearn import svm
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report


from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split, cross_val_score

### Reading the cleaned sqlite file. 
###### (cleanning process show in previous file.)

In [2]:
conn = sqlite3.connect('C:\\Users\\HP\\Desktop\\GRE\\ML\\final.sqlite')
final = pd.read_sql_query("""SELECT * FROM Reviews""", conn)
conn.close()
final.head()


Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text,CleanedText
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,1,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...,bought sever vital can dog food product found ...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,0,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...,product arriv label jumbo salt peanut peanut a...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,1,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...,confect around centuri light pillowi citrus ge...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,0,1307923200,Cough Medicine,If you are looking for the secret ingredient i...,look secret ingredi robitussin believ found go...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,1,1350777600,Great taffy,Great taffy at a great price. There was a wid...,great taffi great price wide assort yummi taff...


### using sample dataset since svm takes quit a long time.

In [3]:
negative = final[final['Score'] == 0][['Score', 'Time', 'CleanedText']]
positive = final[final['Score'] == 1][['Score', 'Time', 'CleanedText']]
data = pd.concat([negative.iloc[:5,:], positive.iloc[:5,:]], axis = 0, ignore_index = True)


### sorting data based on time

In [4]:
data = final[['Score', 'Time', 'CleanedText']].copy()
data.sort_values('Time', inplace = True)
data.head(10)

Unnamed: 0,Score,Time,CleanedText
117879,1,939340800,witti littl book make son laugh loud recit car...
117856,1,940809600,rememb see show air televis year ago child sis...
298565,1,944092800,beetlejuic well written movi everyth excel act...
169198,1,944438400,twist rumplestiskin captur film star michael k...
298564,1,946857600,beetlejuic excel funni movi keaton hilari wack...
169259,1,947376000,one movi movi collect fill comedi action whate...
169184,0,948240000,alway enjoy movi funni entertain hesit pick cl...
63292,1,948672000,bought apart infest fruit fli hour trap quot a...
169284,1,951523200,happen say name three time michael keaten star...
298566,1,959990400,get crazi realli imposs today find french vhs ...


### ### Unbalanced dataset:
(can be balanced by upsampling the negative datapoints.)

In [5]:
data['Score'].value_counts()

1    306779
0     57078
Name: Score, dtype: int64

### Deleting the final variable inorder to empty space:

In [6]:
del final

### Separating the target variable from the features.

In [7]:
X = data['CleanedText']
y = data['Score']

### Splitting the training and testing data:
(70:30 ratio)

In [8]:
X_train_raw, X_test_raw, y_train, y_test = train_test_split(X[:8000], y[:8000], test_size = 0.3, shuffle = False )

In [9]:
X_train_raw.shape, y_train.shape, X_test_raw.shape, y_test.shape

((5600,), (5600,), (2400,), (2400,))

In [10]:
X_train_raw.values

array(['witti littl book make son laugh loud recit car drive along alway sing refrain learn whale india droop rose love new word book introduc silli classic book will bet son still abl recit memori colleg',
       'rememb see show air televis year ago child sister later bought day thirti someth use seri book amp song student teach preschool amp turn whole school purchas along book children amp tradit live',
       'beetlejuic well written movi everyth excel act special effect delight chose view movi',
       ...,
       'famili love marinad smoki sweet flavor tast wonder chicken love dice chicken use fill soft taco burrito',
       'difficult oatmeal quinoa cook breakfast small grain must shaken apart ad water frequent stir requir avoid sticki gel like mass bottom pan minor annoy asid teff excel flavor nutrit benefit includ rda iron great fruit nut breakfast base veget casserol',
       'spent time recent year old man actual quit famous main thing mind maintain sens regular much younge

### Applying Count BOW text preprocessing technique on training data:

In [11]:
bow = CountVectorizer()
X_train = bow.fit_transform(X_train_raw.values)

In [12]:
X_train.shape 

(5600, 12476)

### Applying BOW vectorizer text preprocessing technique on testing data:

In [13]:
X_test = bow.transform(X_test_raw.values)

In [14]:
X_test.shape

(2400, 12476)

### Standardizing  features by removing the mean and scaling to unit variance:

In [15]:
s = StandardScaler(with_mean=False) 

X_train= s.fit_transform(X_train)

X_test = s.transform(X_test)



### Performing cross validation to find the best hyperparameters(c, gamma),kernels & finding accuracy on training data:

In [29]:
start = time.time()
clf = SVC()
param_grid = [{'kernel':['rbf'],'gamma':[50,5,10,0.5],
             'C':[10,0.1,0.001] }]
gsv = GridSearchCV(clf,param_grid,cv=5,n_jobs=-1)
gsv.fit(X_train,y_train)
end = time.time()
print("Time required ="+str(end-start)+"s")

Time required =515.1245949268341s


In [30]:

print("Best HyperParameter: ",gsv.best_params_)
print("Best Accuracy: %.2f%%"%(gsv.best_score_*100))

Best HyperParameter:  {'C': 10, 'gamma': 50, 'kernel': 'rbf'}
Best Accuracy: 88.34%


### appying model on testing data and finding accuracy on it.

In [31]:
clf = SVC(C= 10, gamma = 0.5 )
clf.fit(X_train , y_train)
y_pred = clf.predict(X_test)
acc = accuracy_score(y_test, y_pred) * 100
print("Accuracy =", acc)
confusion_matrix(y_test, y_pred)

Accuracy = 89.41666666666667


array([[   0,  254],
       [   0, 2146]], dtype=int64)