In [11]:
import pandas as pd

# Loading the labelled dataset
filePath = 'dataset/amazon_reviews.txt'
df = pd.read_csv(filePath, delimiter='\t')
df

Unnamed: 0,DOC_ID,LABEL,RATING,VERIFIED_PURCHASE,PRODUCT_CATEGORY,PRODUCT_ID,PRODUCT_TITLE,REVIEW_TITLE,REVIEW_TEXT
0,1,__label1__,4,N,PC,B00008NG7N,"Targus PAUK10U Ultra Mini USB Keypad, Black",useful,"When least you think so, this product will sav..."
1,2,__label1__,4,Y,Wireless,B00LH0Y3NM,Note 3 Battery : Stalion Strength Replacement ...,New era for batteries,Lithium batteries are something new introduced...
2,3,__label1__,3,N,Baby,B000I5UZ1Q,"Fisher-Price Papasan Cradle Swing, Starlight",doesn't swing very well.,I purchased this swing for my baby. She is 6 m...
3,4,__label1__,4,N,Office Products,B003822IRA,Casio MS-80B Standard Function Desktop Calculator,Great computing!,I was looking for an inexpensive desk calcolat...
4,5,__label1__,4,N,Beauty,B00PWSAXAM,Shine Whitening - Zero Peroxide Teeth Whitenin...,Only use twice a week,I only use it twice a week and the results are...
...,...,...,...,...,...,...,...,...,...
20995,20996,__label2__,4,Y,Shoes,B00BXYM8T8,"Madden Girl Women's Gettaw Pump,Red Patent,7.5...",wide width is great!,"I bought these for work. I have high arches, ..."
20996,20997,__label2__,4,Y,Shoes,B0014C2ORK,"crocs Unisex Classic Clog,Khaki,6 US Men's / 8...",Love crocs!,Crocs are one of only two brands of shoes that...
20997,20998,__label2__,5,Y,Shoes,B000EX8CCQ,Minnetonka Men's 703 Leather Laced Softsole Mo...,I love moccasins This fit like it was custom m...,I love moccasins This fit like it was custom ...
20998,20999,__label2__,5,Y,Shoes,B00748YHVE,Ariat Womens Unbridled Fatbaby 9 B Powder Brown,"This fit well, comfortable, best investment",I wish these were a little more durable. I got...


In [12]:
# Getting only the label and review text and dropping other columns
reviewDf = df.drop(['DOC_ID', 'RATING', 'VERIFIED_PURCHASE', 'PRODUCT_CATEGORY', 'PRODUCT_ID', 'PRODUCT_TITLE', 'REVIEW_TITLE'], axis="columns")
reviewDf

Unnamed: 0,LABEL,REVIEW_TEXT
0,__label1__,"When least you think so, this product will sav..."
1,__label1__,Lithium batteries are something new introduced...
2,__label1__,I purchased this swing for my baby. She is 6 m...
3,__label1__,I was looking for an inexpensive desk calcolat...
4,__label1__,I only use it twice a week and the results are...
...,...,...
20995,__label2__,"I bought these for work. I have high arches, ..."
20996,__label2__,Crocs are one of only two brands of shoes that...
20997,__label2__,I love moccasins This fit like it was custom ...
20998,__label2__,I wish these were a little more durable. I got...


In [13]:
# Splitting the Review_Text column as x and Label column as y
x = reviewDf.iloc[:,1].values
y = reviewDf.iloc[:,0].values

In [14]:
x[0]

'When least you think so, this product will save the day. Just keep it around just in case you need it for something.'

In [15]:
y[0]

'__label1__'

In [16]:
# Converting the reviews to a matrix of token counts (Bag of words model)
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 21000)
reviewText = cv.fit_transform(x).todense()

In [17]:
# Splitting the train and test data (80/20)
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(reviewText, y, test_size = 0.2, random_state = 0)

In [22]:
# Using SVM classifier on the train set
from sklearn.svm import LinearSVC
classifer = LinearSVC(max_iter=10000)
classifer.fit(x_train, y_train)

LinearSVC(max_iter=10000)

In [23]:
# Predicting test results
y_pred = classifer.predict(x_test)

In [24]:
# Confusion matrix to see the number of true positive, true negative, false positive and false negative
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

array([[1334,  764],
       [ 837, 1265]], dtype=int64)

In [25]:
# Accuracy = (TP + TN) / (TP + TN + FP + FN)
# Precision = (TP) / (TP + FP)
# Recall = (TP) / (TP + FN)
# F1 Score = (2 * Precision * Recall) / (Precision + Recall)

accuracy = round((1334 + 1265)/(1334 + 1265 + 764 + 837), 2)
precision = round((1334) / (1334 + 764), 2)
recall = round((1334) / (1334 + 837), 2)
f1 = round((2 * precision * recall) / (precision + recall), 2)

print('SVM results')
print('Accuracy = ', accuracy)
print('Precision = ', precision)
print('Recall = ', recall)
print('F1 Score = ', f1)

SVM results
Accuracy =  0.62
Precision =  0.64
Recall =  0.61
F1 Score =  0.62


In [26]:
import pickle

pickleName = "serialized_model.pkl"  

with open(pickleName, 'wb') as file:  
    pickle.dump(classifer, file)