In [1]:
import pandas as pd

filePath = '../dataset/amazon_review_updated_label.txt'
df = pd.read_csv(filePath, delimiter='\t')

In [2]:
# drop unessasary columns
reviewDf = df.drop(['DOC_ID', 'RATING', 'VERIFIED_PURCHASE', 'PRODUCT_CATEGORY', 'PRODUCT_ID', 'PRODUCT_TITLE', 'REVIEW_TITLE'], axis="columns")
reviewDf

Unnamed: 0,LABEL,REVIEW_TEXT
0,Fake,"When least you think so, this product will sav..."
1,Fake,Lithium batteries are something new introduced...
2,Fake,I purchased this swing for my baby. She is 6 m...
3,Fake,I was looking for an inexpensive desk calcolat...
4,Fake,I only use it twice a week and the results are...
...,...,...
20995,Real,"I bought these for work. I have high arches, ..."
20996,Real,Crocs are one of only two brands of shoes that...
20997,Real,I love moccasins This fit like it was custom ...
20998,Real,I wish these were a little more durable. I got...


In [3]:
# apply string 
reviewDf['REVIEW_TEXT']=reviewDf['REVIEW_TEXT'].apply(str)
reviewDf['LABEL']=reviewDf['LABEL'].apply(str)

In [4]:
# isolate review column
x = reviewDf.iloc[:,1].values
x[0]

'When least you think so, this product will save the day. Just keep it around just in case you need it for something.'

In [5]:
# isolate label column
y = reviewDf.iloc[:,0].values
y[0]

'Fake'

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline

In [7]:
# Using SVM classifier on the train set
from sklearn.svm import LinearSVC
# classifer = LinearSVC(max_iter=df.shape[0])

In [8]:
# fit model to pipeline
tfidf_lr_pipe = Pipeline([('tfidf', TfidfVectorizer(lowercase=False)), ('svm', LinearSVC(max_iter=df.shape[0]))])

In [10]:
# Splitting the train and test data (80/20)
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 0)

In [11]:
# Converting the reviews to a matrix of token counts (Bag of words model)
tfidf_lr_pipe.fit(x_train, y_train)

Pipeline(steps=[('tfidf', TfidfVectorizer(lowercase=False)),
                ('svm', LinearSVC(max_iter=21000))])

In [12]:
# predicting label review with test data
y_pred = tfidf_lr_pipe.predict(x_test)

In [13]:
# Confusion matrix to see the number of true positive, true negative, false positive and false negative
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

array([[1373,  725],
       [ 734, 1368]], dtype=int64)

In [14]:
# Generating classification report
from sklearn.metrics import classification_report
print('SVM results')
target_names = ['fake','real']
print(classification_report(y_test, y_pred, target_names=target_names))

# Accuracy = (TP + TN) / (TP + TN + FP + FN)
# Precision = (TP) / (TP + FP)
# Recall = (TP) / (TP + FN)
# F1 Score = (2 * Precision * Recall) / (Precision + Recall)

SVM results
              precision    recall  f1-score   support

        fake       0.65      0.65      0.65      2098
        real       0.65      0.65      0.65      2102

    accuracy                           0.65      4200
   macro avg       0.65      0.65      0.65      4200
weighted avg       0.65      0.65      0.65      4200



In [15]:
# creating dataframe with prdicted label and actual label
predicted_df = pd.DataFrame({'predict': y_pred, 'actual': y_test})

In [16]:
predicted_df.head(100)

Unnamed: 0,predict,actual
0,Fake,Real
1,Real,Fake
2,Fake,Fake
3,Fake,Fake
4,Real,Real
...,...,...
95,Real,Real
96,Fake,Real
97,Fake,Fake
98,Fake,Real


In [17]:
# exporting serialized model
import pickle
filename = "../ML models/serialized_pipeline_model_tfid.pkl"  

with open(filename, 'wb') as file:  
    pickle.dump(tfidf_lr_pipe, file)