In [30]:

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import joblib
%matplotlib inline
import warnings, string
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from joblib import dump, load

In [8]:
df = pd.read_csv('CHP1_fakeReviewData.csv')
df.head()

Unnamed: 0,category,rating,label,text_,tokens,processed_text
0,Home_and_Kitchen_5,5.0,CG,love this well made sturdy and very comfortab...,"['love', 'well', 'made', 'sturdy', 'comfortabl...",love well made sturdy comfortable love itvery ...
1,Home_and_Kitchen_5,5.0,CG,love it a great upgrade from the original ive...,"['love', 'great', 'upgrade', 'original', 'ive'...",love great upgrade original ive mine couple year
2,Home_and_Kitchen_5,5.0,CG,this pillow saved my back i love the look and ...,"['pillow', 'saved', 'back', 'love', 'look', 'f...",pillow saved back love look feel pillow
3,Home_and_Kitchen_5,1.0,CG,missing information on how to use it but it is...,"['missing', 'information', 'use', 'great', 'pr...",missing information use great product price
4,Home_and_Kitchen_5,5.0,CG,very nice set good quality we have had the set...,"['nice', 'set', 'good', 'quality', 'set', 'two...",nice set good quality set two month


In [9]:
df.head()

Unnamed: 0,category,rating,label,text_,tokens,processed_text
0,Home_and_Kitchen_5,5.0,CG,love this well made sturdy and very comfortab...,"['love', 'well', 'made', 'sturdy', 'comfortabl...",love well made sturdy comfortable love itvery ...
1,Home_and_Kitchen_5,5.0,CG,love it a great upgrade from the original ive...,"['love', 'great', 'upgrade', 'original', 'ive'...",love great upgrade original ive mine couple year
2,Home_and_Kitchen_5,5.0,CG,this pillow saved my back i love the look and ...,"['pillow', 'saved', 'back', 'love', 'look', 'f...",pillow saved back love look feel pillow
3,Home_and_Kitchen_5,1.0,CG,missing information on how to use it but it is...,"['missing', 'information', 'use', 'great', 'pr...",missing information use great product price
4,Home_and_Kitchen_5,5.0,CG,very nice set good quality we have had the set...,"['nice', 'set', 'good', 'quality', 'set', 'two...",nice set good quality set two month


In [10]:
def text_process(review):
    nopunc = [char for char in review if char not in string.punctuation]
    nopunc = ''.join(nopunc)
    return [word for word in nopunc.split() if word.lower() not in stopwords.words('english')]

In [11]:
bow_transformer = CountVectorizer(analyzer=text_process)
bow_transformer

## Creating training and testing data

In [18]:
y_train, y_test, x_train, x_test = train_test_split(df['text_'],df['label'],test_size=0.35)


## Multinomial Naive Bayes.


In [19]:

pipeline = Pipeline([
    ('bow',CountVectorizer(analyzer=text_process)),
    ('tfidf',TfidfTransformer()),
    ('classifier',MultinomialNB())
])
pipeline.fit(y_train,x_train)
mnb_pred = pipeline.predict(y_test)
mnb_pred
print('Classification Report:',classification_report(x_test,mnb_pred))
print('Confusion Matrix:',confusion_matrix(x_test,mnb_pred))
print('Accuracy Score:',accuracy_score(x_test,mnb_pred))
print('Model Prediction Accuracy:',str(np.round(accuracy_score(x_test,mnb_pred)*100,2)) + '%')


Classification Report:               precision    recall  f1-score   support

          CG       0.82      0.91      0.86      7017
          OR       0.90      0.80      0.84      7076

    accuracy                           0.85     14093
   macro avg       0.86      0.85      0.85     14093
weighted avg       0.86      0.85      0.85     14093

Confusion Matrix: [[6372  645]
 [1435 5641]]
Accuracy Score: 0.8524089973745831
Model Prediction Accuracy: 85.24%



## Random Forest Classifier


In [20]:
pipeline = Pipeline([
    ('bow',CountVectorizer(analyzer=text_process)),
    ('tfidf',TfidfTransformer()),
    ('classifier',RandomForestClassifier())
])
pipeline.fit(y_train,x_train)
rfc_pred = pipeline.predict(y_test)
rfc_pred
print('Classification Report:',classification_report(x_test,rfc_pred))
print('Confusion Matrix:',confusion_matrix(x_test,rfc_pred))
print('Accuracy Score:',accuracy_score(x_test,rfc_pred))
print('Model Prediction Accuracy:',str(np.round(accuracy_score(x_test,rfc_pred)*100,2)) + '%')


Classification Report:               precision    recall  f1-score   support

          CG       0.81      0.90      0.85      7017
          OR       0.89      0.80      0.84      7076

    accuracy                           0.85     14093
   macro avg       0.85      0.85      0.85     14093
weighted avg       0.85      0.85      0.85     14093

Confusion Matrix: [[6297  720]
 [1450 5626]]
Accuracy Score: 0.8460228482225218
Model Prediction Accuracy: 84.6%


## Support Vector Machine/Classifier


In [21]:
pipeline = Pipeline([
    ('bow',CountVectorizer(analyzer=text_process)),
    ('tfidf',TfidfTransformer()),
    ('classifier',SVC())
])
pipeline.fit(y_train,x_train)
svc_pred = pipeline.predict(y_test)
svc_pred
print('Classification Report:',classification_report(x_test,svc_pred))
print('Confusion Matrix:',confusion_matrix(x_test,svc_pred))
print('Accuracy Score:',accuracy_score(x_test,svc_pred))
print('Model Prediction Accuracy:',str(np.round(accuracy_score(x_test,svc_pred)*100,2)) + '%')


Classification Report:               precision    recall  f1-score   support

          CG       0.91      0.87      0.89      7017
          OR       0.88      0.91      0.90      7076

    accuracy                           0.89     14093
   macro avg       0.89      0.89      0.89     14093
weighted avg       0.89      0.89      0.89     14093

Confusion Matrix: [[6108  909]
 [ 607 6469]]
Accuracy Score: 0.8924288653941673
Model Prediction Accuracy: 89.24%


## Logistic Regression 

In [22]:
pipeline = Pipeline([
    ('bow',CountVectorizer(analyzer=text_process)),
    ('tfidf',TfidfTransformer()),
    ('classifier',LogisticRegression())
])
pipeline.fit(y_train,x_train)
logi_pred = pipeline.predict(y_test)
logi_pred
print('Classification Report:',classification_report(x_test,logi_pred))
print('Confusion Matrix:',confusion_matrix(x_test,logi_pred))
print('Accuracy Score:',accuracy_score(x_test,logi_pred))
print('Model Prediction Accuracy:',str(np.round(accuracy_score(x_test,logi_pred)*100,2)) + '%')

Classification Report:               precision    recall  f1-score   support

          CG       0.89      0.86      0.87      7017
          OR       0.86      0.89      0.88      7076

    accuracy                           0.88     14093
   macro avg       0.88      0.88      0.88     14093
weighted avg       0.88      0.88      0.88     14093

Confusion Matrix: [[6018  999]
 [ 759 6317]]
Accuracy Score: 0.8752572198964025
Model Prediction Accuracy: 87.53%


# Conclusion

In [25]:
print('Different ML model performances:')
print('\n')
print('Logistic Regression Model Accuracy:',str(np.round(accuracy_score(x_test,logi_pred)*100,2)) + '%')
print('Random Forests Classifier Model Accuracy:',str(np.round(accuracy_score(x_test,rfc_pred)*100,2)) + '%')
print('Support Vector Machines Model Accuracy:',str(np.round(accuracy_score(x_test,svc_pred)*100,2)) + '%')
print('Multinomial Naive Bayes Model Accuracy:',str(np.round(accuracy_score(x_test,mnb_pred)*100,2)) + '%')

Different ML model performances:


Logistic Regression Model Accuracy: 87.53%
Random Forests Classifier Model Accuracy: 84.6%
Support Vector Machines Model Accuracy: 89.24%
Multinomial Naive Bayes Model Accuracy: 85.24%


In [31]:
joblib.dump(logi_pred,'logisticRegre_model.pkl')
joblib.dump(svc_pred,'svc_model.pkl')
joblib.dump(rfc_pred,'randomForest_model.pkl')
joblib.dump(mnb_pred,'naiveBayes_model.pkl')


['naiveBayes_model.pkl']