In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import warnings, string
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

In [2]:
df = pd.read_csv('CHP1_fakeReviewData.csv')
df.head()

Unnamed: 0,category,rating,label,text_,tokens,processed_text
0,Home_and_Kitchen_5,5.0,CG,love this well made sturdy and very comfortab...,"['love', 'well', 'made', 'sturdy', 'comfortabl...",love well made sturdy comfortable love itvery ...
1,Home_and_Kitchen_5,5.0,CG,love it a great upgrade from the original ive...,"['love', 'great', 'upgrade', 'original', 'ive'...",love great upgrade original ive mine couple year
2,Home_and_Kitchen_5,5.0,CG,this pillow saved my back i love the look and ...,"['pillow', 'saved', 'back', 'love', 'look', 'f...",pillow saved back love look feel pillow
3,Home_and_Kitchen_5,1.0,CG,missing information on how to use it but it is...,"['missing', 'information', 'use', 'great', 'pr...",missing information use great product price
4,Home_and_Kitchen_5,5.0,CG,very nice set good quality we have had the set...,"['nice', 'set', 'good', 'quality', 'set', 'two...",nice set good quality set two month


In [4]:
df.head()

Unnamed: 0,category,rating,label,text_,tokens,processed_text
0,Home_and_Kitchen_5,5.0,CG,love this well made sturdy and very comfortab...,"['love', 'well', 'made', 'sturdy', 'comfortabl...",love well made sturdy comfortable love itvery ...
1,Home_and_Kitchen_5,5.0,CG,love it a great upgrade from the original ive...,"['love', 'great', 'upgrade', 'original', 'ive'...",love great upgrade original ive mine couple year
2,Home_and_Kitchen_5,5.0,CG,this pillow saved my back i love the look and ...,"['pillow', 'saved', 'back', 'love', 'look', 'f...",pillow saved back love look feel pillow
3,Home_and_Kitchen_5,1.0,CG,missing information on how to use it but it is...,"['missing', 'information', 'use', 'great', 'pr...",missing information use great product price
4,Home_and_Kitchen_5,5.0,CG,very nice set good quality we have had the set...,"['nice', 'set', 'good', 'quality', 'set', 'two...",nice set good quality set two month


In [6]:
def text_process(review):
    nopunc = [char for char in review if char not in string.punctuation]
    nopunc = ''.join(nopunc)
    return [word for word in nopunc.split() if word.lower() not in stopwords.words('english')]

In [7]:
bow_transformer = CountVectorizer(analyzer=text_process)
bow_transformer

## Creating training and testing data

In [9]:
review_train, review_test, label_train, label_test = train_test_split(df['text_'],df['label'],test_size=0.35)

In [10]:
pipeline = Pipeline([
    ('bow',CountVectorizer(analyzer=text_process)),
    ('tfidf',TfidfTransformer()),
    ('classifier',MultinomialNB())
])

## Training and testing Multinomial Naive Bayes Algorithm on the preprocessed data

In [13]:
pipeline.fit(review_train,label_train)

In [15]:
predictions = pipeline.predict(review_test)
predictions

array(['CG', 'CG', 'OR', ..., 'OR', 'CG', 'CG'], dtype='<U2')

In [16]:
print('Classification Report:',classification_report(label_test,predictions))
print('Confusion Matrix:',confusion_matrix(label_test,predictions))
print('Accuracy Score:',accuracy_score(label_test,predictions))

Classification Report:               precision    recall  f1-score   support

          CG       0.83      0.90      0.87      7153
          OR       0.89      0.81      0.85      6940

    accuracy                           0.86     14093
   macro avg       0.86      0.86      0.86     14093
weighted avg       0.86      0.86      0.86     14093

Confusion Matrix: [[6465  688]
 [1318 5622]]
Accuracy Score: 0.8576598311218335


In [17]:
print('Model Prediction Accuracy:',str(np.round(accuracy_score(label_test,predictions)*100,2)) + '%')

Model Prediction Accuracy: 85.77%


In [18]:
pipeline = Pipeline([
    ('bow',CountVectorizer(analyzer=text_process)),
    ('tfidf',TfidfTransformer()),
    ('classifier',RandomForestClassifier())
])

In [19]:
pipeline.fit(review_train,label_train)

In [20]:
rfc_pred = pipeline.predict(review_test)
rfc_pred

array(['OR', 'CG', 'OR', ..., 'OR', 'CG', 'CG'], dtype=object)

In [22]:
print('Classification Report:',classification_report(label_test,rfc_pred))
print('Confusion Matrix:',confusion_matrix(label_test,rfc_pred))
print('Accuracy Score:',accuracy_score(label_test,rfc_pred))
print('Model Prediction Accuracy:',str(np.round(accuracy_score(label_test,rfc_pred)*100,2)) + '%')

Classification Report:               precision    recall  f1-score   support

          CG       0.83      0.89      0.86      7153
          OR       0.88      0.81      0.85      6940

    accuracy                           0.85     14093
   macro avg       0.86      0.85      0.85     14093
weighted avg       0.86      0.85      0.85     14093

Confusion Matrix: [[6392  761]
 [1290 5650]]
Accuracy Score: 0.8544667565458028
Model Prediction Accuracy: 85.45%


In [23]:
pipeline = Pipeline([
    ('bow',CountVectorizer(analyzer=text_process)),
    ('tfidf',TfidfTransformer()),
    ('classifier',DecisionTreeClassifier())
])

In [24]:
pipeline.fit(review_train,label_train)

In [25]:
dtree_pred = pipeline.predict(review_test)
dtree_pred

array(['CG', 'CG', 'OR', ..., 'OR', 'CG', 'OR'], dtype=object)

In [26]:
print('Classification Report:',classification_report(label_test,dtree_pred))
print('Confusion Matrix:',confusion_matrix(label_test,dtree_pred))
print('Accuracy Score:',accuracy_score(label_test,dtree_pred))
print('Model Prediction Accuracy:',str(np.round(accuracy_score(label_test,dtree_pred)*100,2)) + '%')

Classification Report:               precision    recall  f1-score   support

          CG       0.75      0.76      0.75      7153
          OR       0.75      0.74      0.74      6940

    accuracy                           0.75     14093
   macro avg       0.75      0.75      0.75     14093
weighted avg       0.75      0.75      0.75     14093

Confusion Matrix: [[5434 1719]
 [1811 5129]]
Accuracy Score: 0.7495210388135954
Model Prediction Accuracy: 74.95%


In [27]:
pipeline = Pipeline([
    ('bow',CountVectorizer(analyzer=text_process)),
    ('tfidf',TfidfTransformer()),
    ('classifier',KNeighborsClassifier(n_neighbors=2))
])

In [28]:
pipeline.fit(review_train,label_train)

In [29]:
knn_pred = pipeline.predict(review_test)
knn_pred

array(['CG', 'CG', 'CG', ..., 'CG', 'OR', 'CG'], dtype=object)

In [30]:
print('Classification Report:',classification_report(label_test,knn_pred))
print('Confusion Matrix:',confusion_matrix(label_test,knn_pred))
print('Accuracy Score:',accuracy_score(label_test,knn_pred))
print('Model Prediction Accuracy:',str(np.round(accuracy_score(label_test,knn_pred)*100,2)) + '%')

Classification Report:               precision    recall  f1-score   support

          CG       0.55      0.97      0.70      7153
          OR       0.86      0.17      0.28      6940

    accuracy                           0.58     14093
   macro avg       0.70      0.57      0.49     14093
weighted avg       0.70      0.58      0.49     14093

Confusion Matrix: [[6969  184]
 [5784 1156]]
Accuracy Score: 0.5765273540055347
Model Prediction Accuracy: 57.65%


In [31]:
pipeline = Pipeline([
    ('bow',CountVectorizer(analyzer=text_process)),
    ('tfidf',TfidfTransformer()),
    ('classifier',SVC())
])

In [32]:
pipeline.fit(review_train,label_train)

In [33]:
svc_pred = pipeline.predict(review_test)
svc_pred

array(['OR', 'CG', 'OR', ..., 'OR', 'CG', 'OR'], dtype=object)

In [34]:
print('Classification Report:',classification_report(label_test,svc_pred))
print('Confusion Matrix:',confusion_matrix(label_test,svc_pred))
print('Accuracy Score:',accuracy_score(label_test,svc_pred))
print('Model Prediction Accuracy:',str(np.round(accuracy_score(label_test,svc_pred)*100,2)) + '%')

Classification Report:               precision    recall  f1-score   support

          CG       0.91      0.87      0.89      7153
          OR       0.87      0.91      0.89      6940

    accuracy                           0.89     14093
   macro avg       0.89      0.89      0.89     14093
weighted avg       0.89      0.89      0.89     14093

Confusion Matrix: [[6216  937]
 [ 614 6326]]
Accuracy Score: 0.8899453629461435
Model Prediction Accuracy: 88.99%


In [35]:
pipeline = Pipeline([
    ('bow',CountVectorizer(analyzer=text_process)),
    ('tfidf',TfidfTransformer()),
    ('classifier',LogisticRegression())
])

In [36]:
pipeline.fit(review_train,label_train)

In [37]:
lr_pred = pipeline.predict(review_test)
lr_pred

array(['OR', 'CG', 'OR', ..., 'OR', 'CG', 'CG'], dtype=object)

In [38]:
print('Classification Report:',classification_report(label_test,lr_pred))
print('Confusion Matrix:',confusion_matrix(label_test,lr_pred))
print('Accuracy Score:',accuracy_score(label_test,lr_pred))
print('Model Prediction Accuracy:',str(np.round(accuracy_score(label_test,lr_pred)*100,2)) + '%')

Classification Report:               precision    recall  f1-score   support

          CG       0.89      0.86      0.88      7153
          OR       0.86      0.90      0.88      6940

    accuracy                           0.88     14093
   macro avg       0.88      0.88      0.88     14093
weighted avg       0.88      0.88      0.88     14093

Confusion Matrix: [[6125 1028]
 [ 721 6219]]
Accuracy Score: 0.8758958348116086
Model Prediction Accuracy: 87.59%


# Conclusion

In [39]:
print('Performance of various ML models:')
print('\n')
print('Logistic Regression Prediction Accuracy:',str(np.round(accuracy_score(label_test,lr_pred)*100,2)) + '%')
print('K Nearest Neighbors Prediction Accuracy:',str(np.round(accuracy_score(label_test,knn_pred)*100,2)) + '%')
print('Decision Tree Classifier Prediction Accuracy:',str(np.round(accuracy_score(label_test,dtree_pred)*100,2)) + '%')
print('Random Forests Classifier Prediction Accuracy:',str(np.round(accuracy_score(label_test,rfc_pred)*100,2)) + '%')
print('Support Vector Machines Prediction Accuracy:',str(np.round(accuracy_score(label_test,svc_pred)*100,2)) + '%')
print('Multinomial Naive Bayes Prediction Accuracy:',str(np.round(accuracy_score(label_test,predictions)*100,2)) + '%')

Performance of various ML models:


Logistic Regression Prediction Accuracy: 87.59%
K Nearest Neighbors Prediction Accuracy: 57.65%
Decision Tree Classifier Prediction Accuracy: 74.95%
Random Forests Classifier Prediction Accuracy: 85.45%
Support Vector Machines Prediction Accuracy: 88.99%
Multinomial Naive Bayes Prediction Accuracy: 85.77%
