In [69]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import warnings, string
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
import joblib

In [70]:
df = pd.read_csv('../data/preprocessed_dataset.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,category,rating,label,text_
0,0,Home_and_Kitchen_5,5.0,CG,love well made sturdi comfort love pretti
1,1,Home_and_Kitchen_5,5.0,CG,love great upgrad origin 've mine coupl year
2,2,Home_and_Kitchen_5,5.0,CG,pillow save back love look feel pillow
3,3,Home_and_Kitchen_5,1.0,CG,miss inform use great product price
4,4,Home_and_Kitchen_5,5.0,CG,nice set good qualiti set two month


In [71]:
df.dropna(inplace=True)

In [72]:
def text_process(review):
    nopunc = [char for char in review if char not in string.punctuation]
    nopunc = ''.join(nopunc)
    return [word for word in nopunc.split() if word.lower() not in stopwords.words('english')]

In [73]:
bow_transformer = CountVectorizer(analyzer=text_process)
bow_transformer

In [74]:
bow_transformer.fit(df['text_'])

In [75]:
review4 = df['text_'][3]
review4

'miss inform use great product price'

In [76]:
bow_msg4 = bow_transformer.transform([review4])
print(bow_msg4)
print(bow_msg4.shape)

  (0, 13643)	1
  (0, 15885)	1
  (0, 19892)	1
  (0, 23889)	1
  (0, 24019)	1
  (0, 32221)	1
(1, 34489)


There are 6 unique words in the 4th review.

In [77]:
bow_reviews = bow_transformer.transform(df['text_'])

In [78]:
print("Shape of Bag of Words Transformer for the entire reviews corpus:",bow_reviews.shape)
print("Amount of non zero values in the bag of words model:",bow_reviews.nnz)

Shape of Bag of Words Transformer for the entire reviews corpus: (40431, 34489)
Amount of non zero values in the bag of words model: 1001954


In [79]:
print("Sparsity:",np.round((bow_reviews.nnz/(bow_reviews.shape[0]*bow_reviews.shape[1]))*100,2))

Sparsity: 0.07


In [80]:
tfidf_transformer = TfidfTransformer().fit(bow_reviews)
tfidf_rev4 = tfidf_transformer.transform(bow_msg4)
print(bow_msg4)

  (0, 13643)	1
  (0, 15885)	1
  (0, 19892)	1
  (0, 23889)	1
  (0, 24019)	1
  (0, 32221)	1


In [81]:
print(tfidf_transformer.idf_[bow_transformer.vocabulary_['mango']])
print(tfidf_transformer.idf_[bow_transformer.vocabulary_['book']])

10.91422964906803
2.821684385176731


In [82]:
tfidf_reviews = tfidf_transformer.transform(bow_reviews)
print("Shape:",tfidf_reviews.shape)
print("No. of Dimensions:",tfidf_reviews.ndim)

Shape: (40431, 34489)
No. of Dimensions: 2


# Creating training and testing data

In [83]:
review_train, review_test, label_train, label_test = train_test_split(df['text_'],df['label'],test_size=0.2)

# Multinomial Naive Bayes Algorithm

In [84]:
pipeline = Pipeline([
    ('bow',CountVectorizer(analyzer=text_process)),
    ('tfidf',TfidfTransformer()),
    ('classifier',MultinomialNB())
])

In [85]:
pipeline.fit(review_train,label_train)

In [86]:
predictions = pipeline.predict(review_test)
predictions

array(['OR', 'OR', 'CG', ..., 'OR', 'CG', 'CG'], dtype='<U2')

In [87]:
print('Classification Report:',classification_report(label_test,predictions))
print('Confusion Matrix:',confusion_matrix(label_test,predictions))
print('Accuracy Score:',accuracy_score(label_test,predictions))

Classification Report:               precision    recall  f1-score   support

          CG       0.82      0.89      0.85      4021
          OR       0.88      0.81      0.84      4066

    accuracy                           0.85      8087
   macro avg       0.85      0.85      0.85      8087
weighted avg       0.85      0.85      0.85      8087

Confusion Matrix: [[3583  438]
 [ 786 3280]]
Accuracy Score: 0.8486459750216396


In [88]:
print('Model Prediction Accuracy:',str(np.round(accuracy_score(label_test,predictions)*100,2)) + '%')

Model Prediction Accuracy: 84.86%


In [89]:
joblib.dump(pipeline, '../models/multinomialNB_model.pkl')

['../models/multinomialNB_model.pkl']

# Random Forest Algorithm

In [90]:
pipeline = Pipeline([
    ('bow',CountVectorizer(analyzer=text_process)),
    ('tfidf',TfidfTransformer()),
    ('classifier',RandomForestClassifier())
])

In [91]:
pipeline.fit(review_train,label_train)

In [92]:
rfc_pred = pipeline.predict(review_test)
rfc_pred

array(['OR', 'OR', 'CG', ..., 'OR', 'OR', 'OR'], dtype=object)

In [93]:
print('Classification Report:',classification_report(label_test,rfc_pred))
print('Confusion Matrix:',confusion_matrix(label_test,rfc_pred))
print('Accuracy Score:',accuracy_score(label_test,rfc_pred))
print('Model Prediction Accuracy:',str(np.round(accuracy_score(label_test,rfc_pred)*100,2)) + '%')

Classification Report:               precision    recall  f1-score   support

          CG       0.81      0.89      0.85      4021
          OR       0.88      0.79      0.84      4066

    accuracy                           0.84      8087
   macro avg       0.85      0.84      0.84      8087
weighted avg       0.85      0.84      0.84      8087

Confusion Matrix: [[3579  442]
 [ 834 3232]]
Accuracy Score: 0.8422159020650427
Model Prediction Accuracy: 84.22%


In [94]:
joblib.dump(pipeline, '../models/random_forest_classifier__model.pkl')

['../models/random_forest_classifier__model.pkl']

# Decision Tree Classifier Algorithm

In [95]:
pipeline = Pipeline([
    ('bow',CountVectorizer(analyzer=text_process)),
    ('tfidf',TfidfTransformer()),
    ('classifier',DecisionTreeClassifier())
])

In [96]:
pipeline.fit(review_train,label_train)

In [97]:
dtree_pred = pipeline.predict(review_test)
dtree_pred

array(['OR', 'OR', 'OR', ..., 'CG', 'OR', 'OR'], dtype=object)

In [98]:
print('Classification Report:',classification_report(label_test,dtree_pred))
print('Confusion Matrix:',confusion_matrix(label_test,dtree_pred))
print('Accuracy Score:',accuracy_score(label_test,dtree_pred))
print('Model Prediction Accuracy:',str(np.round(accuracy_score(label_test,dtree_pred)*100,2)) + '%')

Classification Report:               precision    recall  f1-score   support

          CG       0.73      0.76      0.74      4021
          OR       0.75      0.72      0.73      4066

    accuracy                           0.74      8087
   macro avg       0.74      0.74      0.74      8087
weighted avg       0.74      0.74      0.74      8087

Confusion Matrix: [[3043  978]
 [1138 2928]]
Accuracy Score: 0.7383454927661679
Model Prediction Accuracy: 73.83%


In [99]:
joblib.dump(pipeline, '../models/decision_tree_classifier_model.pkl')

['../models/decision_tree_classifier_model.pkl']

# KNeighbors Classifier Algorithm

In [100]:
pipeline = Pipeline([
    ('bow',CountVectorizer(analyzer=text_process)),
    ('tfidf',TfidfTransformer()),
    ('classifier',KNeighborsClassifier(n_neighbors=2))
])

In [101]:
pipeline.fit(review_train,label_train)

In [102]:
knn_pred = pipeline.predict(review_test)
knn_pred

array(['CG', 'OR', 'CG', ..., 'CG', 'CG', 'CG'], dtype=object)

In [103]:
print('Classification Report:',classification_report(label_test,knn_pred))
print('Confusion Matrix:',confusion_matrix(label_test,knn_pred))
print('Accuracy Score:',accuracy_score(label_test,knn_pred))
print('Model Prediction Accuracy:',str(np.round(accuracy_score(label_test,knn_pred)*100,2)) + '%')

Classification Report:               precision    recall  f1-score   support

          CG       0.54      0.97      0.69      4021
          OR       0.86      0.19      0.31      4066

    accuracy                           0.58      8087
   macro avg       0.70      0.58      0.50      8087
weighted avg       0.70      0.58      0.50      8087

Confusion Matrix: [[3893  128]
 [3304  762]]
Accuracy Score: 0.5756151848645975
Model Prediction Accuracy: 57.56%


In [104]:
joblib.dump(pipeline, '../models/kneighbors_classifier_model.pkl')

['../models/kneighbors_classifier_model.pkl']

# Support Vector Classifier Algorithm

In [105]:
pipeline = Pipeline([
    ('bow',CountVectorizer(analyzer=text_process)),
    ('tfidf',TfidfTransformer()),
    ('classifier',SVC())
])

In [106]:
pipeline.fit(review_train,label_train)

In [107]:
svc_pred = pipeline.predict(review_test)
svc_pred

array(['OR', 'OR', 'CG', ..., 'OR', 'CG', 'CG'], dtype=object)

In [108]:
print('Classification Report:',classification_report(label_test,svc_pred))
print('Confusion Matrix:',confusion_matrix(label_test,svc_pred))
print('Accuracy Score:',accuracy_score(label_test,svc_pred))
print('Model Prediction Accuracy:',str(np.round(accuracy_score(label_test,svc_pred)*100,2)) + '%')

Classification Report:               precision    recall  f1-score   support

          CG       0.89      0.87      0.88      4021
          OR       0.88      0.89      0.88      4066

    accuracy                           0.88      8087
   macro avg       0.88      0.88      0.88      8087
weighted avg       0.88      0.88      0.88      8087

Confusion Matrix: [[3517  504]
 [ 442 3624]]
Accuracy Score: 0.8830221342896006
Model Prediction Accuracy: 88.3%


In [109]:
joblib.dump(pipeline, '../models/svc_model.pkl')

['../models/svc_model.pkl']

# Logistic Regression

In [110]:
pipeline = Pipeline([
    ('bow',CountVectorizer(analyzer=text_process)),
    ('tfidf',TfidfTransformer()),
    ('classifier',LogisticRegression())
])

In [111]:
pipeline.fit(review_train,label_train)

In [112]:
lr_pred = pipeline.predict(review_test)
lr_pred

array(['OR', 'OR', 'CG', ..., 'OR', 'OR', 'CG'], dtype=object)

In [113]:
print('Classification Report:',classification_report(label_test,lr_pred))
print('Confusion Matrix:',confusion_matrix(label_test,lr_pred))
print('Accuracy Score:',accuracy_score(label_test,lr_pred))
print('Model Prediction Accuracy:',str(np.round(accuracy_score(label_test,lr_pred)*100,2)) + '%')

Classification Report:               precision    recall  f1-score   support

          CG       0.87      0.86      0.86      4021
          OR       0.86      0.87      0.87      4066

    accuracy                           0.87      8087
   macro avg       0.87      0.87      0.87      8087
weighted avg       0.87      0.87      0.87      8087

Confusion Matrix: [[3454  567]
 [ 523 3543]]
Accuracy Score: 0.8652157784097935
Model Prediction Accuracy: 86.52%


In [114]:
joblib.dump(pipeline, '../models/logisic_regression_model.pkl')

['../models/logisic_regression_model.pkl']

# Conclusion

In [115]:
print('Performance of various ML models:')
print('\n')
print('Logistic Regression Prediction Accuracy:',str(np.round(accuracy_score(label_test,lr_pred)*100,2)) + '%')
print('K Nearest Neighbors Prediction Accuracy:',str(np.round(accuracy_score(label_test,knn_pred)*100,2)) + '%')
print('Decision Tree Classifier Prediction Accuracy:',str(np.round(accuracy_score(label_test,dtree_pred)*100,2)) + '%')
print('Random Forests Classifier Prediction Accuracy:',str(np.round(accuracy_score(label_test,rfc_pred)*100,2)) + '%')
print('Support Vector Machines Prediction Accuracy:',str(np.round(accuracy_score(label_test,svc_pred)*100,2)) + '%')
print('Multinomial Naive Bayes Prediction Accuracy:',str(np.round(accuracy_score(label_test,predictions)*100,2)) + '%')

Performance of various ML models:


Logistic Regression Prediction Accuracy: 86.52%
K Nearest Neighbors Prediction Accuracy: 57.56%
Decision Tree Classifier Prediction Accuracy: 73.83%
Random Forests Classifier Prediction Accuracy: 84.22%
Support Vector Machines Prediction Accuracy: 88.3%
Multinomial Naive Bayes Prediction Accuracy: 84.86%
