In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import string
import nltk
from nltk.stem import SnowballStemmer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
import warnings
warnings.filterwarnings("ignore")

In [2]:
review=pd.read_csv("amazon-review-scraper-labelled1.csv",encoding="ISO-8859-1")

In [3]:
print(review)

    web-scraper-order                              web-scraper-start-url  \
0      1580573669-932  https://www.amazon.in/Redmi-Note-Pro-Storage-P...   
1      1580573644-848  https://www.amazon.in/Redmi-Note-Pro-Storage-P...   
2     1580574377-3407  https://www.amazon.in/Redmi-Note-Pro-Storage-P...   
3     1580573743-1197  https://www.amazon.in/Redmi-Note-Pro-Storage-P...   
4     1580574487-3799  https://www.amazon.in/Redmi-Note-Pro-Storage-P...   
..                ...                                                ...   
487               NaN                                                NaN   
488               NaN                                                NaN   
489               NaN                                                NaN   
490   1580574456-3683  https://www.amazon.in/Redmi-Note-Pro-Storage-P...   
491    1580573630-794  https://www.amazon.in/Redmi-Note-Pro-Storage-P...   

              author                                 title  \
0    Amazon Customer  Tha

In [4]:
review.describe()

Unnamed: 0,web-scraper-order,web-scraper-start-url,author,title,date,content,rating,label
count,465,465,475,465,465,492,475,492
unique,465,1,430,426,85,482,5,4
top,1580574273-3036,https://www.amazon.in/Redmi-Note-Pro-Storage-P...,Amazon Customer,Good,Reviewed in India on 7 November 2019,Good,5.0 out of 5 stars,quality
freq,1,465,42,8,15,4,262,455


In [5]:
review.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 492 entries, 0 to 491
Data columns (total 8 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   web-scraper-order      465 non-null    object
 1   web-scraper-start-url  465 non-null    object
 2   author                 475 non-null    object
 3   title                  465 non-null    object
 4   date                   465 non-null    object
 5   content                492 non-null    object
 6   rating                 475 non-null    object
 7   label                  492 non-null    object
dtypes: object(8)
memory usage: 30.9+ KB


In [6]:
review=review.fillna('')

In [7]:
review.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 492 entries, 0 to 491
Data columns (total 8 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   web-scraper-order      492 non-null    object
 1   web-scraper-start-url  492 non-null    object
 2   author                 492 non-null    object
 3   title                  492 non-null    object
 4   date                   492 non-null    object
 5   content                492 non-null    object
 6   rating                 492 non-null    object
 7   label                  492 non-null    object
dtypes: object(8)
memory usage: 30.9+ KB


In [8]:
print(review)

    web-scraper-order                              web-scraper-start-url  \
0      1580573669-932  https://www.amazon.in/Redmi-Note-Pro-Storage-P...   
1      1580573644-848  https://www.amazon.in/Redmi-Note-Pro-Storage-P...   
2     1580574377-3407  https://www.amazon.in/Redmi-Note-Pro-Storage-P...   
3     1580573743-1197  https://www.amazon.in/Redmi-Note-Pro-Storage-P...   
4     1580574487-3799  https://www.amazon.in/Redmi-Note-Pro-Storage-P...   
..                ...                                                ...   
487                                                                        
488                                                                        
489                                                                        
490   1580574456-3683  https://www.amazon.in/Redmi-Note-Pro-Storage-P...   
491    1580573630-794  https://www.amazon.in/Redmi-Note-Pro-Storage-P...   

              author                                 title  \
0    Amazon Customer  Tha

In [9]:
def pre_process(text):

    text = text.translate(str.maketrans('', '', string.punctuation))
    text = [word for word in text.split() if word.lower() not in stopwords.words('english')]
    words = ""
    for i in text:
            stemmer = SnowballStemmer("english")
            words += (stemmer.stem(i))+" "
    return words

In [10]:
textFeatures = review['content'].copy()
print(textFeatures)

0      Thanks for delivery on 30th Dec because ordere...
1      Hi Team,\n\nI am having a trouble time with th...
2      Redmi's Personal Beast is Overall The Best in ...
3      very bad camera. whenever I start camera it is...
4      It's not a magical device.. But it won't make ...
                             ...                        
487                                    I dont like miui
488                                    miui is not good 
489                                     miui is childish
490    Camera is good. Battery is not as per showing ...
491    Super mobile with best configuration. But more...
Name: content, Length: 492, dtype: object


In [11]:
textFeatures = textFeatures.apply(pre_process)

In [12]:
vectorizer = TfidfVectorizer("english")
features = vectorizer.fit_transform(textFeatures)

In [13]:
features_train, features_test, labels_train, labels_test = train_test_split(features, review['label'], test_size=0.3, random_state=111)

In [14]:
features_train

<344x1747 sparse matrix of type '<class 'numpy.float64'>'
	with 5685 stored elements in Compressed Sparse Row format>

In [15]:
features_test

<148x1747 sparse matrix of type '<class 'numpy.float64'>'
	with 2031 stored elements in Compressed Sparse Row format>

In [16]:
mnb = MultinomialNB(alpha=0.2)
mnb.fit(features_train,labels_train)

MultinomialNB(alpha=0.2, class_prior=None, fit_prior=True)

In [17]:
prediction = mnb.predict(features_test)
print("multinomial naive bayes",accuracy_score(labels_test,prediction))

multinomial naive bayes 0.9256756756756757


In [18]:
print("Classification Report\n",classification_report(prediction,labels_test,labels=["spam", "quality"]))

Classification Report
               precision    recall  f1-score   support

        spam       0.09      0.50      0.15         2
     quality       1.00      0.93      0.96       146

   micro avg       0.93      0.93      0.93       148
   macro avg       0.55      0.72      0.56       148
weighted avg       0.99      0.93      0.95       148

