In [1]:
import pandas as pd

from google.colab import files
uploaded = files.upload()

import io
post_covid_df = pd.read_csv(io.BytesIO(uploaded['disinfecting_wipe_Post_Covid.csv']))

Saving disinfecting_wipe_Post_Covid.csv to disinfecting_wipe_Post_Covid.csv


In [2]:
post_covid_df = post_covid_df.drop(columns=['product_url', 'helpfulness_rating'], axis=1)

In [3]:
post_covid_df.head()

Unnamed: 0,product_category,review_date,Date,handle,rating,review
0,disinfecting_wipes,11/2/2020,Post_Covid,Swiszms,5,(4) 80 count canisters are a great value for t...
1,disinfecting_wipes,1/6/2021,Post_Covid,alundra,1,4 out of 4 arrived smashed and damaged. There ...
2,disinfecting_wipes,3/10/2020,Post_Covid,Mandy,5,A absolutely must have for all households and ...
3,disinfecting_wipes,10/23/2020,Post_Covid,Hui,3,A little expensive.I bought from Walmart.It’s ...
4,disinfecting_wipes,1/23/2021,Post_Covid,D. M. Wilkerson,5,A much needed household cleaning product for o...


In [4]:
post_covid_df['rating'] = post_covid_df['rating'].apply(lambda x: 1 if x > 3 else 0)

In [5]:
spec_chars = ["!",'"',"#","%","&","'","(",")",
              "*","+",",","-",".","/",":",";","<",
              "=",">","?","@","[","\\","]","^","_",
              "`","{","|","}","~","–","1","2","3","4",
              "5","6","7","8","9","0","xa","le"]

for char in spec_chars:
    post_covid_df['review'] = post_covid_df['review'].str.replace(char, ' ')


post_covid_df['review'] = post_covid_df['review'].str.split().str.join(" ")

In [6]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

def tokenize_clean_text(text):
    lemmatizer = nltk.stem.WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))
    clean_text = nltk.word_tokenize(text)
    clean_text = [token.lower() for token in clean_text if token.lower() not in stop_words]
    clean_text = [lemmatizer.lemmatize(token) for token in clean_text]
    symbols_for_removal = "``~`!@#$%^&*()_-+={[}]|\:;'<,>.?/"
    clean_text = [token for token in clean_text if token not in symbols_for_removal]
    clean_text = str(' ').join(clean_text)
    return clean_text

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


In [7]:
post_covid_df['clean_review'] = post_covid_df.apply(lambda row: tokenize_clean_text(row['review']), axis=1)

In [8]:
post_covid_df.head()

Unnamed: 0,product_category,review_date,Date,handle,rating,review,clean_review
0,disinfecting_wipes,11/2/2020,Post_Covid,Swiszms,1,count canisters are a great value for the mone...,count canister great value money quality sing ...
1,disinfecting_wipes,1/6/2021,Post_Covid,alundra,0,out of arrived smashed and damaged There is no...,arrived smashed damaged longer seal arrived li...
2,disinfecting_wipes,3/10/2020,Post_Covid,Mandy,1,A absolutely must have for all households and ...,absolutely must household garage well
3,disinfecting_wipes,10/23/2020,Post_Covid,Hui,0,A litt expensive I bought from Walmart It’s ch...,litt expensive bought walmart ’ cheaper
4,disinfecting_wipes,1/23/2021,Post_Covid,D. M. Wilkerson,1,A much needed household c aning product for ou...,much needed household c aning product family l...


In [9]:
post_covid_df['clean_review'] = post_covid_df['clean_review'].astype(str)

In [10]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(post_covid_df['clean_review'], post_covid_df['rating'], random_state=1)

In [11]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(strip_accents='ascii', token_pattern=u'(?ui)\\b\\w*[a-z]+\\w*\\b', lowercase=True, stop_words='english')

X_train_cv = cv.fit_transform(X_train)

X_test_cv = cv.transform(X_test)

In [12]:
word_freq_df = pd.DataFrame(X_train_cv.toarray(), columns=cv.get_feature_names())

In [13]:
most_used_words_df = pd.DataFrame(word_freq_df.sum()).sort_values(0, ascending=False)

In [14]:
most_used_words_df.head(15)

Unnamed: 0,0
wipe,228
great,119
c,115
product,106
lysol,77
use,74
good,65
container,60
time,58
price,52


In [15]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import recall_score, precision_score, f1_score, accuracy_score
from sklearn.metrics import roc_curve
from sklearn.metrics import auc

nb = MultinomialNB()

nb.fit(X_train_cv, y_train)

y_pred_nb = nb.predict(X_test_cv)
y_pred_nb_prob = nb.predict_proba(X_test_cv)[:, 1]

fpr_nb, tpr_nb, _ = roc_curve(y_test, y_pred_nb_prob)
roc_auc_nb = auc(fpr_nb, tpr_nb)

print('Naive Bayes Recall: ', round(recall_score(y_test,y_pred_nb),2))
print('Naive Bayes Precision: ', round(precision_score(y_test,y_pred_nb),2))
print('Naive Bayes F1: ', round(f1_score(y_test,y_pred_nb),2))
print('Naive Bayes Accuracy: ', round(accuracy_score(y_test,y_pred_nb),2))
print("Naive Bayes ROC AUC: %.2f" % roc_auc_nb)

Naive Bayes Recall:  0.91
Naive Bayes Precision:  0.95
Naive Bayes F1:  0.93
Naive Bayes Accuracy:  0.88
Naive Bayes ROC AUC: 0.94
