In [2]:
import pandas as pd

from google.colab import files
uploaded = files.upload()

import io
post_covid_df = pd.read_csv(io.BytesIO(uploaded['Project_Data_Post-Covid.csv']))

Saving Project_Data_Post-Covid.csv to Project_Data_Post-Covid.csv


In [3]:
post_covid_df = post_covid_df.drop(columns=['product_url', 'helpfulness_rating'], axis=1)

In [4]:
post_covid_df.head()

Unnamed: 0,product_category,review_date,Date,handle,rating,review
0,disposable_gloves,4/2/2021,Post_Covid,Den,1,...even with short trimmed nails. I use dispos...
1,disposable_gloves,11/17/2020,Post_Covid,patricia43,5,...they offer the protection I need for chores...
2,disposable_gloves,9/4/2020,Post_Covid,SU,1,1 out of every 2 gloves rips just from the for...
3,disposable_gloves,8/13/2020,Post_Covid,Sheila M.,1,1. They are very small. 2. They are so thin th...
4,disposable_gloves,8/26/2020,Post_Covid,casebes,1,1st glove ripped as i put it on. Ordered mediu...


In [5]:
post_covid_df['rating'] = post_covid_df['rating'].apply(lambda x: 1 if x > 3 else 0)

In [6]:
spec_chars = ["!",'"',"#","%","&","'","(",")",
              "*","+",",","-",".","/",":",";","<",
              "=",">","?","@","[","\\","]","^","_",
              "`","{","|","}","~","â€“","1","2","3","4",
              "5","6","7","8","9","0","xa","le"]

for char in spec_chars:
    post_covid_df['review'] = post_covid_df['review'].str.replace(char, ' ')


post_covid_df['review'] = post_covid_df['review'].str.split().str.join(" ")

In [7]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

def tokenize_clean_text(text):
    lemmatizer = nltk.stem.WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))
    clean_text = nltk.word_tokenize(text)
    clean_text = [token.lower() for token in clean_text if token.lower() not in stop_words]
    clean_text = [lemmatizer.lemmatize(token) for token in clean_text]
    symbols_for_removal = "``~`!@#$%^&*()_-+={[}]|\:;'<,>.?/"
    clean_text = [token for token in clean_text if token not in symbols_for_removal]
    clean_text = str(' ').join(clean_text)
    return clean_text

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


In [8]:
post_covid_df['clean_review'] = post_covid_df.apply(lambda row: tokenize_clean_text(row['review']), axis=1)

In [9]:
post_covid_df.head()

Unnamed: 0,product_category,review_date,Date,handle,rating,review,clean_review
0,disposable_gloves,4/2/2021,Post_Covid,Den,0,even with short trimmed nails I use disposab g...,even short trimmed nail use disposab glove var...
1,disposable_gloves,11/17/2020,Post_Covid,patricia43,1,they offer the protection I need for chores at...,offer protection need chore home including use...
2,disposable_gloves,9/4/2020,Post_Covid,SU,0,out of every gloves rips just from the force o...,every glove rip force trying put light use che...
3,disposable_gloves,8/13/2020,Post_Covid,Sheila M.,0,They are very small They are so thin that I ha...,small thin use two together tear apart putting...
4,disposable_gloves,8/26/2020,Post_Covid,casebes,0,st glove ripped as i put it on Ordered medium ...,st glove ripped put ordered medium fit like ex...


In [10]:
post_covid_df['clean_review'] = post_covid_df['clean_review'].astype(str)

In [11]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(post_covid_df['clean_review'], post_covid_df['rating'], random_state=1)

In [12]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(strip_accents='ascii', token_pattern=u'(?ui)\\b\\w*[a-z]+\\w*\\b', lowercase=True, stop_words='english')

X_train_cv = cv.fit_transform(X_train)

X_test_cv = cv.transform(X_test)

In [13]:
word_freq_df = pd.DataFrame(X_train_cv.toarray(), columns=cv.get_feature_names())

In [14]:
most_used_words_df = pd.DataFrame(word_freq_df.sum()).sort_values(0, ascending=False)

In [15]:
most_used_words_df.head(15)

Unnamed: 0,0
glove,442
hand,164
use,116
good,114
fit,102
small,99
tear,94
easily,93
product,91
rip,82


In [16]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import recall_score, precision_score, f1_score, accuracy_score
from sklearn.metrics import roc_curve
from sklearn.metrics import auc

nb = MultinomialNB()

nb.fit(X_train_cv, y_train)

y_pred_nb = nb.predict(X_test_cv)
y_pred_nb_prob = nb.predict_proba(X_test_cv)[:, 1]

fpr_nb, tpr_nb, _ = roc_curve(y_test, y_pred_nb_prob)
roc_auc_nb = auc(fpr_nb, tpr_nb)

print('Naive Bayes Recall: ', round(recall_score(y_test,y_pred_nb),2))
print('Naive Bayes Precision: ', round(precision_score(y_test,y_pred_nb),2))
print('Naive Bayes F1: ', round(f1_score(y_test,y_pred_nb),2))
print('Naive Bayes Accuracy: ', round(accuracy_score(y_test,y_pred_nb),2))
print("Naive Bayes ROC AUC: %.2f" % roc_auc_nb)

Naive Bayes Recall:  0.84
Naive Bayes Precision:  0.81
Naive Bayes F1:  0.83
Naive Bayes Accuracy:  0.79
Naive Bayes ROC AUC: 0.87
