In [2]:
import pandas as pd

from google.colab import files
uploaded = files.upload()

import io
pre_covid_df = pd.read_csv(io.BytesIO(uploaded['disinfecting_wipe_Pre_Covid.csv']))

Saving disinfecting_wipe_Pre_Covid.csv to disinfecting_wipe_Pre_Covid.csv


In [3]:
pre_covid_df = pre_covid_df.drop(columns=['product_url', 'helpfulness_rating'], axis=1)

In [4]:
pre_covid_df.head()

Unnamed: 0,product_category,review_date,Date,handle,rating,review
0,disinfecting_wipes,2/22/2018,Pre_Covid,K Lilledahl,1,*****EVERY SINGLE TIME I ORDER THESE THEY COME...
1,disinfecting_wipes,11/11/2019,Pre_Covid,andychen278,5,5 stars
2,disinfecting_wipes,6/2/2018,Pre_Covid,Rachel Anschuetz,5,"80 wipes lasts forever, and they work. they sm..."
3,disinfecting_wipes,10/27/2018,Pre_Covid,Mohabee Serrano,5,"A great product, and it's a bit cheaper than t..."
4,disinfecting_wipes,7/21/2019,Pre_Covid,Robint,4,"A little concerned that when package arrived, ..."


In [5]:
pre_covid_df['rating'] = pre_covid_df['rating'].apply(lambda x: 1 if x > 3 else 0)

In [6]:
spec_chars = ["!",'"',"#","%","&","'","(",")",
              "*","+",",","-",".","/",":",";","<",
              "=",">","?","@","[","\\","]","^","_",
              "`","{","|","}","~","–","1","2","3","4",
              "5","6","7","8","9","0","xa","le"]

for char in spec_chars:
    pre_covid_df['review'] = pre_covid_df['review'].str.replace(char, ' ')


pre_covid_df['review'] = pre_covid_df['review'].str.split().str.join(" ")

In [7]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

def tokenize_clean_text(text):
    lemmatizer = nltk.stem.WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))
    clean_text = nltk.word_tokenize(text)
    clean_text = [token.lower() for token in clean_text if token.lower() not in stop_words]
    clean_text = [lemmatizer.lemmatize(token) for token in clean_text]
    symbols_for_removal = "``~`!@#$%^&*()_-+={[}]|\:;'<,>.?/"
    clean_text = [token for token in clean_text if token not in symbols_for_removal]
    clean_text = str(' ').join(clean_text)
    return clean_text

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


In [8]:
pre_covid_df['clean_review'] = pre_covid_df.apply(lambda row: tokenize_clean_text(row['review']), axis=1)

In [9]:
pre_covid_df.head()

Unnamed: 0,product_category,review_date,Date,handle,rating,review,clean_review
0,disinfecting_wipes,2/22/2018,Pre_Covid,K Lilledahl,0,EVERY SINGLE TIME I ORDER THESE THEY COME LEAK...,every single time order come leaking lanels so...
1,disinfecting_wipes,11/11/2019,Pre_Covid,andychen278,1,stars,star
2,disinfecting_wipes,6/2/2018,Pre_Covid,Rachel Anschuetz,1,wipes lasts forever and they work they smell c...,wipe last forever work smell c work anything d...
3,disinfecting_wipes,10/27/2018,Pre_Covid,Mohabee Serrano,1,A great product and it s a bit cheaper than th...,great product bit cheaper local dg product
4,disinfecting_wipes,7/21/2019,Pre_Covid,Robint,1,A litt concerned that when package arrived the...,litt concerned package arrived box wet smel li...


In [10]:
pre_covid_df['clean_review'] = pre_covid_df['clean_review'].astype(str)

In [11]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(pre_covid_df['clean_review'], pre_covid_df['rating'], random_state=1)

In [12]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(strip_accents='ascii', token_pattern=u'(?ui)\\b\\w*[a-z]+\\w*\\b', lowercase=True, stop_words='english')

X_train_cv = cv.fit_transform(X_train)

X_test_cv = cv.transform(X_test)

In [13]:
word_freq_df = pd.DataFrame(X_train_cv.toarray(), columns=cv.get_feature_names())

In [14]:
most_used_words_df = pd.DataFrame(word_freq_df.sum()).sort_values(0, ascending=False)

In [15]:
most_used_words_df.head(15)

Unnamed: 0,0
c,170
wipe,148
great,83
use,75
product,71
aning,60
love,53
lysol,46
good,46
time,40


In [16]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import recall_score, precision_score, f1_score, accuracy_score
from sklearn.metrics import roc_curve
from sklearn.metrics import auc

nb = MultinomialNB()

nb.fit(X_train_cv, y_train)

y_pred_nb = nb.predict(X_test_cv)
y_pred_nb_prob = nb.predict_proba(X_test_cv)[:, 1]

fpr_nb, tpr_nb, _ = roc_curve(y_test, y_pred_nb_prob)
roc_auc_nb = auc(fpr_nb, tpr_nb)

print('Naive Bayes Recall: ', round(recall_score(y_test,y_pred_nb),2))
print('Naive Bayes Precision: ', round(precision_score(y_test,y_pred_nb),2))
print('Naive Bayes F1: ', round(f1_score(y_test,y_pred_nb),2))
print('Naive Bayes Accuracy: ', round(accuracy_score(y_test,y_pred_nb),2))
print("Naive Bayes ROC AUC: %.2f" % roc_auc_nb)

Naive Bayes Recall:  0.97
Naive Bayes Precision:  0.94
Naive Bayes F1:  0.96
Naive Bayes Accuracy:  0.93
Naive Bayes ROC AUC: 0.92
