In [23]:
import pandas as pd

from google.colab import files
uploaded = files.upload()

import io
pre_covid_df = pd.read_csv(io.BytesIO(uploaded['Project_Data_Pre-Covid.csv']))

Saving Project_Data_Pre-Covid.csv to Project_Data_Pre-Covid.csv


In [24]:
pre_covid_df = pre_covid_df.drop(columns=['product_url', 'helpfulness_rating'], axis=1)

In [25]:
pre_covid_df.head()

Unnamed: 0,product_category,review_date,Date,handle,rating,review
0,disposable_gloves,4/28/2019,Pre_Covid,Wrich,5,As described. good value!
1,disposable_gloves,5/21/2019,Pre_Covid,Amy H.,5,awesome for the price. fit nice and dont rip s...
2,disposable_gloves,6/19/2019,Pre_Covid,Frankie Boy,2,"Every time i used for washing, there’s a moist..."
3,disposable_gloves,6/21/2019,Pre_Covid,Frances Parish,5,Exactly the right size. Comfortable.
4,disposable_gloves,5/21/2019,Pre_Covid,Stan,5,Exactly what needed and described


In [26]:
pre_covid_df['rating'] = pre_covid_df['rating'].apply(lambda x: 1 if x > 3 else 0)

In [27]:
spec_chars = ["!",'"',"#","%","&","'","(",")",
              "*","+",",","-",".","/",":",";","<",
              "=",">","?","@","[","\\","]","^","_",
              "`","{","|","}","~","–","1","2","3","4",
              "5","6","7","8","9","0","xa","le"]

for char in spec_chars:
    pre_covid_df['review'] = pre_covid_df['review'].str.replace(char, ' ')


pre_covid_df['review'] = pre_covid_df['review'].str.split().str.join(" ")

In [28]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

def tokenize_clean_text(text):
    lemmatizer = nltk.stem.WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))
    clean_text = nltk.word_tokenize(text)
    clean_text = [token.lower() for token in clean_text if token.lower() not in stop_words]
    clean_text = [lemmatizer.lemmatize(token) for token in clean_text]
    symbols_for_removal = "``~`!@#$%^&*()_-+={[}]|\:;'<,>.?/"
    clean_text = [token for token in clean_text if token not in symbols_for_removal]
    clean_text = str(' ').join(clean_text)
    return clean_text

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [29]:
pre_covid_df['clean_review'] = pre_covid_df.apply(lambda row: tokenize_clean_text(row['review']), axis=1)

In [30]:
pre_covid_df.head()

Unnamed: 0,product_category,review_date,Date,handle,rating,review,clean_review
0,disposable_gloves,4/28/2019,Pre_Covid,Wrich,1,As described good value,described good value
1,disposable_gloves,5/21/2019,Pre_Covid,Amy H.,1,awesome for the price fit nice and dont rip su...,awesome price fit nice dont rip super easy wou...
2,disposable_gloves,6/19/2019,Pre_Covid,Frankie Boy,0,Every time i used for washing there’s a moistu...,every time used washing ’ moisture inside
3,disposable_gloves,6/21/2019,Pre_Covid,Frances Parish,1,E ctly the right size Comfortab,e ctly right size comfortab
4,disposable_gloves,5/21/2019,Pre_Covid,Stan,1,E ctly what needed and described,e ctly needed described


In [31]:
pre_covid_df['clean_review'] = pre_covid_df['clean_review'].astype(str)

In [32]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(pre_covid_df['clean_review'], pre_covid_df['rating'], random_state=1)

In [33]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(strip_accents='ascii', token_pattern=u'(?ui)\\b\\w*[a-z]+\\w*\\b', lowercase=True, stop_words='english')

X_train_cv = cv.fit_transform(X_train)

X_test_cv = cv.transform(X_test)

In [34]:
word_freq_df = pd.DataFrame(X_train_cv.toarray(), columns=cv.get_feature_names())

In [35]:
most_used_words_df = pd.DataFrame(word_freq_df.sum()).sort_values(0, ascending=False)

In [36]:
most_used_words_df.head(15)

Unnamed: 0,0
glove,8
good,6
use,5
product,5
great,4
fit,4
quality,4
daily,3
job,3
tear,3


In [37]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import recall_score, precision_score, f1_score, accuracy_score
from sklearn.metrics import roc_curve
from sklearn.metrics import auc

nb = MultinomialNB()

nb.fit(X_train_cv, y_train)

y_pred_nb = nb.predict(X_test_cv)
y_pred_nb_prob = nb.predict_proba(X_test_cv)[:, 1]

fpr_nb, tpr_nb, _ = roc_curve(y_test, y_pred_nb_prob)
roc_auc_nb = auc(fpr_nb, tpr_nb)

print('Naive Bayes Recall: ', round(recall_score(y_test,y_pred_nb),2))
print('Naive Bayes Precision: ', round(precision_score(y_test,y_pred_nb),2))
print('Naive Bayes F1: ', round(f1_score(y_test,y_pred_nb),2))
print('Naive Bayes Accuracy: ', round(accuracy_score(y_test,y_pred_nb),2))
print("Naive Bayes ROC AUC: %.2f" % roc_auc_nb)

Naive Bayes Recall:  0.83
Naive Bayes Precision:  0.71
Naive Bayes F1:  0.77
Naive Bayes Accuracy:  0.62
Naive Bayes ROC AUC: 0.50
