In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('darkgrid')

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report

In [2]:
df = pd.read_csv('cleaned_data.csv', parse_dates=[5])
df.drop(['Unnamed: 1'], axis=1, inplace=True)

df = df.sort_values('Product').set_index(['Product', df.index])
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Reviewer_Name,Reviewer_Location,Review,Review_Date,Review_Time,Rating,good_review,cleaned_reviews,neg,neu,pos,compound,day_of_week,review_hour
Product,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
3-D Knit Face Mask,0,Katie,"Fishers, IN",This mask has a wire at the nose for a good fi...,2020-07-10,14:15:35,5.0,1,mask wire nose good fit guy really like design...,0.0,0.391,0.609,0.9268,4,14
3-D Knit Face Mask,1,JD,"Cocoa, FL","Good size, washable is a plus. I like the carb...",2020-07-31,15:26:14,5.0,1,good size washable plus like carbon filter fea...,0.0,0.707,0.293,0.6597,4,15
3-D Knit Face Mask,2,Steph in Plymouth,"Plymouth, MN",I love that this mask fits nice and snug on th...,2020-07-10,13:07:13,4.0,1,love mask fit nice snug side face narrow heck ...,0.0,0.602,0.398,0.9531,4,13
3-D Knit Face Mask,3,babashopper,Illinois,These are the best everyday/everywhere masks--...,2020-07-04,10:54:14,5.0,1,best everyday/everywhere masks---simple seamle...,0.114,0.611,0.274,0.8442,5,10
3-D Knit Face Mask,4,Marcie,"Fairfax, VA",I ordered the gray mask. Everything about the ...,2020-07-12,16:34:43,5.0,1,ordered gray mask everything design material s...,0.064,0.476,0.46,0.959,6,16


In [3]:
df['is_positive'] = df['compound'].apply(lambda x: 1 if x > 0 else 0)

In [11]:
tfidf = TfidfVectorizer(min_df = 10, lowercase=True, stop_words='english')
tfidf_result = tfidf.fit_transform(df["cleaned_reviews"]).toarray()
tfidf_df = pd.DataFrame(tfidf_result, columns = tfidf.get_feature_names())
tfidf_df.columns = ["word_" + str(x) for x in tfidf_df.columns]
tfidf_df.index = df.index
tfidf_df

Unnamed: 0_level_0,Unnamed: 1_level_0,word_absolutely,word_add,word_beautiful,word_best,word_big,word_birthday,word_box,word_buy,word_change,word_coffee,...,word_unique,word_use,word_wait,word_want,word_way,word_wear,word_wife,word_wish,word_work,word_year
Product,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
3-D Knit Face Mask,0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.000000,...,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0
3-D Knit Face Mask,1,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.309738,0.0,0.463744,...,0.000000,0.278860,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0
3-D Knit Face Mask,2,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.000000,...,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0
3-D Knit Face Mask,3,0.0,0.0,0.000000,0.410839,0.000000,0.0,0.0,0.548805,0.0,0.000000,...,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0
3-D Knit Face Mask,4,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.000000,...,0.000000,0.479032,0.0,0.0,0.0,0.390904,0.0,0.0,0.321848,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
World Map Coloring Tablecloth,429,0.0,0.0,0.000000,0.000000,0.258012,0.0,0.0,0.000000,0.0,0.000000,...,0.218361,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0
World Map Coloring Tablecloth,430,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.000000,...,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.469471,0.0
World Map Coloring Tablecloth,431,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.000000,...,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0
World Map Coloring Tablecloth,432,0.0,0.0,0.156024,0.000000,0.389920,0.0,0.0,0.000000,0.0,0.000000,...,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0


In [13]:
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Reviewer_Name,Reviewer_Location,Review,Review_Date,Review_Time,Rating,good_review,cleaned_reviews,neg,neu,...,word_unique,word_use,word_wait,word_want,word_way,word_wear,word_wife,word_wish,word_work,word_year
Product,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
3-D Knit Face Mask,0,Katie,"Fishers, IN",This mask has a wire at the nose for a good fi...,2020-07-10,14:15:35,5.0,1,mask wire nose good fit guy really like design...,0.0,0.391,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3-D Knit Face Mask,1,JD,"Cocoa, FL","Good size, washable is a plus. I like the carb...",2020-07-31,15:26:14,5.0,1,good size washable plus like carbon filter fea...,0.0,0.707,...,0.0,0.27886,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3-D Knit Face Mask,2,Steph in Plymouth,"Plymouth, MN",I love that this mask fits nice and snug on th...,2020-07-10,13:07:13,4.0,1,love mask fit nice snug side face narrow heck ...,0.0,0.602,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3-D Knit Face Mask,3,babashopper,Illinois,These are the best everyday/everywhere masks--...,2020-07-04,10:54:14,5.0,1,best everyday/everywhere masks---simple seamle...,0.114,0.611,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3-D Knit Face Mask,4,Marcie,"Fairfax, VA",I ordered the gray mask. Everything about the ...,2020-07-12,16:34:43,5.0,1,ordered gray mask everything design material s...,0.064,0.476,...,0.0,0.479032,0.0,0.0,0.0,0.390904,0.0,0.0,0.321848,0.0


In [17]:
X = df['cleaned_reviews']
y = df['is_positive']

In [18]:
y.value_counts()

1    383
0     51
Name: is_positive, dtype: int64

In [8]:
from imblearn.over_sampling import SMOTE
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier

In [39]:
def build_model(model_type, X, y, smote=False, test_size=0.3):
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=101)
    tfidf = TfidfVectorizer(min_df = 10, lowercase=True, stop_words='english')
    X_train = tfidf.fit_transform(X_train)
    X_test = tfidf.transform(X_test)
    
    if smote == True:
        sm = SMOTE()
        X_train, y_train = sm.fit_resample(X_train, y_train)
        X_test, y_test = sm.fit_resample(X_test, y_test)
    

    if model_type.lower() == 'naive bayes':
        model = MultinomialNB()
        model.fit(X_train, y_train)
    
    elif model_type.lower() == 'support vector':
        model = SVC()
        model.fit(X_train, y_train)
        
    elif model_type.lower() == 'logistic':
        model = LogisticRegression()
        model.fit(X_train, y_train)
        
    elif model_type.lower() == 'xgb':
        model = XGBClassifier()
        model.fit(X_train, y_train)
        
    r_squared = model.score(X_test, y_test)
    print('R-Squared: %0.2f' % (r_squared))
    adj_r_squared = 1 - ((1-r_squared) * len(X) / (len(X) - 1 - 1))
    print('Adjusted R-Squared: %0.2f' % (adj_r_squared))
    print('\n')
    
    y_pred = model.predict(X_test)
    print(confusion_matrix(y_test, y_pred))
    print('\n')
    print(classification_report(y_test, y_pred))


In [40]:
build_model('naive bayes', X, y, smote=True)

R-Squared: 0.73
Adjusted R-Squared: 0.73


[[ 72  47]
 [ 18 101]]


              precision    recall  f1-score   support

           0       0.80      0.61      0.69       119
           1       0.68      0.85      0.76       119

    accuracy                           0.73       238
   macro avg       0.74      0.73      0.72       238
weighted avg       0.74      0.73      0.72       238



In [41]:
build_model('support vector', X, y, smote=True)

R-Squared: 0.83
Adjusted R-Squared: 0.83


[[ 86  33]
 [  8 111]]


              precision    recall  f1-score   support

           0       0.91      0.72      0.81       119
           1       0.77      0.93      0.84       119

    accuracy                           0.83       238
   macro avg       0.84      0.83      0.83       238
weighted avg       0.84      0.83      0.83       238



In [42]:
build_model('logistic', X, y, smote=True)

R-Squared: 0.79
Adjusted R-Squared: 0.79


[[ 86  33]
 [ 17 102]]


              precision    recall  f1-score   support

           0       0.83      0.72      0.77       119
           1       0.76      0.86      0.80       119

    accuracy                           0.79       238
   macro avg       0.80      0.79      0.79       238
weighted avg       0.80      0.79      0.79       238



In [43]:
build_model('xgb', X, y, smote=True)

R-Squared: 0.70
Adjusted R-Squared: 0.70


[[ 62  57]
 [ 15 104]]


              precision    recall  f1-score   support

           0       0.81      0.52      0.63       119
           1       0.65      0.87      0.74       119

    accuracy                           0.70       238
   macro avg       0.73      0.70      0.69       238
weighted avg       0.73      0.70      0.69       238

