## YeLP Reviews Sentiment Analysis and Topic Modeling

In [1]:
DF_PATH = '../data/processed/yelp_data_engineered.pkl'

### Import libraries

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import re
import string
import nltk
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, cross_val_score, KFold
from xgboost import XGBClassifier
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

### Read dataset

In [3]:
df = pd.read_pickle(DF_PATH)
df.shape

(10000, 1004)

In [4]:
df.sample(3)

Unnamed: 0,stars,cool,useful,funny,10,100,12,15,20,25,...,yelp,yes,yet,yogurt,youll,young,youre,youve,yum,yummy
7852,1,0,1,0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2759,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4431,2,1,1,1,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Drop duplicated cool columns

In [5]:
duplicated_columns = df.columns[df.columns.duplicated(keep='first')]
column_to_drop = duplicated_columns[0] 

index_to_drop = df.columns.get_loc(column_to_drop)

df.drop(df.columns[index_to_drop], axis=1, inplace=True)

In [6]:
X = df.drop('stars', axis=1)
y = df['stars']

### Model creation using cross validation

In [7]:
k_fold = KFold(n_splits=5, shuffle=True, random_state=42)

In [8]:
dt_clf = DecisionTreeClassifier(random_state=42)
svc_clf = SVC(kernel='rbf', random_state=42)
rf_clf = RandomForestClassifier(n_estimators=100, random_state=42)
xgb_clf = XGBClassifier(n_estimators=100, random_state=42)

In [9]:
clf_list = [dt_clf, rf_clf, svc_clf, xgb_clf]
clf_names = ['Decision Tree', 'Random Forest', 'SVC', 'XGBoost']

In [10]:
test_acc = []

for clf, name in zip(clf_list, clf_names):
    scores = cross_val_score(clf, X, y, cv=k_fold, scoring='accuracy')
    print(f'{name} cross validation scores: {scores}')
    print(f'{name} cross validation score: {scores.mean():.4f}')
    test_acc.append(scores.mean())
    print("===========================================")

Decision Tree cross validation scores: [0.6505 0.6245 0.6355 0.6335 0.6385]
Decision Tree cross validation score: 0.6365
Random Forest cross validation scores: [0.747  0.7125 0.7355 0.744  0.74  ]
Random Forest cross validation score: 0.7358
SVC cross validation scores: [0.7175 0.68   0.6995 0.709  0.715 ]
SVC cross validation score: 0.7042
XGBoost cross validation scores: [0.7765 0.7495 0.7625 0.78   0.7605]
XGBoost cross validation score: 0.7658


### Feature selection to improve accuracy

In [11]:
k_best = SelectKBest(score_func=chi2, k=100)
X_selected = k_best.fit_transform(X, y)

In [12]:
test_acc = []

for clf, name in zip(clf_list, clf_names):
    scores = cross_val_score(clf, X_selected, y, cv=k_fold, scoring='accuracy')
    print(f'{name} cross validation scores: {scores * 100}')
    print(f'{name} cross validation score: {scores.mean() * 100:.4f}')
    test_acc.append(scores.mean())
    print("===========================================")

Decision Tree cross validation scores: [64.45 63.1  64.35 63.85 66.2 ]
Decision Tree cross validation score: 64.3900
Random Forest cross validation scores: [75.55 73.   74.95 74.75 73.9 ]
Random Forest cross validation score: 74.4300
SVC cross validation scores: [70.45 66.85 68.45 69.4  69.6 ]
SVC cross validation score: 68.9500
XGBoost cross validation scores: [76.35 73.4  75.6  75.55 75.8 ]
XGBoost cross validation score: 75.3400
