In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

#Load Data

yelp_data = pd.read_csv('yelp_data_sentiment.csv')

In [2]:
yelp_data.head()

Unnamed: 0,Name,Review,Polarity,Sentiment,Positive_Words_P,chocol,cup,amaz,eat,year,...,linguin,linguini,jimmi,player,juliana,pool tabl,castl,falafel,paella,white castl
0,Morris Park Bake Shop,'The chocolate cups are amazing! Have been eat...,0.5,Positive,0.222222,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Morris Park Bake Shop,'Morris Park Bake Shop has become my go to spo...,0.338889,Slightly Positive,0.206897,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Morris Park Bake Shop,'I thought the cookies and biscotti were prett...,0.314583,Slightly Positive,0.130435,0.0,0.0,0.0,0.0,0.160339,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Morris Park Bake Shop,'Guys.... so Im a big time biscotti connoisseu...,0.238068,Slightly Positive,0.12766,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.149255,0.0,0.0,0.0
4,Morris Park Bake Shop,'I had a craving for a special type of cake wi...,0.314643,Slightly Positive,0.21875,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [3]:
from sklearn.metrics import make_scorer, accuracy_score, f1_score
from sklearn.model_selection import train_test_split
from pycm import *

X = yelp_data.iloc[0:,4:]
y = yelp_data.Sentiment
indices = yelp_data.index

X_train, X_test, y_train, y_test, itrain, itest = train_test_split(X,y,indices,train_size=0.8,random_state=7)

# Logistic Regression

In [4]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
import pickle

In [5]:
steps = [('scaler', StandardScaler()), ('lr', LogisticRegression(solver = 'lbfgs'))] 
pipeline = Pipeline(steps)
parameters = {'lr__C':[0.01, 0.1, 1, 10, 100]}

clf = GridSearchCV(pipeline, parameters, cv = 10, scoring="accuracy") 
clf.fit(X_train, y_train)
clf.best_params_

{'lr__C': 0.1}

In [6]:
filename = 'lr.sav'
pickle.dump(clf, open(filename, 'wb'))

In [33]:
clf=pickle.load(open('lr.sav', 'rb'))

In [34]:
results = clf.predict(X_test)

In [35]:
test_accuracy = clf.score(X_test, y_test)
f1_accuracy = f1_score(y_test,results,average='macro')
f1_accuracym = f1_score(y_test,results,average='micro')
f1_accuracyw = f1_score(y_test,results,average='weighted')
print("Accuracy on test data: " ,test_accuracy)
print('F1 Score (macro): ', f1_accuracy)
print('F1 Score (micro): ', f1_accuracym)
print('F1 Score (weighted): ', f1_accuracyw)

Accuracy on test data:  0.6951553930530164
F1 Score (macro):  0.6933462169665948
F1 Score (micro):  0.6951553930530164
F1 Score (weighted):  0.6957884475947633


In [36]:
lr_acc = test_accuracy
lr_f1 = f1_accuracy
lr_f1m = f1_accuracym
lr_f1w = f1_accuracyw

# Random Forest

In [9]:
from sklearn.ensemble import RandomForestClassifier

In [10]:
steps = [('scaler', StandardScaler()), ('rf', RandomForestClassifier())] 
pipeline = Pipeline(steps) 
parameters = {'rf__n_estimators':[10 , 20, 30, 40, 50], 'rf__max_features':['auto','sqrt']}
clf = GridSearchCV(pipeline, parameters, cv = 10, scoring="accuracy") 
clf.fit(X_train, y_train)

clf.best_params_

{'rf__max_features': 'sqrt', 'rf__n_estimators': 50}

In [11]:
filename = 'rf.sav'
pickle.dump(clf, open(filename, 'wb'))

In [18]:
clf=pickle.load(open('rf.sav', 'rb'))

In [19]:
results = clf.predict(X_test)

In [20]:
test_accuracy = clf.score(X_test, y_test)
f1_accuracy = f1_score(y_test,results,average='macro')
f1_accuracym = f1_score(y_test,results,average='micro')
f1_accuracyw = f1_score(y_test,results,average='weighted')
print("Accuracy on test data: " ,test_accuracy)
print('F1 Score (macro): ', f1_accuracy)
print('F1 Score (micro): ', f1_accuracym)
print('F1 Score (weighted): ', f1_accuracyw)

Accuracy on test data:  0.6590493601462523
F1 Score (macro):  0.6584350649084096
F1 Score (micro):  0.6590493601462523
F1 Score (weighted):  0.6583161189038397


In [21]:
rf_acc = test_accuracy
rf_f1 = f1_accuracy
rf_f1m = f1_accuracym
rf_f1w = f1_accuracyw

# Support Vector Classification (SVC)

In [22]:
from sklearn.svm import SVC

steps = [('scaler', StandardScaler()), ('svc', SVC(probability=False,kernel='linear',gamma='auto'))] 
pipeline = Pipeline(steps) 
parameters = {'svc__C':[0.01, 0.1, 1]}

clf = GridSearchCV(pipeline, parameters, cv = 3, scoring="accuracy") 
clf.fit(X_train, y_train)

clf.best_params_

{'svc__C': 0.01}

In [23]:
filename = 'svc.sav'
pickle.dump(clf, open(filename, 'wb'))

In [24]:
results = clf.predict(X_test)

In [25]:
test_accuracy = clf.score(X_test, y_test)
f1_accuracy = f1_score(y_test,results,average='macro')
f1_accuracym = f1_score(y_test,results,average='micro')
f1_accuracyw = f1_score(y_test,results,average='weighted')
print("Accuracy on test data: " ,test_accuracy)
print('F1 Score (macro): ', f1_accuracy)
print('F1 Score (micro): ', f1_accuracym)
print('F1 Score (weighted): ', f1_accuracyw)

Accuracy on test data:  0.7010968921389397
F1 Score (macro):  0.7017782289569849
F1 Score (micro):  0.7010968921389397
F1 Score (weighted):  0.701150485468546


In [26]:
svc_acc = test_accuracy
svc_f1 = f1_accuracy
svc_f1m = f1_accuracym
svc_f1w = f1_accuracyw

# Gradient Boosted Classifier

In [27]:
from sklearn.ensemble import GradientBoostingClassifier

steps = [('scaler', StandardScaler()), ('gbc', GradientBoostingClassifier(max_features='sqrt'))] 
pipeline = Pipeline(steps) 
parameters = {'gbc__n_estimators':[10, 50, 100, 200, 500], 'gbc__learning_rate': [0.05, 0.1, 0.15, 0.2, 0.25]}

clf = GridSearchCV(pipeline, parameters, cv = 10, scoring="accuracy") 
clf.fit(X_train, y_train)

clf.best_params_

{'gbc__learning_rate': 0.2, 'gbc__n_estimators': 500}

In [28]:
filename = 'gbc.sav'
pickle.dump(clf, open(filename, 'wb'))

In [29]:
results = clf.predict(X_test)

In [30]:
test_accuracy = clf.score(X_test, y_test)
probs = clf.predict_proba(X_test)[:, 1]
f1_accuracy = f1_score(y_test,results,average='macro')
f1_accuracym = f1_score(y_test,results,average='micro')
f1_accuracyw = f1_score(y_test,results,average='weighted')
print("Accuracy on test data: " ,test_accuracy)
print('F1 Score (macro): ', f1_accuracy)
print('F1 Score (micro): ', f1_accuracym)
print('F1 Score (weighted): ', f1_accuracyw)

Accuracy on test data:  0.7212065813528337
F1 Score (macro):  0.7239217920748415
F1 Score (micro):  0.7212065813528337
F1 Score (weighted):  0.7219657542040139


In [31]:
gbc_acc = test_accuracy
gbc_f1 = f1_accuracy
gbc_f1m = f1_accuracym
gbc_f1w = f1_accuracyw

# Results

In [37]:
result1 = pd.DataFrame({'Model':['Logistic Regression', 'Random Forest', 'SVC', 'GBC'],
             'Accuracy':[lr_acc, rf_acc, svc_acc, gbc_acc],
             'F1_Macro':[lr_f1, rf_f1, svc_f1, gbc_f1],
             'F1_Micro':[lr_f1m, rf_f1m, svc_f1m, gbc_f1m],
             'F1_Weighted':[lr_f1w, rf_f1w, svc_f1w, gbc_f1w]})
result1 = result1.round(3)
result1

Unnamed: 0,Model,Accuracy,F1_Macro,F1_Micro,F1_Weighted
0,Logistic Regression,0.695,0.693,0.695,0.696
1,Random Forest,0.659,0.658,0.659,0.658
2,SVC,0.701,0.702,0.701,0.701
3,GBC,0.721,0.724,0.721,0.722
