# Lab 5: Kaggle Competition

In this lab, I will be building a predictive model for a yelp dataset

## Import Libraries

In [135]:
import pandas as pd
import numpy as np
import re
from datetime import datetime

from sklearn.preprocessing import LabelEncoder, OneHotEncoder, MultiLabelBinarizer
from sklearn.feature_extraction import DictVectorizer

from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

from sklearn.model_selection import cross_validate
from sklearn.metrics import accuracy_score

## Import Data 

In [108]:
df_train = pd.read_csv("../Data/yelp_train.csv")
df_test = pd.read_csv("../Data/yelp_test.csv")

## Data Preprocessing

In [160]:
#split into labels and features
X_train = df_train.drop(["review_id", "is_good_rating"], axis=1)
y_train = df_train.is_good_rating

X_test = df_test.drop('review_id', axis=1)

In [92]:
#categorize businesses
# bus_cat_train = X_train['business_categories'].fillna("None")
# bus_cat_train = bus_cat_train.apply(lambda x : x.split(','))

# mlb = MultiLabelBinarizer()
# bus_cat_train = mlb.fit_transform(bus_cat_train)

# #categorize businesses
# bus_cat_test = X_test['business_categories'].fillna("None")
# bus_cat_test = bus_cat_test.apply(lambda x : x.split(','))

# bus_cat_test = mlb.transform(bus_cat_test)

In [94]:
# bus_cat_train #onehotencoded df of types of businesses

In [93]:
# bus_cat_train = pd.DataFrame(bus_cat_train)
# bus_cat_train.head()

### Differentiate good and bad reviews by keywords

In [161]:
# make some categoricals from text feature
good_tr = X_train.text.apply(lambda x : 1 if re.search(r'good', x) else 0)
great_tr = X_train.text.apply(lambda x : 1 if re.search(r'great', x) else 0)
bad_tr = X_train.text.apply(lambda x : 1 if re.search(r'bad', x) else 0)
amazing_tr = X_train.text.apply(lambda x : 1 if re.search(r'amazing', x) else 0)

In [162]:
# turn into dataframe
common_words_train = pd.DataFrame({'good' : good_tr, 'bad' : bad_tr, 'great' : great_tr, 'amazing' : amazing_tr})

In [163]:
# make some categoricals from text feature repeat for test data
good_tst = X_test.text.apply(lambda x : 1 if re.search(r'good', x) else 0)
great_tst = X_test.text.apply(lambda x : 1 if re.search(r'great', x) else 0)
bad_tst = X_test.text.apply(lambda x : 1 if re.search(r'bad', x) else 0)
amazing_tst = X_test.text.apply(lambda x : 1 if re.search(r'amazing', x) else 0)

In [164]:
# turn into dataframe
common_words_tst = pd.DataFrame({'good' : good_tst, 'bad' : bad_tst, 'great' : great_tst, 'amazing' : amazing_tst})

### Differentiate elite yelpers from non-elite

In [165]:
#turn user elite column into number of times user has been user elite
user_elite_tr = X_train.user_elite.apply(lambda x : 0 if x == 'None' else len(x.split(",")))
user_elite_tst = X_test.user_elite.apply(lambda x : 0 if x == 'None' else len(x.split(",")))

### Calculate diff from start of yelp to the year of the review

In [166]:
review_date_tr = X_train.date.apply(lambda x: datetime.strptime(x, '%Y-%m-%d').year)
start_date_tr = X_train.user_yelping_since.apply(lambda x: datetime.strptime(x, '%Y-%m-%d').year)

date_diff_tr = review_date_tr-start_date_tr

In [167]:
review_date_tst = X_test.date.apply(lambda x: datetime.strptime(x, '%Y-%m-%d').year)
start_date_tst = X_test.user_yelping_since.apply(lambda x: datetime.strptime(x, '%Y-%m-%d').year)

date_diff_tst = review_date_tst-start_date_tst

## Update Training Data and Test Data

In [168]:
new_X_train = X_train.drop(['text', 'business_id', 'user_id', 'business_categories', 'business_latitude', 
              'business_longitude', 'business_state', 'business_city', 'date', 'user_yelping_since'], axis=1)
new_X_train['user_elite'] = user_elite_tr
new_X_train['date_diff'] = date_diff_tr
new_X_train = new_X_train.join(common_words_train)
new_X_train.head()

Unnamed: 0,cool,funny,useful,user_average_stars,user_elite,user_review_count,business_review_count,business_average_stars,date_diff,good,bad,great,amazing
0,0,0,0,2.0,0,5,158,4.0,4,0,0,0,0
1,0,0,0,4.43,0,7,26,3.5,1,0,0,0,0
2,0,0,0,4.09,0,21,189,4.0,4,0,0,0,0
3,1,0,0,3.55,2,83,316,3.0,0,1,0,1,0
4,1,1,1,3.75,0,28,61,3.5,0,0,0,0,0


In [169]:
new_X_test = X_test.drop(['text', 'business_id', 'user_id', 'business_categories', 'business_latitude', 
              'business_longitude', 'business_state', 'business_city', 'date', 'user_yelping_since'], axis=1)
new_X_test['user_elite'] = user_elite_tst
new_X_test['date_diff'] = date_diff_tst
new_X_test = new_X_test.join(common_words_tst)
new_X_test.head()

Unnamed: 0,cool,funny,useful,user_average_stars,user_elite,user_review_count,business_review_count,business_average_stars,date_diff,good,bad,great,amazing
0,2,1,2,3.8,4,369,81,4.0,4,0,1,0,0
1,1,0,2,3.8,10,483,18,5.0,4,0,0,0,0
2,1,1,1,4.14,0,7,572,4.0,1,0,0,0,1
3,1,1,2,4.26,2,131,33,4.0,1,0,0,0,0
4,7,6,11,3.83,6,212,558,4.0,3,1,0,1,1


## Train ML Models

### Logistic Regression 

In [172]:
lr = LogisticRegression(solver='liblinear')
cv_results = cross_validate(lr, new_X_train, y_train, cv=5)
cv_results['test_score']

array([0.80114998, 0.800025  , 0.79954167, 0.79834997, 0.80099585])

In [173]:
lr.fit(new_X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [174]:
y_pred = lr.predict(new_X_train)
accuracy_score(y_pred, y_train)

0.8003208333333334

In [175]:
submission = pd.DataFrame(index=df_test.review_id)
submission['is_good_rating'] = lr.predict(new_X_test)

In [176]:
submission.reset_index().to_csv('submission.csv', index=False)

### Linear SVC 

In [143]:
svm = LinearSVC()
cv_results = cross_validate(svm, new_X_train, y_train, cv=5)
cv_results['test_score']



KeyboardInterrupt: 

In [None]:
svm = LinearSVC()
svm.fit(new_X_train, y_train)

In [134]:
accuracy_score(svm.predict(new_X_train), y_train)

0.7427916666666666

### Decision Tree

In [144]:
clf = DecisionTreeClassifier()
cv_results = cross_validate(clf, new_X_train, y_train, cv=5)
cv_results['test_score']

array([0.73083894, 0.7313806 , 0.729375  , 0.72880685, 0.72647347])

In [145]:
clf = DecisionTreeClassifier()
clf.fit(new_X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [146]:
accuracy_score(clf.predict(new_X_train), y_train)

0.9994166666666666