# Lab 5: Kaggle Competition

In this lab, I will be building a predictive model for a yelp dataset

## Import Libraries

In [44]:
import pandas as pd
import numpy as np
import re
from datetime import datetime

from sklearn.preprocessing import LabelEncoder, OneHotEncoder, MultiLabelBinarizer, StandardScaler
from sklearn.feature_extraction import DictVectorizer

from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline

from sklearn.model_selection import cross_validate, GridSearchCV
from sklearn.metrics import accuracy_score

## Import Data 

In [2]:
df_train = pd.read_csv("../Data/yelp_train.csv")
df_test = pd.read_csv("../Data/yelp_test.csv")

## Data Preprocessing

In [3]:
#split into labels and features
X_train = df_train.drop(["review_id", "is_good_rating"], axis=1)
y_train = df_train.is_good_rating

X_test = df_test.drop('review_id', axis=1)

In [57]:
# # categorize businesses
# bus_cat_train = X_train['business_categories'].fillna("None")
# bus_cat_train = bus_cat_train.apply(lambda x : x.split(','))

# mlb = MultiLabelBinarizer()
# bus_cat_train = mlb.fit_transform(bus_cat_train)

# #categorize businesses
# bus_cat_test = X_test['business_categories'].fillna("None")
# bus_cat_test = bus_cat_test.apply(lambda x : x.split(','))

# bus_cat_test = mlb.transform(bus_cat_test)

In [55]:
# bus_cat_train #onehotencoded df of types of businesses

In [56]:
# bus_cat_train = pd.DataFrame(bus_cat_train)
# bus_cat_train.head()

In [43]:
# bus_cat_test = pd.DataFrame(bus_cat_test)
# bus_cat_test.head()

### Differentiate good and bad reviews by keywords

In [113]:
words = ['good', 'great', 'bad', 'amazing', 'awesome', 'best', 'worst', 'love', 'hate', 'favorite', 'disgusting', 
                 'awful', 'rude', 'subpar', 'wonderful', 'fantastic', 'acceptable', 'disappointing', 'uncool', 
                 'delicious']

In [114]:
def common_words_df(words, test=False):
    if test == True:
        data = X_test
    else: 
        data = X_train
        
    df = None
    for word in words:
        if df is None: 
            df = pd.DataFrame(data.text.apply(lambda x : 1 if re.search(word, x) else 0))
        else: 
            df[word] = (data.text.apply(lambda x : 1 if re.search(word, x) else 0))
    return df

In [115]:
common_words_train = common_words_df(words)

In [116]:
common_words_tst = common_words_df(words, True)

### Differentiate elite yelpers from non-elite

In [98]:
#turn user elite column into number of times user has been user elite
user_elite_tr = X_train.user_elite.apply(lambda x : 0 if x == 'None' else len(x.split(",")))
user_elite_tst = X_test.user_elite.apply(lambda x : 0 if x == 'None' else len(x.split(",")))

### Calculate diff from start of yelp to the year of the review

In [99]:
review_date_tr = X_train.date.apply(lambda x: datetime.strptime(x, '%Y-%m-%d').year)
start_date_tr = X_train.user_yelping_since.apply(lambda x: datetime.strptime(x, '%Y-%m-%d').year)

date_diff_tr = review_date_tr-start_date_tr

In [100]:
review_date_tst = X_test.date.apply(lambda x: datetime.strptime(x, '%Y-%m-%d').year)
start_date_tst = X_test.user_yelping_since.apply(lambda x: datetime.strptime(x, '%Y-%m-%d').year)

date_diff_tst = review_date_tst-start_date_tst

## Update Training Data and Test Data

In [117]:
new_X_train = X_train.drop(['text', 'business_id', 'user_id', 'business_categories', 'business_latitude', 
              'business_longitude', 'business_state', 'business_city', 'date', 'user_yelping_since'], axis=1)
new_X_train['user_elite'] = user_elite_tr
new_X_train['date_diff'] = date_diff_tr
new_X_train = new_X_train.join(common_words_train)
# new_X_train = new_X_train.join(bus_cat_train) #add business categories
new_X_train.head()

Unnamed: 0,cool,funny,useful,user_average_stars,user_elite,user_review_count,business_review_count,business_average_stars,date_diff,text,...,disgusting,awful,rude,subpar,wonderful,fantastic,acceptable,disappointing,uncool,delicious
0,0,0,0,2.0,0,5,158,4.0,4,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,4.43,0,7,26,3.5,1,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,4.09,0,21,189,4.0,4,0,...,0,0,0,0,0,0,0,0,0,1
3,1,0,0,3.55,2,83,316,3.0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,1,1,1,3.75,0,28,61,3.5,0,0,...,0,0,0,0,0,0,0,0,0,0


In [102]:
#scale df
scl = StandardScaler()
new_X_train = scl.fit_transform(new_X_train)

In [118]:
new_X_test = X_test.drop(['text', 'business_id', 'user_id', 'business_categories', 'business_latitude', 
              'business_longitude', 'business_state', 'business_city', 'date', 'user_yelping_since'], axis=1)
new_X_test['user_elite'] = user_elite_tst
new_X_test['date_diff'] = date_diff_tst
new_X_test = new_X_test.join(common_words_tst)
# new_X_test = new_X_test.join(bus_cat_test)
new_X_test.head()

Unnamed: 0,cool,funny,useful,user_average_stars,user_elite,user_review_count,business_review_count,business_average_stars,date_diff,text,...,disgusting,awful,rude,subpar,wonderful,fantastic,acceptable,disappointing,uncool,delicious
0,2,1,2,3.8,4,369,81,4.0,4,0,...,0,0,0,0,0,0,0,1,0,0
1,1,0,2,3.8,10,483,18,5.0,4,0,...,0,0,0,0,0,0,0,0,0,0
2,1,1,1,4.14,0,7,572,4.0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,1,1,2,4.26,2,131,33,4.0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,7,6,11,3.83,6,212,558,4.0,3,1,...,0,0,0,0,0,0,0,0,0,0


In [104]:
scl = StandardScaler()
new_X_test = scl.fit_transform(new_X_test)

## Train ML Models

### Logistic Regression 

In [127]:
lr = LogisticRegression(solver='liblinear', penalty='l2', C=100)
cv_results = cross_validate(lr, new_X_train, y_train, cv=5)
cv_results['test_score']

array([0.82204537, 0.82227454, 0.82170833, 0.81912123, 0.82241297])

In [125]:
lr.fit(new_X_train, y_train)

LogisticRegression(C=1000, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [126]:
y_pred = lr.predict(new_X_train)
accuracy_score(y_pred, y_train)

0.8217416666666667

## Grid Search

In [52]:
#grid search for random forest
pipe = Pipeline([('classifier' , RandomForestClassifier())])

param_grid = [
    {'classifier' : [LogisticRegression(solver='liblinear', penalty='l2')],
    'classifier__C' : 'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]},
    
    {'classifier' : [RandomForestClassifier()],
    'classifier__n_estimators' : list(range(10,101,15)),
    'classifier__max_features' : list(range(6,13))}
]

clf = GridSearchCV(pipe, param_grid = param_grid, cv = 5, verbose=True, n_jobs=-1)

best_clf = clf.fit(new_X_train, y_train)

Fitting 5 folds for each of 40 candidates, totalling 200 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   30.2s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  3.2min
[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed:  3.3min finished


In [53]:
accuracy_score(best_clf.predict(new_X_train), y_train)

0.8006166666666666

In [14]:
#grid search for LogReg
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]}
clf = GridSearchCV(LogisticRegression(penalty='l2', solver='liblinear'), param_grid, cv=5, verbose=True, n_jobs=-1)

best_clf = clf.fit(new_X_train, y_train)

Fitting 5 folds for each of 7 candidates, totalling 35 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  35 out of  35 | elapsed:   30.8s finished


In [17]:
best_clf.cv_results_.keys()

dict_keys(['mean_fit_time', 'std_fit_time', 'mean_score_time', 'std_score_time', 'param_C', 'params', 'split0_test_score', 'split1_test_score', 'split2_test_score', 'split3_test_score', 'split4_test_score', 'mean_test_score', 'std_test_score', 'rank_test_score'])

In [24]:
print(best_clf.cv_results_['param_C'])
for x in ['split0_test_score', 'split1_test_score', 'split2_test_score', 'split3_test_score', 'split4_test_score']:
    print(best_clf.cv_results_[x])

[0.001 0.01 0.1 1 10 100 1000]
[0.77833795 0.80100415 0.8015458  0.80114998 0.80117081 0.80131664
 0.80104581]
[0.77902544 0.79925418 0.8002125  0.800025   0.800025   0.80004583
 0.80015   ]
[0.77752083 0.80066667 0.7998125  0.79954167 0.79972917 0.79989583
 0.799875  ]
[0.7777662  0.79847497 0.79859997 0.79834997 0.79853747 0.7984333
 0.79834997]
[0.77843288 0.80097502 0.80116252 0.80099585 0.80141253 0.8014542
 0.8015167 ]


In [15]:
accuracy_score(best_clf.predict(new_X_train), y_train)

0.8003333333333333

# Submission

In [122]:
submission = pd.DataFrame(index=df_test.review_id)
submission['is_good_rating'] = lr.predict(new_X_test)

In [123]:
submission.reset_index().to_csv('submission5.csv', index=False)

### Linear SVC 

In [1]:
# svm = LinearSVC()
# cv_results = cross_validate(svm, new_X_train, y_train, cv=5)
# cv_results['test_score']

In [None]:
svm = LinearSVC()
svm.fit(new_X_train, y_train)

In [134]:
accuracy_score(svm.predict(new_X_train), y_train)

0.7427916666666666

### Decision Tree

In [144]:
clf = DecisionTreeClassifier()
cv_results = cross_validate(clf, new_X_train, y_train, cv=5)
cv_results['test_score']

array([0.73083894, 0.7313806 , 0.729375  , 0.72880685, 0.72647347])

In [145]:
clf = DecisionTreeClassifier()
clf.fit(new_X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [146]:
accuracy_score(clf.predict(new_X_train), y_train)

0.9994166666666666