In [55]:
!pip install imblearn -q
!pip install keras -q
!pip install tensorflow -q

# Get the data
Same as in EDA notbook

In [56]:
import pandas as pd
import folium
import numpy as np
import matplotlib.pyplot as plt
from pandas.io.json import json_normalize
import re
import seaborn as sns
sns.set_style('white')
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

In [57]:
df = pd.read_csv("../data/data_clean_new.csv", 
                          encoding='utf_8', 
                          dtype = 'unicode',
                          parse_dates = True,
                          infer_datetime_format = True,
                          low_memory=False)
df = df.drop("Unnamed: 0", axis = 1)

In [58]:
df.stars.value_counts()

3.5    12833
4.0    12816
3.0     9421
4.5     5881
2.5     5080
2.0     2771
1.5      936
5.0      931
1.0      217
Name: stars, dtype: int64

In [59]:
for column in df.columns :
    if df[column][0] == 'True' or df[column][0] =='False':
    #This tests if the column is a boolean by using the first row for efficiency    
        df[column] = df[column]=='True'
        
    #for some columns we have NaN, in that case we test if we find a True or False value
    elif "True" in df[column].values :
        df[column] = df[column]=='True'
    elif "False" in df[column].values :
        df[column] = df[column]=='True'

In [60]:
cuisine_type = ["American (New)","American (Traditional)","Arts & Entertainment","Asian Fusion","Bakeries","Barbeque","Bars",
"Beer","Breakfast & Brunch","Buffets","Burgers","Cafes","Canadian (New)","Caribbean","Caterers","Chicken Wings",
"Chinese","Cocktail Bars","Coffee & Tea","Comfort Food","Delis","Desserts","Diners","Ethnic Food",
"Event Planning & Services","Fast Food","Food","Food Delivery Services","French","Gastropubs","Gluten-Free",
"Greek","Grocery","Halal","Hot Dogs","Ice Cream & Frozen Yogurt","Indian","Italian","Japanese","Juice Bars & Smoothies",
"Korean","Latin American","Lounges","Mediterranean","Mexican","Middle Eastern","Nightlife","Pizza","Pubs",
"Salad","Sandwiches","Seafood","Soup","Specialty Food","Sports Bars","Steakhouses","Sushi Bars","Tex-Mex",
"Thai","Vegan","Vegetarian","Vietnamese","Wine & Spirits","Wine Bars"]

ambiance = ["romantic","intimate","classy","hipster","divey","touristy","trendy","upscale","casual"]

In [61]:
for column in df[cuisine_type] :
    df[column] = df[column]=="1"

In [62]:
df['stars']=df['stars'].astype(float)
df.Price = pd.to_numeric(df.Price, errors='coerce')
df = df[np.isfinite(df['Price'])]

In [63]:
df.columns

Index(['address', 'business_id', 'city', 'is_open', 'latitude', 'longitude',
       'name', 'postal_code', 'review_count', 'stars', 'state', 'Price',
       'American (New)', 'American (Traditional)', 'Arts & Entertainment',
       'Asian Fusion', 'Bakeries', 'Barbeque', 'Bars', 'Beer',
       'Breakfast & Brunch', 'Buffets', 'Burgers', 'Cafes', 'Canadian (New)',
       'Caribbean', 'Caterers', 'Chicken Wings', 'Chinese', 'Cocktail Bars',
       'Coffee & Tea', 'Comfort Food', 'Delis', 'Desserts', 'Diners',
       'Ethnic Food', 'Event Planning & Services', 'Fast Food', 'Food',
       'Food Delivery Services', 'French', 'Gastropubs', 'Gluten-Free',
       'Greek', 'Grocery', 'Halal', 'Hot Dogs', 'Ice Cream & Frozen Yogurt',
       'Indian', 'Italian', 'Japanese', 'Juice Bars & Smoothies', 'Korean',
       'Latin American', 'Lounges', 'Mediterranean', 'Mexican',
       'Middle Eastern', 'Nightlife', 'Pizza', 'Pubs', 'Salad', 'Sandwiches',
       'Seafood', 'Soup', 'Specialty Food', 'Spo

In [64]:
df["review_count"]= df["review_count"].astype(int)
df["name_length"]= df["name_length"].astype(int)

# Base rate
The base rate is the size of the most common class divided by the size of the dataset.
Our accuracy should be better than the default rate

In [65]:
print("The most common class for the ratings is", df["stars"].mode()[0])

baseRate = df[df["stars"] == 3.5].count()["stars"] / df["stars"].count()
print("The baserate is :", baseRate)

The most common class for the ratings is 3.5
The baserate is : 0.25228663034284704


# Preprocess data

In [66]:
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression

In [67]:
#mean normalisation
#df["normalized_review_count"]=(df["review_count"]-df["review_count"].mean())/df["review_count"].std()

#min-max normalisation 
#df["normalized_review_count"]=(df["review_count"]-df["review_count"].min())/(df["review_count"].max()-df["review_count"].min())

In [68]:
#df["filter_stars"] = df[df["stars"] != 1]["stars"]
#df["filter_stars"] = df["filter_stars"].dropna()

In [69]:
X = df[df['stars'] != 1]
X = X.drop(['stars', "name", "address", "business_id", "city", "state", 'postal_code', 'latitude', 'longitude'], axis = 1)
y = df["stars"][df["stars"] != 1]

In [70]:
from sklearn import preprocessing
from sklearn import utils

lab_enc = preprocessing.LabelEncoder()
encoded_y = lab_enc.fit_transform(y) #we label encode the star ratings

In [71]:
# split train/test
from sklearn.model_selection import train_test_split, GridSearchCV
from pprint import pprint
from time import time
X_train, X_test, y_train, y_test = train_test_split(X, encoded_y, test_size=0.2, random_state=72)

In [72]:
pd.Series(y_train).value_counts()

4    10268
5    10266
3     7539
6     4702
2     4056
1     2202
0      736
7      729
dtype: int64

In [73]:
med_cl_cnt= int(pd.Series(y_train).value_counts().median())

In [74]:
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import AllKNN, NearMiss
X_resampled, y_resampled = SMOTE(sampling_strategy = {7:med_cl_cnt, 0:med_cl_cnt, 1:med_cl_cnt, 2:med_cl_cnt}).fit_resample(X_train, y_train) # upsampling
X_resampled, y_resampled = NearMiss(sampling_strategy = {3:med_cl_cnt, 4:med_cl_cnt, 5:med_cl_cnt, 6:med_cl_cnt}).fit_resample(X_resampled, y_resampled) #downsampling

In [75]:
import numpy
unique, counts = numpy.unique(y_resampled, return_counts=True)
dict(zip(unique, counts))

{0: 4379, 1: 4379, 2: 4379, 3: 4379, 4: 4379, 5: 4379, 6: 4379, 7: 4379}

In [None]:
#balance classes

#balanced = pd.DataFrame(columns=df.columns) #create empty df
#for i in df["filter_stars"].dropna().unique():
  #  balanced = balanced.append(df[df["filter_stars"] == i].sample(900))


In [None]:
#balanced["stars"].value_counts()

In [None]:

#X = df[["name_length", "normalized_review_count", "Price", "American (New)", "Sandwiches"]]
#X = balanced.drop(["stars","filter_stars", "name", "address", "business_id", "city", "state", "review_count", 'postal_code'], axis = 1)
#y = balanced["stars"]


In [None]:
parametersRF = {
    'n_estimators': (100,200,300),
    'max_depth': (10,20,30)
}
parametersLR = {
    'C': (0.1, 1,100),
    'solver': (['saga','lbfgs'])
}
parametersNN = {
    'epochs': ([10, 100]),
    'batch_size': ([20,30])
}

# Logistic Regression

In [None]:
LR = LogisticRegression(solver='lbfgs', max_iter=2000, multi_class = "auto")
LR.fit(X_train, y_train)

In [None]:
if __name__ == "__main__":
    # multiprocessing requires the fork to happen in a __main__ protected
    # block

    # find the best parameters for both the feature extraction and the
    # classifier
    grid_search = GridSearchCV(LR, parametersLR, cv=2,
                               n_jobs=-1, verbose=1,scoring='accuracy')

    print("Performing grid search...")
    #print("pipeline:", [name for name, _ in pipeline2.steps])
    print("parameters:")
    pprint(parametersLR)
    t0 = time()
    grid_search.fit(X_train, y_train)
    print("done in %0.3fs" % (time() - t0))
    print()

    print("Best score for Logistic Regression: %0.3f" % grid_search.best_score_)
    print("Best parameters set:")
    best_parameters = grid_search.best_estimator_.get_params()
    for param_name in sorted(parametersLR.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))

The best model according to gridsearch is the one we tested at the beginning with lbfgs solver and C equal to 1.

In [None]:
# decomment, if need to run best model with other params 
#LR = LogisticRegression(solver='lbfgs',C=1, max_iter=2000, multi_class = "auto")
#LR.fit(X_train, y_train)

In [None]:
# train accuracy
LR.score(X_train,y_train)

In [None]:
# test accuracy
LR.score(X_test, y_test)

In [76]:
from sklearn.metrics import classification_report
target_names = ["1.5","2","2.5","3","3.5","4","4.5","5"]

#print(classification_report(y_test, LR.predict(X_test), target_names= target_names))

Our test accuracy is above the baserate but it isn't really a good result

# Random Forest Classifier

In [79]:
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier

clf = RandomForestClassifier(n_estimators = 200,max_depth = 30)
clf.fit(X_resampled, y_resampled)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=30, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=200,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [80]:
print("Train accuracy :", clf.score(X_resampled, y_resampled))
print("Test accuracy :", clf.score(X_test, y_test))

Train accuracy : 0.8759705412194565
Test accuracy : 0.22903703703703704


In [None]:
if __name__ == "__main__": # just for multiprocessing purposes
    grid_search = GridSearchCV(clf, parametersRF, cv=3,
                               n_jobs=-1, verbose=1,scoring='accuracy')

    print("Performing grid search...")
    #print("Random Forest:", [name for name, _ in clf.steps])
    print("parameters:")
    pprint(parametersRF)
    t0 = time()
    grid_search.fit(X_train, y_train)
    print("done in %0.3fs" % (time() - t0))
    print()

    print("Best score for Random Forest: %0.3f" % grid_search.best_score_)
    print("Best parameters set:")
    best_parameters = grid_search.best_estimator_.get_params()
    for param_name in sorted(parametersRF.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))

In [83]:
clf = RandomForestClassifier(n_estimators = 100,max_depth = 30)
clf.fit(X_resampled, y_resampled)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=30, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [84]:
print("Train accuracy :", clf.score(X_resampled, y_resampled))
print("Test accuracy :", clf.score(X_test, y_test))

Train accuracy : 0.867121488924412
Test accuracy : 0.2308148148148148


In [None]:
print(classification_report(y_test, clf.predict(X_test), target_names= target_names))

In [None]:
model = ExtraTreesClassifier(n_estimators=100)
model.fit(X_train,y_train)
print(model.feature_importances_) #use inbuilt class feature_importances of tree based classifiers
#plot graph of feature importances for better visualization
feat_importances = pd.Series(model.feature_importances_, index=X.columns)
feat_importances.nlargest(10).plot(kind='barh')
plt.show()

# Neural network

In [40]:
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation
from keras import optimizers
from keras.utils import np_utils
from keras.wrappers.scikit_learn import KerasClassifier
np.random.seed(1143)

In [41]:
def model_NN():
    model = Sequential()
    model.add(Dense(512, input_shape=(69,)))
    model.add(Activation('relu')) # An "activation" is just a non-linear function applied to the output
                              # of the layer above. Here, with a "rectified linear unit",
                              # we clamp all values below 0 to 0.
                           
    model.add(Dropout(0.2))   # Dropout helps protect the model from memorizing or "overfitting" the training data
    model.add(Dense(8))
    model.add(Activation('softmax')) # This special "softmax" activation among other things,
                                 # ensures the output is a valid probaility distribution, that is
                                 # that its values are all non-negative and sum to 1.
    #optimizer = optimizers.Adam(lr=0.01, decay=1e-6)
    optimizer = optimizers.SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True)
    model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])
    return model

In [42]:
model = KerasClassifier(build_fn= model_NN, epochs=100, batch_size=10, verbose=0)

In [43]:
X = df.drop(['stars', "name", "address", "business_id", "city", "state", 'postal_code', "longitude", "latitude"], axis = 1)
y = df["stars"]

In [None]:
#nb_classes = len(y.value_counts())

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=72)


In [44]:
from keras.utils import to_categorical

y_train = to_categorical(y_resampled, num_classes=8)
y_test = to_categorical(y_test, num_classes=8)

In [48]:
model_hist = model.fit(X_resampled, y_train,
                       batch_size=64, epochs=10,
                       verbose=1, validation_split=0.2)

Train on 28025 samples, validate on 7007 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [49]:
score = model.score(X_test, y_test, verbose=0)
score

0.1079506203532219