In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

import seaborn as sns
sns.set(style="whitegrid")

In [2]:
business_df = pd.read_csv('business_list.csv')

In [3]:
business_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16134 entries, 0 to 16133
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   stars         16134 non-null  float64
 1   business_id   16134 non-null  object 
 2   name          16134 non-null  object 
 3   city          16134 non-null  object 
 4   state         16134 non-null  object 
 5   postal_code   16113 non-null  object 
 6   latitude      16134 non-null  float64
 7   longitude     16134 non-null  float64
 8   categories    16134 non-null  object 
 9   review_count  16134 non-null  int64  
dtypes: float64(3), int64(1), object(6)
memory usage: 1.2+ MB


In [4]:
#labelling Stars as 0 and 1 

business_df['stars'] = np.where(business_df['stars'] < 4, 0, 1)

In [5]:
business_df['stars'].value_counts()

0    9919
1    6215
Name: stars, dtype: int64

In [6]:
attributes_df = pd.read_csv('attributes.csv')

In [7]:
business_df_merged = pd.merge(business_df,attributes_df,on='business_id')

In [8]:
business_df_merged.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10721 entries, 0 to 10720
Data columns (total 18 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   stars_x                            10721 non-null  int32  
 1   business_id                        10721 non-null  object 
 2   name                               10721 non-null  object 
 3   city                               10721 non-null  object 
 4   state                              10721 non-null  object 
 5   postal_code_x                      10721 non-null  object 
 6   latitude                           10721 non-null  float64
 7   longitude                          10721 non-null  float64
 8   categories_x                       10721 non-null  object 
 9   review_count                       10721 non-null  int64  
 10  attributes.RestaurantsTakeOut      10721 non-null  bool   
 11  attributes.GoodForKids             10721 non-null  boo

In [9]:
business_df_merged.rename(columns = {'stars_x':'stars','postal_code_x':'postal_code','categories_x':'categories'}, inplace = True)


In [10]:
business_df_merged.to_csv('business_df_merged.csv', index=False, header=True)

In [11]:
X = business_df_merged.drop({'stars','state','business_id','name','review_count','attributes.RestaurantsTakeOut','attributes.GoodForKids','postal_code_y','categories_y','attributes.RestaurantsDelivery','stars_y'},axis = 1)
y = business_df_merged['stars']

In [12]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10721 entries, 0 to 10720
Data columns (total 7 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   city                               10721 non-null  object 
 1   postal_code                        10721 non-null  object 
 2   latitude                           10721 non-null  float64
 3   longitude                          10721 non-null  float64
 4   categories                         10721 non-null  object 
 5   attributes.RestaurantsPriceRange2  10721 non-null  int64  
 6   attributes.WiFi                    10721 non-null  object 
dtypes: float64(2), int64(1), object(4)
memory usage: 670.1+ KB


In [13]:
y.info()

<class 'pandas.core.series.Series'>
Int64Index: 10721 entries, 0 to 10720
Series name: stars
Non-Null Count  Dtype
--------------  -----
10721 non-null  int32
dtypes: int32(1)
memory usage: 125.6 KB


In [14]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
for col in X:
    if X[col].dtype == 'object':
        X[col] = le.fit_transform(X[col])

In [15]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state= 42)

In [16]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10721 entries, 0 to 10720
Data columns (total 7 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   city                               10721 non-null  int32  
 1   postal_code                        10721 non-null  int32  
 2   latitude                           10721 non-null  float64
 3   longitude                          10721 non-null  float64
 4   categories                         10721 non-null  int32  
 5   attributes.RestaurantsPriceRange2  10721 non-null  int64  
 6   attributes.WiFi                    10721 non-null  int32  
dtypes: float64(2), int32(4), int64(1)
memory usage: 502.5 KB


In [17]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler(feature_range=(0,1))
rescaledX_train = scaler.fit_transform(X_train)
rescaledX_test = scaler.transform(X_test)

In [18]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

logreg = LogisticRegression()

logreg.fit(rescaledX_train, y_train)

y_pred = logreg.predict(rescaledX_test)

print("Logistic regression classifier: ")

print('Precision: %.3f' % precision_score(y_test, y_pred))
print('Recall: %.3f' % recall_score(y_test, y_pred))
print('Accuracy: %.3f' % accuracy_score(y_test, y_pred))
print('F1 Score: %.3f' % f1_score(y_test, y_pred))

cf_mtx = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(cf_mtx)


Logistic regression classifier: 
Precision: 0.667
Recall: 0.019
Accuracy: 0.647
F1 Score: 0.037
Confusion Matrix:
[[2060   11]
 [1124   22]]


In [19]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

rf = RandomForestClassifier(random_state=0)

rf.fit(rescaledX_train, y_train)

y_pred = rf.predict(rescaledX_test)

print("Random forest classifier: ")

print('Precision: %.3f' % precision_score(y_test, y_pred))
print('Recall: %.3f' % recall_score(y_test, y_pred))
print('Accuracy: %.3f' % accuracy_score(y_test, y_pred))
print('F1 Score: %.3f' % f1_score(y_test, y_pred))

cf_mtx = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(cf_mtx)


Random forest classifier: 
Precision: 0.878
Recall: 0.795
Accuracy: 0.888
F1 Score: 0.835
Confusion Matrix:
[[1945  126]
 [ 235  911]]


In [20]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from xgboost.sklearn import XGBClassifier

xgb = XGBClassifier(random_state=0)

xgb.fit(rescaledX_train, y_train)

y_pred = xgb.predict(rescaledX_test)

print("XGB classifier: ")

print('Precision: %.3f' % precision_score(y_test, y_pred))
print('Recall: %.3f' % recall_score(y_test, y_pred))
print('Accuracy: %.3f' % accuracy_score(y_test, y_pred))
print('F1 Score: %.3f' % f1_score(y_test, y_pred))

cf_mtx = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(cf_mtx)

  from pandas import MultiIndex, Int64Index


XGB classifier: 
Precision: 0.844
Recall: 0.692
Accuracy: 0.845
F1 Score: 0.760
Confusion Matrix:
[[1924  147]
 [ 353  793]]


In [21]:
import pickle

pickle.dump(rf, open('rf_model.sav', 'wb'))