In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

from sqlalchemy import create_engine
from config import db_password
from sqlalchemy import inspect

from imblearn.ensemble import EasyEnsembleClassifier
from imblearn.metrics import classification_report_imbalanced

from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.metrics import balanced_accuracy_score,confusion_matrix

In [2]:
# Creating engine and connection to the SQL database

db_string = f"postgresql://postgres:{db_password}@127.0.0.1:5432/yelp_DB"
engine = create_engine(db_string)
business_df = pd.read_sql_table('merged_data', engine)

In [3]:
business_df.head()

Unnamed: 0,postal_code,restaurant_id,restaurants_name,address,city,states,latitude,longitude,stars_rating,review_count,...,wifi,good_for_groups,wheelchair_accessible,happy_hour,noise_level,dietary_restrictions,total_estimate_households_per_zip,total_estimate_married_couple_family_households,total_estimate_nonfamily_households,median_income_dollars
0,83709,oCVcvXmtVJKAH8vpFCoVyg,#1 Mongolian BBQ - Best Stir Fried Noodles In ...,"8249 W Overland Rd, Ste 180",Boise,ID,43.589722,-116.285309,3.5,51,...,True,True,False,False,4,False,21787.0,12577.0,6435.0,74941.0
1,19104,k_43W3zbbEuvYuLZviSJKA,&pizza - UPenn,3925 Walnut St,Philadelphia,PA,39.954236,-75.201192,4.0,61,...,False,True,True,False,4,False,16508.0,2296.0,10575.0,28603.0
2,19102,wuH4TPUo8oJo4E59xZKsNg,&pizza - Walnut,430 Walnut St,Philadelphia,PA,39.949207,-75.16592,4.5,364,...,True,True,True,False,4,False,3365.0,765.0,2525.0,110096.0
3,19428,VZFQS0SXzXPuxISbAgIVBA,'feine,812 Fayette St,Conshohocken,PA,40.080194,-75.300277,4.5,166,...,True,True,False,False,1,False,8479.0,3332.0,4239.0,98031.0
4,37212,UBCOE-7SXWrg2kPLSuF4YQ,'za,2005 Belcourt Ave,Nashville,TN,36.136669,-86.800017,4.0,83,...,True,False,True,False,4,False,7149.0,2539.0,4043.0,71699.0


In [4]:
business_df.columns

Index(['postal_code', 'restaurant_id', 'restaurants_name', 'address', 'city',
       'states', 'latitude', 'longitude', 'stars_rating', 'review_count',
       'restaurants_delivery', 'outdoor_seating', 'accepts_creditcards',
       'price_range', 'alcohol', 'good_for_kids', 'reservations',
       'restaurants_takeout', 'wifi', 'good_for_groups',
       'wheelchair_accessible', 'happy_hour', 'noise_level',
       'dietary_restrictions', 'total_estimate_households_per_zip',
       'total_estimate_married_couple_family_households',
       'total_estimate_nonfamily_households', 'median_income_dollars'],
      dtype='object')

In [5]:
# Categorizing restaurants based on stars ratings

business_df["category"] = pd.cut(business_df["stars_rating"],bins=[0.9,3.5,5],
                                 labels=["Lower Performance", "Higher Performance"])

In [6]:
def changeStatus(status):
    if status == "Lower Performance":
        return 0
    else:
        return 1
    
business_df['category_encoded'] = business_df["category"].apply(changeStatus)
business_df["category_encoded"] = pd.to_numeric(business_df["category_encoded"])

In [7]:
# Transform Previous State column

def changeStatus(status):
    if status == 'AZ':
        return 1
    elif status == 'CA':
        return 2
    elif status == 'DE':
        return 3
    elif status == 'FL':
        return 4
    elif status == 'ID':
        return 5
    elif status == 'IL':
        return 6
    elif status == 'IN':
        return 7
    elif status == 'LA':
        return 8
    elif status == 'NJ':
        return 9
    elif status == 'NV':
        return 10
    elif status == 'PA':
        return 11
    else:
        return 12
# Along with replace() and map(), encode the state column into numbers.
business_df["states"] = business_df["states"].apply(changeStatus)

business_df["median_income_dollars"] = pd.to_numeric(business_df["median_income_dollars"], errors='coerce')
business_df = business_df.dropna(subset=['median_income_dollars'])


In [8]:
business_df.columns

Index(['postal_code', 'restaurant_id', 'restaurants_name', 'address', 'city',
       'states', 'latitude', 'longitude', 'stars_rating', 'review_count',
       'restaurants_delivery', 'outdoor_seating', 'accepts_creditcards',
       'price_range', 'alcohol', 'good_for_kids', 'reservations',
       'restaurants_takeout', 'wifi', 'good_for_groups',
       'wheelchair_accessible', 'happy_hour', 'noise_level',
       'dietary_restrictions', 'total_estimate_households_per_zip',
       'total_estimate_married_couple_family_households',
       'total_estimate_nonfamily_households', 'median_income_dollars',
       'category', 'category_encoded'],
      dtype='object')

In [9]:
# Define the features set

X = business_df[['states', 'review_count', 'restaurants_delivery', 'outdoor_seating', 'accepts_creditcards',
       'price_range', 'alcohol', 'good_for_kids', 'reservations',
       'restaurants_takeout', 'wifi', 'good_for_groups',
       'wheelchair_accessible', 'happy_hour', 'noise_level',
       'dietary_restrictions', 'total_estimate_households_per_zip',
       'total_estimate_married_couple_family_households',
       'total_estimate_nonfamily_households', 'median_income_dollars']]

In [10]:
# Define the target set

y = business_df['category_encoded']

In [11]:
# Split the model into training and testing sets

X_train, X_test, y_train, y_test = train_test_split(X,
                                                   y, 
                                                   random_state=1, 
                                                    stratify=y)

In [12]:
# Train the EasyEnsembleClassifier

model = EasyEnsembleClassifier(n_estimators=100, random_state=1)

model.fit(X_train, y_train)

In [13]:
# Predicting Testing Data
y_pred = model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.6668370836281106

In [14]:
# Predicting Training Data

y_pred_train = model.predict(X_train)
balanced_accuracy_score(y_train, y_pred_train)

0.6740226306985269

In [15]:
# Display the confusion matrix

cm = confusion_matrix(y_test, y_pred)
cm

array([[2020,  964],
       [1094, 2093]])

In [16]:
# Print the imbalanced classification report

print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.65      0.68      0.66      0.66      0.67      0.45      2984
          1       0.68      0.66      0.68      0.67      0.67      0.44      3187

avg / total       0.67      0.67      0.67      0.67      0.67      0.44      6171

