 ## Easy Ensemble AdaBoost Classifier

In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter


from sqlalchemy import create_engine
from config import db_password
from sqlalchemy import inspect

from imblearn.ensemble import EasyEnsembleClassifier
from imblearn.metrics import classification_report_imbalanced

from sklearn.metrics import accuracy_score
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.metrics import balanced_accuracy_score,confusion_matrix
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler
from sklearn import datasets

In [2]:
# Read in the data
# Creating engine and connection to the SQL database

db_string = f"postgresql://postgres:{db_password}@127.0.0.1:5432/yelp_DB"
engine = create_engine(db_string)
df = pd.read_sql_table('merged_data', engine)
df.head()

Unnamed: 0,postal_code,restaurant_id,restaurants_name,address,city,states,latitude,longitude,stars_rating,review_count,...,wifi,good_for_groups,wheelchair_accessible,happy_hour,noise_level,dietary_restrictions,total_estimate_households_per_zip,total_estimate_married_couple_family_households,total_estimate_nonfamily_households,median_income_dollars
0,83709,oCVcvXmtVJKAH8vpFCoVyg,#1 Mongolian BBQ - Best Stir Fried Noodles In ...,"8249 W Overland Rd, Ste 180",Boise,ID,43.589722,-116.285309,3.5,51,...,True,True,False,False,4,False,21787.0,12577.0,6435.0,74941.0
1,19104,k_43W3zbbEuvYuLZviSJKA,&pizza - UPenn,3925 Walnut St,Philadelphia,PA,39.954236,-75.201192,4.0,61,...,False,True,True,False,4,False,16508.0,2296.0,10575.0,28603.0
2,19102,wuH4TPUo8oJo4E59xZKsNg,&pizza - Walnut,430 Walnut St,Philadelphia,PA,39.949207,-75.16592,4.5,364,...,True,True,True,False,4,False,3365.0,765.0,2525.0,110096.0
3,19428,VZFQS0SXzXPuxISbAgIVBA,'feine,812 Fayette St,Conshohocken,PA,40.080194,-75.300277,4.5,166,...,True,True,False,False,1,False,8479.0,3332.0,4239.0,98031.0
4,37212,UBCOE-7SXWrg2kPLSuF4YQ,'za,2005 Belcourt Ave,Nashville,TN,36.136669,-86.800017,4.0,83,...,True,False,True,False,4,False,7149.0,2539.0,4043.0,71699.0


## Data Cleaning

In [3]:
df.dtypes

postal_code                                         object
restaurant_id                                       object
restaurants_name                                    object
address                                             object
city                                                object
states                                              object
latitude                                           float64
longitude                                          float64
stars_rating                                       float64
review_count                                         int64
restaurants_delivery                                  bool
outdoor_seating                                       bool
accepts_creditcards                                   bool
price_range                                         object
alcohol                                               bool
good_for_kids                                         bool
reservations                                          bo

In [4]:
# Categorizing restaurants based on stars ratings
df["category"] = pd.cut(df["stars_rating"],bins=[0.9,3.5,5],
                                 labels=["Lower Performance", "Higher Performance"])



In [5]:
def changeStatus(status):
    if status == "Lower Performance":
        return 0
    else:
        return 1
    
df["category_encoded"] = df["category"].apply(changeStatus)
df["category_encoded"] = pd.to_numeric(df["category_encoded"])


In [6]:
# Transform Previous State column
states_num = {
   "AZ": 1,
   "CA": 2,
   "DE": 3,
   "FL": 4,
   "ID": 5,
   "IL": 6,
   "IN": 7,
   "LA": 8,
   "MO": 9,
   "NJ": 10,
   "NV": 11,
   "PA": 12,
    "TN":13
}

    
# Apply lambda function to states to convert to numbers.
df["states"] = df["states"].apply(lambda x: states_num[x])



In [7]:
df['price_range'] = df['price_range'].fillna(0)
df['price_range'] = df['price_range'].astype("int64")
df.drop(df.loc[df['price_range']==0].index, inplace=True)  

In [8]:
df['noise_level'] = df['noise_level'].fillna(0)
df['noise_level'] = df['noise_level'].astype("int64")
df.drop(df.loc[df['noise_level']==0].index, inplace=True) 

In [9]:
df['total_estimate_households_per_zip'] = df['total_estimate_households_per_zip'].fillna(0)
df['total_estimate_households_per_zip'] = df['total_estimate_households_per_zip'].astype("int64")
df.drop(df.loc[df['total_estimate_households_per_zip']==0].index, inplace=True) 


In [10]:
df['total_estimate_married_couple_family_households'] = df['total_estimate_married_couple_family_households'].fillna(0)
df['total_estimate_married_couple_family_households'] = df['total_estimate_married_couple_family_households'].astype("int64")
df.drop(df.loc[df['total_estimate_married_couple_family_households']==0].index, inplace=True) 


In [11]:
df['total_estimate_nonfamily_households'] = df['total_estimate_nonfamily_households'].fillna(0)
df['total_estimate_nonfamily_households'] = df['total_estimate_nonfamily_households'].astype("int64")
df.drop(df.loc[df['total_estimate_nonfamily_households']==0].index, inplace=True) 


In [12]:
df['median_income_dollars'] = df['median_income_dollars'].fillna(0)
df['median_income_dollars'] = df['median_income_dollars'].astype("int64")
df.drop(df.loc[df['median_income_dollars']==0].index, inplace=True)

In [14]:
df.columns

Index(['postal_code', 'restaurant_id', 'restaurants_name', 'address', 'city',
       'states', 'latitude', 'longitude', 'stars_rating', 'review_count',
       'restaurants_delivery', 'outdoor_seating', 'accepts_creditcards',
       'price_range', 'alcohol', 'good_for_kids', 'reservations',
       'restaurants_takeout', 'wifi', 'good_for_groups',
       'wheelchair_accessible', 'happy_hour', 'noise_level',
       'dietary_restrictions', 'total_estimate_households_per_zip',
       'total_estimate_married_couple_family_households',
       'total_estimate_nonfamily_households', 'median_income_dollars',
       'category', 'category_encoded', 'unencoded_city'],
      dtype='object')

In [16]:
# Define the features set

X = df[['states', 'review_count', 'restaurants_delivery', 'outdoor_seating', 'accepts_creditcards',
       'price_range', 'alcohol', 'good_for_kids', 'reservations',
       'restaurants_takeout', 'wifi', 'good_for_groups',
       'wheelchair_accessible', 'happy_hour', 'noise_level', 'total_estimate_households_per_zip',
       'total_estimate_married_couple_family_households',
       'total_estimate_nonfamily_households', 'median_income_dollars']]

In [17]:
# Define the target set

y = df["category_encoded"]

 ## Split data into training and testing

In [18]:
# Use the train_test_split function to create training and testing subsets
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1, 
                                                    stratify=y)


In [19]:
# Train the EasyEnsembleClassifier

model = EasyEnsembleClassifier(n_estimators=100, random_state=1)

model.fit(X_train, y_train)

In [20]:
# Predicting Testing Data
y_pred = model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.6668370836281106

In [21]:
# Predicting Training Data
y_pred_train = model.predict(X_train)
balanced_accuracy_score(y_train, y_pred_train)


0.6740226306985269

In [22]:
# Display the confusion matrix

cm = confusion_matrix(y_test, y_pred)
cm

array([[2020,  964],
       [1094, 2093]])

In [23]:
# Print the imbalanced classification report

print(classification_report_imbalanced(y_test, y_pred))


                   pre       rec       spe        f1       geo       iba       sup

          0       0.65      0.68      0.66      0.66      0.67      0.45      2984
          1       0.68      0.66      0.68      0.67      0.67      0.44      3187

avg / total       0.67      0.67      0.67      0.67      0.67      0.44      6171



In [24]:
df.columns

Index(['postal_code', 'restaurant_id', 'restaurants_name', 'address', 'city',
       'states', 'latitude', 'longitude', 'stars_rating', 'review_count',
       'restaurants_delivery', 'outdoor_seating', 'accepts_creditcards',
       'price_range', 'alcohol', 'good_for_kids', 'reservations',
       'restaurants_takeout', 'wifi', 'good_for_groups',
       'wheelchair_accessible', 'happy_hour', 'noise_level',
       'dietary_restrictions', 'total_estimate_households_per_zip',
       'total_estimate_married_couple_family_households',
       'total_estimate_nonfamily_households', 'median_income_dollars',
       'category', 'category_encoded', 'unencoded_city'],
      dtype='object')