# Easy Ensemble AdaBoost Classifier

In AdaBoost, a model is trained and then evaluated. After evaluating the errors of the first model, another model is trained. 

The model gives extra weight to the errors from the previous model. The purpose of this weighting is to minimize similar errors in subsequent models. Then, the errors from the second model are given extra weight for the third model. This process is repeated until the error rate is minimized.

However, AdaBoost is more sensetive to overfitting than Random Forests.

In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

from sqlalchemy import create_engine
# from config import db_password
from sqlalchemy import inspect
import matplotlib.pyplot as plt
import pandas as pd

business_df = pd.read_csv("../../../Data/new_merged_datasets.csv")
#business_df = pd.read_csv("../../../Data/merged_datasets.csv")

# Categorizing restaurants based on stars ratings
business_df["Category"] = pd.cut(business_df["Stars_Rating"],bins=[0.9,3.5,5],
                                 labels=["Lower Performance", "Higher Performance"])

def changeStatus(status):
    if status == "Lower Performance":
        return 0
    else:
        return 1
    
business_df['Category_Encoded'] = business_df["Category"].apply(changeStatus)
business_df["Category_Encoded"] = pd.to_numeric(business_df["Category_Encoded"])

business_df["Median_Income(dollars)"] = pd.to_numeric(business_df["Median_Income(dollars)"], errors='coerce')
business_df["Mean_Income(dollars)"] = pd.to_numeric(business_df["Mean_Income(dollars)"], errors='coerce')

business_df = business_df.dropna(subset=['Median_Income(dollars)'])

In [7]:
# # Creating engine and connection to the SQL database
# db_string = f"postgresql://postgres:{db_password}@127.0.0.1:5432/yelp_DB"
# engine = create_engine(db_string)
# df = pd.read_sql_table('business', engine)

In [12]:
business_df.dtypes

Unnamed: 0                                            int64
Restaurant_ID                                        object
Restaurants_Name                                     object
Address                                              object
City                                                 object
State                                                object
Postal_Code                                           int64
Latitude                                            float64
Longitude                                           float64
Stars_Rating                                        float64
Review_Count                                          int64
Restaurants_Delivery                                  int64
Outdoor_Seating                                       int64
Accepts_CreditCards                                   int64
Price_Range                                           int64
Alcohol                                               int64
Good_For_Kids                           

In [13]:
business_df.columns

Index(['Unnamed: 0', 'Restaurant_ID', 'Restaurants_Name', 'Address', 'City',
       'State', 'Postal_Code', 'Latitude', 'Longitude', 'Stars_Rating',
       'Review_Count', 'Restaurants_Delivery', 'Outdoor_Seating',
       'Accepts_CreditCards', 'Price_Range', 'Alcohol', 'Good_For_Kids',
       'Reservations', 'Restaurants_TakeOut', 'WiFi', 'Good_For_Groups',
       'Wheelchair_Accessible', 'Happy_Hour', 'Noise_Level',
       'Dietary_Restrictions', 'Total_Estimate_Households_per_Zip',
       'Total_Estimate_Married-couple_Family_households',
       'Total_Estimate_Nonfamily_households', 'Percentage_EH<$10,000',
       'Percentage_EH_$10,000-$14,999', 'Percentage_EH_$15,000-$24,999',
       'Percentage_EH_$25,000-$34,999', 'Percentage_EH_$35,000-$49,999',
       'Percentage_EH_$50,000-$74,999', 'Percentage_EH_$75,000-$99,999',
       'Percentage_EH_$100,000-$149,999', 'Percentage_EH_$150,000-$199,999',
       'Percentage_EH_$200,000<', 'Median_Income(dollars)',
       'Mean_Income(dolla

In [None]:
# # 48.92%
# X = business_df[['Review_Count', 'Restaurants_Delivery', 'Outdoor_Seating',
#        'Accepts_CreditCards', 'Price_Range', 'Alcohol', 'Good_For_Kids',
#        'Reservations', 'Good_For_Groups',
#        'Wheelchair_Accessible', 'Noise_Level', 'WiFi', 'Dietary_Restrictions', 'Happy_Hour', 'Restaurants_TakeOut',
#       'Total_Estimate_Married-couple_Family_households', 'Total_Estimate_Nonfamily_households',
#                  'Median_Income(dollars)', 'Population']]

In [2]:
# 48.93%
X = business_df[['Review_Count', 'Restaurants_Delivery', 'Outdoor_Seating',
       'Accepts_CreditCards', 'Price_Range', 'Alcohol', 'Good_For_Kids',
       'Reservations', 'Restaurants_TakeOut', 'WiFi', 'Good_For_Groups',
       'Wheelchair_Accessible', 'Happy_Hour', 'Noise_Level',
       'Dietary_Restrictions',
                'Total_Estimate_Married-couple_Family_households',
       'Total_Estimate_Nonfamily_households',
                'Median_Income(dollars)', 'Population']]

In [3]:
# Define the target set
y = business_df["Category_Encoded"]

In [5]:
# Split the model into training and testing sets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,
                                                   y, 
                                                   random_state=1, 
                                                    stratify=y)

In [6]:
# Scale the Training and Testing Data
from sklearn.preprocessing import StandardScaler

# Instantiate a StandardScaler()
scaler = StandardScaler()

# Fitting the Standard Scaler with the training data
X_scaler = scaler.fit(X_train)


# Scaling the data 
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [7]:
# Train the EasyEnsembleClassifier
from imblearn.ensemble import EasyEnsembleClassifier

model = EasyEnsembleClassifier(n_estimators=100, random_state=1)

model.fit(X_train, y_train)

EasyEnsembleClassifier(n_estimators=100, random_state=1)

In [8]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score
y_pred = model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.6666856086914428

In [9]:
y_pred_train = model.predict(X_train)
balanced_accuracy_score(y_train, y_pred_train)

0.6758395525660825

In [20]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test, y_pred)
cm

array([[ 437,  244,  395,  129],
       [ 832, 1267,  645,  763],
       [  41,   13,  185,   15],
       [ 139,  303,  163,  617]])

In [21]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced

print(classification_report_imbalanced(y_test, y_pred))



                   pre       rec       spe        f1       geo       iba       sup

    Average       0.30      0.36      0.80      0.33      0.54      0.28      1205
       Good       0.69      0.36      0.79      0.48      0.53      0.27      3507
       Poor       0.13      0.73      0.80      0.23      0.76      0.58       254
 Successful       0.40      0.50      0.82      0.45      0.64      0.40      1222

avg / total       0.54      0.40      0.80      0.43      0.57      0.31      6188

