# Balanced Random Forest Classifier

A random forest model combines many decision trees into a forest of trees. Random forest models:
- Are robust against overfitting because all of those weak learners are trained on different pieces of the data.
- Can be used to rank the importance of input variables in a natural way.
- Can handle thousands of input variables without variable deletion.
- Are robust to outliers and nonlinear data.
- Run efficiently on large datasets. 

In [5]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

from sqlalchemy import create_engine
#from config import db_password
from sqlalchemy import inspect
import matplotlib.pyplot as plt
import pandas as pd

business_df = pd.read_csv("../../Data/new_merged_datasets.csv")

# Categorizing restaurants based on stars ratings
business_df["Category"] = pd.cut(business_df["Stars_Rating"],bins=[0.9,2,3,4,5],labels=["Poor","Average","Good","Successful"])

# Since price can't be 0 and None, so replace it with a 1
def changeStatus(status):
    if status == "Poor":
        return 0
    elif status == "Average":
        return 1
    elif status ==  "Good":
        return 2
    else:
        return 3

business_df['Category_Encoded'] = business_df["Category"].apply(changeStatus)
business_df["Category_Encoded"] = pd.to_numeric(business_df["Category_Encoded"])


business_df["Median_Income(dollars)"] = pd.to_numeric(business_df["Median_Income(dollars)"], errors='coerce')
business_df["Mean_Income(dollars)"] = pd.to_numeric(business_df["Mean_Income(dollars)"], errors='coerce')

business_df = business_df.dropna(subset=['Median_Income(dollars)'])

In [6]:
business_df.columns

Index(['Unnamed: 0', 'Restaurant_ID', 'Restaurants_Name', 'Address', 'City',
       'State', 'Postal_Code', 'Latitude', 'Longitude', 'Stars_Rating',
       'Review_Count', 'Restaurants_Delivery', 'Outdoor_Seating',
       'Accepts_CreditCards', 'Price_Range', 'Alcohol', 'Good_For_Kids',
       'Reservations', 'Restaurants_TakeOut', 'WiFi', 'Good_For_Groups',
       'Wheelchair_Accessible', 'Happy_Hour', 'Noise_Level',
       'Dietary_Restrictions', 'Total_Estimate_Households_per_Zip',
       'Total_Estimate_Married-couple_Family_households',
       'Total_Estimate_Nonfamily_households', 'Median_Income(dollars)',
       'Mean_Income(dollars)', 'Population', 'Category', 'Category_Encoded'],
      dtype='object')

In [7]:
# Define features set
X = business_df[['Review_Count', 'Restaurants_Delivery', 'Outdoor_Seating',
       'Accepts_CreditCards', 'Price_Range', 'Alcohol', 'Good_For_Kids',
       'Reservations', 'Restaurants_TakeOut', 'WiFi', 'Good_For_Groups',
       'Wheelchair_Accessible', 'Happy_Hour', 'Noise_Level',
       'Dietary_Restrictions',
                'Total_Estimate_Married-couple_Family_households',
       'Total_Estimate_Nonfamily_households',
                'Median_Income(dollars)', 'Population']]

In [8]:
# Define the target
y = business_df["Category"]
y

0              Good
1              Good
2        Successful
3           Average
4              Good
            ...    
15602    Successful
15603       Average
15604    Successful
15605       Average
15606    Successful
Name: Category, Length: 15607, dtype: category
Categories (4, object): ['Poor' < 'Average' < 'Good' < 'Successful']

In [9]:
# Split the model into training and testing sets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,
                                                   y, 
                                                   random_state=1, 
                                                    stratify=y)

In [10]:
# Resample the training data with the BalancedRandomForestClassifier

from imblearn.ensemble import BalancedRandomForestClassifier

model = BalancedRandomForestClassifier(n_estimators=100, random_state=1)

model.fit(X_train, y_train)  

BalancedRandomForestClassifier(random_state=1)

In [11]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score

y_pred = model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.48076813002290175

In [12]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix

y_pred = model.predict(X_test)

confusion_matrix(y_test, y_pred)

array([[307, 161, 194,  89],
       [538, 733, 300, 645],
       [ 40,   8, 101,  11],
       [ 99, 174,  74, 428]])

In [13]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced

print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

    Average       0.31      0.41      0.79      0.35      0.57      0.31       751
       Good       0.68      0.33      0.80      0.45      0.51      0.25      2216
       Poor       0.15      0.63      0.85      0.24      0.73      0.52       160
 Successful       0.36      0.55      0.76      0.44      0.65      0.41       775

avg / total       0.53      0.40      0.79      0.42      0.56      0.31      3902



In [14]:
# List the features sorted in descending order by feature importance
feature_importance = sorted(zip(model.feature_importances_, X.columns), reverse=True)

for i in feature_importance:
    print('{} : ({})'.format(i[1], i[0]))

Review_Count : (0.1818176931847286)
Median_Income(dollars) : (0.1191585189207469)
Total_Estimate_Nonfamily_households : (0.11835912542916352)
Population : (0.1171339306227998)
Total_Estimate_Married-couple_Family_households : (0.1157440300852224)
Noise_Level : (0.04903911666356362)
Price_Range : (0.040374875940393525)
Wheelchair_Accessible : (0.03737702224474798)
Restaurants_Delivery : (0.031576919479167485)
WiFi : (0.029712084788837828)
Outdoor_Seating : (0.029409325579123374)
Reservations : (0.026159265328701747)
Good_For_Groups : (0.023279528717917924)
Happy_Hour : (0.023037947193747053)
Good_For_Kids : (0.02184532763562573)
Accepts_CreditCards : (0.013240534763468042)
Alcohol : (0.012054446708330715)
Restaurants_TakeOut : (0.010448587802385419)
Dietary_Restrictions : (0.00023171891132823917)
