# Balanced Random Forest Classifier

A random forest model combines many decision trees into a forest of trees. Random forest models:
- Are robust against overfitting because all of those weak learners are trained on different pieces of the data.
- Can be used to rank the importance of input variables in a natural way.
- Can handle thousands of input variables without variable deletion.
- Are robust to outliers and nonlinear data.
- Run efficiently on large datasets. 

In [19]:
import pandas as pd

from sqlalchemy import create_engine
#from config import db_password
from sqlalchemy import inspect
import matplotlib.pyplot as plt
import pandas as pd

business_df = pd.read_csv("../../Data/business_census_merged_dataset.csv")

# Categorizing restaurants based on stars ratings
business_df["Category"] = pd.cut(business_df["Stars_Rating"],bins=[0.9,2,3,4,5],
                                 labels=["Poor","Average","Good","Successful"])

# Since price can't be 0 and None, so replace it with a 1
def changeStatus(status):
    if status == "Poor":
        return 0
    elif status == "Average":
        return 1
    elif status ==  "Good":
        return 2
    else:
        return 3

business_df['Category_Encoded'] = business_df["Category"].apply(changeStatus)
business_df["Category_Encoded"] = pd.to_numeric(business_df["Category_Encoded"])


business_df["Median_Income(dollars)"] = pd.to_numeric(business_df["Median_Income(dollars)"], errors='coerce')
business_df["Mean_Income(dollars)"] = pd.to_numeric(business_df["Mean_Income(dollars)"], errors='coerce')

business_df = business_df.dropna(subset=['Median_Income(dollars)'])

In [16]:
business_df.columns

Index(['Restaurant_ID', 'Restaurants_Name', 'Address', 'City', 'State',
       'Postal_Code', 'Latitude', 'Longitude', 'Stars_Rating', 'Review_Count',
       'Restaurants_Delivery', 'Outdoor_Seating', 'Accepts_CreditCards',
       'Price_Range', 'Alcohol', 'Good_For_Kids', 'Reservations',
       'Restaurants_TakeOut', 'WiFi', 'Good_For_Groups',
       'Wheelchair_Accessible', 'Happy_Hour', 'Noise_Level',
       'Dietary_Restrictions', 'Total_Estimate_Households_per_Zip',
       'Total_Estimate_Married-couple_Family_households',
       'Total_Estimate_Nonfamily_households', 'Median_Income(dollars)',
       'Mean_Income(dollars)', 'Category', 'Category_Encoded'],
      dtype='object')

In [20]:
# Define features set
X = business_df[['Review_Count', 'Restaurants_Delivery', 'Outdoor_Seating',
       'Accepts_CreditCards', 'Price_Range', 'Alcohol', 'Good_For_Kids',
       'Reservations', 'Restaurants_TakeOut', 'WiFi', 'Good_For_Groups',
       'Wheelchair_Accessible', 'Happy_Hour', 'Noise_Level',
       'Dietary_Restrictions',
                'Total_Estimate_Married-couple_Family_households',
       'Total_Estimate_Nonfamily_households',
                'Median_Income(dollars)', 'Total_Estimate_Households_per_Zip']]

In [21]:
# Define the target
y = business_df["Category_Encoded"]
y

0        2
1        3
2        2
3        1
4        2
        ..
24734    2
24735    2
24736    1
24737    2
24738    1
Name: Category_Encoded, Length: 24739, dtype: int64

In [22]:
# Split the model into training and testing sets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,
                                                   y, 
                                                   random_state=1, 
                                                    stratify=y)

In [23]:
# Resample the training data with the BalancedRandomForestClassifier

from imblearn.ensemble import BalancedRandomForestClassifier

model = BalancedRandomForestClassifier(n_estimators=100, random_state=1)

model.fit(X_train, y_train)  

BalancedRandomForestClassifier(random_state=1)

In [24]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score

y_pred = model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.49589057442868234

In [26]:
# Training balanced accuracy
y_pred_train = model.predict(X_train)
balanced_accuracy_score(y_train, y_pred_train)

0.7177471558695625

In [27]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix

y_pred = model.predict(X_test)

confusion_matrix(y_test, y_pred)

array([[ 169,   49,   17,   19],
       [ 333,  505,  243,  124],
       [ 478,  870, 1142, 1015],
       [ 143,  131,  247,  700]])

In [28]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced

print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.15      0.67      0.84      0.25      0.75      0.55       254
          1       0.32      0.42      0.79      0.37      0.58      0.32      1205
          2       0.69      0.33      0.81      0.44      0.51      0.25      3505
          3       0.38      0.57      0.77      0.45      0.66      0.43      1221

avg / total       0.54      0.41      0.80      0.42      0.56      0.31      6185



In [29]:
# List the features sorted in descending order by feature importance
feature_importance = sorted(zip(model.feature_importances_, X.columns), reverse=True)

for i in feature_importance:
    print('{} : ({})'.format(i[1], i[0]))

Review_Count : (0.1788782644804728)
Median_Income(dollars) : (0.12409864300839364)
Total_Estimate_Nonfamily_households : (0.12318118521064529)
Total_Estimate_Married-couple_Family_households : (0.12190302917874356)
Total_Estimate_Households_per_Zip : (0.11831846990534008)
Noise_Level : (0.045687785963008765)
Wheelchair_Accessible : (0.0383297964864004)
Price_Range : (0.038186494595161595)
Restaurants_Delivery : (0.030192196693723458)
WiFi : (0.02965648391663924)
Outdoor_Seating : (0.02864042314770798)
Reservations : (0.025703636772338192)
Good_For_Kids : (0.022383717220160096)
Happy_Hour : (0.021432945246647628)
Good_For_Groups : (0.021212739305355152)
Alcohol : (0.011983872635595891)
Accepts_CreditCards : (0.011129879368267703)
Restaurants_TakeOut : (0.009002795556490533)
Dietary_Restrictions : (7.764130890807689e-05)
