# Balanced Random Forest Classifier

A random forest model combines many decision trees into a forest of trees. Random forest models:
- Are robust against overfitting because all of those weak learners are trained on different pieces of the data.
- Can be used to rank the importance of input variables in a natural way.
- Can handle thousands of input variables without variable deletion.
- Are robust to outliers and nonlinear data.
- Run efficiently on large datasets. 

In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

from sqlalchemy import create_engine
# from config import db_password
from sqlalchemy import inspect
import matplotlib.pyplot as plt
import pandas as pd

business_df = pd.read_csv("../../../Data/new_merged_datasets.csv")

# Categorizing restaurants based on stars ratings
business_df["Category"] = pd.cut(business_df["Stars_Rating"],bins=[0.9,3,5],
                                 labels=["Lower Performance", "Higher Performance"])
def changeStatus(status):
    if status == "Lower Performance":
        return 0
    else:
        return 1

business_df['Category_Encoded'] = business_df["Category"].apply(changeStatus)
business_df["Category_Encoded"] = pd.to_numeric(business_df["Category_Encoded"])

business_df["Median_Income(dollars)"] = pd.to_numeric(business_df["Median_Income(dollars)"], errors='coerce')
business_df["Mean_Income(dollars)"] = pd.to_numeric(business_df["Mean_Income(dollars)"], errors='coerce')

business_df = business_df.dropna(subset=['Median_Income(dollars)'])

In [3]:
# Define features set
X = business_df[['Review_Count', 'Restaurants_Delivery', 'Outdoor_Seating',
       'Accepts_CreditCards', 'Price_Range', 'Alcohol', 'Good_For_Kids',
       'Reservations', 'Restaurants_TakeOut', 'WiFi', 'Good_For_Groups',
       'Wheelchair_Accessible', 'Happy_Hour', 'Noise_Level',
       'Dietary_Restrictions',
                'Total_Estimate_Married-couple_Family_households',
       'Total_Estimate_Nonfamily_households',
                'Median_Income(dollars)', 'Population']]

In [2]:
# Define the target
y = business_df["Category_Encoded"]
y

0        1
1        1
2        1
3        0
4        1
        ..
15602    1
15603    0
15604    1
15605    0
15606    1
Name: Category_Encoded, Length: 15607, dtype: int64

In [4]:
# Split the model into training and testing sets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,
                                                   y, 
                                                   random_state=1, 
                                                    stratify=y)

In [5]:
# Resample the training data with the BalancedRandomForestClassifier

from imblearn.ensemble import BalancedRandomForestClassifier

model = BalancedRandomForestClassifier(n_estimators=100, random_state=1)

model.fit(X_train, y_train)  

BalancedRandomForestClassifier(random_state=1)

In [6]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score

y_pred = model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.6773283019421463

In [7]:
y_pred_train = model.predict(X_train)
balanced_accuracy_score(y_train, y_pred_train)

0.9069223052056627

In [8]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix

y_pred = model.predict(X_test)

confusion_matrix(y_test, y_pred)

array([[ 655,  257],
       [1087, 1903]])

In [9]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced

print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

    Average       0.32      0.42      0.79      0.36      0.57      0.32      1205
       Good       0.69      0.32      0.81      0.44      0.51      0.25      3507
       Poor       0.15      0.69      0.84      0.25      0.76      0.57       254
 Successful       0.37      0.57      0.77      0.45      0.66      0.42      1222

avg / total       0.54      0.40      0.80      0.42      0.56      0.31      6188



In [10]:
# List the features sorted in descending order by feature importance
feature_importance = sorted(zip(model.feature_importances_, X.columns), reverse=True)

for i in feature_importance:
    print('{} : ({})'.format(i[1], i[0]))

Review_Count : (0.17690743696866845)
Total_Estimate_Nonfamily_households : (0.12357466970917706)
Median_Income(dollars) : (0.12126275616379076)
Total_Estimate_Married-couple_Family_households : (0.12121503581610554)
Population : (0.12114808007835083)
Noise_Level : (0.04729202043767742)
Wheelchair_Accessible : (0.03965837090756363)
Price_Range : (0.03876032198733259)
Restaurants_Delivery : (0.030308077070677376)
WiFi : (0.028523627145452594)
Outdoor_Seating : (0.02768732789012317)
Reservations : (0.027235191528856513)
Good_For_Kids : (0.022386263251048564)
Good_For_Groups : (0.021824627224959866)
Happy_Hour : (0.021096803550480353)
Alcohol : (0.011373362176144576)
Accepts_CreditCards : (0.010333855073241836)
Restaurants_TakeOut : (0.009164208909037318)
Dietary_Restrictions : (0.0002479641113116617)
