# Balanced Random Forest Classifier

A random forest model combines many decision trees into a forest of trees. Random forest models:
- Are robust against overfitting because all of those weak learners are trained on different pieces of the data.
- Can be used to rank the importance of input variables in a natural way.
- Can handle thousands of input variables without variable deletion.
- Are robust to outliers and nonlinear data.
- Run efficiently on large datasets. 

In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

from sqlalchemy import create_engine
# from config import db_password
from sqlalchemy import inspect
import matplotlib.pyplot as plt
import pandas as pd

business_df = pd.read_csv("../../../Data/business_census_merged_dataset.csv")
#business_df = pd.read_csv("../../../Data/merged_datasets.csv")

# Categorizing restaurants based on stars ratings
business_df["Category"] = pd.cut(business_df["Stars_Rating"],bins=[0.9,3.5,5],
                                 labels=["Lower Performance", "Higher Performance"])

def changeStatus(status):
    if status == "Lower Performance":
        return 0
    else:
        return 1
    
business_df['Category_Encoded'] = business_df["Category"].apply(changeStatus)
business_df["Category_Encoded"] = pd.to_numeric(business_df["Category_Encoded"])

business_df["Median_Income(dollars)"] = pd.to_numeric(business_df["Median_Income(dollars)"], errors='coerce')
business_df["Mean_Income(dollars)"] = pd.to_numeric(business_df["Mean_Income(dollars)"], errors='coerce')

business_df = business_df.dropna(subset=['Median_Income(dollars)'])

In [9]:
# Define features set
X = business_df[['Review_Count', 'Restaurants_Delivery', 'Outdoor_Seating',
       'Accepts_CreditCards', 'Price_Range', 'Alcohol', 'Good_For_Kids',
       'Reservations', 'Restaurants_TakeOut', 'WiFi', 'Good_For_Groups',
       'Wheelchair_Accessible', 'Happy_Hour', 'Noise_Level',
       'Dietary_Restrictions',
                'Total_Estimate_Married-couple_Family_households',
       'Total_Estimate_Nonfamily_households',
                'Median_Income(dollars)', 'Total_Estimate_Households_per_Zip']]

In [10]:
# Define the target
y = business_df["Category_Encoded"]
y

0        1
1        1
2        1
3        0
4        1
        ..
24734    1
24735    1
24736    0
24737    0
24738    0
Name: Category_Encoded, Length: 24739, dtype: int64

In [11]:
# Split the model into training and testing sets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,
                                                   y, 
                                                   random_state=1, 
                                                    stratify=y)

In [12]:
# Scale the Training and Testing Data
from sklearn.preprocessing import StandardScaler

# Instantiate a StandardScaler()
scaler = StandardScaler()

# Fitting the Standard Scaler with the training data
X_scaler = scaler.fit(X_train)


# Scaling the data 
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [13]:
# Resample the training data with the BalancedRandomForestClassifier

from imblearn.ensemble import BalancedRandomForestClassifier

model = BalancedRandomForestClassifier(n_estimators=100, random_state=1)

model.fit(X_train, y_train)  

BalancedRandomForestClassifier(random_state=1)

In [14]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score

y_pred = model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.6528860743697744

In [15]:
y_pred_train = model.predict(X_train)
balanced_accuracy_score(y_train, y_pred_train)

0.9994714966059557

In [16]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix

y_pred = model.predict(X_test)

confusion_matrix(y_test, y_pred)

array([[2007,  979],
       [1172, 2027]])

In [17]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced

print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.63      0.67      0.63      0.65      0.65      0.43      2986
          1       0.67      0.63      0.67      0.65      0.65      0.42      3199

avg / total       0.65      0.65      0.65      0.65      0.65      0.43      6185



In [11]:
# List the features sorted in descending order by feature importance
feature_importance = sorted(zip(model.feature_importances_, X.columns), reverse=True)

for i in feature_importance:
    print('{} : ({})'.format(i[1], i[0]))

Review_Count : (0.23555666334841727)
Median_Income(dollars) : (0.11437666106759317)
Total_Estimate_Nonfamily_households : (0.11204395379599824)
Total_Estimate_Married-couple_Family_households : (0.1116961260119542)
Population : (0.10953046460287853)
Noise_Level : (0.043150654332244034)
Wheelchair_Accessible : (0.03888393741460815)
Price_Range : (0.033559477617293436)
WiFi : (0.028222008061795843)
Restaurants_Delivery : (0.027727436272503823)
Reservations : (0.02714784435280517)
Outdoor_Seating : (0.02435587586520726)
Happy_Hour : (0.022187130115292997)
Good_For_Kids : (0.020527968423447278)
Good_For_Groups : (0.019486150053464196)
Accepts_CreditCards : (0.01262716209245474)
Alcohol : (0.009519549503033244)
Restaurants_TakeOut : (0.009282629043565034)
Dietary_Restrictions : (0.00011830802544342473)
