# Balanced Random Forest Classifier

A random forest model combines many decision trees into a forest of trees. Random forest models:
- Are robust against overfitting because all of those weak learners are trained on different pieces of the data.
- Can be used to rank the importance of input variables in a natural way.
- Can handle thousands of input variables without variable deletion.
- Are robust to outliers and nonlinear data.
- Run efficiently on large datasets. 

In [6]:
import pandas as pd

business_df = pd.read_csv("../../Data/01_Clean_Business_Data.csv")

# Categorizing restaurants based on stars ratings
business_df["Category"] = pd.cut(business_df["Stars_Rating"],bins=[0.9,2,3,4,5],
                                 labels=["Poor","Average","Good","Successful"])

def changeStatus(status):
    if status == "Poor":
        return 0
    elif status == "Average":
        return 1
    elif status ==  "Good":
        return 2
    else:
        return 3

business_df['Category_Encoded'] = business_df["Category"].apply(changeStatus)
business_df["Category_Encoded"] = pd.to_numeric(business_df["Category_Encoded"])

In [7]:
# Define features set
X = business_df[['Review_Count', 'Restaurants_Delivery', 'Outdoor_Seating',
       'Restaurants_TakeOut', 'WiFi', 'Restaurants_Reservations',
       'Good_For_Groups', 'Wheelchair_Accessible', 'Happy_Hour',
       'Dietary_Restrictions']]
X

Unnamed: 0,Review_Count,Restaurants_Delivery,Outdoor_Seating,Restaurants_TakeOut,WiFi,Restaurants_Reservations,Good_For_Groups,Wheelchair_Accessible,Happy_Hour,Dietary_Restrictions
0,80,0,0,1,1,0,0,0,0,0
1,6,1,1,1,0,0,1,1,0,0
2,19,0,1,1,1,0,1,0,0,0
3,10,1,1,1,0,0,0,1,0,0
4,10,0,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...
49852,998,1,1,1,1,0,1,0,0,0
49853,11,1,0,1,0,0,0,0,0,0
49854,33,0,1,0,1,0,1,0,0,0
49855,35,1,0,1,1,0,1,0,0,0


In [8]:
# Define the target
y = business_df["Category_Encoded"]
y

0        2
1        0
2        1
3        0
4        2
        ..
49852    3
49853    1
49854    2
49855    3
49856    3
Name: Category_Encoded, Length: 49857, dtype: int64

In [9]:
# Split the model into training and testing sets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,
                                                   y, 
                                                   random_state=1, 
                                                    stratify=y)

In [10]:
# Resample the training data with the BalancedRandomForestClassifier

from imblearn.ensemble import BalancedRandomForestClassifier

model = BalancedRandomForestClassifier(n_estimators=100, random_state=1)

model.fit(X_train, y_train)  

BalancedRandomForestClassifier(random_state=1)

In [11]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score

y_pred = model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.4025564949612325

In [12]:
# Training balanced accuracy
y_pred_train = model.predict(X_train)
balanced_accuracy_score(y_train, y_pred_train)

0.5744872186000443

In [13]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix

y_pred = model.predict(X_test)

confusion_matrix(y_test, y_pred)

array([[ 733,  178,   67,  139],
       [1191,  746,  511,  514],
       [1407, 1250, 1601, 1744],
       [ 532,  333,  481, 1038]])

In [14]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced

print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.19      0.66      0.72      0.29      0.69      0.47      1117
          1       0.30      0.25      0.81      0.27      0.45      0.19      2962
          2       0.60      0.27      0.84      0.37      0.47      0.21      6002
          3       0.30      0.44      0.76      0.36      0.58      0.32      2384

avg / total       0.44      0.33      0.81      0.34      0.51      0.25     12465



In [15]:
# List the features sorted in descending order by feature importance
feature_importance = sorted(zip(model.feature_importances_, X.columns), reverse=True)

for i in feature_importance:
    print('{} : ({})'.format(i[1], i[0]))

Review_Count : (0.7324759019773165)
Wheelchair_Accessible : (0.0534501806425577)
Restaurants_Delivery : (0.04476882428200106)
Good_For_Groups : (0.0330113525477062)
Restaurants_Reservations : (0.031482544035938115)
Outdoor_Seating : (0.030084200886671443)
WiFi : (0.028996683862190835)
Happy_Hour : (0.02367716768572259)
Restaurants_TakeOut : (0.021585882284394214)
Dietary_Restrictions : (0.0004672617955013318)
