In [1]:
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, StandardScaler
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
from sklearn.model_selection import (
    GridSearchCV,
    cross_val_score,
    cross_validate,
    train_test_split,
)
from scipy.stats import loguniform
from sklearn.model_selection import RandomizedSearchCV
import numpy as np


### Import Data

In [2]:
data = pd.read_csv('../data/cleaned_data_v2.csv')

In [3]:
data = data.rename(columns={
    'What was the length of the tenancy, or in other words, how long had the tenants lived at the residence in question? '
    :'What was the length of the tenancy',
    'What was the amount of the rental deposit? ':'rental deposit amount',
    'If any rent increases occurred, what was the rent after the increase(s)?':'was there an rent increases',
    'Over how many months did the arrears accumulate? ':'Over how many months did the arrears accumulate?',
    'If the tenant made a payment on the arrears after the eviction notice was served and/or prior to the hearing, what was the amount of the payment? ':'Does the tenant made a payment on the arrears after the eviction notice',
    'What were the specific mental, medical, or physical conditions of the tenant, if any? ':'What were the specific mental, medical, or physical conditions of the tenant, if any?'})


In [17]:
data.loc[:,'L1 present?':]

Unnamed: 0,L1 present?,L2 present?,L3 present?,L4 present?,L8 present?,L9 present?,N5 present?,N6 present?,N7 present?,N8 present?,T1 present?,T2 present?,T3 present?,T5 present?,T6 present?
0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
691,0,1,0,0,1,1,0,0,0,0,0,0,0,0,0
692,1,1,0,0,0,0,1,0,0,0,0,0,0,0,0
693,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
694,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


### Make Train, Test

In [4]:
train_df, test_df = train_test_split(data, test_size=0.2, random_state=123)
X_train, y_train = train_df.drop(columns = ["What was the outcome of the case?"]), train_df["What was the outcome of the case?"]
X_test, y_test = test_df.drop(columns = ["What was the outcome of the case?"]), test_df["What was the outcome of the case?"]
X_train.head()

Unnamed: 0,case number,Who was the member adjudicating the decision?,What was the location of the landlord tenant board?,Did the decision state the landlord was represented?,Did the decision state the landlord attended the hearing?,Did the decision state the tenant was represented?,Did the decision state the tenant attended the hearing?,Did the decision state the landlord was a not-for-profit landlord (e.g. Toronto Community Housing)?,Did the decision state the tenant was collecting a subsidy?,What was the length of the tenancy,...,L9 present?,N5 present?,N6 present?,N7 present?,N8 present?,T1 present?,T2 present?,T3 present?,T5 present?,T6 present?
195,TSL-90833-17,Roger Rodrigues,Toronto,1,0,0,0,0,0,6.306049,...,0,0,0,0,0,0,0,0,0,0
553,TNL-07861-18,Nancy Morris,Toronto,0,1,0,1,0,0,24.0,...,0,0,0,0,0,0,0,0,0,0
598,TSL-96267-18,David Mungovan,Toronto,1,0,0,1,0,0,6.306049,...,0,0,0,0,0,0,0,0,0,0
645,TEL-81094-17,Shelby Whittick,Whitby,0,1,0,1,0,0,6.306049,...,0,0,0,0,0,0,0,0,0,0
634,TEL-80073-17,Jim McMaster,Toronto,1,1,0,1,0,0,6.306049,...,0,0,0,0,0,0,1,0,0,0


### Preprocessing

In [21]:
categorical_features = ['Who was the member adjudicating the decision?',
                        'What was the location of the landlord tenant board?',
                        'If the tenant was not employed, did the decision state the tenant was receiving any form of government assistance (e.g. OW, childcare benefits, ODSP, OSAP)?',
                        'If the tenant was employed, did the decision state any doubts about the stability of employment e.g. lack of guaranteed hours, contract work, etc.?'
                        ]

numeric_features = [
'What was the length of the tenancy',
'What was the monthly rent?',
'rental deposit amount',
'What was the total amount of arrears?',
'Over how many months did the arrears accumulate?'
]
ordinal_features = [
'Was the tenant employed at the time of the hearing?',
'Payment Plan'
]
drop_features = ['case number']
ordering_ordinal = [
    [-1,0,1],
    [0, 0.5, 1]
]
passthrough_feats = list(
    set(X_train.columns)
    - set(categorical_features)
    - set(numeric_features)
    - set(ordinal_features)
    - set(drop_features)
)

In [22]:
preprocessor = make_column_transformer(
        ("passthrough", passthrough_feats),
        (OneHotEncoder(handle_unknown="ignore"),categorical_features),
        (StandardScaler(),numeric_features),
        (OrdinalEncoder(categories = ordering_ordinal),ordinal_features),
        ("drop",drop_features))
preprocessor

In [23]:
pipe_rf = make_pipeline(preprocessor, RandomForestClassifier(random_state=123))

### Without hypterparameter optimization

In [24]:
scoring_metrics = ["accuracy"]
pd.DataFrame(cross_validate(pipe_rf, X_train, y_train, cv=5, scoring = scoring_metrics, return_train_score=True)).agg(['mean', 'std']).round(3).T



Unnamed: 0,mean,std
fit_time,0.105,0.002
score_time,0.008,0.0
test_accuracy,0.772,0.05
train_accuracy,1.0,0.0


In [25]:
param_dist = { 
    "randomforestclassifier__n_estimators": np.arange(50, 100, 10),
    "randomforestclassifier__max_depth": [None, 5,10,15]
}

random_search = RandomizedSearchCV(
            pipe_rf, param_distributions=param_dist, n_iter=20, 
            n_jobs=-1, verbose = 1, scoring= scoring_metrics, 
            refit = 'accuracy', return_train_score=True, random_state=123)

random_search.fit(X_train, y_train)

Fitting 5 folds for each of 20 candidates, totalling 100 fits




In [26]:
print(random_search.best_params_)
print(random_search.best_score_)

{'randomforestclassifier__n_estimators': 90, 'randomforestclassifier__max_depth': None}
0.7752252252252252


In [33]:
best_model = RandomForestClassifier(n_estimators = 90, max_depth = None) 
pipe_rf2 = make_pipeline(preprocessor, best_model)

In [34]:
pd.DataFrame(cross_validate(pipe_rf2, X_train, y_train, cv=5, scoring = scoring_metrics, return_train_score=True)).agg(['mean', 'std']).round(3).T



Unnamed: 0,mean,std
fit_time,0.095,0.002
score_time,0.008,0.0
test_accuracy,0.763,0.05
train_accuracy,1.0,0.0


### Result

Seems like randomize search did not have luck to find the best parameter. The reason being the accuracy after hyperparameter tuning is lower than the accuracy we have originally with cross-validation. We can also see how they perform in test below

In [35]:
pipe_rf2.fit(X_train, y_train)
pipe_rf2.score(X_test, y_test)

0.7642857142857142

In [36]:
pipe_rf.fit(X_train, y_train)
pipe_rf.score(X_test, y_test)

0.7571428571428571

In [37]:
ohe_feature_names = (pipe_rf.named_steps['columntransformer'].named_transformers_['onehotencoder'].get_feature_names_out(categorical_features).tolist())
feature_names = (
    ohe_feature_names+ numeric_features + ordinal_features + passthrough_feats
)

In [38]:
data_importance = {
    "Importance": pipe_rf.named_steps["randomforestclassifier"].feature_importances_,
}
pd.DataFrame(data=data_importance, index=feature_names).sort_values(
    by="Importance", ascending=False
)[:10]

Unnamed: 0,Importance
was there an rent increases,0.106813
Did the decision state the landlord was represented?,0.077908
"Did the decision mention the tenant’s difficulty finding alternative housing for any reason e.g.physical limitations, reliance on social assistance, etc.?",0.051158
N5 present?,0.050302
Who was the member adjudicating the decision?_Cristina De Leon-Culp,0.04405
Who was the member adjudicating the decision?_Melanie Love,0.038937
Did the decision state the landlord attended the hearing?,0.035754
L3 present?,0.035595
Who was the member adjudicating the decision?_Lorraine Mathers,0.024648
Who was the member adjudicating the decision?_Nancy Morris,0.0214
