In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import (
    RandomizedSearchCV,
    cross_val_score,
    cross_validate,
    train_test_split,
)
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.compose import (
    ColumnTransformer,
    TransformedTargetRegressor,
    make_column_transformer,
)

# Load data

In [2]:
data = pd.read_csv('../data/cleaned_data_v2.csv')
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 696 entries, 0 to 695
Data columns (total 33 columns):
 #   Column                                                                                                                                                                                                                         Non-Null Count  Dtype  
---  ------                                                                                                                                                                                                                         --------------  -----  
 0   case number                                                                                                                                                                                                                    696 non-null    object 
 1   Who was the member adjudicating the decision?                                                                                         

In [3]:
data = data.rename(columns={
    'What was the length of the tenancy, or in other words, how long had the tenants lived at the residence in question? '
    :'What was the length of the tenancy',
    'What was the amount of the rental deposit? ':'rental deposit amount',
    'If any rent increases occurred, what was the rent after the increase(s)?':'was there an rent increases',
    'Over how many months did the arrears accumulate? ':'Over how many months did the arrears accumulate?',
    'If the tenant made a payment on the arrears after the eviction notice was served and/or prior to the hearing, what was the amount of the payment? ':'Does the tenant made a payment on the arrears after the eviction notice',
    'What were the specific mental, medical, or physical conditions of the tenant, if any? ':'What were the specific mental, medical, or physical conditions of the tenant, if any?'})

In [4]:
train_df, test_df = train_test_split(data, test_size=0.2, random_state=123)
X_train, y_train = train_df.drop(columns = ["What was the outcome of the case?"]), train_df["What was the outcome of the case?"]
X_test, y_test = test_df.drop(columns = ["What was the outcome of the case?"]), test_df["What was the outcome of the case?"]
X_train.head()

Unnamed: 0,case number,Who was the member adjudicating the decision?,What was the location of the landlord tenant board?,Did the decision state the landlord was represented?,Did the decision state the landlord attended the hearing?,Did the decision state the tenant was represented?,Did the decision state the tenant attended the hearing?,Did the decision state the landlord was a not-for-profit landlord (e.g. Toronto Community Housing)?,Did the decision state the tenant was collecting a subsidy?,What was the length of the tenancy,...,"If the tenant was employed, did the decision state any doubts about the stability of employment e.g. lack of guaranteed hours, contract work, etc.?",Did the member find the tenant had sufficient income to pay rent?,Did the decision mention the tenant lost their job leading up to or during the period of the hearing?,"Did the decision mention any other extenuating circumstances experienced by the tenant leading up to or during the period of the claim (e.g. hospitalization, death in the family, etc.)?","Did the decision mention the tenant’s difficulty finding alternative housing for any reason e.g.physical limitations, reliance on social assistance, etc.?",Did the decision state the tenant was given prior notice for the eviction?,Did the decisions state postponement would result in the tenant accruing additional arrears?,Which other specific applications of the landlord or the tenant were mentioned?,Did the decision mention the validity of an N4 eviction notice?,Payment Plan
195,TSL-90833-17,Roger Rodrigues,Toronto,1,0,0,0,0,0,6.306049,...,0,1,0,0,0,0,0,[],1,0.0
553,TNL-07861-18,Nancy Morris,Toronto,0,1,0,1,0,0,24.0,...,1,0,0,1,0,1,0,[],0,0.5
598,TSL-96267-18,David Mungovan,Toronto,1,0,0,1,0,0,6.306049,...,0,0,0,0,0,1,0,[],0,0.0
645,TEL-81094-17,Shelby Whittick,Whitby,0,1,0,1,0,0,6.306049,...,0,0,0,1,0,1,1,"['L1', 'L2']",0,0.0
634,TEL-80073-17,Jim McMaster,Toronto,1,1,0,1,0,0,6.306049,...,0,0,0,0,0,1,1,"['L1', 'L2', 'T2']",0,0.0


In [5]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 556 entries, 195 to 510
Data columns (total 32 columns):
 #   Column                                                                                                                                                                                                                         Non-Null Count  Dtype  
---  ------                                                                                                                                                                                                                         --------------  -----  
 0   case number                                                                                                                                                                                                                    556 non-null    object 
 1   Who was the member adjudicating the decision?                                                                                       

In [6]:
y_train.value_counts()

No relief                   352
Postponement of eviction    124
Payment plan                 64
Relief                       12
Conditional Order             4
Name: What was the outcome of the case?, dtype: int64

## Preprocess data with pipeline

In [7]:
categorical_features = ['Who was the member adjudicating the decision?',
                        'What was the location of the landlord tenant board?',
                        'Which other specific applications of the landlord or the tenant were mentioned?'
                        ]

numeric_features = [
'What was the length of the tenancy',
'What was the monthly rent?',
'rental deposit amount',
'What was the total amount of arrears?',
'Over how many months did the arrears accumulate?',

]

ordinal_features = [
'Was the tenant employed at the time of the hearing?',
'If the tenant was not employed, did the decision state the tenant was receiving any form of government assistance (e.g. OW, childcare benefits, ODSP, OSAP)?',
'If the tenant was employed, did the decision state any doubts about the stability of employment e.g. lack of guaranteed hours, contract work, etc.?',
'Payment Plan'
]

drop_features = ['case number']

ordering_ordinal = [
    [-1,0,1],
    [-1,0,1],
    [-1,0,1],
    [0, 0.5, 1]
]

passthrough_feats = list(
    set(X_train.columns)
    - set(categorical_features)
    - set(numeric_features)
    - set(ordinal_features)
    - set(drop_features)
)

In [8]:
preprocessor = make_column_transformer(
        (OneHotEncoder(handle_unknown="ignore"),categorical_features),
        ("passthrough", passthrough_feats),
        (StandardScaler(),numeric_features),
        (OrdinalEncoder(categories = ordering_ordinal),ordinal_features),
        ("drop",drop_features))
preprocessor

## Baseline model

In [9]:
# adapt from DSCI573 lecture note
def mean_std_cross_val_scores(model, X_train, y_train, **kwargs):
    """
    Returns mean and std of cross validation

    Parameters
    ----------
    model :
        scikit-learn model
    X_train : numpy array or pandas DataFrame
        X in the training data
    y_train :
        y in the training data

    Returns
    ----------
        pandas Series with mean scores from cross_validation
    """

    scores = cross_validate(model, X_train, y_train, **kwargs)

    mean_scores = pd.DataFrame(scores).mean()
    std_scores = pd.DataFrame(scores).std()
    out_col = []

    for i in range(len(mean_scores)):
        out_col.append((f"%0.3f (+/- %0.3f)" % (mean_scores[i], std_scores[i])))

    return pd.Series(data=out_col, index=mean_scores.index)

In [10]:
from sklearn.dummy import DummyClassifier

results_dict = {}
scoring_metrics = ["accuracy"]

dummy = make_pipeline(preprocessor, DummyClassifier(random_state = 123))
results_dict["dummy"] = mean_std_cross_val_scores(
    dummy, X_train, y_train, scoring = scoring_metrics, return_train_score=True
)

results_df = pd.DataFrame(results_dict).T
results_df



Unnamed: 0,fit_time,score_time,test_accuracy,train_accuracy
dummy,0.014 (+/- 0.005),0.008 (+/- 0.001),0.633 (+/- 0.004),0.633 (+/- 0.001)


_With no tuned hyperparameters of DummyClassifier, the mean cross validation score of our baseline model is 0.633. Since it is a a multi-class classification and there is class imbalance, this accuracy score is low and not quite representative._

# LightGBM model

In [11]:
from lightgbm.sklearn import LGBMClassifier

pipe_lgbm = make_pipeline(preprocessor, LGBMClassifier(random_state=123))

results_dict["lgbm"] = mean_std_cross_val_scores(
    pipe_lgbm, X_train, y_train, scoring = scoring_metrics, return_train_score=True
)

results_df = pd.DataFrame(results_dict).T
results_df



Unnamed: 0,fit_time,score_time,test_accuracy,train_accuracy
dummy,0.014 (+/- 0.005),0.008 (+/- 0.001),0.633 (+/- 0.004),0.633 (+/- 0.001)
lgbm,0.193 (+/- 0.034),0.009 (+/- 0.001),0.750 (+/- 0.025),1.000 (+/- 0.000)


_With default hyperparameters, our LightGBM model gets a better mean cross validation score (0.75) than the score of base model (0.663). However, we can see the model is very overfit, with the mean train score being 0.25 higher than the mean cv score._

# Hyperparameter optimization

In [12]:
param_grid = {
    "lgbmclassifier__learning_rate": np.arange(0.01, 0.2, 0.001),
    "lgbmclassifier__max_depth": np.arange(0, 12, 1),
    "lgbmclassifier__num_leaves": np.arange(10, 80, 2),
    "lgbmclassifier__class_weight": ["balanced", None]
}

random_search = RandomizedSearchCV(
    pipe_lgbm, 
    param_grid, 
    n_jobs=-1, 
    n_iter=30, 
    random_state=123,
    scoring=scoring_metrics, 
    refit='accuracy',
    return_train_score=True
)

random_search.fit(X_train, y_train)

results = pd.DataFrame(random_search.cv_results_)[
    [
        "mean_test_accuracy",
        "param_lgbmclassifier__learning_rate",
        "param_lgbmclassifier__max_depth",
        "param_lgbmclassifier__num_leaves",
        "param_lgbmclassifier__class_weight",
        "mean_fit_time",
        "rank_test_accuracy",
    ]
].set_index("rank_test_accuracy").sort_index().T
results



rank_test_accuracy,1,2,3,4,5,6,7,8,9,10,...,21,22,23,24,25,26,27,28,29,30
mean_test_accuracy,0.778813,0.777011,0.775209,0.762613,0.760811,0.759041,0.751786,0.750016,0.748279,0.746445,...,0.726625,0.724871,0.723053,0.719434,0.715878,0.697957,0.687178,0.685425,0.661953,0.61694
param_lgbmclassifier__learning_rate,0.117,0.109,0.13,0.187,0.017,0.017,0.105,0.167,0.195,0.127,...,0.185,0.066,0.118,0.166,0.047,0.025,0.019,0.015,0.052,0.029
param_lgbmclassifier__max_depth,1.0,1.0,1.0,2.0,7.0,9.0,5.0,7.0,5.0,5.0,...,0.0,5,4,3,5,9,11,9,2,1
param_lgbmclassifier__num_leaves,14.0,36.0,52.0,66.0,64.0,58.0,72.0,72.0,56.0,40.0,...,36.0,76,24,24,30,76,70,36,50,36
param_lgbmclassifier__class_weight,,,,,,,,,,,...,,balanced,balanced,balanced,balanced,balanced,balanced,balanced,balanced,balanced
mean_fit_time,0.067573,0.056241,0.065615,0.073467,0.175733,0.219866,0.176721,0.181274,0.129539,0.15296,...,0.203693,0.138929,0.114771,0.093195,0.177596,0.201779,0.273525,0.230737,0.108869,0.066162


In [14]:
best_pipe_lgbm = make_pipeline(
    preprocessor, 
    LGBMClassifier(num_leaves = 14, learning_rate = 0.117, max_depth = 1, class_weight=None, random_state=123)
)

In [15]:
results_dict["best_lgbm"] = mean_std_cross_val_scores(
    best_pipe_lgbm, X_train, y_train, scoring = scoring_metrics, return_train_score=True
)

results_df = pd.DataFrame(results_dict).T
results_df



Unnamed: 0,fit_time,score_time,test_accuracy,train_accuracy
dummy,0.014 (+/- 0.005),0.008 (+/- 0.001),0.633 (+/- 0.004),0.633 (+/- 0.001)
lgbm,0.193 (+/- 0.034),0.009 (+/- 0.001),0.750 (+/- 0.025),1.000 (+/- 0.000)
best_lgbm,0.044 (+/- 0.010),0.010 (+/- 0.001),0.779 (+/- 0.044),0.816 (+/- 0.006)


_As shown above, We tuned three hyperparameters: learning_rate, max_depth, num_leaves and class_weight. The highest mean cv score we got is 0.779 with learning_rate=0.117, max_depth=1, num_leaves=14 and class_weight=None. This is better than with the default hyperparameter which has a mean cv score of 0.75. Additionally, we can see that the model is not overfit anymore with the mean train score being only 0.037 higher than the mean cv score._

# Interpretation and feature importances 

In [16]:
#!pip install eli5

In [17]:
import eli5
# eli5 feature importance weights for LGBM classifier
categorical_feature_names = (
    random_search.best_estimator_
    .named_steps["columntransformer"]
    .named_transformers_["onehotencoder"]
    .get_feature_names_out(categorical_features)
    .tolist()
)

feature_names = (
    categorical_feature_names + passthrough_feats + numeric_features + ordinal_features
)

best_pipe_lgbm.fit(X_train, y_train)

eli5.explain_weights(
    best_pipe_lgbm.named_steps["lgbmclassifier"], feature_names=feature_names
)

Weight,Feature
0.3717,Payment Plan
0.1387,Did the decision state the tenant attended the hearing?
0.0808,What was the total amount of arrears?
0.0802,"Did the decision mention the tenant’s difficulty finding alternative housing for any reason e.g.physical limitations, reliance on social assistance, etc.?"
0.0394,Over how many months did the arrears accumulate?
0.0335,Did the member find the tenant had sufficient income to pay rent?
0.0283,rental deposit amount
0.0270,Did the decisions state postponement would result in the tenant accruing additional arrears?
0.0263,"What were the specific mental, medical, or physical conditions of the tenant, if any?"
0.0254,Did the decision state the tenant was represented?


_Based on the eli5 results above, it seems that "Payment Plan" is the strongest variable used by the LGBM Classifiers with a value of 0.3717. The second strongest variable "Did the decision state the tenant attended the hearing?" gets a weight of 0.1387. All other variables have a weight lower than 0.09, which indicates they are not as important._

# Conclusion

In [18]:
test_score = best_pipe_lgbm.score(X_test, y_test)
test_score

0.7785714285714286

_The test score for my LightGBM model is 0.778, which agrees with the mean validation score of 0.779 from before. Since the validation score and the test score are similar, I don't think there is optimization bias. If there is optimization bias, the validation score would be much higher than the test score. Therefore, I have decent trust in my model, and believe it would do well to deployment data._