In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import (
    RandomizedSearchCV,
    cross_val_score,
    cross_validate,
    train_test_split,
)
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.compose import (
    ColumnTransformer,
    TransformedTargetRegressor,
    make_column_transformer,
)

# Load data

In [2]:
data = pd.read_csv('../data/cleaned_data_v2.csv')
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 696 entries, 0 to 695
Data columns (total 47 columns):
 #   Column                                                                                                                                                                                                                         Non-Null Count  Dtype  
---  ------                                                                                                                                                                                                                         --------------  -----  
 0   case number                                                                                                                                                                                                                    696 non-null    object 
 1   Who was the member adjudicating the decision?                                                                                         

In [3]:
train_df, test_df = train_test_split(data, test_size=0.2, random_state=123)
X_train, y_train = train_df.drop(columns = ["What was the outcome of the case?"]), train_df["What was the outcome of the case?"]
X_test, y_test = test_df.drop(columns = ["What was the outcome of the case?"]), test_df["What was the outcome of the case?"]
X_train.head()

Unnamed: 0,case number,Who was the member adjudicating the decision?,What was the location of the landlord tenant board?,Did the decision state the landlord was represented?,Did the decision state the landlord attended the hearing?,Did the decision state the tenant was represented?,Did the decision state the tenant attended the hearing?,Did the decision state the landlord was a not-for-profit landlord (e.g. Toronto Community Housing)?,Did the decision state the tenant was collecting a subsidy?,What was the length of the tenancy,...,L9 present?,N5 present?,N6 present?,N7 present?,N8 present?,T1 present?,T2 present?,T3 present?,T5 present?,T6 present?
195,TSL-90833-17,roger rodrigues,Toronto,1,0,0,0,0,0,6.306049,...,0,0,0,0,0,0,0,0,0,0
553,TNL-07861-18,nancy morris,Toronto,0,1,0,1,0,0,24.0,...,0,0,0,0,0,0,0,0,0,0
598,TSL-96267-18,david mungovan,Toronto,1,0,0,1,0,0,6.306049,...,0,0,0,0,0,0,0,0,0,0
645,TEL-81094-17,shelby whittick,Whitby,0,1,0,1,0,0,6.306049,...,0,0,0,0,0,0,0,0,0,0
634,TEL-80073-17,jim mcmaster,Toronto,1,1,0,1,0,0,6.306049,...,0,0,0,0,0,0,1,0,0,0


In [4]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 556 entries, 195 to 510
Data columns (total 46 columns):
 #   Column                                                                                                                                                                                                                         Non-Null Count  Dtype  
---  ------                                                                                                                                                                                                                         --------------  -----  
 0   case number                                                                                                                                                                                                                    556 non-null    object 
 1   Who was the member adjudicating the decision?                                                                                       

In [5]:
y_train.value_counts()

No relief            352
Relief               200
Conditional Order      4
Name: What was the outcome of the case?, dtype: int64

## Preprocess data with pipeline

In [6]:
categorical_features = [
    'Who was the member adjudicating the decision?',
    'What was the location of the landlord tenant board?'
]

numeric_features = [
    'What was the length of the tenancy',
    'What was the monthly rent?',
    'rental deposit amount',
    'What was the total amount of arrears?',
    'Over how many months did the arrears accumulate?'
]

ordinal_features = [
    'Was the tenant employed at the time of the hearing?',
    'If the tenant was not employed, did the decision state the tenant was receiving any form of government assistance (e.g. OW, childcare benefits, ODSP, OSAP)?',
    'If the tenant was employed, did the decision state any doubts about the stability of employment e.g. lack of guaranteed hours, contract work, etc.?',
    'Payment Plan'
]

drop_features = ['case number']

ordering_ordinal = [
    [-1,0,1],
    [-1,0,1],
    [-1,0,1],
    [0, 0.5, 1]
]

passthrough_feats = list(
    set(X_train.columns)
    - set(categorical_features)
    - set(numeric_features)
    - set(ordinal_features)
    - set(drop_features)
)

In [7]:
preprocessor = make_column_transformer(
        (OneHotEncoder(handle_unknown="ignore"),categorical_features),
        ("passthrough", passthrough_feats),
        (StandardScaler(),numeric_features),
        (OrdinalEncoder(categories = ordering_ordinal),ordinal_features),
        ("drop",drop_features))
preprocessor

## Baseline model

In [8]:
# adapt from DSCI573 lecture note
def mean_std_cross_val_scores(model, X_train, y_train, **kwargs):
    """
    Returns mean and std of cross validation

    Parameters
    ----------
    model :
        scikit-learn model
    X_train : numpy array or pandas DataFrame
        X in the training data
    y_train :
        y in the training data

    Returns
    ----------
        pandas Series with mean scores from cross_validation
    """

    scores = cross_validate(model, X_train, y_train, **kwargs)

    mean_scores = pd.DataFrame(scores).mean()
    std_scores = pd.DataFrame(scores).std()
    out_col = []

    for i in range(len(mean_scores)):
        out_col.append((f"%0.3f (+/- %0.3f)" % (mean_scores[i], std_scores[i])))

    return pd.Series(data=out_col, index=mean_scores.index)

In [9]:
from sklearn.dummy import DummyClassifier

results_dict = {}
scoring_metrics = ["accuracy"]

dummy = make_pipeline(preprocessor, DummyClassifier(random_state = 123))
results_dict["dummy"] = mean_std_cross_val_scores(
    dummy, X_train, y_train, scoring = scoring_metrics, return_train_score=True
)

results_df = pd.DataFrame(results_dict).T
results_df



Unnamed: 0,fit_time,score_time,test_accuracy,train_accuracy
dummy,0.013 (+/- 0.006),0.009 (+/- 0.005),0.633 (+/- 0.004),0.633 (+/- 0.001)


_With no tuned hyperparameters of DummyClassifier, the mean cross validation score of our baseline model is 0.633. Since it is a a multi-class classification and there is class imbalance, this accuracy score is low and not quite representative._

# LightGBM model

In [10]:
from lightgbm.sklearn import LGBMClassifier

pipe_lgbm = make_pipeline(preprocessor, LGBMClassifier(random_state=123))

results_dict["lgbm"] = mean_std_cross_val_scores(
    pipe_lgbm, X_train, y_train, scoring = scoring_metrics, return_train_score=True
)

results_df = pd.DataFrame(results_dict).T
results_df



Unnamed: 0,fit_time,score_time,test_accuracy,train_accuracy
dummy,0.013 (+/- 0.006),0.009 (+/- 0.005),0.633 (+/- 0.004),0.633 (+/- 0.001)
lgbm,0.108 (+/- 0.011),0.008 (+/- 0.001),0.795 (+/- 0.023),1.000 (+/- 0.000)


_With default hyperparameters, our LightGBM model gets a better mean cross validation score (0.795) than the score of base model (0.633). However, we can see the model is very overfit, with the mean train score being 0.205 higher than the mean cv score._

# Hyperparameter optimization

In [11]:
param_grid = {
    "lgbmclassifier__learning_rate": np.arange(0.01, 0.3, 0.001),
    "lgbmclassifier__max_depth": np.arange(0, 12, 1),
    "lgbmclassifier__num_leaves": np.arange(2, 40, 2),
    "lgbmclassifier__min_split_gain": np.arange(0, 15, 1),
    "lgbmclassifier__class_weight": ["balanced", None]
}

random_search = RandomizedSearchCV(
    pipe_lgbm, 
    param_grid, 
    n_jobs=-1, 
    n_iter=20, 
    random_state=123,
    scoring=scoring_metrics, 
    refit='accuracy',
    return_train_score=True
)

random_search.fit(X_train, y_train)

results = pd.DataFrame(random_search.cv_results_)[
    [
        "mean_test_accuracy",
        "param_lgbmclassifier__learning_rate",
        "param_lgbmclassifier__max_depth",
        "param_lgbmclassifier__num_leaves",
        "param_lgbmclassifier__min_split_gain",
        "param_lgbmclassifier__class_weight",
        "mean_fit_time",
        "rank_test_accuracy",
    ]
].set_index("rank_test_accuracy").sort_index().T
results



rank_test_accuracy,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20
mean_test_accuracy,0.812999,0.802172,0.798584,0.784218,0.778781,0.760795,0.759025,0.757191,0.753636,0.75362,0.748295,0.746396,0.739221,0.71768,0.713948,0.712355,0.708768,0.706902,0.703346,0.6889
param_lgbmclassifier__learning_rate,0.228,0.024,0.099,0.071,0.044,0.082,0.091,0.045,0.291,0.04,0.141,0.031,0.222,0.184,0.217,0.018,0.236,0.066,0.299,0.091
param_lgbmclassifier__max_depth,1.0,8.0,8.0,5.0,9,10.0,0.0,9.0,7.0,1.0,2,9.0,8.0,1,7.0,2,2,9,8,2
param_lgbmclassifier__num_leaves,4.0,36.0,2.0,14.0,26,22.0,34.0,22.0,10.0,38.0,34,16.0,8.0,20,26.0,12,16,8,22,28
param_lgbmclassifier__min_split_gain,0.0,0.0,1.0,2.0,0,8.0,6.0,7.0,5.0,4.0,1,6.0,10.0,6,13.0,5,7,10,7,14
param_lgbmclassifier__class_weight,,,,,balanced,,,,,,balanced,,,balanced,,balanced,balanced,balanced,balanced,balanced
mean_fit_time,0.043507,0.125424,0.045657,0.039141,0.128243,0.036843,0.036678,0.040125,0.043218,0.049142,0.047262,0.042129,0.039412,0.038817,0.041523,0.053366,0.039881,0.04756,0.040763,0.042257


In [12]:
best_pipe_lgbm = make_pipeline(
    preprocessor, 
    LGBMClassifier(
        learning_rate = 0.228, 
        max_depth = 1, 
        num_leaves = 4, 
        min_split_gain = 0, 
        class_weight=None, 
        random_state=123
    )
)

In [13]:
results_dict["best_lgbm"] = mean_std_cross_val_scores(
    best_pipe_lgbm, X_train, y_train, scoring = scoring_metrics, return_train_score=True
)

results_df = pd.DataFrame(results_dict).T
results_df



Unnamed: 0,fit_time,score_time,test_accuracy,train_accuracy
dummy,0.013 (+/- 0.006),0.009 (+/- 0.005),0.633 (+/- 0.004),0.633 (+/- 0.001)
lgbm,0.108 (+/- 0.011),0.008 (+/- 0.001),0.795 (+/- 0.023),1.000 (+/- 0.000)
best_lgbm,0.026 (+/- 0.006),0.007 (+/- 0.001),0.813 (+/- 0.029),0.853 (+/- 0.007)


_As shown above, We tuned three hyperparameters: learning_rate, max_depth, num_leaves and class_weight. The highest mean cv score we got is 0.804 with learning_rate=0.179, max_depth=1, num_leaves=14 and class_weight=None. This is better than with the default hyperparameter which has a mean cv score of 0.795. Additionally, we can see that the model is not overfit anymore with the mean train score being only 0.04 higher than the mean cv score._

# Interpretation and feature importances 

In [14]:
#!pip install eli5

In [15]:
import eli5
# eli5 feature importance weights for LGBM classifier
categorical_feature_names = (
    random_search.best_estimator_
    .named_steps["columntransformer"]
    .named_transformers_["onehotencoder"]
    .get_feature_names_out(categorical_features)
    .tolist()
)

feature_names = (
    categorical_feature_names + passthrough_feats + numeric_features + ordinal_features
)

best_pipe_lgbm.fit(X_train, y_train)

eli5.explain_weights(
    best_pipe_lgbm.named_steps["lgbmclassifier"], top=None, feature_names=feature_names
)

Weight,Feature
0.2275,Did the decision state the tenant attended the hearing?
0.2175,Payment Plan
0.1102,What was the total amount of arrears?
0.0836,"Did the decision mention the tenant’s difficulty finding alternative housing for any reason e.g.physical limitations, reliance on social assistance, etc.?"
0.0608,rental deposit amount
0.0533,"What were the specific mental, medical, or physical conditions of the tenant, if any?"
0.0453,Did the decision state the tenant was represented?
0.0298,Did the decision state the tenant was collecting a subsidy?
0.0278,"If the tenant was employed, did the decision state any doubts about the stability of employment e.g. lack of guaranteed hours, contract work, etc.?"
0.0225,What was the location of the landlord tenant board?_London


_Based on the eli5 results above, it seems that "Did the decision state the tenant attended the hearing?" is the strongest variable used by the LGBM Classifiers with a value of 0.2275. Then, the second strongest variable "Payment Plan" gets a weight of 0.2175. Next, the variable "What was the total amount of arrears?" gets a weight of 0.1102. All other variables have a weight lower than 0.09, which indicates they are not as important._

# Conclusion

In [16]:
test_score = best_pipe_lgbm.score(X_test, y_test)
test_score

0.7857142857142857

_The test score for my LightGBM model is 0.786, which somewhat agrees with the mean validation score of 0.813 from before. Since the validation score and the test score are similar, I don't think there is optimization bias. If there is optimization bias, the validation score would be much higher than the test score. Therefore, I have decent trust in my model, and believe it would do well to deployment data._