In [1]:
from sklearn.linear_model import LogisticRegression
from sklearn.dummy import DummyRegressor
import pandas as pd
from sklearn.model_selection import (
    GridSearchCV,
    cross_val_score,
    cross_validate,
    train_test_split,
)
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.compose import (
    ColumnTransformer,
    TransformedTargetRegressor,
    make_column_transformer,
)
import numpy as np

# Load data

In [2]:
data = pd.read_csv('../data/cleaned_data_v2.csv')
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 696 entries, 0 to 695
Data columns (total 33 columns):
 #   Column                                                                                                                                                                                                                         Non-Null Count  Dtype  
---  ------                                                                                                                                                                                                                         --------------  -----  
 0   case number                                                                                                                                                                                                                    696 non-null    object 
 1   Who was the member adjudicating the decision?                                                                                         

In [3]:
data = data.rename(columns={
    'What was the length of the tenancy, or in other words, how long had the tenants lived at the residence in question? '
    :'What was the length of the tenancy',
    'What was the amount of the rental deposit? ':'rental deposit amount',
    'If any rent increases occurred, what was the rent after the increase(s)?':'was there an rent increases',
    'Over how many months did the arrears accumulate? ':'Over how many months did the arrears accumulate?',
    'If the tenant made a payment on the arrears after the eviction notice was served and/or prior to the hearing, what was the amount of the payment? ':'Does the tenant made a payment on the arrears after the eviction notice',
    'What were the specific mental, medical, or physical conditions of the tenant, if any? ':'What were the specific mental, medical, or physical conditions of the tenant, if any?'})

In [4]:
train_df, test_df = train_test_split(data, test_size=0.2, random_state=123)
X_train, y_train = train_df.drop(columns = ["What was the outcome of the case?"]), train_df["What was the outcome of the case?"]
X_test, y_test = test_df.drop(columns = ["What was the outcome of the case?"]), test_df["What was the outcome of the case?"]
X_train.head()

Unnamed: 0,case number,Who was the member adjudicating the decision?,What was the location of the landlord tenant board?,Did the decision state the landlord was represented?,Did the decision state the landlord attended the hearing?,Did the decision state the tenant was represented?,Did the decision state the tenant attended the hearing?,Did the decision state the landlord was a not-for-profit landlord (e.g. Toronto Community Housing)?,Did the decision state the tenant was collecting a subsidy?,What was the length of the tenancy,...,"If the tenant was employed, did the decision state any doubts about the stability of employment e.g. lack of guaranteed hours, contract work, etc.?",Did the member find the tenant had sufficient income to pay rent?,Did the decision mention the tenant lost their job leading up to or during the period of the hearing?,"Did the decision mention any other extenuating circumstances experienced by the tenant leading up to or during the period of the claim (e.g. hospitalization, death in the family, etc.)?","Did the decision mention the tenant’s difficulty finding alternative housing for any reason e.g.physical limitations, reliance on social assistance, etc.?",Did the decision state the tenant was given prior notice for the eviction?,Did the decisions state postponement would result in the tenant accruing additional arrears?,Which other specific applications of the landlord or the tenant were mentioned?,Did the decision mention the validity of an N4 eviction notice?,Payment Plan
195,TSL-90833-17,Roger Rodrigues,Toronto,1,0,0,0,0,0,6.306049,...,0,1,0,0,0,0,0,[],1,0.0
553,TNL-07861-18,Nancy Morris,Toronto,0,1,0,1,0,0,24.0,...,1,0,0,1,0,1,0,[],0,0.5
598,TSL-96267-18,David Mungovan,Toronto,1,0,0,1,0,0,6.306049,...,0,0,0,0,0,1,0,[],0,0.0
645,TEL-81094-17,Shelby Whittick,Whitby,0,1,0,1,0,0,6.306049,...,0,0,0,1,0,1,1,"['L1', 'L2']",0,0.0
634,TEL-80073-17,Jim McMaster,Toronto,1,1,0,1,0,0,6.306049,...,0,0,0,0,0,1,1,"['L1', 'L2', 'T2']",0,0.0


In [5]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 556 entries, 195 to 510
Data columns (total 32 columns):
 #   Column                                                                                                                                                                                                                         Non-Null Count  Dtype  
---  ------                                                                                                                                                                                                                         --------------  -----  
 0   case number                                                                                                                                                                                                                    556 non-null    object 
 1   Who was the member adjudicating the decision?                                                                                       

In [6]:
y_train.value_counts()

No relief                   352
Postponement of eviction    124
Payment plan                 64
Relief                       12
Conditional Order             4
Name: What was the outcome of the case?, dtype: int64

there is an imbalanced class: 352's No relief and 124's Postponement of eviction and others. Therefore, we should add `class_weight = 'balanced'` in estimator's parameter.

In [7]:
X_train['If the tenant was employed, did the decision state any doubts about the stability of employment e.g. lack of guaranteed hours, contract work, etc.?'].unique()

array([ 0,  1, -1], dtype=int64)

## Preprocess data with pipeline

In [8]:
categorical_features = ['Who was the member adjudicating the decision?',
                        'What was the location of the landlord tenant board?',
                        'Which other specific applications of the landlord or the tenant were mentioned?'
                        ]

numeric_features = [
'What was the length of the tenancy',
'What was the monthly rent?',
'rental deposit amount',
'What was the total amount of arrears?',
'Over how many months did the arrears accumulate?',

]
ordinal_features = [
'Was the tenant employed at the time of the hearing?',
'Payment Plan',
'If the tenant was not employed, did the decision state the tenant was receiving any form of government assistance (e.g. OW, childcare benefits, ODSP, OSAP)?',
'If the tenant was employed, did the decision state any doubts about the stability of employment e.g. lack of guaranteed hours, contract work, etc.?'
]
drop_features = ['case number']
ordering_ordinal = [
    [-1,0,1],
    [0, 0.5, 1],
    [ -1, 0, 1],
    [ -1, 0, 1]
]

passthrough_feats = list(
    set(X_train.columns)
    - set(categorical_features)
    - set(numeric_features)
    - set(ordinal_features)
    - set(drop_features)
)

In [9]:
preprocessor = make_column_transformer(
        (OneHotEncoder(handle_unknown="ignore"),categorical_features),
        ("passthrough", passthrough_feats),
        (StandardScaler(),numeric_features),
        (OrdinalEncoder(categories = ordering_ordinal),ordinal_features),
        ("drop",drop_features))
preprocessor

In [10]:
# column_names = (
#     preprocessor.named_transformers_["onehotencoder"].get_feature_names_out().tolist()
#     + passthrough_feats    
#     + numeric_features
#     + ordinal_features
# )
# len(column_names)

## Baseline model

In [11]:
results = {}
scoring_metrics = ["accuracy"]

In [12]:
def mean_std_cross_val_scores(model, X_train, y_train, **kwargs):
    """
    Returns mean and std of cross validation

    Parameters
    ----------
    model :
        scikit-learn model
    X_train : numpy array or pandas DataFrame
        X in the training data
    y_train :
        y in the training data

    Returns
    ----------
        pandas Series with mean scores from cross_validation
    """

    scores = cross_validate(model, X_train, y_train, **kwargs)

    mean_scores = pd.DataFrame(scores).mean()
    std_scores = pd.DataFrame(scores).std()
    out_col = []

    for i in range(len(mean_scores)):
        out_col.append((f"%0.3f (+/- %0.3f)" % (mean_scores[i], std_scores[i])))

    return pd.Series(data=out_col, index=mean_scores.index)

In [13]:
from sklearn.dummy import DummyClassifier
results["dummy"] = mean_std_cross_val_scores(
    DummyClassifier(), X_train, y_train, return_train_score=True,scoring = scoring_metrics
)
pd.DataFrame(results).T



Unnamed: 0,fit_time,score_time,test_accuracy,train_accuracy
dummy,0.001 (+/- 0.000),0.000 (+/- 0.000),0.633 (+/- 0.004),0.633 (+/- 0.001)


## Multinomial logistic regression


In [14]:
pipe = make_pipeline(preprocessor, LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000))


results["LR"] = mean_std_cross_val_scores(
    pipe, X_train, y_train, return_train_score=True, scoring=scoring_metrics
)
pd.DataFrame(results).T



Unnamed: 0,fit_time,score_time,test_accuracy,train_accuracy
dummy,0.001 (+/- 0.000),0.000 (+/- 0.000),0.633 (+/- 0.004),0.633 (+/- 0.001)
LR,0.049 (+/- 0.005),0.006 (+/- 0.001),0.741 (+/- 0.054),0.859 (+/- 0.010)


### Hyperparameter tuning
Use randomizedSearchCV to find the best C parameters value.

In [15]:
from scipy.stats import loguniform
from sklearn.model_selection import RandomizedSearchCV


lr = LogisticRegression(random_state=123, multi_class='multinomial', solver='lbfgs', max_iter=1000) #class_weight = 'balanced',solver='lbfgs', 
pipe_lr = make_pipeline(preprocessor, lr)

param_dist = { 
    "logisticregression__C": loguniform(1e-3, 1e3),
    "logisticregression__class_weight": ["balanced", None]
}

random_search = RandomizedSearchCV(
            pipe_lr, param_distributions=param_dist, n_iter=20, 
            n_jobs=-1, verbose = 1, scoring= scoring_metrics, 
            refit = 'accuracy', return_train_score=True, random_state=123)

random_search.fit(X_train, y_train)


Fitting 5 folds for each of 20 candidates, totalling 100 fits




In [16]:
random_search.best_params_

{'logisticregression__C': 0.36913136191432777,
 'logisticregression__class_weight': None}

In [17]:
random_search.best_score_

0.7464607464607464

In [18]:
lr_2 = LogisticRegression(random_state=123, C = 0.36913136191432777, solver='lbfgs', multi_class='multinomial', max_iter=1000) 
pipe_lr2 = make_pipeline(preprocessor, lr_2)

results["LR_noweighted_Best"] = mean_std_cross_val_scores(
    pipe_lr2, X_train, y_train, return_train_score=True, scoring=scoring_metrics
)
pd.DataFrame(results).T



Unnamed: 0,fit_time,score_time,test_accuracy,train_accuracy
dummy,0.001 (+/- 0.000),0.000 (+/- 0.000),0.633 (+/- 0.004),0.633 (+/- 0.001)
LR,0.049 (+/- 0.005),0.006 (+/- 0.001),0.741 (+/- 0.054),0.859 (+/- 0.010)
LR_noweighted_Best,0.041 (+/- 0.002),0.006 (+/- 0.000),0.746 (+/- 0.039),0.826 (+/- 0.014)


## set class_weight = 'balanced'

In [19]:
lr3 = LogisticRegression(random_state=123, class_weight = 'balanced',solver='lbfgs', multi_class='multinomial', max_iter=1000)
pipe_lr3 = make_pipeline(preprocessor, lr3)

param_dist = { 
    "logisticregression__C": loguniform(1e-3, 1e3)
}

random_search = RandomizedSearchCV(
            pipe_lr3, param_distributions=param_dist, n_iter=20, 
            n_jobs=-1, verbose = 1, scoring= scoring_metrics, 
            refit = 'accuracy', return_train_score=True, random_state=123)

random_search.fit(X_train, y_train)


Fitting 5 folds for each of 20 candidates, totalling 100 fits




In [20]:
print(random_search.best_params_)
print(random_search.best_score_)

{'logisticregression__C': 1.5463515822289584}
0.6494369369369368


In [21]:
lr_3 = LogisticRegression(random_state=123, C = 1.5463515822289584,class_weight = 'balanced', solver='lbfgs', multi_class='multinomial') 
pipe_lr3 = make_pipeline(preprocessor, lr_3)

results["LR_weighted_Best"] = mean_std_cross_val_scores(
    pipe_lr3, X_train, y_train, return_train_score=True, scoring=scoring_metrics
)
pd.DataFrame(results).T

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Unnamed: 0,fit_time,score_time,test_accuracy,train_accuracy
dummy,0.001 (+/- 0.000),0.000 (+/- 0.000),0.633 (+/- 0.004),0.633 (+/- 0.001)
LR,0.049 (+/- 0.005),0.006 (+/- 0.001),0.741 (+/- 0.054),0.859 (+/- 0.010)
LR_noweighted_Best,0.041 (+/- 0.002),0.006 (+/- 0.000),0.746 (+/- 0.039),0.826 (+/- 0.014)
LR_weighted_Best,0.036 (+/- 0.002),0.006 (+/- 0.001),0.649 (+/- 0.088),0.823 (+/- 0.008)


## Feature Importance

In [27]:
pipe_lr2.fit(X_train, y_train)

In [28]:
column_names = (
    pipe_lr2.named_steps['columntransformer'].named_transformers_['onehotencoder'].get_feature_names_out(categorical_features).tolist()
    + passthrough_feats    
    + numeric_features
    + ordinal_features
)
len(column_names)

166

In [29]:
pipe_lr2.named_steps['logisticregression'].coef_.shape

(5, 166)

The coefficients have an array of (5, 166) because we have a multiclass cases here (5 classes). Thus, an one-vs-rest strategy is applied. Sklearn creates 5 classifiers, that is why we have (5, 166) coefficient arrays. Then, for the feature selection we need to use squared coefficient approach. [reference paper](https://www.nature.com/articles/s41374-021-00662-x)

In [30]:
def squared_coef(coefs):
    """ 
    For each feature,  calculated the squared-coefficient of each
    feature by adding together the squared values of the feature's coefficient
    for each class.
    """
    squared_coef = np.square(coefs)
    results = np.sum(squared_coef, axis=0)
    return results


In [31]:
squared_coef(pipe_lr2.named_steps['logisticregression'].coef_).shape

(166,)

In [32]:
coefs= (
    pd.DataFrame(squared_coef(pipe_lr2.named_steps['logisticregression'].coef_), column_names, columns=["coef"]) # check first classifier
    .query("coef != 0")
    .sort_values("coef",ascending=False)
    .reset_index()
    .rename(columns={"index": "variable"})
)
# coefs.style.background_gradient('PuOr')

In [33]:
coefs

Unnamed: 0,variable,coef
0,Payment Plan,3.388722
1,Did the decision state the tenant attended the...,1.828997
2,Did the member find the tenant had sufficient ...,1.579449
3,Did the decisions state postponement would res...,1.517150
4,Did the decision mention the tenant’s difficul...,1.380098
...,...,...
161,Who was the member adjudicating the decision?_...,0.000638
162,Which other specific applications of the landl...,0.000428
163,Which other specific applications of the landl...,0.000344
164,Who was the member adjudicating the decision?_...,0.000328


In [34]:
import altair as alt

alt.Chart(
    coefs,
    title=pipe_lr.named_steps['logisticregression'].__str__()[:-2] + " Coefficients",
).mark_bar().encode(y=alt.Y("variable", sort="-x"), x="coef")

## Conclusion:

It seems like if we add the `class_weight = 'balanced'` and default LR, the classfier is overfit with the training data. Thus, we will set the `class_weight = None`

That is `pipe_lr2`

In [35]:
pipe_lr2.fit(X_train, y_train)

In [36]:
pipe_lr2.score(X_test, y_test)

0.7428571428571429

In [38]:
pd.DataFrame(results).T

Unnamed: 0,fit_time,score_time,test_accuracy,train_accuracy
dummy,0.001 (+/- 0.000),0.000 (+/- 0.000),0.633 (+/- 0.004),0.633 (+/- 0.001)
LR,0.049 (+/- 0.005),0.006 (+/- 0.001),0.741 (+/- 0.054),0.859 (+/- 0.010)
LR_noweighted_Best,0.041 (+/- 0.002),0.006 (+/- 0.000),0.746 (+/- 0.039),0.826 (+/- 0.014)
LR_weighted_Best,0.036 (+/- 0.002),0.006 (+/- 0.001),0.649 (+/- 0.088),0.823 (+/- 0.008)
