In [1]:
from sklearn.linear_model import LogisticRegression
import pandas as pd
from sklearn.model_selection import (
    GridSearchCV,
    cross_val_score,
    cross_validate,
    train_test_split,
)
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.compose import (
    ColumnTransformer,
    TransformedTargetRegressor,
    make_column_transformer,
)

# Load data

In [2]:
data = pd.read_csv('../data/cleaned_data_v2.csv')
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 696 entries, 0 to 695
Data columns (total 33 columns):
 #   Column                                                                                                                                                                                                                         Non-Null Count  Dtype  
---  ------                                                                                                                                                                                                                         --------------  -----  
 0   case number                                                                                                                                                                                                                    696 non-null    object 
 1   Who was the member adjudicating the decision?                                                                                         

In [3]:
data = data.rename(columns={
    'What was the length of the tenancy, or in other words, how long had the tenants lived at the residence in question? '
    :'What was the length of the tenancy',
    'What was the amount of the rental deposit? ':'rental deposit amount',
    'If any rent increases occurred, what was the rent after the increase(s)?':'was there an rent increases',
    'Over how many months did the arrears accumulate? ':'Over how many months did the arrears accumulate?',
    'If the tenant made a payment on the arrears after the eviction notice was served and/or prior to the hearing, what was the amount of the payment? ':'Does the tenant made a payment on the arrears after the eviction notice',
    'What were the specific mental, medical, or physical conditions of the tenant, if any? ':'What were the specific mental, medical, or physical conditions of the tenant, if any?'})

In [4]:
train_df, test_df = train_test_split(data, test_size=0.2, random_state=123)
X_train, y_train = train_df.drop(columns = ["What was the outcome of the case?"]), train_df["What was the outcome of the case?"]
X_test, y_test = test_df.drop(columns = ["What was the outcome of the case?"]), test_df["What was the outcome of the case?"]
X_train.head()

Unnamed: 0,case number,Who was the member adjudicating the decision?,What was the location of the landlord tenant board?,Did the decision state the landlord was represented?,Did the decision state the landlord attended the hearing?,Did the decision state the tenant was represented?,Did the decision state the tenant attended the hearing?,Did the decision state the landlord was a not-for-profit landlord (e.g. Toronto Community Housing)?,Did the decision state the tenant was collecting a subsidy?,What was the length of the tenancy,...,"If the tenant was employed, did the decision state any doubts about the stability of employment e.g. lack of guaranteed hours, contract work, etc.?",Did the member find the tenant had sufficient income to pay rent?,Did the decision mention the tenant lost their job leading up to or during the period of the hearing?,"Did the decision mention any other extenuating circumstances experienced by the tenant leading up to or during the period of the claim (e.g. hospitalization, death in the family, etc.)?","Did the decision mention the tenant’s difficulty finding alternative housing for any reason e.g.physical limitations, reliance on social assistance, etc.?",Did the decision state the tenant was given prior notice for the eviction?,Did the decisions state postponement would result in the tenant accruing additional arrears?,Which other specific applications of the landlord or the tenant were mentioned?,Did the decision mention the validity of an N4 eviction notice?,Payment Plan
195,TSL-90833-17,Roger Rodrigues,Toronto,1,0,0,0,0,0,6.306049,...,0,1,0,0,0,0,0,[],1,0.0
553,TNL-07861-18,Nancy Morris,Toronto,0,1,0,1,0,0,24.0,...,1,0,0,1,0,1,0,[],0,0.5
598,TSL-96267-18,David Mungovan,Toronto,1,0,0,1,0,0,6.306049,...,0,0,0,0,0,1,0,[],0,0.0
645,TEL-81094-17,Shelby Whittick,Whitby,0,1,0,1,0,0,6.306049,...,0,0,0,1,0,1,1,"['L1', 'L2']",0,0.0
634,TEL-80073-17,Jim McMaster,Toronto,1,1,0,1,0,0,6.306049,...,0,0,0,0,0,1,1,"['L1', 'L2', 'T2']",0,0.0


In [5]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 556 entries, 195 to 510
Data columns (total 32 columns):
 #   Column                                                                                                                                                                                                                         Non-Null Count  Dtype  
---  ------                                                                                                                                                                                                                         --------------  -----  
 0   case number                                                                                                                                                                                                                    556 non-null    object 
 1   Who was the member adjudicating the decision?                                                                                       

In [6]:
y_train.value_counts()

No relief                   352
Postponement of eviction    124
Payment plan                 64
Relief                       12
Conditional Order             4
Name: What was the outcome of the case?, dtype: int64

there is an imbalanced class: 352's No relief and 124's Postponement of eviction and others. Therefore, we should add `class_weight = 'balanced'` in estimator's parameter.

## Preprocess data with pipeline

In [7]:
categorical_features = ['Who was the member adjudicating the decision?',
                        'What was the location of the landlord tenant board?',
                        'Which other specific applications of the landlord or the tenant were mentioned?'
                        ]

numeric_features = [
'What was the length of the tenancy',
'What was the monthly rent?',
'rental deposit amount',
'What was the total amount of arrears?',
'Over how many months did the arrears accumulate?',

]
ordinal_features = [
'Was the tenant employed at the time of the hearing?',
'Payment Plan'
]
drop_features = ['case number']
ordering_ordinal = [
    [-1,0,1],
    [0, 0.5, 1]
]

In [8]:
preprocessor = make_column_transformer(
        (OneHotEncoder(handle_unknown="ignore"),categorical_features),
        (StandardScaler(),numeric_features),
        (OrdinalEncoder(categories = ordering_ordinal),ordinal_features),
        ("drop",drop_features))
preprocessor

## Baseline model

In [9]:
from sklearn.dummy import DummyClassifier

cross_val_results = {}
dc = DummyClassifier()
scoring_metrics = ["accuracy"]
cross_val_results['dummy'] = pd.DataFrame(cross_validate(dc, X_train, y_train, cv=5, scoring = scoring_metrics, return_train_score=True)).agg(['mean', 'std']).round(3).T
cross_val_results['dummy']



Unnamed: 0,mean,std
fit_time,0.001,0.0
score_time,0.0,0.0
test_accuracy,0.633,0.004
train_accuracy,0.633,0.001


## logistic regression

Use randomizedSearchCV to find the best C parameter's value.

In [10]:
from scipy.stats import loguniform
from sklearn.model_selection import RandomizedSearchCV


lr = LogisticRegression(random_state=123, multi_class='multinomial', solver='lbfgs') #class_weight = 'balanced',solver='lbfgs', 
pipe_lr = make_pipeline(preprocessor, lr)

param_dist = { 
    "logisticregression__C": loguniform(1e-3, 1e3),
    "logisticregression__class_weight": ["balanced", None]
}

random_search = RandomizedSearchCV(
            pipe_lr, param_distributions=param_dist, n_iter=20, 
            n_jobs=-1, verbose = 1, scoring= scoring_metrics, 
            refit = 'accuracy', return_train_score=True, random_state=123)

random_search.fit(X_train, y_train)




Fitting 5 folds for each of 20 candidates, totalling 100 fits


In [11]:
random_search.best_params_

{'logisticregression__C': 0.36913136191432777,
 'logisticregression__class_weight': None}

In [12]:
random_search.best_score_

0.7032496782496783

In [13]:
lr_2 = LogisticRegression(random_state=123, C = 0.36913136191432777, solver='lbfgs', multi_class='multinomial') 
pipe_lr2 = make_pipeline(preprocessor, lr_2)

cross_val_results['logreg_best'] = pd.DataFrame(cross_validate(pipe_lr2, X_train, y_train, cv=5, scoring = scoring_metrics, return_train_score=True)).agg(['mean', 'std']).round(3).T
cross_val_results['logreg_best']



Unnamed: 0,mean,std
fit_time,0.023,0.001
score_time,0.004,0.0
test_accuracy,0.703,0.03
train_accuracy,0.746,0.008


## set class_weight = 'balanced'

In [14]:
lr3 = LogisticRegression(random_state=123, class_weight = 'balanced',solver='lbfgs', multi_class='multinomial')
pipe_lr3 = make_pipeline(preprocessor, lr3)

param_dist = { 
    "logisticregression__C": loguniform(1e-3, 1e3)
}

random_search = RandomizedSearchCV(
            pipe_lr3, param_distributions=param_dist, n_iter=20, 
            n_jobs=-1, verbose = 1, scoring= scoring_metrics, 
            refit = 'accuracy', return_train_score=True, random_state=123)

random_search.fit(X_train, y_train)


Fitting 5 folds for each of 20 candidates, totalling 100 fits


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [15]:
print(random_search.best_params_)
print(random_search.best_score_)

{'logisticregression__C': 766.6289057556017}
0.5377734877734878


In [16]:
lr_3 = LogisticRegression(random_state=123, C = 766.6289057556017,class_weight = 'balanced', solver='lbfgs', multi_class='multinomial') 
pipe_lr3 = make_pipeline(preprocessor, lr_3)

cross_val_results['logreg_best'] = pd.DataFrame(cross_validate(pipe_lr3, X_train, y_train, cv=5, scoring = scoring_metrics, return_train_score=True)).agg(['mean', 'std']).round(3).T
cross_val_results['logreg_best']

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Unnamed: 0,mean,std
fit_time,0.033,0.005
score_time,0.004,0.001
test_accuracy,0.538,0.054
train_accuracy,0.753,0.015


## Conclusion:

It seems like if we add the `class_weight = 'balanced'`, the classfier is overfit with the training data. Thus, we will set the `class_weight = None`

In [17]:
pipe_lr2.fit(X_train, y_train)

In [18]:
pipe_lr2.score(X_test, y_test)

0.7071428571428572