In [1]:
from sklearn import svm
from sklearn.dummy import DummyRegressor
import pandas as pd
from sklearn.model_selection import (
    GridSearchCV,
    cross_val_score,
    cross_validate,
    train_test_split,
)
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.compose import (
    ColumnTransformer,
    TransformedTargetRegressor,
    make_column_transformer,
)
import numpy as np

# Load data

In [2]:
data = pd.read_csv('../data/cleaned_data_v2.csv')
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 696 entries, 0 to 695
Data columns (total 47 columns):
 #   Column                                                                                                                                                                                                                         Non-Null Count  Dtype  
---  ------                                                                                                                                                                                                                         --------------  -----  
 0   case number                                                                                                                                                                                                                    696 non-null    object 
 1   Who was the member adjudicating the decision?                                                                                         

In [3]:
data = data.rename(columns={
    'What was the length of the tenancy, or in other words, how long had the tenants lived at the residence in question? '
    :'What was the length of the tenancy',
    'What was the amount of the rental deposit? ':'rental deposit amount',
    'If any rent increases occurred, what was the rent after the increase(s)?':'was there an rent increases',
    'Over how many months did the arrears accumulate? ':'Over how many months did the arrears accumulate?',
    'If the tenant made a payment on the arrears after the eviction notice was served and/or prior to the hearing, what was the amount of the payment? ':'Does the tenant made a payment on the arrears after the eviction notice',
    'What were the specific mental, medical, or physical conditions of the tenant, if any? ':'What were the specific mental, medical, or physical conditions of the tenant, if any?'})

In [4]:
train_df, test_df = train_test_split(data, test_size=0.2, random_state=123)
X_train, y_train = train_df.drop(columns = ["What was the outcome of the case?"]), train_df["What was the outcome of the case?"]
X_test, y_test = test_df.drop(columns = ["What was the outcome of the case?"]), test_df["What was the outcome of the case?"]
X_train.head()

Unnamed: 0,case number,Who was the member adjudicating the decision?,What was the location of the landlord tenant board?,Did the decision state the landlord was represented?,Did the decision state the landlord attended the hearing?,Did the decision state the tenant was represented?,Did the decision state the tenant attended the hearing?,Did the decision state the landlord was a not-for-profit landlord (e.g. Toronto Community Housing)?,Did the decision state the tenant was collecting a subsidy?,What was the length of the tenancy,...,L9 present?,N5 present?,N6 present?,N7 present?,N8 present?,T1 present?,T2 present?,T3 present?,T5 present?,T6 present?
195,TSL-90833-17,roger rodrigues,Toronto,1,0,0,0,0,0,6.306049,...,0,0,0,0,0,0,0,0,0,0
553,TNL-07861-18,nancy morris,Toronto,0,1,0,1,0,0,24.0,...,0,0,0,0,0,0,0,0,0,0
598,TSL-96267-18,david mungovan,Toronto,1,0,0,1,0,0,6.306049,...,0,0,0,0,0,0,0,0,0,0
645,TEL-81094-17,shelby whittick,Whitby,0,1,0,1,0,0,6.306049,...,0,0,0,0,0,0,0,0,0,0
634,TEL-80073-17,jim mcmaster,Toronto,1,1,0,1,0,0,6.306049,...,0,0,0,0,0,0,1,0,0,0


In [5]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 556 entries, 195 to 510
Data columns (total 46 columns):
 #   Column                                                                                                                                                                                                                         Non-Null Count  Dtype  
---  ------                                                                                                                                                                                                                         --------------  -----  
 0   case number                                                                                                                                                                                                                    556 non-null    object 
 1   Who was the member adjudicating the decision?                                                                                       

In [6]:
y_train.value_counts()

No relief            352
Relief               200
Conditional Order      4
Name: What was the outcome of the case?, dtype: int64

there is an imbalanced class: 352's No relief and 124's Postponement of eviction and others. Therefore, we should add `class_weight = 'balanced'` in estimator's parameter.

In [7]:
X_train['If the tenant was employed, did the decision state any doubts about the stability of employment e.g. lack of guaranteed hours, contract work, etc.?'].unique()

array([ 0,  1, -1], dtype=int64)

## Preprocess data with pipeline

In [8]:
categorical_features = ['Who was the member adjudicating the decision?',
                        'What was the location of the landlord tenant board?'
                        ]

numeric_features = [
'What was the length of the tenancy',
'What was the monthly rent?',
'rental deposit amount',
'What was the total amount of arrears?',
'Over how many months did the arrears accumulate?',

]
ordinal_features = [
'Was the tenant employed at the time of the hearing?',
'Payment Plan',
'If the tenant was not employed, did the decision state the tenant was receiving any form of government assistance (e.g. OW, childcare benefits, ODSP, OSAP)?',
'If the tenant was employed, did the decision state any doubts about the stability of employment e.g. lack of guaranteed hours, contract work, etc.?'
]
drop_features = ['case number']
ordering_ordinal = [
    [-1,0,1],
    [0, 0.5, 1],
    [ -1, 0, 1],
    [ -1, 0, 1]
]

passthrough_feats = list(
    set(X_train.columns)
    - set(categorical_features)
    - set(numeric_features)
    - set(ordinal_features)
    - set(drop_features)
)

In [9]:
preprocessor = make_column_transformer(
        (OneHotEncoder(handle_unknown="ignore"),categorical_features),
        ("passthrough", passthrough_feats),
        (StandardScaler(),numeric_features),
        (OrdinalEncoder(categories = ordering_ordinal),ordinal_features),
        ("drop",drop_features))
preprocessor

## Baseline model

In [10]:
results = {}
scoring_metrics = ["accuracy"]

In [11]:
def mean_std_cross_val_scores(model, X_train, y_train, **kwargs):
    """
    Returns mean and std of cross validation

    Parameters
    ----------
    model :
        scikit-learn model
    X_train : numpy array or pandas DataFrame
        X in the training data
    y_train :
        y in the training data

    Returns
    ----------
        pandas Series with mean scores from cross_validation
    """

    scores = cross_validate(model, X_train, y_train, **kwargs)

    mean_scores = pd.DataFrame(scores).mean()
    std_scores = pd.DataFrame(scores).std()
    out_col = []

    for i in range(len(mean_scores)):
        out_col.append((f"%0.3f (+/- %0.3f)" % (mean_scores[i], std_scores[i])))

    return pd.Series(data=out_col, index=mean_scores.index)

In [16]:
from sklearn.dummy import DummyClassifier
results["dummy"] = mean_std_cross_val_scores(
    DummyClassifier(), X_train, y_train, return_train_score=True,scoring = scoring_metrics
)
pd.DataFrame(results).T



Unnamed: 0,fit_time,score_time,test_accuracy,train_accuracy
dummy,0.002 (+/- 0.000),0.001 (+/- 0.001),0.633 (+/- 0.004),0.633 (+/- 0.001)


### Linear SVM


In [17]:
pipe = make_pipeline(preprocessor,svm.LinearSVC())


results["Linear_SVC"] = mean_std_cross_val_scores(
    pipe, X_train, y_train, return_train_score=True, scoring=scoring_metrics
)
pd.DataFrame(results).T



Unnamed: 0,fit_time,score_time,test_accuracy,train_accuracy
dummy,0.002 (+/- 0.000),0.001 (+/- 0.001),0.633 (+/- 0.004),0.633 (+/- 0.001)
Linear_SVC,0.150 (+/- 0.029),0.033 (+/- 0.023),0.754 (+/- 0.046),0.862 (+/- 0.012)


### Hyperparameter tuning
Use randomizedSearchCV to find the best C parameters value.

In [18]:
from scipy.stats import loguniform
from sklearn.model_selection import RandomizedSearchCV


linear_svc=svm.LinearSVC(random_state=123,max_iter=1000)
pipe_linear_svc=make_pipeline(preprocessor,linear_svc)

param_dist = { 
    "linearsvc__C": loguniform(1e-3, 1e3),
    "linearsvc__class_weight": ["balanced", None],
    "linearsvc__multi_class": ["ovr","crammer_singer"]
}

random_search = RandomizedSearchCV(
            pipe_linear_svc, param_distributions=param_dist, n_iter=20, 
            n_jobs=-1, verbose = 1, scoring= scoring_metrics, 
            refit = 'accuracy', return_train_score=True, random_state=123)

random_search.fit(X_train, y_train)


Fitting 5 folds for each of 20 candidates, totalling 100 fits




In [19]:
random_search.best_params_

{'linearsvc__C': 0.35890684548321944,
 'linearsvc__class_weight': 'balanced',
 'linearsvc__multi_class': 'crammer_singer'}

In [20]:
random_search.best_score_

0.7627413127413127

In [21]:
linear_svc_2=svm.LinearSVC(random_state=123,C=0.35890684548321944,class_weight="balanced",multi_class="crammer_singer",max_iter=1000) 
pipe_linear_svc_2=make_pipeline(preprocessor,linear_svc_2)

results["Linear_SVC_Best"] = mean_std_cross_val_scores(
    pipe_linear_svc_2, X_train, y_train, return_train_score=True, scoring=scoring_metrics
)
pd.DataFrame(results).T



Unnamed: 0,fit_time,score_time,test_accuracy,train_accuracy
dummy,0.002 (+/- 0.000),0.001 (+/- 0.001),0.633 (+/- 0.004),0.633 (+/- 0.001)
Linear_SVC,0.150 (+/- 0.029),0.033 (+/- 0.023),0.754 (+/- 0.046),0.862 (+/- 0.012)
Linear_SVC_Best,0.074 (+/- 0.024),0.011 (+/- 0.004),0.763 (+/- 0.058),0.836 (+/- 0.015)


## Feature Importance

In [22]:
pipe_linear_svc_2.fit(X_train, y_train)



In [23]:
column_names = (
    pipe_linear_svc_2.named_steps['columntransformer'].named_transformers_['onehotencoder'].get_feature_names_out(categorical_features).tolist()
    + passthrough_feats    
    + numeric_features
    + ordinal_features
)
len(column_names)

126

In [26]:
pipe_linear_svc_2.named_steps["linearsvc"].coef_.shape

(3, 126)

In [27]:
def squared_coef(coefs):
    """ 
    For each feature,  calculated the squared-coefficient of each
    feature by adding together the squared values of the feature's coefficient
    for each class.
    """
    squared_coef = np.square(coefs)
    results = np.sum(squared_coef, axis=0)
    return results


In [28]:
squared_coef(pipe_linear_svc_2.named_steps['linearsvc'].coef_).shape

(126,)

In [30]:
coefs= (
    pd.DataFrame(squared_coef(pipe_linear_svc_2.named_steps["linearsvc"].coef_), column_names, columns=["coef"]) # check first classifier
    .query("coef != 0")
    .sort_values("coef",ascending=False)
    .reset_index()
    .rename(columns={"index": "variable"})
)
# coefs.style.background_gradient('PuOr')

In [31]:
coefs

Unnamed: 0,variable,coef
0,What was the location of the landlord tenant b...,1.134795e+00
1,Did the decision mention the tenant’s difficul...,6.235867e-01
2,Did the decision state the tenant was represen...,6.037648e-01
3,"What were the specific mental, medical, or phy...",6.019093e-01
4,Who was the member adjudicating the decision?_...,5.743243e-01
...,...,...
117,Who was the member adjudicating the decision?_...,3.641211e-34
118,Who was the member adjudicating the decision?_...,2.337030e-34
119,Who was the member adjudicating the decision?_...,6.808463e-35
120,L4 present?,6.211312e-35


In [32]:
import altair as alt

alt.Chart(
    coefs,
    title=pipe_linear_svc_2.named_steps["linearsvc"].__str__()[:-2] + " Coefficients",
).mark_bar().encode(y=alt.Y("variable", sort="-x"), x="coef")