# Decision Tress and Ensemble Learning

## Credit Risck Scoring Project

- Dataset: https://github.com/gastonstat/CreditScoring

In [26]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import roc_auc_score
from sklearn.tree import export_text

## Data Cleaning and Preparation

- Downloading the dataset
- Re-encoding the categorical variables
- Train/Validation/Test split

In [2]:
# Downloading the dataset
data = "https://raw.githubusercontent.com/alexeygrigorev/mlbookcamp-code/master/chapter-06-trees/CreditScoring.csv"
df = pd.read_csv(data)

In [3]:
df.head()

Unnamed: 0,Status,Seniority,Home,Time,Age,Marital,Records,Job,Expenses,Income,Assets,Debt,Amount,Price
0,1,9,1,60,30,2,1,3,73,129,0,0,800,846
1,1,17,1,60,58,3,1,1,48,131,0,0,1000,1658
2,2,10,2,36,46,2,2,3,90,200,3000,0,2000,2985
3,1,0,1,60,24,1,1,1,63,182,2500,0,900,1325
4,1,0,1,36,26,1,1,1,46,107,0,0,310,910


In [4]:
# Renaming the columns
df.columns = df.columns.str.lower()
df.head()

Unnamed: 0,status,seniority,home,time,age,marital,records,job,expenses,income,assets,debt,amount,price
0,1,9,1,60,30,2,1,3,73,129,0,0,800,846
1,1,17,1,60,58,3,1,1,48,131,0,0,1000,1658
2,2,10,2,36,46,2,2,3,90,200,3000,0,2000,2985
3,1,0,1,60,24,1,1,1,63,182,2500,0,900,1325
4,1,0,1,36,26,1,1,1,46,107,0,0,310,910


In [5]:
df.status.value_counts()

status
1    3200
2    1254
0       1
Name: count, dtype: int64

In [6]:
status_values = {1: "ok", 2: "default", 0: "unk"}

df.status = df.status.map(status_values)

home_values = {
    1: "rent",
    2: "owner",
    3: "private",
    4: "ignore",
    5: "parents",
    6: "other",
    0: "unk",
}

df.home = df.home.map(home_values)

marital_values = {
    1: "single",
    2: "married",
    3: "widow",
    4: "separated",
    5: "divorced",
    0: "unk",
}

df.marital = df.marital.map(marital_values)

records_values = {1: "no", 2: "yes", 0: "unk"}

df.records = df.records.map(records_values)

job_values = {1: "fixed", 2: "partime", 3: "freelance", 4: "others", 0: "unk"}

df.job = df.job.map(job_values)

In [7]:
df.head()

Unnamed: 0,status,seniority,home,time,age,marital,records,job,expenses,income,assets,debt,amount,price
0,ok,9,rent,60,30,married,no,freelance,73,129,0,0,800,846
1,ok,17,rent,60,58,widow,no,fixed,48,131,0,0,1000,1658
2,default,10,owner,36,46,married,yes,freelance,90,200,3000,0,2000,2985
3,ok,0,rent,60,24,single,no,fixed,63,182,2500,0,900,1325
4,ok,0,rent,36,26,single,no,fixed,46,107,0,0,310,910


In [8]:
df.describe().round()

Unnamed: 0,seniority,time,age,expenses,income,assets,debt,amount,price
count,4455.0,4455.0,4455.0,4455.0,4455.0,4455.0,4455.0,4455.0,4455.0
mean,8.0,46.0,37.0,56.0,763317.0,1060341.0,404382.0,1039.0,1463.0
std,8.0,15.0,11.0,20.0,8703625.0,10217569.0,6344253.0,475.0,628.0
min,0.0,6.0,18.0,35.0,0.0,0.0,0.0,100.0,105.0
25%,2.0,36.0,28.0,35.0,80.0,0.0,0.0,700.0,1118.0
50%,5.0,48.0,36.0,51.0,120.0,3500.0,0.0,1000.0,1400.0
75%,12.0,60.0,45.0,72.0,166.0,6000.0,0.0,1300.0,1692.0
max,48.0,72.0,68.0,180.0,99999999.0,99999999.0,99999999.0,5000.0,11140.0


In [9]:
df.income.max()

99999999

In [10]:
for column in ["income", "assets", "debt"]:
    df[column] = df[column].replace(99999999.0, np.nan)

In [11]:
df.describe().round()

Unnamed: 0,seniority,time,age,expenses,income,assets,debt,amount,price
count,4455.0,4455.0,4455.0,4455.0,4421.0,4408.0,4437.0,4455.0,4455.0
mean,8.0,46.0,37.0,56.0,131.0,5403.0,343.0,1039.0,1463.0
std,8.0,15.0,11.0,20.0,86.0,11573.0,1246.0,475.0,628.0
min,0.0,6.0,18.0,35.0,0.0,0.0,0.0,100.0,105.0
25%,2.0,36.0,28.0,35.0,80.0,0.0,0.0,700.0,1118.0
50%,5.0,48.0,36.0,51.0,120.0,3000.0,0.0,1000.0,1400.0
75%,12.0,60.0,45.0,72.0,165.0,6000.0,0.0,1300.0,1692.0
max,48.0,72.0,68.0,180.0,959.0,300000.0,30000.0,5000.0,11140.0


In [12]:
df = df[df.status != "unk"].reset_index(drop=True)

In [None]:
# Split the data
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=1)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=1)
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [19]:
y_train = (df_train.status == "default").astype("int").values
y_val = (df_val.status == "default").astype("int").values
y_test = (df_test.status == "default").astype("int").values

In [20]:
del df_full_train["status"]
del df_train["status"]
del df_val["status"]

In [21]:
df_train.head()

Unnamed: 0,seniority,home,time,age,marital,records,job,expenses,income,assets,debt,amount,price
0,12,private,24,28,married,no,fixed,45,102.0,,,700,1229
1,4,other,60,28,married,no,fixed,60,143.0,0.0,0.0,1150,1630
2,0,owner,60,27,married,no,fixed,35,295.0,6000.0,0.0,1950,2208
3,6,owner,48,28,married,no,fixed,45,114.0,3500.0,0.0,650,1255
4,12,owner,36,42,married,no,freelance,60,0.0,15000.0,0.0,1000,1500


## Decision trees

- How a [decisions tree](https://www.geeksforgeeks.org/machine-learning/building-and-implementing-decision-tree-classifiers-with-scikit-learn-a-comprehensive-guide/) looks like
- Training a decision tree
- Overfitting 
- Controlling the size of a tree

In [27]:
# Decision tree example
def assess_risk(client):
    if client["records"] == "yes":
        if client["job"] == "parttime":
            return "default"
        else:
            return "ok"
    else:
        if client["assets"] > 6000:
            return "ok"
        else:
            return "default"

In [28]:
x = df_train.iloc[0].to_dict()
x

{'seniority': 12,
 'home': 'private',
 'time': 24,
 'age': 28,
 'marital': 'married',
 'records': 'no',
 'job': 'fixed',
 'expenses': 45,
 'income': 102.0,
 'assets': nan,
 'debt': nan,
 'amount': 700,
 'price': 1229}

In [29]:
assess_risk(x)

'default'

In [31]:
train_dicts = df_train.fillna(0).to_dict(orient="records")

In [32]:
dv = DictVectorizer(sparse=False)
X_train = dv.fit_transform(train_dicts)
X_train

array([[2.80e+01, 7.00e+02, 0.00e+00, ..., 0.00e+00, 1.20e+01, 2.40e+01],
       [2.80e+01, 1.15e+03, 0.00e+00, ..., 0.00e+00, 4.00e+00, 6.00e+01],
       [2.70e+01, 1.95e+03, 6.00e+03, ..., 0.00e+00, 0.00e+00, 6.00e+01],
       ...,
       [3.20e+01, 1.49e+03, 0.00e+00, ..., 1.00e+00, 9.00e+00, 6.00e+01],
       [2.10e+01, 1.40e+03, 3.00e+03, ..., 0.00e+00, 0.00e+00, 6.00e+01],
       [2.60e+01, 1.40e+03, 0.00e+00, ..., 0.00e+00, 8.00e+00, 6.00e+01]])

In [33]:
y_train

array([1, 0, 0, ..., 1, 0, 0])

In [34]:
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)

0,1,2
,criterion,'gini'
,splitter,'best'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [35]:
val_dicts = df_val.fillna(0).to_dict(orient="records")
X_val = dv.transform(val_dicts)

In [36]:
y_pred = dt.predict_proba(X_val)[:, 1]
roc_auc_score(y_val, y_pred)

0.6451597655511438

In [None]:
# Overfitting
y_pred = dt.predict_proba(X_train)[:, 1]
roc_auc_score(y_train, y_pred)

1.0

In [48]:
dt = DecisionTreeClassifier(max_depth=3)
dt.fit(X_train, y_train)

0,1,2
,criterion,'gini'
,splitter,'best'
,max_depth,3
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [49]:
y_pred = dt.predict_proba(X_train)[:, 1]
auc = roc_auc_score(y_train, y_pred)
print("train:", auc)

y_pred = dt.predict_proba(X_val)[:, 1]
auc = roc_auc_score(y_val, y_pred)
print("val:", auc)

train: 0.7769739797926325
val: 0.7519852524106636


In [50]:
print(export_text(dt, feature_names=list(dv.get_feature_names_out())))

|--- income <= 89.50
|   |--- seniority <= 2.50
|   |   |--- records=yes <= 0.50
|   |   |   |--- class: 1
|   |   |--- records=yes >  0.50
|   |   |   |--- class: 1
|   |--- seniority >  2.50
|   |   |--- records=yes <= 0.50
|   |   |   |--- class: 0
|   |   |--- records=yes >  0.50
|   |   |   |--- class: 1
|--- income >  89.50
|   |--- records=yes <= 0.50
|   |   |--- job=partime <= 0.50
|   |   |   |--- class: 0
|   |   |--- job=partime >  0.50
|   |   |   |--- class: 0
|   |--- records=yes >  0.50
|   |   |--- seniority <= 4.50
|   |   |   |--- class: 1
|   |   |--- seniority >  4.50
|   |   |   |--- class: 0

