# Trees

In [62]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [63]:
df = pd.read_csv('CreditScoring.csv')

In [64]:
df.head()

Unnamed: 0,Status,Seniority,Home,Time,Age,Marital,Records,Job,Expenses,Income,Assets,Debt,Amount,Price
0,1,9,1,60,30,2,1,3,73,129,0,0,800,846
1,1,17,1,60,58,3,1,1,48,131,0,0,1000,1658
2,2,10,2,36,46,2,2,3,90,200,3000,0,2000,2985
3,1,0,1,60,24,1,1,1,63,182,2500,0,900,1325
4,1,0,1,36,26,1,1,1,46,107,0,0,310,910


## Data Cleaning

In [65]:
def standardise_colnames(df: pd.DataFrame) -> pd.DataFrame:
  df = df.copy()
  df.columns = df.columns.str.lower().str.replace(' ', '_')
  return df

In [66]:
df = standardise_colnames(df)

In [67]:
df.head()

Unnamed: 0,status,seniority,home,time,age,marital,records,job,expenses,income,assets,debt,amount,price
0,1,9,1,60,30,2,1,3,73,129,0,0,800,846
1,1,17,1,60,58,3,1,1,48,131,0,0,1000,1658
2,2,10,2,36,46,2,2,3,90,200,3000,0,2000,2985
3,1,0,1,60,24,1,1,1,63,182,2500,0,900,1325
4,1,0,1,36,26,1,1,1,46,107,0,0,310,910


In [68]:
status_values = {
  0: 'unk',
  1: 'ok',
  2: 'default'
}
df.status = df.status.map(status_values)

In [69]:
home_values = {
  0: 'unk',
  1: 'rent',
  2: 'owner',
  3: 'private',
  4: 'ignore',
  5: 'parents',
  6: 'other'
}
df.home = df.home.map(home_values)

In [70]:
marital_values = {
  0: 'unk',
  1: 'single',
  2: 'married',
  3: 'widow',
  4: 'separated',
  5: 'divorced'
}
df.marital = df.marital.map(marital_values)

In [71]:
records_values = {
  0: 'unk',
  1: 'no',
  2: 'yes'
}
df.records = df.records.map(records_values)

In [72]:
job_values = {
  0: 'unk',
  1: 'fixed',
  2: 'partime',
  3: 'freelance',
  4: 'others'
}
df.job = df.job.map(job_values)

In [73]:
df.describe().round()

Unnamed: 0,seniority,time,age,expenses,income,assets,debt,amount,price
count,4455.0,4455.0,4455.0,4455.0,4455.0,4455.0,4455.0,4455.0,4455.0
mean,8.0,46.0,37.0,56.0,763317.0,1060341.0,404382.0,1039.0,1463.0
std,8.0,15.0,11.0,20.0,8703625.0,10217569.0,6344253.0,475.0,628.0
min,0.0,6.0,18.0,35.0,0.0,0.0,0.0,100.0,105.0
25%,2.0,36.0,28.0,35.0,80.0,0.0,0.0,700.0,1118.0
50%,5.0,48.0,36.0,51.0,120.0,3500.0,0.0,1000.0,1400.0
75%,12.0,60.0,45.0,72.0,166.0,6000.0,0.0,1300.0,1692.0
max,48.0,72.0,68.0,180.0,99999999.0,99999999.0,99999999.0,5000.0,11140.0


In [74]:
df.income = df.income.replace(df.income.max(), np.nan)
df.assets = df.assets.replace(df.assets.max(), np.nan)
df.debt = df.debt.replace(df.debt.max(), np.nan)

In [75]:
df = df.loc[df.status != 'unk'].reset_index(drop=True)

## Validation Framework

In [None]:
df_train_full, df_test = train_test_split(df, test_size=.2, random_state=11)
df_train, df_val = train_test_split(df_train_full, test_size=.25, random_state=11)

In [78]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [80]:
y_train = (df_train.status == 'default').astype(int).values
y_val = (df_val.status == 'default').astype(int).values
y_test = (df_test.status == 'default').astype(int).values

In [85]:
df_train.drop('status', axis=1, inplace=True, errors='ignore')
df_val.drop('status', axis=1, inplace=True, errors='ignore')
df_test.drop('status', axis=1, inplace=True, errors='ignore')

In [86]:
df_train

Unnamed: 0,seniority,home,time,age,marital,records,job,expenses,income,assets,debt,amount,price
0,2,parents,36,21,single,no,freelance,35,221.0,0.0,0.0,500,650
1,3,parents,60,23,single,no,fixed,45,113.0,4000.0,0.0,900,1105
2,10,parents,12,37,married,no,fixed,90,250.0,3000.0,0.0,240,1488
3,20,private,48,38,married,no,fixed,60,57.0,2500.0,0.0,900,1246
4,4,rent,60,37,single,yes,fixed,79,436.0,0.0,0.0,1500,1550
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2667,1,parents,30,22,single,no,fixed,35,80.0,4000.0,0.0,600,850
2668,2,rent,60,28,single,no,fixed,43,81.0,0.0,0.0,1000,1135
2669,0,parents,36,24,single,no,freelance,45,0.0,250000.0,0.0,700,1214
2670,0,other,36,31,married,no,fixed,75,144.0,0.0,0.0,450,500
