In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns

%matplotlib inline

In [2]:
data = 'https://raw.githubusercontent.com/gastonstat/CreditScoring/master/CreditScoring.csv'

In [3]:
!wget $data

--2023-10-23 19:31:47--  https://raw.githubusercontent.com/gastonstat/CreditScoring/master/CreditScoring.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.110.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 182489 (178K) [text/plain]
Saving to: ‘CreditScoring.csv.2’


2023-10-23 19:31:47 (1.08 MB/s) - ‘CreditScoring.csv.2’ saved [182489/182489]



<h1>Data Preparation & Cleaning</h1>

In [4]:
!head CreditScoring.csv

"Status","Seniority","Home","Time","Age","Marital","Records","Job","Expenses","Income","Assets","Debt","Amount","Price"
1,9,1,60,30,2,1,3,73,129,0,0,800,846
1,17,1,60,58,3,1,1,48,131,0,0,1000,1658
2,10,2,36,46,2,2,3,90,200,3000,0,2000,2985
1,0,1,60,24,1,1,1,63,182,2500,0,900,1325
1,0,1,36,26,1,1,1,46,107,0,0,310,910
1,1,2,60,36,2,1,1,75,214,3500,0,650,1645
1,29,2,60,44,2,1,1,75,125,10000,0,1600,1800
1,9,5,12,27,1,1,1,35,80,0,0,200,1093
1,0,2,60,32,2,1,3,90,107,15000,0,1200,1957


In [5]:
df = pd.read_csv(data)

In [6]:
df.head()

Unnamed: 0,Status,Seniority,Home,Time,Age,Marital,Records,Job,Expenses,Income,Assets,Debt,Amount,Price
0,1,9,1,60,30,2,1,3,73,129,0,0,800,846
1,1,17,1,60,58,3,1,1,48,131,0,0,1000,1658
2,2,10,2,36,46,2,2,3,90,200,3000,0,2000,2985
3,1,0,1,60,24,1,1,1,63,182,2500,0,900,1325
4,1,0,1,36,26,1,1,1,46,107,0,0,310,910


In [7]:
df.columns.str.lower()

Index(['status', 'seniority', 'home', 'time', 'age', 'marital', 'records',
       'job', 'expenses', 'income', 'assets', 'debt', 'amount', 'price'],
      dtype='object')

In [8]:
df.head()

Unnamed: 0,Status,Seniority,Home,Time,Age,Marital,Records,Job,Expenses,Income,Assets,Debt,Amount,Price
0,1,9,1,60,30,2,1,3,73,129,0,0,800,846
1,1,17,1,60,58,3,1,1,48,131,0,0,1000,1658
2,2,10,2,36,46,2,2,3,90,200,3000,0,2000,2985
3,1,0,1,60,24,1,1,1,63,182,2500,0,900,1325
4,1,0,1,36,26,1,1,1,46,107,0,0,310,910


In [9]:
df.columns = df.columns.str.lower()

In [10]:
df.head()

Unnamed: 0,status,seniority,home,time,age,marital,records,job,expenses,income,assets,debt,amount,price
0,1,9,1,60,30,2,1,3,73,129,0,0,800,846
1,1,17,1,60,58,3,1,1,48,131,0,0,1000,1658
2,2,10,2,36,46,2,2,3,90,200,3000,0,2000,2985
3,1,0,1,60,24,1,1,1,63,182,2500,0,900,1325
4,1,0,1,36,26,1,1,1,46,107,0,0,310,910


In [11]:
df.status = df.status.map({1: 'ok', 2: 'default', 0: 'unk'})

In [12]:
df.head()

Unnamed: 0,status,seniority,home,time,age,marital,records,job,expenses,income,assets,debt,amount,price
0,ok,9,1,60,30,2,1,3,73,129,0,0,800,846
1,ok,17,1,60,58,3,1,1,48,131,0,0,1000,1658
2,default,10,2,36,46,2,2,3,90,200,3000,0,2000,2985
3,ok,0,1,60,24,1,1,1,63,182,2500,0,900,1325
4,ok,0,1,36,26,1,1,1,46,107,0,0,310,910


In [13]:
status_values = {
    'ok': 1,
    'default': 0,
    'unk': 0        
}   

df.status = df.status.map(status_values)

In [14]:
home_values = {
    1: 'rent',
    2: 'owner',
    3: 'private',
    4: 'ignore',
    5: 'parents',
    6: 'other',
    0: 'unk'      
}

df.home = df.home.map(home_values)

In [15]:
marital_values = {
    1: 'single',
    2: 'married',
    3: 'widow',
    4: 'separated',
    5: 'divorced',
    0: 'unk'      
}

df.marital = df.marital.map(marital_values)

In [16]:
records_values = {
    1: 'no',
    2: 'yes',
    0: 'unk'      
}

df.records = df.records.map(records_values)

In [17]:
job_values = {
    1: 'fixed',
    2: 'partime',
    3: 'freelance',
    4: 'others',
    0: 'unk'      
}

df.job = df.job.map(job_values)

In [18]:
df.head()

Unnamed: 0,status,seniority,home,time,age,marital,records,job,expenses,income,assets,debt,amount,price
0,1,9,rent,60,30,married,no,freelance,73,129,0,0,800,846
1,1,17,rent,60,58,widow,no,fixed,48,131,0,0,1000,1658
2,0,10,owner,36,46,married,yes,freelance,90,200,3000,0,2000,2985
3,1,0,rent,60,24,single,no,fixed,63,182,2500,0,900,1325
4,1,0,rent,36,26,single,no,fixed,46,107,0,0,310,910


In [19]:
df.describe().round()

Unnamed: 0,status,seniority,time,age,expenses,income,assets,debt,amount,price
count,4455.0,4455.0,4455.0,4455.0,4455.0,4455.0,4455.0,4455.0,4455.0,4455.0
mean,1.0,8.0,46.0,37.0,56.0,763317.0,1060341.0,404382.0,1039.0,1463.0
std,0.0,8.0,15.0,11.0,20.0,8703625.0,10217569.0,6344253.0,475.0,628.0
min,0.0,0.0,6.0,18.0,35.0,0.0,0.0,0.0,100.0,105.0
25%,0.0,2.0,36.0,28.0,35.0,80.0,0.0,0.0,700.0,1118.0
50%,1.0,5.0,48.0,36.0,51.0,120.0,3500.0,0.0,1000.0,1400.0
75%,1.0,12.0,60.0,45.0,72.0,166.0,6000.0,0.0,1300.0,1692.0
max,1.0,48.0,72.0,68.0,180.0,99999999.0,99999999.0,99999999.0,5000.0,11140.0


In [20]:
for c in ['income', 'assets', 'debt']:
    df[c].replace(to_replace=99999999, value=np.nan, inplace=True)

In [21]:
df.describe().round()

Unnamed: 0,status,seniority,time,age,expenses,income,assets,debt,amount,price
count,4455.0,4455.0,4455.0,4455.0,4455.0,4421.0,4408.0,4437.0,4455.0,4455.0
mean,1.0,8.0,46.0,37.0,56.0,131.0,5403.0,343.0,1039.0,1463.0
std,0.0,8.0,15.0,11.0,20.0,86.0,11573.0,1246.0,475.0,628.0
min,0.0,0.0,6.0,18.0,35.0,0.0,0.0,0.0,100.0,105.0
25%,0.0,2.0,36.0,28.0,35.0,80.0,0.0,0.0,700.0,1118.0
50%,1.0,5.0,48.0,36.0,51.0,120.0,3000.0,0.0,1000.0,1400.0
75%,1.0,12.0,60.0,45.0,72.0,165.0,6000.0,0.0,1300.0,1692.0
max,1.0,48.0,72.0,68.0,180.0,959.0,300000.0,30000.0,5000.0,11140.0


In [22]:
df.status.value_counts()

1    3200
0    1255
Name: status, dtype: int64

In [23]:
df[df.status != 'unk'].reset_index(drop=True)

Unnamed: 0,status,seniority,home,time,age,marital,records,job,expenses,income,assets,debt,amount,price
0,1,9,rent,60,30,married,no,freelance,73,129.0,0.0,0.0,800,846
1,1,17,rent,60,58,widow,no,fixed,48,131.0,0.0,0.0,1000,1658
2,0,10,owner,36,46,married,yes,freelance,90,200.0,3000.0,0.0,2000,2985
3,1,0,rent,60,24,single,no,fixed,63,182.0,2500.0,0.0,900,1325
4,1,0,rent,36,26,single,no,fixed,46,107.0,0.0,0.0,310,910
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4450,0,1,rent,60,39,married,no,fixed,69,92.0,0.0,0.0,900,1020
4451,1,22,owner,60,46,married,no,fixed,60,75.0,3000.0,600.0,950,1263
4452,0,0,owner,24,37,married,no,partime,60,90.0,3500.0,0.0,500,963
4453,1,0,rent,48,23,single,no,freelance,49,140.0,0.0,0.0,550,550


In [24]:
from sklearn.model_selection import train_test_split

df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=11)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=11)

In [25]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [26]:
y_train = (df_train.status == 'default').astype('int').values
y_val = (df_val.status == 'default').astype('int').values
y_test = (df_test.status == 'default').astype('int').values



In [27]:
del df_train['status']
del df_val['status']
del df_test['status']

In [28]:
df_train.head()

Unnamed: 0,seniority,home,time,age,marital,records,job,expenses,income,assets,debt,amount,price
0,3,rent,36,61,married,no,fixed,41,57.0,12000.0,0.0,2500,3559
1,5,rent,48,30,separated,yes,partime,39,41.0,0.0,0.0,1300,1600
2,15,owner,60,32,divorced,no,freelance,35,0.0,6000.0,3300.0,1550,1612
3,2,rent,12,47,married,yes,fixed,103,426.0,0.0,0.0,350,450
4,6,parents,48,32,single,yes,fixed,35,85.0,0.0,0.0,1100,1330


<h1>Decision Trees</h1>

In [29]:
def assess_risk(client):
    if client['records'] == 'yes':
        if client['job'] == 'parttime':
            return 'default'
        else:
            return 'ok'
    else:
        if client['assets'] > 6000:
            return 'ok'
        else:
            return 'default'
        

In [30]:
xi = df_train.iloc[0].to_dict()

In [31]:
xi

{'seniority': 3,
 'home': 'rent',
 'time': 36,
 'age': 61,
 'marital': 'married',
 'records': 'no',
 'job': 'fixed',
 'expenses': 41,
 'income': 57.0,
 'assets': 12000.0,
 'debt': 0.0,
 'amount': 2500,
 'price': 3559}

In [32]:
assess_risk(xi)

'ok'

In [33]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import roc_auc_score



In [34]:
train_dicts = df_train.fillna(0).to_dict(orient='records')

In [35]:
train_dicts[0]

{'seniority': 3,
 'home': 'rent',
 'time': 36,
 'age': 61,
 'marital': 'married',
 'records': 'no',
 'job': 'fixed',
 'expenses': 41,
 'income': 57.0,
 'assets': 12000.0,
 'debt': 0.0,
 'amount': 2500,
 'price': 3559}

In [36]:
dv = DictVectorizer(sparse=False)

In [37]:
x_train = dv.fit_transform(train_dicts)

In [38]:
x_train[0]

array([6.100e+01, 2.500e+03, 1.200e+04, 0.000e+00, 4.100e+01, 0.000e+00,
       0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 1.000e+00, 0.000e+00,
       5.700e+01, 1.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00,
       1.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 3.559e+03,
       1.000e+00, 0.000e+00, 3.000e+00, 3.600e+01])

In [39]:
dv = DictVectorizer(sparse=False)
X_train = dv.fit_transform(train_dicts)

In [40]:
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)

In [41]:
val_dicts = df_val.fillna(0).to_dict(orient='records')
x_val = dv.transform(val_dicts)

In [43]:
y_pred = dt.predict_proba(x_val)[:, 0]

In [47]:
unique_values = np.unique(y_val)

if len(unique_values) == 1:
    minority_class = unique_values[0]
    minority_indices = np.where(y_train == minority_class)[0]
    num_examples_to_add = min(10, len(minority_indices))  # Adjust the number as needed
    selected_indices = np.random.choice(minority_indices, size=num_examples_to_add, replace=False)

    x_val = np.concatenate([x_val, X_train[selected_indices]])
    y_val = np.concatenate([y_val, y_train[selected_indices]])
