# Chapter 6 - Decision Tree Model

## Project: Credit Risk Score.

In [22]:
# Importing Python Packages. 

# Data Manipulation. 
import pandas as pd 
import numpy as np

# Data Visualization Packages. 
import seaborn as sns 
import matplotlib.pyplot as plt

# Data Gathering Package. 
import wget 

# Data preparation packages.
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import DecisionTreeClassifier

# Other
from IPython.display import display 
%matplotlib inline

ImportError: cannot import name 'DecisionTreeClassifier' from 'sklearn.model_selection' (C:\Users\User1\anaconda3\envs\ml-zoomcamp\lib\site-packages\sklearn\model_selection\__init__.py)

In [21]:
sklearn.model_selection?

Object `sklearn.model_selection` not found.


## Data Gathering.

In [2]:
# Data url link
url = 'https://github.com/gastonstat/CreditScoring/raw/master/CreditScoring.csv'

# Downloading data with wget.
# wget.download(url, 'data/credit_scoring.csv')

In [3]:
# Loading dataset.
credit_score = pd.read_csv('data/credit_scoring.csv')

# Viewing 
credit_score.head()

Unnamed: 0,Status,Seniority,Home,Time,Age,Marital,Records,Job,Expenses,Income,Assets,Debt,Amount,Price
0,1,9,1,60,30,2,1,3,73,129,0,0,800,846
1,1,17,1,60,58,3,1,1,48,131,0,0,1000,1658
2,2,10,2,36,46,2,2,3,90,200,3000,0,2000,2985
3,1,0,1,60,24,1,1,1,63,182,2500,0,900,1325
4,1,0,1,36,26,1,1,1,46,107,0,0,310,910


## Data Cleaning & Preparation. 

In [4]:
# Changing header to small cap.
credit_score.columns = credit_score.columns.str.replace(' ', '_').str.lower()

# Checking data columns names.
credit_score.columns

Index(['status', 'seniority', 'home', 'time', 'age', 'marital', 'records',
       'job', 'expenses', 'income', 'assets', 'debt', 'amount', 'price'],
      dtype='object')

In [5]:
credit_score.status.value_counts()

1    3200
2    1254
0       1
Name: status, dtype: int64

In [6]:
# Replacing numbers with values.
status_values = {
    1: 'ok',
    2: 'default',
    0: 'unk'
}

credit_score.status = credit_score.status.map(status_values)

# Check header. 
credit_score.head()

Unnamed: 0,status,seniority,home,time,age,marital,records,job,expenses,income,assets,debt,amount,price
0,ok,9,1,60,30,2,1,3,73,129,0,0,800,846
1,ok,17,1,60,58,3,1,1,48,131,0,0,1000,1658
2,default,10,2,36,46,2,2,3,90,200,3000,0,2000,2985
3,ok,0,1,60,24,1,1,1,63,182,2500,0,900,1325
4,ok,0,1,36,26,1,1,1,46,107,0,0,310,910


In [7]:
# Converting Values to str values. 
home_values = {
    1: 'rent',
    2: 'owner',
    3: 'private',
    4: 'ignorfe',
    5: 'parents',
    6: 'other',
    0: 'unk'
}

credit_score.home = credit_score.home.map(home_values)

marital_values = {
    1: 'single',
    2: 'married',
    3: 'widow',
    4: 'separated',
    5: 'divorced',
    0: 'unk'
}

credit_score.marital = credit_score.marital.map(marital_values)

records_values = {
    1: 'no',
    2: 'yes',
    0: 'unk'
}

credit_score.records = credit_score.records.map(records_values)

job_values = {
    1: 'fixed',
    2: 'partime',
    3: 'freelance',
    4: 'others',
    0: 'unk'
}

credit_score.job = credit_score.job.map(job_values)

In [8]:
# Checking result.
credit_score.head()

Unnamed: 0,status,seniority,home,time,age,marital,records,job,expenses,income,assets,debt,amount,price
0,ok,9,rent,60,30,married,no,freelance,73,129,0,0,800,846
1,ok,17,rent,60,58,widow,no,fixed,48,131,0,0,1000,1658
2,default,10,owner,36,46,married,yes,freelance,90,200,3000,0,2000,2985
3,ok,0,rent,60,24,single,no,fixed,63,182,2500,0,900,1325
4,ok,0,rent,36,26,single,no,fixed,46,107,0,0,310,910


In [9]:
# Statistic overview.
credit_score.describe()

Unnamed: 0,seniority,time,age,expenses,income,assets,debt,amount,price
count,4455.0,4455.0,4455.0,4455.0,4455.0,4455.0,4455.0,4455.0,4455.0
mean,7.987205,46.441751,37.077666,55.568799,763317.0,1060341.0,404382.0,1039.021773,1462.875645
std,8.173444,14.655225,10.984856,19.515878,8703625.0,10217570.0,6344253.0,474.543007,628.089913
min,0.0,6.0,18.0,35.0,0.0,0.0,0.0,100.0,105.0
25%,2.0,36.0,28.0,35.0,80.0,0.0,0.0,700.0,1117.5
50%,5.0,48.0,36.0,51.0,120.0,3500.0,0.0,1000.0,1400.0
75%,12.0,60.0,45.0,72.0,166.0,6000.0,0.0,1300.0,1692.0
max,48.0,72.0,68.0,180.0,100000000.0,100000000.0,100000000.0,5000.0,11140.0


In [10]:
# Removing Outlier with None. 

# Removing outliers for Debt column. 
credit_score.debt.replace(99999999, np.nan, inplace = True)

# Removing outlier for assets column.
credit_score.assets.replace(99999999, np.nan, inplace = True)

# Removing outlier for income column.
credit_score.income.replace(99999999, np.nan, inplace = True)

# Veiwing changes. 
credit_score[['income', 'debt', 'assets']].max()

income       959.0
debt       30000.0
assets    300000.0
dtype: float64

In [11]:
# Converting data to dict.
credit_dict = credit_score.to_dict(orient = 'records')

# Check result.
credit_dict

[{'status': 'ok',
  'seniority': 9,
  'home': 'rent',
  'time': 60,
  'age': 30,
  'marital': 'married',
  'records': 'no',
  'job': 'freelance',
  'expenses': 73,
  'income': 129.0,
  'assets': 0.0,
  'debt': 0.0,
  'amount': 800,
  'price': 846},
 {'status': 'ok',
  'seniority': 17,
  'home': 'rent',
  'time': 60,
  'age': 58,
  'marital': 'widow',
  'records': 'no',
  'job': 'fixed',
  'expenses': 48,
  'income': 131.0,
  'assets': 0.0,
  'debt': 0.0,
  'amount': 1000,
  'price': 1658},
 {'status': 'default',
  'seniority': 10,
  'home': 'owner',
  'time': 36,
  'age': 46,
  'marital': 'married',
  'records': 'yes',
  'job': 'freelance',
  'expenses': 90,
  'income': 200.0,
  'assets': 3000.0,
  'debt': 0.0,
  'amount': 2000,
  'price': 2985},
 {'status': 'ok',
  'seniority': 0,
  'home': 'rent',
  'time': 60,
  'age': 24,
  'marital': 'single',
  'records': 'no',
  'job': 'fixed',
  'expenses': 63,
  'income': 182.0,
  'assets': 2500.0,
  'debt': 0.0,
  'amount': 900,
  'price': 13

In [17]:
# Splitting dataset. 
# Full_Train & Test Dataset.
credit_train_full, credit_test = train_test_split(credit_score, test_size= 0.2, random_state = 10)

# Train & Validation Dataset.
credit_train, credit_val = train_test_split(credit_train_full, test_size = 0.25, random_state = 10)

# Reset index values. 
credit_train_full = credit_train_full.reset_index(drop=True)
credit_test = credit_test.reset_index(drop = True)
credit_train = credit_train.reset_index(drop = True)
credit_val = credit_val.reset_index(drop = True)

# Extract Dependent feature.
y_train = (credit_train['status']== 'ok').astype(int).values
y_test = (credit_test['status'] == 'ok').astype(int).values
y_val = (credit_val['status'] == 'ok').astype(int).values

# Delete dependent columns from dataset. 
del credit_train['status']
del credit_test['status']
del credit_val['status']

# Show all dataset. 
display(credit_train)
display(credit_test)
display(credit_val)

Unnamed: 0,seniority,home,time,age,marital,records,job,expenses,income,assets,debt,amount,price
0,32,owner,36,56,married,no,fixed,60,152.0,7000.0,0.0,1000,2950
1,11,rent,60,38,married,no,fixed,82,71.0,0.0,0.0,900,977
2,8,rent,36,36,married,no,fixed,78,179.0,0.0,0.0,900,1503
3,3,owner,48,26,single,no,fixed,35,157.0,7000.0,0.0,1200,2053
4,0,owner,18,20,single,no,partime,35,180.0,4000.0,0.0,700,960
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2668,1,rent,36,47,married,yes,freelance,105,0.0,0.0,0.0,500,500
2669,2,owner,60,23,married,no,fixed,45,78.0,5000.0,6300.0,1075,1123
2670,6,owner,48,33,married,no,freelance,45,78.0,3500.0,0.0,1100,1526
2671,8,owner,24,33,married,no,fixed,75,315.0,4500.0,0.0,400,1780


Unnamed: 0,seniority,home,time,age,marital,records,job,expenses,income,assets,debt,amount,price
0,3,parents,18,42,married,no,freelance,75,276.0,1500.0,0.0,750,1758
1,3,parents,60,30,single,no,fixed,35,86.0,0.0,0.0,750,829
2,1,parents,60,36,married,no,partime,60,60.0,0.0,0.0,1400,1612
3,1,other,36,28,single,no,freelance,35,100.0,0.0,0.0,1950,2074
4,30,owner,24,57,married,no,freelance,45,40.0,17000.0,0.0,400,500
...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,15,owner,60,32,divorced,no,freelance,35,0.0,6000.0,3300.0,1550,1612
887,3,parents,24,25,single,yes,fixed,35,128.0,0.0,0.0,300,1397
888,0,owner,48,44,married,no,partime,75,120.0,4000.0,0.0,900,1204
889,6,rent,24,28,married,no,fixed,62,166.0,0.0,0.0,500,1150


Unnamed: 0,seniority,home,time,age,marital,records,job,expenses,income,assets,debt,amount,price
0,0,parents,60,23,single,no,partime,35,205.0,0.0,0.0,1400,1426
1,0,owner,60,30,married,yes,fixed,75,255.0,4000.0,0.0,1500,1749
2,25,rent,60,52,married,no,fixed,73,127.0,0.0,0.0,1030,1250
3,22,other,24,36,married,no,fixed,45,176.0,0.0,0.0,700,1428
4,18,owner,48,32,married,no,fixed,75,71.0,4000.0,39.0,660,1461
...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,7,parents,60,33,single,no,partime,35,70.0,0.0,0.0,1300,1438
887,8,owner,36,53,married,no,fixed,45,254.0,6000.0,0.0,350,575
888,3,parents,60,39,married,no,fixed,45,77.0,0.0,0.0,1000,1094
889,1,parents,24,29,single,no,partime,35,147.0,0.0,0.0,650,1370


## Training Model: Decision Tree.