# Chapter 6 - Decision Tree Model

## Project: Credit Risk Score.

In [12]:
# Importing Python Packages. 

# Data Manipulation. 
import pandas as pd 
import numpy as np

# Data Visualization Packages. 
import seaborn as sns 
import matplotlib.pyplot as plt

# Data Gathering Package. 
import wget 

# Data preparation packages.
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer

# Other
%matplotlib inline

## Data Gathering.

In [13]:
# Data url link
url = 'https://github.com/gastonstat/CreditScoring/raw/master/CreditScoring.csv'

# Downloading data with wget.
# wget.download(url, 'data/credit_scoring.csv')

In [14]:
# Loading dataset.
credit_score = pd.read_csv('data/credit_scoring.csv')

# Viewing 
credit_score.head()

Unnamed: 0,Status,Seniority,Home,Time,Age,Marital,Records,Job,Expenses,Income,Assets,Debt,Amount,Price
0,1,9,1,60,30,2,1,3,73,129,0,0,800,846
1,1,17,1,60,58,3,1,1,48,131,0,0,1000,1658
2,2,10,2,36,46,2,2,3,90,200,3000,0,2000,2985
3,1,0,1,60,24,1,1,1,63,182,2500,0,900,1325
4,1,0,1,36,26,1,1,1,46,107,0,0,310,910


In [15]:
# Changing header to small cap.
credit_score.columns = credit_score.columns.str.replace(' ', '_').str.lower()

# Checking data columns names.
credit_score.columns

Index(['status', 'seniority', 'home', 'time', 'age', 'marital', 'records',
       'job', 'expenses', 'income', 'assets', 'debt', 'amount', 'price'],
      dtype='object')

In [16]:
credit_score.status.value_counts()

1    3200
2    1254
0       1
Name: status, dtype: int64

In [17]:
# Replacing numbers with values.
status_values = {
    1: 'ok',
    2: 'default',
    0: 'unk'
}

credit_score.status = credit_score.status.map(status_values)

# Check header. 
credit_score.head()

Unnamed: 0,status,seniority,home,time,age,marital,records,job,expenses,income,assets,debt,amount,price
0,ok,9,1,60,30,2,1,3,73,129,0,0,800,846
1,ok,17,1,60,58,3,1,1,48,131,0,0,1000,1658
2,default,10,2,36,46,2,2,3,90,200,3000,0,2000,2985
3,ok,0,1,60,24,1,1,1,63,182,2500,0,900,1325
4,ok,0,1,36,26,1,1,1,46,107,0,0,310,910


In [18]:
# Converting Values to str values. 
home_values = {
    1: 'rent',
    2: 'owner',
    3: 'private',
    4: 'ignorfe',
    5: 'parents',
    6: 'other',
    0: 'unk'
}

credit_score.home = credit_score.home.map(home_values)

marital_values = {
    1: 'single',
    2: 'married',
    3: 'widow',
    4: 'separated',
    5: 'divorced',
    0: 'unk'
}

credit_score.marital = credit_score.marital.map(marital_values)

records_values = {
    1: 'no',
    2: 'yes',
    0: 'unk'
}

credit_score.records = credit_score.records.map(records_values)

job_values = {
    1: 'fixed',
    2: 'partime',
    3: 'freelance',
    4: 'others',
    0: 'unk'
}

credit_score.job = credit_score.job.map(job_values)

In [19]:
# Checking result.
credit_score.head()

Unnamed: 0,status,seniority,home,time,age,marital,records,job,expenses,income,assets,debt,amount,price
0,ok,9,rent,60,30,married,no,freelance,73,129,0,0,800,846
1,ok,17,rent,60,58,widow,no,fixed,48,131,0,0,1000,1658
2,default,10,owner,36,46,married,yes,freelance,90,200,3000,0,2000,2985
3,ok,0,rent,60,24,single,no,fixed,63,182,2500,0,900,1325
4,ok,0,rent,36,26,single,no,fixed,46,107,0,0,310,910


In [23]:
# Statistic overview.
credit_score.describe()

Unnamed: 0,seniority,time,age,expenses,income,assets,debt,amount,price
count,4455.0,4455.0,4455.0,4455.0,4455.0,4455.0,4455.0,4455.0,4455.0
mean,7.987205,46.441751,37.077666,55.568799,763317.0,1060341.0,404382.0,1039.021773,1462.875645
std,8.173444,14.655225,10.984856,19.515878,8703625.0,10217570.0,6344253.0,474.543007,628.089913
min,0.0,6.0,18.0,35.0,0.0,0.0,0.0,100.0,105.0
25%,2.0,36.0,28.0,35.0,80.0,0.0,0.0,700.0,1117.5
50%,5.0,48.0,36.0,51.0,120.0,3500.0,0.0,1000.0,1400.0
75%,12.0,60.0,45.0,72.0,166.0,6000.0,0.0,1300.0,1692.0
max,48.0,72.0,68.0,180.0,100000000.0,100000000.0,100000000.0,5000.0,11140.0


In [28]:
# Removing Outlier with None. 

# Removing outliers for Debt column. 
credit_score.debt.replace(99999999, np.nan, inplace = True)

# Removing outlier for assets column.
credit_score.debt.replace(99999999, np.nan, inplace = T)

# Removing outlier for income column.
credit_score.income.replace(99999999, np.nan)

# Veiwing changes. 
credit_score[['income', 'debt', 'assets']].max()

income    99999999
debt      99999999
assets    99999999
dtype: int64