In [27]:
# importing libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score
import warnings
warnings.filterwarnings('ignore')

%matplotlib inline
sns.set_style('whitegrid')
plt.rcParams['figure.dpi']=200

In [3]:
# reading the csv file
df = pd.read_csv(r"D:\College\Academics\SEM 4\New Generation Database\Datasets\credit.csv")

In [4]:
# displaying the dataset
display(df.head(),df.tail(),df.sample(5))

Unnamed: 0,checking_balance,months_loan_duration,credit_history,purpose,amount,savings_balance,employment_duration,percent_of_income,years_at_residence,age,other_credit,housing,existing_loans_count,job,dependents,phone,default
0,< 0 DM,6,critical,furniture/appliances,1169,unknown,> 7 years,4,4,67,none,own,2,skilled,1,yes,no
1,1 - 200 DM,48,good,furniture/appliances,5951,< 100 DM,1 - 4 years,2,2,22,none,own,1,skilled,1,no,yes
2,unknown,12,critical,education,2096,< 100 DM,4 - 7 years,2,3,49,none,own,1,unskilled,2,no,no
3,< 0 DM,42,good,furniture/appliances,7882,< 100 DM,4 - 7 years,2,4,45,none,other,1,skilled,2,no,no
4,< 0 DM,24,poor,car,4870,< 100 DM,1 - 4 years,3,4,53,none,other,2,skilled,2,no,yes


Unnamed: 0,checking_balance,months_loan_duration,credit_history,purpose,amount,savings_balance,employment_duration,percent_of_income,years_at_residence,age,other_credit,housing,existing_loans_count,job,dependents,phone,default
995,unknown,12,good,furniture/appliances,1736,< 100 DM,4 - 7 years,3,4,31,none,own,1,unskilled,1,no,no
996,< 0 DM,30,good,car,3857,< 100 DM,1 - 4 years,4,4,40,none,own,1,management,1,yes,no
997,unknown,12,good,furniture/appliances,804,< 100 DM,> 7 years,4,4,38,none,own,1,skilled,1,no,no
998,< 0 DM,45,good,furniture/appliances,1845,< 100 DM,1 - 4 years,4,4,23,none,other,1,skilled,1,yes,yes
999,1 - 200 DM,45,critical,car,4576,100 - 500 DM,unemployed,3,4,27,none,own,1,skilled,1,no,no


Unnamed: 0,checking_balance,months_loan_duration,credit_history,purpose,amount,savings_balance,employment_duration,percent_of_income,years_at_residence,age,other_credit,housing,existing_loans_count,job,dependents,phone,default
939,unknown,24,critical,car,6842,unknown,1 - 4 years,2,4,46,none,own,2,management,2,yes,no
616,1 - 200 DM,60,poor,furniture/appliances,9157,unknown,1 - 4 years,2,2,27,none,other,1,management,1,no,no
605,< 0 DM,24,very good,furniture/appliances,2828,500 - 1000 DM,1 - 4 years,4,4,22,store,own,1,skilled,1,yes,no
598,unknown,18,critical,car,2775,< 100 DM,4 - 7 years,2,2,31,bank,own,2,skilled,1,no,yes
658,1 - 200 DM,30,perfect,business,4221,< 100 DM,1 - 4 years,2,1,28,none,own,2,skilled,1,no,no


In [5]:
# checking the shape of the data
df.shape

(1000, 17)

There are **1000** rows and **17** columns in the dataset

In [6]:
# checkinhg the summary statistics
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
months_loan_duration,1000.0,20.903,12.058814,4.0,12.0,18.0,24.0,72.0
amount,1000.0,3271.258,2822.736876,250.0,1365.5,2319.5,3972.25,18424.0
percent_of_income,1000.0,2.973,1.118715,1.0,2.0,3.0,4.0,4.0
years_at_residence,1000.0,2.845,1.103718,1.0,2.0,3.0,4.0,4.0
age,1000.0,35.546,11.375469,19.0,27.0,33.0,42.0,75.0
existing_loans_count,1000.0,1.407,0.577654,1.0,1.0,1.0,2.0,4.0
dependents,1000.0,1.155,0.362086,1.0,1.0,1.0,1.0,2.0


### Inference:

- There are **7** numerical atttributes

- `months_loan_duration ranges` from **4** to **72**, that means the loan duration can range from __4 months__ to up to __6 years__ and the average loan duration is __20 months__.

- `amount` ranges from __250__ to __18424__ and the average amount is around __3000__

- `percent_of_income` ranges from __1%__ to __4%__ average percent of income being around __3%__

- `years_at_residence` ranges from __1__ to __4__ years and the average year of residence is around __3__

- `age` ranges from  __19 years__ to **75 years** and the average age is __35 years__

- `existing_loan_count` ranges from __1__ to __4__ and the average existing loan count is __1__ 

- `dependents` ranges from __1__ to __2__ and the average amount of dependent is __1__

In [7]:
# checking dataframe information
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 17 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   checking_balance      1000 non-null   object
 1   months_loan_duration  1000 non-null   int64 
 2   credit_history        1000 non-null   object
 3   purpose               1000 non-null   object
 4   amount                1000 non-null   int64 
 5   savings_balance       1000 non-null   object
 6   employment_duration   1000 non-null   object
 7   percent_of_income     1000 non-null   int64 
 8   years_at_residence    1000 non-null   int64 
 9   age                   1000 non-null   int64 
 10  other_credit          1000 non-null   object
 11  housing               1000 non-null   object
 12  existing_loans_count  1000 non-null   int64 
 13  job                   1000 non-null   object
 14  dependents            1000 non-null   int64 
 15  phone                 1000 non-null   o

### Inference
There are total 17 column
from which, 7 are numerical and 10 are objects

In [8]:
# viewing the data
df.head()

Unnamed: 0,checking_balance,months_loan_duration,credit_history,purpose,amount,savings_balance,employment_duration,percent_of_income,years_at_residence,age,other_credit,housing,existing_loans_count,job,dependents,phone,default
0,< 0 DM,6,critical,furniture/appliances,1169,unknown,> 7 years,4,4,67,none,own,2,skilled,1,yes,no
1,1 - 200 DM,48,good,furniture/appliances,5951,< 100 DM,1 - 4 years,2,2,22,none,own,1,skilled,1,no,yes
2,unknown,12,critical,education,2096,< 100 DM,4 - 7 years,2,3,49,none,own,1,unskilled,2,no,no
3,< 0 DM,42,good,furniture/appliances,7882,< 100 DM,4 - 7 years,2,4,45,none,other,1,skilled,2,no,no
4,< 0 DM,24,poor,car,4870,< 100 DM,1 - 4 years,3,4,53,none,other,2,skilled,2,no,yes


On futher inspection, we can see the object variable are not all nominal variable, it also contains ordinal categories, so preprocessing needs to be done

In [9]:
# converting the object columns into Categorical datatype
for i in df.columns:
    if df[i].dtype == 'object':
        df[i] = pd.Categorical(df[i])

In [10]:
# checking that dataframe to validate changes
df.dtypes

checking_balance        category
months_loan_duration       int64
credit_history          category
purpose                 category
amount                     int64
savings_balance         category
employment_duration     category
percent_of_income          int64
years_at_residence         int64
age                        int64
other_credit            category
housing                 category
existing_loans_count       int64
job                     category
dependents                 int64
phone                   category
default                 category
dtype: object

In [11]:
# creating a variable for category columns
cat_cols = ['checking_balance',
             'credit_history',
             'purpose',
             'savings_balance',
             'employment_duration',
             'other_credit',
             'housing',
             'job',
             'phone',
             'default']

In [12]:
# running value count check for all the categorical columns
for i in df[cat_cols]:
    print("==="*15)
    print(i)
    print("--"*5)
    print(df[i].value_counts())

checking_balance
----------
checking_balance
unknown       394
< 0 DM        274
1 - 200 DM    269
> 200 DM       63
Name: count, dtype: int64
credit_history
----------
credit_history
good         530
critical     293
poor          88
very good     49
perfect       40
Name: count, dtype: int64
purpose
----------
purpose
furniture/appliances    473
car                     337
business                 97
education                59
renovations              22
car0                     12
Name: count, dtype: int64
savings_balance
----------
savings_balance
< 100 DM         603
unknown          183
100 - 500 DM     103
500 - 1000 DM     63
> 1000 DM         48
Name: count, dtype: int64
employment_duration
----------
employment_duration
1 - 4 years    339
> 7 years      253
4 - 7 years    174
< 1 year       172
unemployed      62
Name: count, dtype: int64
other_credit
----------
other_credit
none     814
bank     139
store     47
Name: count, dtype: int64
housing
----------
housing
own      

All the categorical attributes and their's value counts for each distinct value, using that infomation, performing pre processing

# Preprocessing: Turning categorical columns into ordinal values

In [13]:
# getting the keyvalue pair dictionary
d = {}
for i in ["checking_balance",
          "credit_history",
          "savings_balance",
          "employment_duration",
          "phone","default"]:
    d[i] = df[i].value_counts().to_dict()

In [14]:
# making the categorical columns into ordinal
pp = {
 'checking_balance': {'unknown': 0,
                      '< 0 DM': 1,
                      '1 - 200 DM': 2,
                      '> 200 DM': 3},
 'credit_history': {'good': 2,
                      'critical': 0,
                      'poor': 1,
                      'very good': 3,
                      'perfect': 4},
 'savings_balance': {'< 100 DM': 1,
                      'unknown': 0,
                      '100 - 500 DM': 2,
                      '500 - 1000 DM': 3,
                      '> 1000 DM': 4},
 'employment_duration': {'1 - 4 years': 2,
                      '> 7 years': 4,
                      '4 - 7 years': 3,
                      '< 1 year': 1,
                      'unemployed': 0},
            'phone': {'no': 0,
                       'yes': 1},
            'default': {'no': 0,
                        'yes': 1}
}

In [16]:
# making a list of all the columns for dummies
obj = ['purpose','housing','other_credit','job']

df = df.replace(pp)
df = pd.get_dummies(df, columns= obj)

In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 29 columns):
 #   Column                        Non-Null Count  Dtype   
---  ------                        --------------  -----   
 0   checking_balance              1000 non-null   category
 1   months_loan_duration          1000 non-null   int64   
 2   credit_history                1000 non-null   category
 3   amount                        1000 non-null   int64   
 4   savings_balance               1000 non-null   category
 5   employment_duration           1000 non-null   category
 6   percent_of_income             1000 non-null   int64   
 7   years_at_residence            1000 non-null   int64   
 8   age                           1000 non-null   int64   
 9   existing_loans_count          1000 non-null   int64   
 10  dependents                    1000 non-null   int64   
 11  phone                         1000 non-null   category
 12  default                       1000 non-null   cat

# Splitting the data

In [19]:
# creating independent variable
X = df.drop("default",axis = 1)
# creating dependent variable
y = df['default']

In [20]:
# creating a train and test split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.30, random_state= 23)

# Building Decision Tree

In [23]:
# creating the model object and fitting the variable
tree = DecisionTreeClassifier(criterion='gini', random_state=23)
tree.fit(X_train, y_train)

In [26]:
print("Train Data Accuracy:",tree.score(X_train, y_train))
print("Test Data Accuracy:",tree.score(X_test, y_test))

Train Data Accuracy: 1.0
Test Data Accuracy: 0.6933333333333334


# Identifying Hyper-parameters: Pre-pruning

In [28]:
md = [3, 5, 10, 25]
ml = [15, 30, 70]
ms = [15, 30, 70]
model_performance_data = []

In [None]:
for i in md:
    for j in ml:
        for k in ms: