# Loan Risk analysis using Decision Tree

In [1]:
import pandas as pd
import numpy as np
import json
import math
print('Modules imported')

Modules imported


In [5]:
#Importing Dataset
loans=pd.read_csv(r'lending-club-data.csv')
loans.head()

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,...,sub_grade_num,delinq_2yrs_zero,pub_rec_zero,collections_12_mths_zero,short_emp,payment_inc_ratio,final_d,last_delinq_none,last_record_none,last_major_derog_none
0,1077501,1296599,5000,5000,4975,36 months,10.65,162.87,B,B2,...,0.4,1.0,1.0,1.0,0,8.1435,20141201T000000,1,1,1
1,1077430,1314167,2500,2500,2500,60 months,15.27,59.83,C,C4,...,0.8,1.0,1.0,1.0,1,2.3932,20161201T000000,1,1,1
2,1077175,1313524,2400,2400,2400,36 months,15.96,84.33,C,C5,...,1.0,1.0,1.0,1.0,0,8.25955,20141201T000000,1,1,1
3,1076863,1277178,10000,10000,10000,36 months,13.49,339.31,C,C1,...,0.2,1.0,1.0,1.0,0,8.27585,20141201T000000,0,1,1
4,1075269,1311441,5000,5000,5000,36 months,7.9,156.46,A,A4,...,0.8,1.0,1.0,1.0,0,5.21533,20141201T000000,1,1,1


In [7]:
#Extracting label feature of safe or unsafe loan from 'bad loans' column of dataset
loans['safe_loans']=loans['bad_loans'].apply(lambda x:+1 if x==0 else -1)
loans.drop('bad_loans',axis=1)

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,...,delinq_2yrs_zero,pub_rec_zero,collections_12_mths_zero,short_emp,payment_inc_ratio,final_d,last_delinq_none,last_record_none,last_major_derog_none,safe_loans
0,1077501,1296599,5000,5000,4975,36 months,10.65,162.87,B,B2,...,1.0,1.0,1.0,0,8.143500,20141201T000000,1,1,1,1
1,1077430,1314167,2500,2500,2500,60 months,15.27,59.83,C,C4,...,1.0,1.0,1.0,1,2.393200,20161201T000000,1,1,1,-1
2,1077175,1313524,2400,2400,2400,36 months,15.96,84.33,C,C5,...,1.0,1.0,1.0,0,8.259550,20141201T000000,1,1,1,1
3,1076863,1277178,10000,10000,10000,36 months,13.49,339.31,C,C1,...,1.0,1.0,1.0,0,8.275850,20141201T000000,0,1,1,1
4,1075269,1311441,5000,5000,5000,36 months,7.90,156.46,A,A4,...,1.0,1.0,1.0,0,5.215330,20141201T000000,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
122602,9856168,11708132,6000,6000,6000,60 months,23.40,170.53,E,E5,...,0.0,1.0,1.0,1,4.487630,20190101T000000,0,1,0,-1
122603,9795013,11647121,15250,15250,15250,36 months,17.57,548.05,D,D2,...,0.0,0.0,1.0,0,10.117800,20170101T000000,0,0,0,1
122604,9695736,11547808,8525,8525,8525,60 months,18.25,217.65,D,D3,...,0.0,1.0,1.0,0,6.958120,20190101T000000,0,1,0,-1
122605,9684700,11536848,22000,22000,22000,60 months,19.97,582.50,D,D5,...,1.0,0.0,1.0,0,8.961540,20190101T000000,1,0,1,-1


In [9]:
#Extracting few columns from dataset for ease of work
features=['grade','term','home_ownership','emp_length_num']
target=['safe_loans']

loans=loans[features+target]
loans.head()

Unnamed: 0,grade,term,home_ownership,emp_length_num,safe_loans
0,B,36 months,RENT,11,1
1,C,60 months,RENT,1,-1
2,C,36 months,RENT,11,1
3,C,36 months,RENT,11,1
4,A,36 months,RENT,4,1


# Handling imbalance in dataset

In [21]:
#We use the simple approach of undersampling the larger class(safe_loans) in order to balance out our dataset.
safe_loans_raw = loans.loc[loans['safe_loans'] == 1]
risky_loans_raw = loans.loc[loans['safe_loans'] == -1]

# Since there are less risky loans than safe loans, we find the ratio of the sizes and use that percentage to undersample the safe loans.
percentage = float(len(risky_loans_raw))/float(len(safe_loans_raw))
safe_loans = safe_loans_raw.sample(frac=percentage,replace=False, random_state = 1)
risky_loans = risky_loans_raw
loans_data = risky_loans.append(safe_loans)

print("Percentage of safe loans                 :", len(safe_loans) / float(len(loans_data)))
print("Percentage of risky loans                :", len(risky_loans) / float(len(loans_data)))
print("Total number of loans in our new dataset :", len(loans_data))

Percentage of safe loans                 : 0.5
Percentage of risky loans                : 0.5
Total number of loans in our new dataset : 46300


# Train Validation split

In [39]:
loans_data.head()
train_data=loans_data.sample(frac=0.8)
valid_data=loans_data.drop(train_data.index)

#indexing the dataframes
idx_tr=list(range(len(train_data)))
train_data['idx_tr']=idx_tr
train_data.set_index(['idx_tr'],inplace=True)

idx_valid=list(range(len(valid_data)))
valid_data['idx_valid']=idx_valid
valid_data.set_index(['idx_valid'],inplace=True)

# Decision Tree Functions

In [40]:
#Creating Dataframes to arrange data where required 
def select_data(data,feature,value_of_the_attri):
    raw_select=data[data[feature]==value_of_the_attri]
    tmp_idx=list(range(len(raw_select)))
    raw_select['idx']=tmp_idx
    raw_select.set_index(['idx'],inplace=True)
    return raw_select

# Selecting Entropy as measure of node impurity

In [41]:
#Calculation of ENTROPY
def intermediate_node_Entropy(labels_in_node):
    if len(labels_in_node)==0:
        return 0
    label_idx=list(range(len(labels_in_node)))
    labels=pd.DataFrame(labels_in_node,index=label_idx)
    safe=0
    risky=0
    denom=len(labels)
    for j in range(len(labels)):
        if labels.iloc[j,-1]==+1:
            safe=safe+1
        elif labels.iloc[j,-1]==-1:
            risky=risky+1
    
    p1=safe/denom
    p0=risky/denom
    if p1!=0 and p0!=0:
        entropy=-(p1*math.log2(p1))-(p0*math.log2(p0))
        return entropy
    else:
        return 0

#Calculation of GAIN
def Gain(feature_entropy,len_data,sub_feature_lengths,sub_feature_entropies):
    tmp4=0
    for feat_attri in sub_feature_lengths.keys():
        tmp4=(sub_feature_lengths[feat_attri]/len_data)*sub_feature_entropies[feat_attri]
    gain=feature_entropy-tmp4
    return gain


In [45]:
#Early stopping condition 2
def crossed_min_node_size(data,min_node_size):
    if len(data)<=min_node_size:
        return True
    else: return False

In [43]:
#Finding best feature to split on
def best_splitting_feature(data,features,target):
    best_feature=None
    best_gain=-1
    ent_feat=intermediate_node_Entropy(data[target])
    feat_gains={}
    
    for feature in features:
        
        sub_features_ent={}
        sub_features_len={}
        
        for feat_attri in data[feature].unique():
            sub_features_ent[feat_attri]=intermediate_node_Entropy(select_data(data,feature,feat_attri)[target])
            sub_features_len[feat_attri]=len(select_data(data,feature,feat_attri))
            
        feat_gains[feature]=Gain(ent_feat,len(data),sub_features_len,sub_features_ent)
        
    for key,value in feat_gains.items():
        if value>best_gain:
            best_gain=value
            best_feature=key
    return best_feature,best_gain


In [46]:
#Creating leaf node
def create_leaf(target_values):
    leaf={'splitting_feature':None,'left':None,'right':None,'is_leaf':True}
    num_ones=len(target_values[target_values==1])
    num_minus_ones=len(target_values[target_values==-1])
    if num_ones>num_minus_ones:
        leaf['prediction']='safe'
    elif num_minus_ones>num_ones:
        leaf['prediction']= 'unsafe'
    else:
        leaf['prediction']='unsafe'
    return leaf

# Creating Decision Tree

In [47]:
def decision_tree_create(data,features,target,current_depth=0,max_depth=10, min_node_size=1):
    
    remaining_features=features[:]
    target_values=data[target]
    
    print('____________________________________________________________________________________________')
    print("Subtree, depth=%s(%s datapoints)" %(current_depth,len(target_values)))
    
    #Stopping condition 1, if there are no mistakes at this node
    tmp3=intermediate_node_Entropy(target_values.tolist())
    if tmp3==0:
        print("Stopping condition 1 no more mistakes reached")
        return create_leaf(target_values)
    
    #Stopping condition 2, if there are no more features to split on
    if remaining_features==[]:
        print("Stopping condition 2 no more features reached")
        return create_leaf(target_values)
    
    #Early stopping condition 1, max depth reached
    if current_depth>=max_depth:
        print("Early stopping condition 1 max depth reached")
        return create_leaf(target_values)
    
    #Early stopping condition 2:
    if crossed_min_node_size(data,min_node_size):
        print("Early stopping condition 2 minimum node size reached")
        return create_leaf(target_values)    
    
    #Otherwise, split
    print('split_feat ongoing')
    print("Remaining features are",remaining_features)
    split_feat,split_gain=best_splitting_feature(data,features,target)
    print("splitting feature is %s and its gain is %s" %(split_feat,split_gain))
    
    feature_values={}
    for value in data[split_feat].unique():
        feature_values[value]=select_data(data,split_feat,value)
    
    remaining_features.remove(split_feat)
    print("SPlit on feature %s; of lengths:" %(split_feat))
    for key in feature_values.keys():
        print("length of branch on value %s is %s" %(key,len(feature_values[key])))
        
    #Create node if split is perfect
    for key in feature_values.keys():
        if len(feature_values[key])==len(data):
             print("Creating leaf node as data is completely inclined to", key)
             return create_leaf(feature_values[key][target])
        
    #Creating tree branches through recursion
    print("recursing")
    DT_split={}
    for key in feature_values.keys():
        DT_split[key]=decision_tree_create(feature_values[key],remaining_features,target,current_depth+1,max_depth)
    
    return {'is_leaf':False,'prediction':None,'splitting_feature':split_feat,'branch':DT_split}


In [48]:
my_decision_tree_new=decision_tree_create(train_data,train_data.columns.tolist()[:-1],'safe_loans',0,6,100)


____________________________________________________________________________________________
Subtree, depth=0(37040 datapoints)
split_feat ongoing
Remaining features are ['grade', 'term', 'home_ownership', 'emp_length_num']


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


splitting feature is home_ownership and its gain is 0.9983842141830634
SPlit on feature home_ownership; of lengths:
length of branch on value MORTGAGE is 17072
length of branch on value RENT is 16809
length of branch on value OWN is 3099
length of branch on value OTHER is 60
recursing
____________________________________________________________________________________________
Subtree, depth=1(17072 datapoints)
split_feat ongoing
Remaining features are ['grade', 'term', 'emp_length_num']
splitting feature is grade and its gain is 0.98564976662538
SPlit on feature grade; of lengths:
length of branch on value D is 2722
length of branch on value B is 4781
length of branch on value C is 4258
length of branch on value A is 2809
length of branch on value E is 1549
length of branch on value F is 733
length of branch on value G is 220
recursing
____________________________________________________________________________________________
Subtree, depth=2(2722 datapoints)
split_feat ongoing
Remain

SPlit on feature term; of lengths:
length of branch on value  60 months is 40
length of branch on value  36 months is 75
recursing
____________________________________________________________________________________________
Subtree, depth=4(40 datapoints)
Stopping condition 2 no more features reached
____________________________________________________________________________________________
Subtree, depth=4(75 datapoints)
Stopping condition 2 no more features reached
____________________________________________________________________________________________
Subtree, depth=2(4781 datapoints)
split_feat ongoing
Remaining features are ['term', 'emp_length_num']
splitting feature is emp_length_num and its gain is 0.9381066455645585
SPlit on feature emp_length_num; of lengths:
length of branch on value 8 is 313
length of branch on value 1 is 289
length of branch on value 5 is 305
length of branch on value 9 is 204
length of branch on value 7 is 286
length of branch on value 3 is 378
lengt

splitting feature is emp_length_num and its gain is 0.9426692889156355
SPlit on feature emp_length_num; of lengths:
length of branch on value 8 is 254
length of branch on value 0 is 161
length of branch on value 11 is 1528
length of branch on value 9 is 205
length of branch on value 7 is 285
length of branch on value 6 is 343
length of branch on value 4 is 285
length of branch on value 5 is 236
length of branch on value 10 is 206
length of branch on value 2 is 220
length of branch on value 3 is 291
length of branch on value 1 is 244
recursing
____________________________________________________________________________________________
Subtree, depth=3(254 datapoints)
split_feat ongoing
Remaining features are ['term']
splitting feature is term and its gain is 0.3150053058526072
SPlit on feature term; of lengths:
length of branch on value  60 months is 79
length of branch on value  36 months is 175
recursing
_________________________________________________________________________________

Stopping condition 2 no more features reached
____________________________________________________________________________________________
Subtree, depth=4(163 datapoints)
Stopping condition 2 no more features reached
____________________________________________________________________________________________
Subtree, depth=2(2809 datapoints)
split_feat ongoing
Remaining features are ['term', 'emp_length_num']
splitting feature is term and its gain is 0.725841631820507
SPlit on feature term; of lengths:
length of branch on value  36 months is 2713
length of branch on value  60 months is 96
recursing
____________________________________________________________________________________________
Subtree, depth=3(2713 datapoints)
split_feat ongoing
Remaining features are ['emp_length_num']
splitting feature is emp_length_num and its gain is 0.6948478653473461
SPlit on feature emp_length_num; of lengths:
length of branch on value 0 is 113
length of branch on value 7 is 170
length of branch on

splitting feature is term and its gain is 0.6213844959926926
SPlit on feature term; of lengths:
length of branch on value  60 months is 78
length of branch on value  36 months is 31
recursing
____________________________________________________________________________________________
Subtree, depth=4(78 datapoints)
Stopping condition 2 no more features reached
____________________________________________________________________________________________
Subtree, depth=4(31 datapoints)
Stopping condition 2 no more features reached
____________________________________________________________________________________________
Subtree, depth=3(133 datapoints)
split_feat ongoing
Remaining features are ['term']
splitting feature is term and its gain is 0.6995858063957843
SPlit on feature term; of lengths:
length of branch on value  60 months is 99
length of branch on value  36 months is 34
recursing
____________________________________________________________________________________________
Subt

SPlit on feature term; of lengths:
length of branch on value  36 months is 10
length of branch on value  60 months is 39
recursing
____________________________________________________________________________________________
Subtree, depth=4(10 datapoints)
Stopping condition 2 no more features reached
____________________________________________________________________________________________
Subtree, depth=4(39 datapoints)
Stopping condition 2 no more features reached
____________________________________________________________________________________________
Subtree, depth=2(220 datapoints)
split_feat ongoing
Remaining features are ['term', 'emp_length_num']
splitting feature is emp_length_num and its gain is 0.8244381401496109
SPlit on feature emp_length_num; of lengths:
length of branch on value 10 is 4
length of branch on value 8 is 15
length of branch on value 7 is 18
length of branch on value 6 is 12
length of branch on value 11 is 84
length of branch on value 1 is 10
length of b

split_feat ongoing
Remaining features are ['grade', 'term', 'emp_length_num']
splitting feature is grade and its gain is 0.9881163179618385
SPlit on feature grade; of lengths:
length of branch on value A is 1992
length of branch on value C is 4360
length of branch on value B is 4533
length of branch on value D is 3353
length of branch on value E is 1623
length of branch on value F is 763
length of branch on value G is 185
recursing
____________________________________________________________________________________________
Subtree, depth=2(1992 datapoints)
split_feat ongoing
Remaining features are ['term', 'emp_length_num']
splitting feature is term and its gain is 0.831398751225316
SPlit on feature term; of lengths:
length of branch on value  36 months is 1960
length of branch on value  60 months is 32
recursing
____________________________________________________________________________________________
Subtree, depth=3(1960 datapoints)
split_feat ongoing
Remaining features are ['emp_

Stopping condition 2 no more features reached
____________________________________________________________________________________________
Subtree, depth=4(166 datapoints)
Stopping condition 2 no more features reached
____________________________________________________________________________________________
Subtree, depth=3(149 datapoints)
split_feat ongoing
Remaining features are ['term']
splitting feature is term and its gain is 0.8127804039634506
SPlit on feature term; of lengths:
length of branch on value  36 months is 137
length of branch on value  60 months is 12
recursing
____________________________________________________________________________________________
Subtree, depth=4(137 datapoints)
Stopping condition 2 no more features reached
____________________________________________________________________________________________
Subtree, depth=4(12 datapoints)
Stopping condition 2 no more features reached
_____________________________________________________________________

split_feat ongoing
Remaining features are ['term', 'emp_length_num']
splitting feature is emp_length_num and its gain is 0.9203932159634667
SPlit on feature emp_length_num; of lengths:
length of branch on value 3 is 399
length of branch on value 1 is 372
length of branch on value 7 is 200
length of branch on value 2 is 314
length of branch on value 4 is 323
length of branch on value 8 is 163
length of branch on value 11 is 672
length of branch on value 5 is 268
length of branch on value 9 is 147
length of branch on value 6 is 283
length of branch on value 0 is 111
length of branch on value 10 is 101
recursing
____________________________________________________________________________________________
Subtree, depth=3(399 datapoints)
split_feat ongoing
Remaining features are ['term']
splitting feature is term and its gain is 0.8479438720284114
SPlit on feature term; of lengths:
length of branch on value  36 months is 336
length of branch on value  60 months is 63
recursing
_____________

splitting feature is emp_length_num and its gain is 0.8571396689710742
SPlit on feature emp_length_num; of lengths:
length of branch on value 11 is 341
length of branch on value 9 is 60
length of branch on value 8 is 88
length of branch on value 6 is 127
length of branch on value 4 is 143
length of branch on value 3 is 184
length of branch on value 1 is 207
length of branch on value 7 is 101
length of branch on value 5 is 125
length of branch on value 2 is 147
length of branch on value 0 is 49
length of branch on value 10 is 51
recursing
____________________________________________________________________________________________
Subtree, depth=3(341 datapoints)
split_feat ongoing
Remaining features are ['term']
splitting feature is term and its gain is 0.4881741115620033
SPlit on feature term; of lengths:
length of branch on value  60 months is 208
length of branch on value  36 months is 133
recursing
_____________________________________________________________________________________

SPlit on feature emp_length_num; of lengths:
length of branch on value 11 is 176
length of branch on value 3 is 90
length of branch on value 5 is 65
length of branch on value 2 is 52
length of branch on value 7 is 44
length of branch on value 6 is 56
length of branch on value 8 is 48
length of branch on value 4 is 72
length of branch on value 9 is 34
length of branch on value 0 is 24
length of branch on value 1 is 78
length of branch on value 10 is 24
recursing
____________________________________________________________________________________________
Subtree, depth=3(176 datapoints)
split_feat ongoing
Remaining features are ['term']
splitting feature is term and its gain is 0.6893486607092406
SPlit on feature term; of lengths:
length of branch on value  60 months is 154
length of branch on value  36 months is 22
recursing
____________________________________________________________________________________________
Subtree, depth=4(154 datapoints)
Stopping condition 2 no more features 

____________________________________________________________________________________________
Subtree, depth=4(22 datapoints)
Stopping condition 2 no more features reached
____________________________________________________________________________________________
Subtree, depth=3(24 datapoints)
split_feat ongoing
Remaining features are ['term']
splitting feature is term and its gain is 0.0
SPlit on feature term; of lengths:
length of branch on value  60 months is 24
Creating leaf node as data is completely inclined to  60 months
____________________________________________________________________________________________
Subtree, depth=2(185 datapoints)
split_feat ongoing
Remaining features are ['term', 'emp_length_num']
splitting feature is emp_length_num and its gain is 0.755718356658901
SPlit on feature emp_length_num; of lengths:
length of branch on value 5 is 11
length of branch on value 11 is 51
length of branch on value 3 is 17
length of branch on value 2 is 23
length of branch o

splitting feature is emp_length_num and its gain is 0.9373708364012239
SPlit on feature emp_length_num; of lengths:
length of branch on value 11 is 867
length of branch on value 4 is 243
length of branch on value 3 is 261
length of branch on value 7 is 182
length of branch on value 9 is 127
length of branch on value 0 is 252
length of branch on value 2 is 187
length of branch on value 8 is 177
length of branch on value 6 is 242
length of branch on value 10 is 106
length of branch on value 1 is 261
length of branch on value 5 is 194
recursing
____________________________________________________________________________________________
Subtree, depth=2(867 datapoints)
split_feat ongoing
Remaining features are ['grade', 'term']
splitting feature is grade and its gain is 0.9585935950231171
SPlit on feature grade; of lengths:
length of branch on value G is 17
length of branch on value B is 227
length of branch on value D is 156
length of branch on value E is 79
length of branch on value C is

SPlit on feature term; of lengths:
length of branch on value  60 months is 12
length of branch on value  36 months is 1
recursing
____________________________________________________________________________________________
Subtree, depth=4(12 datapoints)
Stopping condition 2 no more features reached
____________________________________________________________________________________________
Subtree, depth=4(1 datapoints)
Stopping condition 1 no more mistakes reached
____________________________________________________________________________________________
Subtree, depth=3(61 datapoints)
split_feat ongoing
Remaining features are ['term']
splitting feature is term and its gain is 0.871932869529625
SPlit on feature term; of lengths:
length of branch on value  36 months is 49
length of branch on value  60 months is 12
recursing
____________________________________________________________________________________________
Subtree, depth=4(49 datapoints)
Stopping condition 2 no more features

splitting feature is term and its gain is 0.6292492238560345
SPlit on feature term; of lengths:
length of branch on value  36 months is 18
length of branch on value  60 months is 1
recursing
____________________________________________________________________________________________
Subtree, depth=4(18 datapoints)
Stopping condition 2 no more features reached
____________________________________________________________________________________________
Subtree, depth=4(1 datapoints)
Stopping condition 1 no more mistakes reached
____________________________________________________________________________________________
Subtree, depth=3(11 datapoints)
split_feat ongoing
Remaining features are ['term']
splitting feature is term and its gain is 0.6990199844009082
SPlit on feature term; of lengths:
length of branch on value  60 months is 7
length of branch on value  36 months is 4
recursing
____________________________________________________________________________________________
Subtree, 

splitting feature is term and its gain is 0.8843437937362254
SPlit on feature term; of lengths:
length of branch on value  36 months is 41
length of branch on value  60 months is 4
recursing
____________________________________________________________________________________________
Subtree, depth=4(41 datapoints)
Stopping condition 2 no more features reached
____________________________________________________________________________________________
Subtree, depth=4(4 datapoints)
Stopping condition 2 no more features reached
____________________________________________________________________________________________
Subtree, depth=3(40 datapoints)
split_feat ongoing
Remaining features are ['term']
splitting feature is term and its gain is 0.5708516725326996
SPlit on feature term; of lengths:
length of branch on value  36 months is 22
length of branch on value  60 months is 18
recursing
____________________________________________________________________________________________
Subtree

splitting feature is grade and its gain is 0.9991421039919088
SPlit on feature grade; of lengths:
length of branch on value D is 57
length of branch on value E is 29
length of branch on value A is 34
length of branch on value F is 8
length of branch on value B is 61
length of branch on value C is 71
length of branch on value G is 1
recursing
____________________________________________________________________________________________
Subtree, depth=3(57 datapoints)
split_feat ongoing
Remaining features are ['term']
splitting feature is term and its gain is 0.21106012178153388
SPlit on feature term; of lengths:
length of branch on value  60 months is 12
length of branch on value  36 months is 45
recursing
____________________________________________________________________________________________
Subtree, depth=4(12 datapoints)
Stopping condition 2 no more features reached
____________________________________________________________________________________________
Subtree, depth=4(45 dat

splitting feature is emp_length_num and its gain is 0.9634582986483032
SPlit on feature emp_length_num; of lengths:
length of branch on value 11 is 17
length of branch on value 1 is 8
length of branch on value 0 is 2
length of branch on value 10 is 1
length of branch on value 7 is 3
length of branch on value 2 is 6
length of branch on value 4 is 10
length of branch on value 9 is 1
length of branch on value 3 is 4
length of branch on value 5 is 3
length of branch on value 6 is 3
length of branch on value 8 is 2
recursing
____________________________________________________________________________________________
Subtree, depth=2(17 datapoints)
split_feat ongoing
Remaining features are ['grade', 'term']
splitting feature is grade and its gain is 0.9774178175281716
SPlit on feature grade; of lengths:
length of branch on value G is 1
length of branch on value B is 9
length of branch on value A is 1
length of branch on value C is 5
length of branch on value D is 1
recursing
________________

In [50]:
#Classifying a data point as safe or unsafe Loan
def classify(tree,x,annotate=False):
    if tree['is_leaf']:
        if annotate:
            print("At leaf, prediciting: %s" %tree['prediction'])
        return tree['prediction']
    else:
        split_branch=x[tree['splitting_feature']]
        if annotate:
            print("Split on %s=%s" %(tree['splitting_feature'],split_branch))
        
        return classify(tree['branch'][split_branch],x,annotate)

print("\n For an example, validation point 1 is\n", valid_data.iloc[1])
print("\n\nIts predicted class using the Decision Tree is", classify(my_decision_tree_new,valid_data.iloc[1]),"\n\n")
print("\nIts predicted class using the Decision Tree with anotation is", classify(my_decision_tree_new,valid_data.iloc[1],True),"\n\n")



 For an example, validation point 1 is
 grade                      D
term               36 months
home_ownership          RENT
emp_length_num             6
safe_loans                -1
Name: 1, dtype: object


Its predicted class using the Decision Tree is unsafe 


Split on home_ownership=RENT
Split on grade=D
Split on emp_length_num=6
Split on term= 36 months
At leaf, prediciting: unsafe

Its predicted class using the Decision Tree with anotation is unsafe 




# Classification Error

In [51]:
#Classification error of the trained Decision Tree
def classification_error(tree,data):
    #classifying each row in data
    predictions=np.array([])
    unknowns=np.array([])
    
    for x in range(len(data)):
        try:
            classify(tree,data.iloc[x,:])
        except(KeyError):
            unknowns=np.append(unknowns,[x],axis=0)
            print("Key Error, unclassified value detected.")
        else:
            predictions=np.append(predictions,[classify(tree,data.iloc[x,:])],axis=0)
            
    print("List of indices where unclassified value detected is",unknowns)
        
    incorrect=0
    truths=np.array(data['safe_loans'])
    truths=np.delete(truths,[unknowns.reshape([1,len(unknowns)]).astype(int)])
    for j in range(len(truths)):
        
        if j not in unknowns:
            if predictions[j]=='safe' and truths[j]!=1:
                incorrect=incorrect+1
            elif predictions[j]=='unsafe' and truths[j]!=-1:
                incorrect=incorrect+1
        elif j in unknowns:
            continue
    #print("Classifiction error is", incorrect/len(predictions))
    return incorrect/len(predictions)


In [52]:
error_tree_valid=classification_error(my_decision_tree_new,valid_data)
print("Accuracy of the decision tree on validation set is %s" %(1-error_tree_valid))
error_tree_train=classification_error(my_decision_tree_new,train_data)
print("Accuracy of the decision tree on training set is %s" %(1-error_tree_train))

Key Error, unclassified value detected.
Key Error, unclassified value detected.
List of indices where unclassified value detected is [1012. 5785.]
Accuracy of the decision tree on validation set is 0.6085547634478289
List of indices where unclassified value detected is []
Accuracy of the decision tree on training set is 0.6255399568034556


In [53]:
#Conclusion

"Other impurtiy measures such as Gini Index and Misclassification Error could also be used. Other methods for imbalanced dataset"
"handling can also be implemnted."

"This program shows how to implement a Decision Tree from scratch and the working behind it."

'This program shows how to implement a Decision Tree from scratch and the working behind it.'