# Feature Engineering 

In this notebook we would be creating some custom features that would help us in the classification task

In [1]:
import pandas as pd 
import numpy as np 

train = pd.read_csv('ML_Artivatic_dataset/df_train_clean.csv')
train

Unnamed: 0,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,grade,sub_grade,emp_length,home_ownership,...,collections_12_mths_ex_med,mths_since_last_major_derog,application_type,verification_status_joint,last_week_pay,acc_now_delinq,tot_coll_amt,tot_cur_bal,total_rev_hi_lim,loan_status
0,58189336,14350,14350,14350.0,36,19.19,4,22,9,4,...,0.0,74.0,0,0,26,0.0,0.0,28699.0,30800.0,0
1,70011223,4800,4800,4800.0,36,10.99,1,8,10,1,...,0.0,44.0,0,0,9,0.0,0.0,9974.0,32900.0,0
2,70255675,10000,10000,10000.0,36,7.26,0,3,2,4,...,0.0,44.0,0,0,9,0.0,65.0,38295.0,34900.0,0
3,1893936,15000,15000,15000.0,36,19.72,3,19,1,5,...,0.0,44.0,0,0,135,0.0,0.0,55564.0,24700.0,0
4,7652106,16000,16000,16000.0,36,10.64,1,6,1,5,...,0.0,44.0,0,0,96,0.0,0.0,47159.0,47033.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
532423,31296187,20000,20000,20000.0,36,12.49,1,9,1,1,...,0.0,44.0,0,0,65,0.0,0.0,83087.0,24800.0,0
532424,29403184,12000,12000,12000.0,60,14.99,2,14,1,1,...,0.0,44.0,0,0,70,0.0,0.0,227812.0,17800.0,0
532425,7357607,18725,18725,18725.0,60,20.80,4,20,8,5,...,0.0,44.0,0,0,9,0.0,0.0,26010.0,24200.0,1
532426,23182668,21000,21000,21000.0,60,16.29,3,16,0,5,...,0.0,44.0,0,0,78,0.0,0.0,29197.0,23300.0,0


How big the loan a person has taken with respect to his earnings, annual income to loan amount ratio

In [2]:
train['loan_to_income'] = train['annual_inc']/train['funded_amnt_inv']


All these attributes indicate that the repayment was not all hunky-dory. All the amounts caclulated are ratios 
like, recovery to the loan amount. This column gives a magnitude of how much the repayment has gone off course in terms of ratios.

In [3]:
train['bad_state'] = train['acc_now_delinq'] + (train['total_rec_late_fee']/train['funded_amnt_inv']) + (train['recoveries']/train['funded_amnt_inv']) + (train['collection_recovery_fee']/train['funded_amnt_inv']) + (train['collections_12_mths_ex_med']/train['funded_amnt_inv'])


# For the sake of this model, I have used just a boolean flag if things had gone bad, with this case I didn't see
# a benifit of including above computations
train.loc[train['bad_state'] > 0, 'bad_state'] = 1

Total number of available/unused 'credit lines'

In [4]:
train['avl_lines'] = train['total_acc'] - train['open_acc']

Interest paid so far

In [5]:
train['int_paid'] = train['total_rec_int'] + train['total_rec_late_fee']

 Calculating EMIs paid (in terms of percent)

In [6]:
train['emi_paid_progress_perc'] = ((train['last_week_pay']/(train['term']/12*52+1))*100)

Calculating total repayments received so far, in terms of EMI or recoveries after charge off

In [7]:
train['total_repayment_progress'] = ((train['last_week_pay']/(train['term']/12*52+1))*100) + ((train['recoveries']/train['funded_amnt_inv']) * 100)

In [8]:
train 

Unnamed: 0,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,grade,sub_grade,emp_length,home_ownership,...,tot_coll_amt,tot_cur_bal,total_rev_hi_lim,loan_status,loan_to_income,bad_state,avl_lines,int_paid,emi_paid_progress_perc,total_repayment_progress
0,58189336,14350,14350,14350.0,36,19.19,4,22,9,4,...,0.0,28699.0,30800.0,0,2.000000,0.0,14.0,1173.84,16.560510,16.560510
1,70011223,4800,4800,4800.0,36,10.99,1,8,10,1,...,0.0,9974.0,32900.0,0,13.541667,0.0,7.0,83.95,5.732484,5.732484
2,70255675,10000,10000,10000.0,36,7.26,0,3,2,4,...,65.0,38295.0,34900.0,0,4.500000,0.0,14.0,56.47,5.732484,5.732484
3,1893936,15000,15000,15000.0,36,19.72,3,19,1,5,...,0.0,55564.0,24700.0,0,7.000000,0.0,11.0,4858.62,85.987261,85.987261
4,7652106,16000,16000,16000.0,36,10.64,1,6,1,5,...,0.0,47159.0,47033.0,0,3.250000,0.0,16.0,2296.41,61.146497,61.146497
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
532423,31296187,20000,20000,20000.0,36,12.49,1,9,1,1,...,0.0,83087.0,24800.0,0,3.750000,0.0,22.0,2595.45,41.401274,41.401274
532424,29403184,12000,12000,12000.0,60,14.99,2,14,1,1,...,0.0,227812.0,17800.0,0,4.916667,0.0,19.0,2182.92,26.819923,26.819923
532425,7357607,18725,18725,18725.0,60,20.80,4,20,8,5,...,0.0,26010.0,24200.0,1,2.269907,0.0,12.0,645.32,3.448276,3.448276
532426,23182668,21000,21000,21000.0,60,16.29,3,16,0,5,...,0.0,29197.0,23300.0,0,2.380952,0.0,7.0,4619.79,29.885057,29.885057


In [10]:
# so the new features generated are -
features_new = ['loan_to_income', 'bad_state', 'avl_lines', 'int_paid', 'emi_paid_progress_perc','total_repayment_progress' ]

# and the features we had found useful using feature selection 
features_selected = ['revol_bal',
 'total_rev_hi_lim',
 'collection_recovery_fee',
 'delinq_2yrs',
 'revol_util',
 'pub_rec',
 'recoveries',
 'open_acc',
 'inq_last_6mths',
 'loan_amnt',
 'funded_amnt',
 'funded_amnt_inv',
 'term',
 'dti',
 'last_week_pay',
 'grade',
 'purpose',
 'home_ownership',
 'verification_status',
 'initial_list_status']

tot_features = ['member_id'] + features_selected + features_new + ['loan_status']

df = train[tot_features]
df

Unnamed: 0,member_id,revol_bal,total_rev_hi_lim,collection_recovery_fee,delinq_2yrs,revol_util,pub_rec,recoveries,open_acc,inq_last_6mths,...,home_ownership,verification_status,initial_list_status,loan_to_income,bad_state,avl_lines,int_paid,emi_paid_progress_perc,total_repayment_progress,loan_status
0,58189336,22515.0,30800.0,0.0,0.0,73.1,1.0,0.0,14.0,1.0,...,4,1,0,2.000000,0.0,14.0,1173.84,16.560510,16.560510,0
1,70011223,7624.0,32900.0,0.0,0.0,23.2,0.0,0.0,6.0,1.0,...,1,1,1,13.541667,0.0,7.0,83.95,5.732484,5.732484,0
2,70255675,10877.0,34900.0,0.0,0.0,31.2,0.0,0.0,5.0,0.0,...,4,0,1,4.500000,0.0,14.0,56.47,5.732484,5.732484,0
3,1893936,13712.0,24700.0,0.0,0.0,55.5,0.0,0.0,10.0,2.0,...,5,0,0,7.000000,0.0,11.0,4858.62,85.987261,85.987261,0
4,7652106,35835.0,47033.0,0.0,0.0,76.2,0.0,0.0,11.0,0.0,...,5,2,1,3.250000,0.0,16.0,2296.41,61.146497,61.146497,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
532423,31296187,15775.0,24800.0,0.0,0.0,63.6,1.0,0.0,12.0,0.0,...,1,1,0,3.750000,0.0,22.0,2595.45,41.401274,41.401274,0
532424,29403184,9453.0,17800.0,0.0,0.0,53.1,1.0,0.0,10.0,0.0,...,1,0,1,4.916667,0.0,19.0,2182.92,26.819923,26.819923,0
532425,7357607,12085.0,24200.0,0.0,0.0,49.9,0.0,0.0,14.0,1.0,...,5,2,0,2.269907,0.0,12.0,645.32,3.448276,3.448276,1
532426,23182668,20902.0,23300.0,0.0,0.0,89.7,0.0,0.0,7.0,1.0,...,5,1,1,2.380952,0.0,7.0,4619.79,29.885057,29.885057,0


In [11]:
df.to_csv('ML_Artivatic_dataset/final_train.csv', index=False)
