In [25]:
import pandas as pd
import numpy as np
import time
import random
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline

In [26]:
# Reading the captured data into Dataframe 
data = pd.read_csv('Financial Data.csv')
data.head()

Unnamed: 0,entry_id,age,pay_schedule,home_owner,income,months_employed,years_employed,current_address_year,personal_account_m,personal_account_y,...,amount_requested,risk_score,risk_score_2,risk_score_3,risk_score_4,risk_score_5,ext_quality_score,ext_quality_score_2,inquiries_last_month,e_signed
0,7629673,40,bi-weekly,1,3135,0,3,3,6,2,...,550,36200,0.737398,0.903517,0.487712,0.515977,0.580918,0.380918,10,1
1,3560428,61,weekly,0,3180,0,6,3,2,7,...,600,30150,0.73851,0.881027,0.713423,0.826402,0.73072,0.63072,9,0
2,6934997,23,weekly,0,1540,6,0,0,7,1,...,450,34550,0.642993,0.766554,0.595018,0.762284,0.531712,0.531712,7,0
3,5682812,40,bi-weekly,0,5230,0,6,1,2,7,...,700,42150,0.665224,0.960832,0.767828,0.778831,0.792552,0.592552,8,1
4,5335819,33,semi-monthly,0,3590,0,5,2,2,8,...,1100,53850,0.617361,0.85756,0.613487,0.665523,0.744634,0.744634,12,0


## Feature Engineering

In [27]:
# As seen in EDA our Months Employeed is not giving accurate information and seems faulty to removing the feature
data = data.drop(['months_employed'], axis = 1)
data.head()

Unnamed: 0,entry_id,age,pay_schedule,home_owner,income,years_employed,current_address_year,personal_account_m,personal_account_y,has_debt,amount_requested,risk_score,risk_score_2,risk_score_3,risk_score_4,risk_score_5,ext_quality_score,ext_quality_score_2,inquiries_last_month,e_signed
0,7629673,40,bi-weekly,1,3135,3,3,6,2,1,550,36200,0.737398,0.903517,0.487712,0.515977,0.580918,0.380918,10,1
1,3560428,61,weekly,0,3180,6,3,2,7,1,600,30150,0.73851,0.881027,0.713423,0.826402,0.73072,0.63072,9,0
2,6934997,23,weekly,0,1540,0,0,7,1,1,450,34550,0.642993,0.766554,0.595018,0.762284,0.531712,0.531712,7,0
3,5682812,40,bi-weekly,0,5230,6,1,2,7,1,700,42150,0.665224,0.960832,0.767828,0.778831,0.792552,0.592552,8,1
4,5335819,33,semi-monthly,0,3590,5,2,2,8,1,1100,53850,0.617361,0.85756,0.613487,0.665523,0.744634,0.744634,12,0


In [28]:
# Personal Account tenure information is given in 2 separate columns, which defines the information is scattered.
# We will create a derived column which will contain the total month information of the personal account
data['Personal_Account_Month'] = (data['personal_account_m'] + (data['personal_account_m']*12))

# Evaluating the new column createdT
data[['Personal_Account_Month','personal_account_m','personal_account_m']].head()

Unnamed: 0,Personal_Account_Month,personal_account_m,personal_account_m.1
0,78,6,6
1,26,2,2
2,91,7,7
3,26,2,2
4,26,2,2


In [29]:
# Removing the columns 'personal_account_m' and 'personal_account_m'
data = data.drop(['personal_account_m','personal_account_m'], axis = 1)
data.head()

Unnamed: 0,entry_id,age,pay_schedule,home_owner,income,years_employed,current_address_year,personal_account_y,has_debt,amount_requested,risk_score,risk_score_2,risk_score_3,risk_score_4,risk_score_5,ext_quality_score,ext_quality_score_2,inquiries_last_month,e_signed,Personal_Account_Month
0,7629673,40,bi-weekly,1,3135,3,3,2,1,550,36200,0.737398,0.903517,0.487712,0.515977,0.580918,0.380918,10,1,78
1,3560428,61,weekly,0,3180,6,3,7,1,600,30150,0.73851,0.881027,0.713423,0.826402,0.73072,0.63072,9,0,26
2,6934997,23,weekly,0,1540,0,0,1,1,450,34550,0.642993,0.766554,0.595018,0.762284,0.531712,0.531712,7,0,91
3,5682812,40,bi-weekly,0,5230,6,1,7,1,700,42150,0.665224,0.960832,0.767828,0.778831,0.792552,0.592552,8,1,26
4,5335819,33,semi-monthly,0,3590,5,2,8,1,1100,53850,0.617361,0.85756,0.613487,0.665523,0.744634,0.744634,12,0,26


### Treating the categorical variables

In [30]:
# Use One Hot Encoding
data = pd.get_dummies(data, drop_first= False)
data.columns

Index(['entry_id', 'age', 'home_owner', 'income', 'years_employed',
       'current_address_year', 'personal_account_y', 'has_debt',
       'amount_requested', 'risk_score', 'risk_score_2', 'risk_score_3',
       'risk_score_4', 'risk_score_5', 'ext_quality_score',
       'ext_quality_score_2', 'inquiries_last_month', 'e_signed',
       'Personal_Account_Month', 'pay_schedule_bi-weekly',
       'pay_schedule_monthly', 'pay_schedule_semi-monthly',
       'pay_schedule_weekly'],
      dtype='object')

In [31]:
# Using one hot encoding we have chances of Dummy Variable Trap, So we will remove one extracted column to avoid it
# Removing pay_schedule_semi-monthly as this column is somerthing that provides least information
data = data.drop(['pay_schedule_semi-monthly'], axis = 1)
data.head()

Unnamed: 0,entry_id,age,home_owner,income,years_employed,current_address_year,personal_account_y,has_debt,amount_requested,risk_score,...,risk_score_4,risk_score_5,ext_quality_score,ext_quality_score_2,inquiries_last_month,e_signed,Personal_Account_Month,pay_schedule_bi-weekly,pay_schedule_monthly,pay_schedule_weekly
0,7629673,40,1,3135,3,3,2,1,550,36200,...,0.487712,0.515977,0.580918,0.380918,10,1,78,1,0,0
1,3560428,61,0,3180,6,3,7,1,600,30150,...,0.713423,0.826402,0.73072,0.63072,9,0,26,0,0,1
2,6934997,23,0,1540,0,0,1,1,450,34550,...,0.595018,0.762284,0.531712,0.531712,7,0,91,0,0,1
3,5682812,40,0,5230,6,1,7,1,700,42150,...,0.767828,0.778831,0.792552,0.592552,8,1,26,1,0,0
4,5335819,33,0,3590,5,2,8,1,1100,53850,...,0.613487,0.665523,0.744634,0.744634,12,0,26,0,0,0


In [32]:
# Removing the extra columns which are not going to be part of our training data
# As per analysis, entry_id is unique for all customers and won't contribute significant information for model building.
# e_signed is our Dependent variable, and hence removing this as well

target = data['e_signed']
user_id = data['entry_id']
data = data.drop(['e_signed', 'entry_id'], axis = 1)
data.head()

Unnamed: 0,age,home_owner,income,years_employed,current_address_year,personal_account_y,has_debt,amount_requested,risk_score,risk_score_2,risk_score_3,risk_score_4,risk_score_5,ext_quality_score,ext_quality_score_2,inquiries_last_month,Personal_Account_Month,pay_schedule_bi-weekly,pay_schedule_monthly,pay_schedule_weekly
0,40,1,3135,3,3,2,1,550,36200,0.737398,0.903517,0.487712,0.515977,0.580918,0.380918,10,78,1,0,0
1,61,0,3180,6,3,7,1,600,30150,0.73851,0.881027,0.713423,0.826402,0.73072,0.63072,9,26,0,0,1
2,23,0,1540,0,0,1,1,450,34550,0.642993,0.766554,0.595018,0.762284,0.531712,0.531712,7,91,0,0,1
3,40,0,5230,6,1,7,1,700,42150,0.665224,0.960832,0.767828,0.778831,0.792552,0.592552,8,26,1,0,0
4,33,0,3590,5,2,8,1,1100,53850,0.617361,0.85756,0.613487,0.665523,0.744634,0.744634,12,26,0,0,0


In [33]:
# Splitting the data into our Train and Test Data
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size= 0.2, random_state= 0)
print(f"Shape of X train is : {X_train.shape}")
print(f"Shape of X test is : {X_test.shape}")
print(f"Shape of y train is : {y_train.shape}")
print(f"Shape of y test is : {y_test.shape}")

Shape of X train is : (14326, 20)
Shape of X test is : (3582, 20)
Shape of y train is : (14326,)
Shape of y test is : (3582,)


In [34]:
# All the feature are in different scales and hence we will bring them to similar scale to avoid computational complexities
# Using Standard Scaler for Transforming the data
scale = StandardScaler()
X_train = pd.DataFrame(scale.fit_transform(X_train), columns = X_train.columns.values, index = X_train.index.values)
X_test = pd.DataFrame(scale.transform(X_test), columns = X_test.columns.values, index = X_test.index.values)

In [35]:
# Evaluating Training Data head
X_train.head()

Unnamed: 0,age,home_owner,income,years_employed,current_address_year,personal_account_y,has_debt,amount_requested,risk_score,risk_score_2,risk_score_3,risk_score_4,risk_score_5,ext_quality_score,ext_quality_score_2,inquiries_last_month,Personal_Account_Month,pay_schedule_bi-weekly,pay_schedule_monthly,pay_schedule_weekly
5957,-0.005991,-0.856349,-0.242358,-1.567782,-0.573557,0.254942,0.507058,-0.64258,-1.905162,-1.190501,-0.766112,0.979893,-0.22938,-1.060059,-1.057386,1.784494,1.162481,-1.224638,-0.30236,1.976827
16854,-0.50989,-0.856349,-0.052592,0.206559,-0.93797,-0.260265,0.507058,-0.785621,-0.749184,1.254553,0.607772,-0.396208,0.823356,0.041801,-0.672086,-0.663369,-0.190159,-1.224638,3.307315,-0.505861
4429,-0.76184,1.167748,-0.412149,-1.124197,-0.93797,0.254942,-1.972161,-0.64258,-1.437575,0.53259,-1.485619,-0.08686,-0.775633,0.588358,-0.84155,1.240525,-0.641039,0.816568,-0.30236,-0.505861
13607,1.421722,1.167748,-0.638537,0.650144,2.341745,2.31577,0.507058,-0.785621,-0.680994,-0.64246,1.696278,-1.25298,-1.027524,1.532749,0.817422,-0.119399,0.260721,0.816568,-0.30236,-0.505861
3750,-1.601671,-0.856349,-1.041374,1.093729,2.341745,-0.260265,0.507058,-0.499539,0.780214,-0.248764,-0.315136,-0.910402,0.225551,0.108871,1.541402,-0.935353,1.162481,0.816568,-0.30236,-0.505861


In [36]:
# Evaluating Test Data head
X_test.head()

Unnamed: 0,age,home_owner,income,years_employed,current_address_year,personal_account_y,has_debt,amount_requested,risk_score,risk_score_2,risk_score_3,risk_score_4,risk_score_5,ext_quality_score,ext_quality_score_2,inquiries_last_month,Personal_Account_Month,pay_schedule_bi-weekly,pay_schedule_monthly,pay_schedule_weekly
3629,-1.013789,-0.856349,-0.242358,-1.124197,-1.302383,-0.260265,0.507058,0.358706,-0.944012,0.548408,-1.07635,0.059153,0.326152,0.314517,-0.399633,0.696555,-0.641039,-1.224638,-0.30236,-0.505861
1820,-1.013789,-0.856349,-0.335577,-0.237027,-1.302383,0.254942,-1.972161,-0.213457,0.767226,-0.024644,-0.047016,-0.029979,0.641494,-0.781613,-0.063715,-1.207338,-0.641039,0.816568,-0.30236,-0.505861
6685,0.245958,-0.856349,-0.598586,2.86807,0.884094,-0.775472,0.507058,0.215665,0.757484,-1.051686,-0.521003,-1.033452,-1.29393,-0.505566,-0.503429,-0.391384,-0.641039,-1.224638,-0.30236,-0.505861
17241,0.329941,-0.856349,0.237051,-0.237027,2.341745,2.31577,0.507058,2.718881,1.264037,-0.268713,-0.313492,-0.381701,1.167423,-0.616575,0.816657,-0.119399,0.260721,-1.224638,-0.30236,1.976827
8332,-0.50989,1.167748,3.357207,-0.237027,-0.93797,-1.290679,0.507058,2.790402,1.189353,-0.224014,1.102305,0.23754,0.350924,1.006524,1.0072,-0.663369,-0.641039,-1.224638,-0.30236,-0.505861


In [42]:
y_test

3629     1
1820     1
6685     0
17241    1
8332     1
        ..
7546     1
9836     1
7446     1
9526     1
13946    1
Name: e_signed, Length: 3582, dtype: int64

In [44]:
# Dumping the data into different CSV files for Model Building

X_train.to_csv('xtrain.csv', columns = X_train.columns.values, index = False)
X_test.to_csv('xtest.csv', columns = X_test.columns.values, index = False)
y_train.to_csv('ytrain.csv', index = False)
y_test.to_csv('ytest.csv', index = False)