In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# for the yeo-johnson transformation
import scipy.stats as stats
import configparser
import klib

import warnings
warnings.filterwarnings('ignore')

In [2]:
from Adjuct_Memory import memoryManager

In [3]:
config = configparser.ConfigParser()
config.read('config.ini')

['config.ini']

| Variable Name	| Description	| Type
| ---	| ---	| ---
|SeriousDlqin2yrs|	Person experienced 90 days past due delinquency or worse 	|Y/N
RevolvingUtilizationOfUnsecuredLines|	Total balance on credit cards and personal lines of credit except real estate and no installment debt like car loans divided by the sum of credit limits|	percentage
age|	Age of borrower in years	|integer
NumberOfTime30-59DaysPastDueNotWorse	|Number of times borrower has been 30-59 days past due but no worse in the last 2 years.	|integer
DebtRatio|	Monthly debt payments, alimony,living costs divided by monthy gross income	|percentage
MonthlyIncome|	Monthly income	|real
NumberOfOpenCreditLinesAndLoans	|Number of Open loans (installment like car loan or mortgage) and Lines of credit (e.g. credit cards)|	integer
NumberOfTimes90DaysLate|	Number of times borrower has been 90 days or more past due.|	integer
NumberRealEstateLoansOrLines	|Number of mortgage and real estate loans including home equity lines of credit	|integer
NumberOfTime60-89DaysPastDueNotWorse|	Number of times borrower has been 60-89 days past due but no worse in the last 2 years.	|integer
NumberOfDependents	|Number of dependents in family excluding themselves (spouse, children etc.)|	integer


In [4]:
# load dataset
data = pd.read_csv(config['Training_Data']['file_path'] +'\\'+ config['Training_Data']['file_name'])
data_t = pd.read_csv(config['Testing_Data']['file_path'] +'\\'+ config['Testing_Data']['file_name'])
data.head()

Unnamed: 0.1,Unnamed: 0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
0,1,1,0.766127,45,2,0.802982,9120.0,13,0,6,0,2.0
1,2,0,0.957151,40,0,0.121876,2600.0,4,0,0,0,1.0
2,3,0,0.65818,38,1,0.085113,3042.0,2,1,0,0,0.0
3,4,0,0.23381,30,0,0.03605,3300.0,5,0,0,0,0.0
4,5,0,0.907239,49,1,0.024926,63588.0,7,0,1,0,0.0


In [5]:
# drop id, it is just a number given to identify each house
data.drop(['Unnamed: 0'], axis=1, inplace=True)
data_t.drop(['Unnamed: 0'], axis=1, inplace=True)
data.shape,data_t.shape

((150000, 11), (101503, 11))

In [6]:
train_data = memoryManager.reduce_mem_usage(data)
test_data = memoryManager.reduce_mem_usage(data_t)

Memory usage of dataframe is 12.59 MB
Memory usage after optimization is: 2.72 MB
Decreased by 78.4%
Memory usage of dataframe is 8.52 MB
Memory usage after optimization is: 2.52 MB
Decreased by 70.5%


In [7]:
transformed_data = train_data.copy()

In [8]:
from Preprocessing import Data_Preprocessing

In [None]:
# countplot to evaluate target distribution # SeriousDlqin2yrs
sns.countplot(train_data['SeriousDlqin2yrs'])
plt.ylabel('Count of SeriousDlqin2yrs')
plt.xlabel('SeriousDlqin2yrs')
plt.show()

In [None]:
(len(train_data[train_data.SeriousDlqin2yrs==1])/len(train_data))*100

In [None]:
train_data.isna().sum()

> - MonthlyIncome and NumberOfDependents having null values

In [None]:
train_data.describe().T

> - Except Age all other independent parameters are highly skewed

In [None]:
Target = ['SeriousDlqin2yrs']

Percentage = ['RevolvingUtilizationOfUnsecuredLines','DebtRatio']

Real = ['MonthlyIncome']

Numeric_Param = ["NumberOfDependents",
                 "NumberRealEstateLoansOrLines",
                 "NumberOfOpenCreditLinesAndLoans",
                 "age"]

Delay_param = ["NumberOfTime30-59DaysPastDueNotWorse",
               "NumberOfTime60-89DaysPastDueNotWorse",
               "NumberOfTimes90DaysLate"]

In [None]:
plt.figure(figsize = (20,7))
corr= train_data.corr()
matrix = np.triu(corr)
sns.heatmap(train_data.corr(),annot=True, cmap="Greens") #, mask=matrix)

In [None]:
train_data[Delay_param][train_data[Delay_param].apply(lambda x: min(x) != max(x), 1)]['NumberOfTime30-59DaysPastDueNotWorse'].value_counts()

In [None]:
train_data[Delay_param[2]].value_counts()

In [None]:
pd.DataFrame([train_data[Delay_param[0]].value_counts(),train_data[Delay_param[1]].value_counts(),train_data[Delay_param[2]].value_counts()]).T

In [None]:
transformed_data.loc[transformed_data[Delay_param[0]].isin([96,98]),Delay_param[0]] = 0
transformed_data.loc[transformed_data[Delay_param[0]].isin([96,98]),Delay_param[1]] = 0
transformed_data.loc[transformed_data[Delay_param[0]].isin([96,98]),Delay_param[1]] = 0

In [None]:
test_data.loc[test_data[Delay_param[0]].isin([96,98]),Delay_param[0]] = 0
test_data.loc[test_data[Delay_param[0]].isin([96,98]),Delay_param[1]] = 0
test_data.loc[test_data[Delay_param[0]].isin([96,98]),Delay_param[1]] = 0

In [None]:
transformed_data[transformed_data[Delay_param[0]].isin([96,98])]

In [None]:
96*0.2+96*0.3+96*0.5

In [None]:
transformed_data['Weighted_Delay_sum'] = transformed_data[Delay_param[0]]*0.2+\
                                         transformed_data[Delay_param[1]]*0.3+\
                                         transformed_data[Delay_param[2]]*0.5

In [None]:
test_data['Weighted_Delay_sum'] = test_data[Delay_param[0]]*0.2+\
                                         test_data[Delay_param[1]]*0.3+\
                                         test_data[Delay_param[2]]*0.5

In [None]:
#transformed_data.reset_index(inplace=True)

In [None]:
#transformed_data.drop(Delay_param,axis=1,inplace=True)

In [None]:
plt.rcParams["figure.figsize"] = [15.00, 3.50]
plt.rcParams["figure.autolayout"] = True
fig, ax = plt.subplots(1, 2)
sns.distplot(transformed_data.loc[transformed_data.SeriousDlqin2yrs==0,'Weighted_Delay_sum'], ax=ax[0])
sns.distplot(transformed_data.loc[transformed_data.SeriousDlqin2yrs==1,'Weighted_Delay_sum'], ax=ax[1],color='r')
plt.show()

In [None]:
sns.barplot(x='SeriousDlqin2yrs',y='Weighted_Delay_sum',data=transformed_data)

In [None]:
#seaborn.barplot(x=df.reputation.value_counts().index, y=df.reputation.value_counts())

In [None]:
# fig, axs = plt.subplots(nrows=5, ncols=3, figsize=(15, 15))
# plt.subplots_adjust(hspace=0.5)

# # loop through tickers and axes
# for ticker, ax in zip(day_30, axs.ravel()):
#     sns.barplot(ax=ax,x=train_data.groupby('NumberOfTime30-59DaysPastDueNotWorse').get_group(ticker)['SeriousDlqin2yrs'].value_counts().index, y=train_data.groupby('NumberOfTime30-59DaysPastDueNotWorse').get_group(ticker)['SeriousDlqin2yrs'].value_counts())
#     ax.set_title(str(ticker)+" time")

# plt.show()

In [None]:
# fig, axs = plt.subplots(nrows=4, ncols=3, figsize=(15, 15))
# plt.subplots_adjust(hspace=0.5)

# # loop through tickers and axes
# for ticker, ax in zip(day_60, axs.ravel()):
#     sns.barplot(ax=ax,x=train_data.groupby('NumberOfTime60-89DaysPastDueNotWorse').get_group(ticker)['SeriousDlqin2yrs'].value_counts().index, y=train_data.groupby('NumberOfTime60-89DaysPastDueNotWorse').get_group(ticker)['SeriousDlqin2yrs'].value_counts())
#     ax.set_title(str(ticker)+" time")

# plt.show()

In [None]:
# fig, axs = plt.subplots(nrows=4, ncols=4, figsize=(20, 15))

# # loop through tickers and axes
# for ticker, ax in zip(day_90, axs.ravel()):
#     sns.barplot(ax=ax,x=train_data.groupby('NumberOfTimes90DaysLate').get_group(ticker)['SeriousDlqin2yrs'].value_counts().index, y=train_data.groupby('NumberOfTimes90DaysLate').get_group(ticker)['SeriousDlqin2yrs'].value_counts())
#     ax.set_title(str(ticker)+' time')

# plt.show()

In [None]:
#sns.FacetGrid(train_data, col = 'age', row = 'NumberOfTimes90DaysLate').map(sns.barplot,'SeriousDlqin2yrs')

In [None]:
# for i in train_data['SumOfDefaults'].index:    
#     if train_data['SumOfDefaults'][i]!=0:
#         d_30 = (train_data['NumberOfTime30-59DaysPastDueNotWorse'][i]/train_data['SumOfDefaults'][i])
#         d_60 = (train_data['NumberOfTime60-89DaysPastDueNotWorse'][i]/train_data['SumOfDefaults'][i])
#         d_90 = (train_data['NumberOfTimes90DaysLate'][i]/train_data['SumOfDefaults'][i])
#         sum_d = (d_30*1)+(d_60*2)+(d_90*3)
#         train_data.loc[i,'Weighted_SumOfDefaults'] = sum_d
#     else:
#         train_data.loc[i,'Weighted_SumOfDefaults'] = train_data['SumOfDefaults'][i]

# Function to check Outliers

In [None]:
def mark_outliers_zscore(feature, threshold = 3):
    z = np.abs(stats.zscore(feature))
    res = [i for i,j in zip(z.index,z.values) if j >= threshold]
    print("Number of outliers : ", len(res))
    return res

# RevolvingUtilizationOfUnsecuredLines

In [None]:
klib.dist_plot(train_data.loc[train_data['SeriousDlqin2yrs']==0,'RevolvingUtilizationOfUnsecuredLines'])
klib.dist_plot(train_data.loc[train_data['SeriousDlqin2yrs']==1,'RevolvingUtilizationOfUnsecuredLines'])
plt.grid()
plt.title("Distribution of RevolvingUtilizationOfUnsecuredLines");

In [None]:
from sklearn.preprocessing import MinMaxScaler
mx = MinMaxScaler()

In [None]:
plt.rcParams["figure.figsize"] = [10.00, 3.50]
plt.rcParams["figure.autolayout"] = True
fig, ax = plt.subplots(2, 1)
sns.boxplot(train_data.loc[train_data['SeriousDlqin2yrs']==0,'RevolvingUtilizationOfUnsecuredLines'], ax=ax[0])
sns.boxplot(train_data.loc[train_data['SeriousDlqin2yrs']==1,'RevolvingUtilizationOfUnsecuredLines'], ax=ax[1],color='r')
plt.show()

In [None]:
train_data.loc[(train_data['SeriousDlqin2yrs']==0)&(train_data['RevolvingUtilizationOfUnsecuredLines']<=1),'RevolvingUtilizationOfUnsecuredLines']

In [None]:
klib.dist_plot(train_data.loc[(train_data['SeriousDlqin2yrs']==0)&(train_data['RevolvingUtilizationOfUnsecuredLines']<=1),'RevolvingUtilizationOfUnsecuredLines'])
klib.dist_plot(train_data.loc[(train_data['SeriousDlqin2yrs']==1)&(train_data['RevolvingUtilizationOfUnsecuredLines']<=1),'RevolvingUtilizationOfUnsecuredLines'])
plt.grid()

In [None]:
plt.rcParams["figure.figsize"] = [15.00, 3.50]
plt.rcParams["figure.autolayout"] = True
fig, ax = plt.subplots(1, 2)
sns.distplot(train_data.loc[(train_data['SeriousDlqin2yrs']==0)&(train_data['RevolvingUtilizationOfUnsecuredLines']<=1),'RevolvingUtilizationOfUnsecuredLines'], ax=ax[0])
sns.distplot(train_data.loc[(train_data['SeriousDlqin2yrs']==1)&(train_data['RevolvingUtilizationOfUnsecuredLines']<=1),'RevolvingUtilizationOfUnsecuredLines'], ax=ax[1],color='r')
plt.show()

In [None]:
print(train_data[train_data.RevolvingUtilizationOfUnsecuredLines>1].shape)
print(train_data[train_data.RevolvingUtilizationOfUnsecuredLines>1]['SeriousDlqin2yrs'].value_counts())
plt.rcParams["figure.figsize"] = [15.00, 3.50]
plt.rcParams["figure.autolayout"] = True
fig, ax = plt.subplots(1, 2)
sns.distplot(train_data.loc[(train_data['SeriousDlqin2yrs']==0)&(train_data['RevolvingUtilizationOfUnsecuredLines']>1),'RevolvingUtilizationOfUnsecuredLines'], ax=ax[0])
sns.distplot(train_data.loc[(train_data['SeriousDlqin2yrs']==1)&(train_data['RevolvingUtilizationOfUnsecuredLines']>1),'RevolvingUtilizationOfUnsecuredLines'], ax=ax[1],color='r')
plt.show()

In [None]:
train_data[train_data.RevolvingUtilizationOfUnsecuredLines>=0.2]['SeriousDlqin2yrs'].value_counts()

In [None]:
train_data[(train_data.RevolvingUtilizationOfUnsecuredLines>0.4)&(train_data.RevolvingUtilizationOfUnsecuredLines<=0.99999)]['SeriousDlqin2yrs'].value_counts()

In [None]:
res = mark_outliers_zscore(train_data.RevolvingUtilizationOfUnsecuredLines)

>- There are more outlier can be seen in the data, There is huge gap between max value and the median.
>- As compare to whole data there are few values where `RevolvingUtilizationOfUnsecuredLines` is more than `1`, which is just around 2% of whole data.
>- Whenever `RevolvingUtilizationOfUnsecuredLines` is more than 8000 then target is always `1`.



# MonthlyIncome and Debtratio

In [None]:
interval = (0,18, 25, 35, 60, 110)
cats = ['Child','Student', 'Young', 'Adult', 'Old']
train_data['age_cat'] = pd.cut(train_data.age, interval, labels=cats)

In [None]:
transformed_data['age_cat'] = pd.cut(transformed_data.age, interval, labels=cats)

In [None]:
test_data['age_cat'] = pd.cut(test_data.age, interval, labels=cats)

In [None]:
res = mark_outliers_zscore(train_data.MonthlyIncome)

In [None]:
res = mark_outliers_zscore(train_data.DebtRatio)

In [None]:
klib.dist_plot(train_data.loc[train_data['SeriousDlqin2yrs']==0,'MonthlyIncome'])
klib.dist_plot(train_data.loc[train_data['SeriousDlqin2yrs']==1,'MonthlyIncome'])
plt.grid()
plt.title("Distribution of RevolvingUtilizationOfUnsecuredLines");

In [None]:
plt.rcParams["figure.figsize"] = [10.00, 3.50]
#plt.rcParams["figure.autolayout"] = True
fig, ax = plt.subplots(2, 1)
sns.boxplot(train_data.loc[train_data['SeriousDlqin2yrs']==0,'MonthlyIncome'], ax=ax[0],color='b')
sns.boxplot(train_data.loc[train_data['SeriousDlqin2yrs']==1,'MonthlyIncome'], ax=ax[1],color='r')
plt.show()

In [None]:
plt.rcParams["figure.figsize"] = [15.00, 3.50]
plt.rcParams["figure.autolayout"] = True
fig, ax = plt.subplots(1, 2)
sns.distplot(train_data.loc[(train_data['SeriousDlqin2yrs']==0),'MonthlyIncome'], ax=ax[0],color='b',rug=True)
sns.distplot(train_data.loc[(train_data['SeriousDlqin2yrs']==1),'MonthlyIncome'], ax=ax[1],color='r',rug=True)
plt.show()

In [None]:
plt.rcParams["figure.figsize"] = [15.00, 3.50]
plt.rcParams["figure.autolayout"] = True
fig, ax = plt.subplots(1, 2)
sns.distplot(np.log(train_data.loc[(train_data['SeriousDlqin2yrs']==0),'MonthlyIncome']+1), ax=ax[0],color='b',rug=True)
sns.distplot(np.log(train_data.loc[(train_data['SeriousDlqin2yrs']==1),'MonthlyIncome']+1), ax=ax[1],color='r',rug=True)
plt.show()

In [None]:
transformed_data['MonthlyIncome_lg'] = np.log(transformed_data['MonthlyIncome']+1)
test_data['MonthlyIncome_lg'] = np.log(test_data['MonthlyIncome']+1)

In [None]:
klib.dist_plot(transformed_data.loc[transformed_data['SeriousDlqin2yrs']==0,'MonthlyIncome_lg'])
klib.dist_plot(transformed_data.loc[transformed_data['SeriousDlqin2yrs']==1,'MonthlyIncome_lg'])
plt.grid()
plt.title("Distribution of MonthlyIncome_lg");

In [None]:
plt.rcParams["figure.figsize"] = [10.00, 3.50]
#plt.rcParams["figure.autolayout"] = True
fig, ax = plt.subplots(2, 1)
sns.boxplot(transformed_data.loc[transformed_data['SeriousDlqin2yrs']==0,'MonthlyIncome_lg'], ax=ax[0],color='b')
sns.boxplot(transformed_data.loc[transformed_data['SeriousDlqin2yrs']==1,'MonthlyIncome_lg'], ax=ax[1],color='r')
plt.show()

In [None]:
plt.rcParams["figure.figsize"] = [10.00, 3.50]
#plt.rcParams["figure.autolayout"] = True
fig, ax = plt.subplots(2, 1)
sns.boxplot(transformed_data.loc[transformed_data['SeriousDlqin2yrs']==0,'MonthlyIncome_lg'], ax=ax[0],color='b')
sns.boxplot(transformed_data.loc[transformed_data['SeriousDlqin2yrs']==1,'MonthlyIncome_lg'], ax=ax[1],color='r')
plt.show()

In [None]:
plt.rcParams["figure.figsize"] = [15.00, 3.50]
plt.rcParams["figure.autolayout"] = True
fig, ax = plt.subplots(1, 2)
sns.distplot(transformed_data.loc[(transformed_data['SeriousDlqin2yrs']==0),'MonthlyIncome_lg'], ax=ax[0],color='b',rug=True)
sns.distplot(transformed_data.loc[(transformed_data['SeriousDlqin2yrs']==1),'MonthlyIncome_lg'], ax=ax[1],color='r',rug=True)
plt.show()

In [None]:
#train_data.drop(train_data[train_data.age==0].index,axis=0,inplace=True)

In [None]:
train_data.loc[train_data.age==0,:]

In [None]:
transformed_data.loc[transformed_data.age==0,:]

In [None]:
test_data.loc[test_data.age==0,:]

In [None]:
train_data.drop(train_data[train_data.age==0].index,axis=0,inplace=True)
transformed_data.drop(transformed_data[transformed_data.age==0].index,axis=0,inplace=True)

In [None]:
print(train_data.loc[:,'SeriousDlqin2yrs'].value_counts())
sns.barplot(x='age_cat',y='MonthlyIncome',data=train_data,hue='SeriousDlqin2yrs')

In [None]:
train_data.loc[train_data.MonthlyIncome<=1]

In [None]:
train_data.loc[(train_data.MonthlyIncome<=1)&(train_data.age_cat=='Student')&(train_data.DebtRatio>1)]

In [None]:
print(train_data.loc[train_data.MonthlyIncome<=1,'SeriousDlqin2yrs'].value_counts())
sns.barplot(x='age_cat',y='MonthlyIncome',data=train_data[train_data.MonthlyIncome<=1],hue='SeriousDlqin2yrs',palette='rainbow')

In [None]:
sns.boxplot(x='age_cat',y='MonthlyIncome',data=train_data[train_data.MonthlyIncome>1],hue='SeriousDlqin2yrs',palette='rainbow')

In [None]:
sns.boxplot(x='age_cat',y='MonthlyIncome',data=train_data[train_data.MonthlyIncome<=1])

> - `MonthlyIncome` is also the parameter highly skewed and contains outliers.
> - There are values with `MonthlyIncome` 0 to 10, for which we dosen't have the interpretation.
> - For this values the `DebtRatio` is more as it is ration of debt to income.
> - `DebtRatio` is dependant on `MonthlyIncome` so `MonthlyIncome` needs to be cleaned. It is also contains null values.
> - Outliers in `MonthlyIncome` can be handle by using `log transform`.

In [None]:
np.log(1), np.exp(8.412796020507812)

In [None]:
transformed_data['MonthlyIncome_lg'].mean()

In [None]:
child = 0 #transformed_data.loc[transformed_data.age_cat=='Child','MonthlyIncome_lg'].median()

In [None]:
student = transformed_data.loc[transformed_data.age_cat=='Student','MonthlyIncome_lg'].median()

In [None]:
young = transformed_data.loc[transformed_data.age_cat=='Young','MonthlyIncome_lg'].median()

In [None]:
adult = transformed_data.loc[transformed_data.age_cat=='Adult','MonthlyIncome_lg'].median()

In [None]:
old = transformed_data.loc[transformed_data.age_cat=='Old','MonthlyIncome_lg'].median()

In [None]:
transformed_data.loc[(transformed_data.age_cat=='Child')&(transformed_data.MonthlyIncome_lg.isna()),'MonthlyIncome_lg'] = 0

In [None]:
transformed_data.loc[(transformed_data.age_cat=='Student')&(transformed_data.MonthlyIncome_lg.isna()),'MonthlyIncome_lg'] = student

In [None]:
transformed_data.loc[(transformed_data.age_cat=='Young')&(transformed_data.MonthlyIncome_lg.isna()),'MonthlyIncome_lg'] = young

In [None]:
transformed_data.loc[(transformed_data.age_cat=='Adult')&(transformed_data.MonthlyIncome_lg.isna()),'MonthlyIncome_lg'] = adult

In [None]:
transformed_data.loc[(transformed_data.age_cat=='Old')&(transformed_data.MonthlyIncome_lg.isna()),'MonthlyIncome_lg'] = old

In [None]:
child = 0 #test_data.loc[test_data.age_cat=='Child','MonthlyIncome_lg'].median()
student = test_data.loc[test_data.age_cat=='Student','MonthlyIncome_lg'].median()
young = test_data.loc[test_data.age_cat=='Young','MonthlyIncome_lg'].median()
adult = test_data.loc[test_data.age_cat=='Adult','MonthlyIncome_lg'].median()
old = test_data.loc[test_data.age_cat=='Old','MonthlyIncome_lg'].median()

test_data.loc[(test_data.age_cat=='Child')&(test_data.MonthlyIncome_lg.isna()),'MonthlyIncome_lg'] = 0
test_data.loc[(test_data.age_cat=='Student')&(test_data.MonthlyIncome_lg.isna()),'MonthlyIncome_lg'] = student
test_data.loc[(test_data.age_cat=='Young')&(test_data.MonthlyIncome_lg.isna()),'MonthlyIncome_lg'] = young
test_data.loc[(test_data.age_cat=='Adult')&(test_data.MonthlyIncome_lg.isna()),'MonthlyIncome_lg'] = adult
test_data.loc[(test_data.age_cat=='Old')&(test_data.MonthlyIncome_lg.isna()),'MonthlyIncome_lg'] = old

In [None]:
plt.rcParams["figure.figsize"] = [15.00, 3.50]
plt.rcParams["figure.autolayout"] = True
fig, ax = plt.subplots(1, 2)
sns.distplot(transformed_data.loc[(transformed_data['SeriousDlqin2yrs']==0),'MonthlyIncome_lg'], ax=ax[0],color='b',rug=True)
sns.distplot(transformed_data.loc[(transformed_data['SeriousDlqin2yrs']==1),'MonthlyIncome_lg'], ax=ax[1],color='r',rug=True)
plt.show()

In [None]:
plt.rcParams["figure.figsize"] = [15.00, 3.50]
plt.rcParams["figure.autolayout"] = True
fig, ax = plt.subplots(1, 2)
sns.distplot(test_data.loc[:,'MonthlyIncome_lg'], ax=ax[0],color='b',rug=True)
plt.show()

In [None]:
transformed_data.isna().sum()

In [None]:
test_data.isna().sum()

In [None]:
#transformed_data.dropna(subset='age_cat',inplace=True)

# Analyzing DebtRatio

In [None]:
res = mark_outliers_zscore(train_data.DebtRatio)

In [None]:
klib.dist_plot(np.log(train_data.DebtRatio+0.0001))

In [None]:
transformed_data[(transformed_data.index.isin(res))]

In [None]:
train_data[(train_data.index.isin(res))]

> - Out of 659 outliers 616 records are having `MonthlyIncome` as Null.
> - For these records Dividing DebtRatio by `MonthlyIncome` might adjust the outliers

In [None]:
transformed_data['DebtRatio_transformed'] = transformed_data['DebtRatio']
test_data['DebtRatio_transformed'] = test_data['DebtRatio']

In [None]:
transformed_data.loc[(transformed_data.index.isin(res)),'DebtRatio_transformed']

In [None]:
def debt(df):
    a = df.loc[(df.index.isin(res)),'DebtRatio_transformed']
    b = np.exp(df.loc[(df.index.isin(res)),'MonthlyIncome_lg'])-1
    
    for i,j in zip(df.loc[df.index.isin(res),'DebtRatio_transformed'].index,df.loc[df.index.isin(res),'DebtRatio_transformed'].values):
        if b[i]>1:
            df.loc[i,'DebtRatio_transformed'] = j/b[i]
    return df

In [None]:
debt(test_data)

In [None]:
transformed_data.loc[(transformed_data.index.isin(res)),'DebtRatio_transformed']

In [None]:
# a = transformed_data.loc[(transformed_data.index.isin(res)),'DebtRatio_transformed']

In [None]:
# b = np.exp(transformed_data.loc[(transformed_data.index.isin(res)),'MonthlyIncome_lg'])-1

In [None]:
res = mark_outliers_zscore(transformed_data['DebtRatio_transformed'])

In [None]:
res = mark_outliers_zscore(test_data['DebtRatio_transformed'])

### skewness
·> skewness is between `-0.5 and 0.5`, the data are fairly `symmetrical`

·> skewness is between `-1 and — 0.5` or between `0.5 and 1`, the data are `moderately skewed`

·> skewness is less than `-1 or greater than 1`, the data are `highly skewed` 

### Kurtosis - determine the volume of the outlier
·> `Kurtosis > 3` --> `leptokurtic` --> `distribution is tall and thin`

·> `Kurtosis < 3` --> `platykurtic` --> `moderately spread out`

·> `Kurtosis = 3`--> `mesokurtic`(looks more close to a normal distribution) ==> between `leptokurtic` and `platykurtic`

In [None]:
train_data[Numeric_Param]

In [None]:
train_data.NumberOfDependents.value_counts()

In [None]:
transformed_data['NumberOfDependentsTR'] = transformed_data['NumberOfDependents']
test_data['NumberOfDependentsTR'] = test_data['NumberOfDependents']
transformed_data.loc[transformed_data.NumberOfDependentsTR>=7,'NumberOfDependentsTR'] = 7
test_data.loc[test_data.NumberOfDependentsTR>=7,'NumberOfDependentsTR'] = 7

In [None]:
transformed_data.NumberOfDependentsTR.value_counts()

In [None]:
train_data.NumberRealEstateLoansOrLines.value_counts()

In [None]:
transformed_data['NumberRealEstateLoansOrLinesTR'] = transformed_data['NumberRealEstateLoansOrLines']
test_data['NumberRealEstateLoansOrLinesTR'] = test_data['NumberRealEstateLoansOrLines']

transformed_data.loc[transformed_data.NumberRealEstateLoansOrLinesTR>=11,'NumberRealEstateLoansOrLinesTR'] = 11
test_data.loc[test_data.NumberRealEstateLoansOrLinesTR>=11,'NumberRealEstateLoansOrLinesTR'] = 11

In [None]:
train_data.NumberOfOpenCreditLinesAndLoans.value_counts()

In [None]:
transformed_data['NumberOfOpenCreditLinesAndLoansTR'] = transformed_data['NumberOfOpenCreditLinesAndLoans']
test_data['NumberOfOpenCreditLinesAndLoansTR'] = test_data['NumberOfOpenCreditLinesAndLoans']

transformed_data.loc[transformed_data.NumberOfOpenCreditLinesAndLoansTR>=25,'NumberOfOpenCreditLinesAndLoansTR'] = 25
test_data.loc[test_data.NumberOfOpenCreditLinesAndLoansTR>=25,'NumberOfOpenCreditLinesAndLoansTR'] = 25

In [None]:
transformed_data.isna().sum()

In [None]:
transformed_data.loc[transformed_data.NumberOfDependents.isna(),'NumberOfDependents'] = 0
test_data.loc[test_data.NumberOfDependents.isna(),'NumberOfDependents'] = 0

transformed_data.loc[transformed_data.NumberOfDependentsTR.isna(),'NumberOfDependentsTR'] = 0
test_data.loc[test_data.NumberOfDependentsTR.isna(),'NumberOfDependentsTR'] = 0

# Clustering on dataset

In [None]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()

In [None]:
X = train_data[['SeriousDlqin2yrs', 'RevolvingUtilizationOfUnsecuredLines']].copy()

In [None]:
# X.fillna(-1,inplace=True)

In [None]:
X.tail(10)

In [None]:
scaled_df = sc.fit_transform(X)
scaled_df = X

In [None]:
kmeans = KMeans(
init="random",
n_clusters=4,
n_init=10,
max_iter=500,
random_state=42)

In [None]:
kmeans.fit(scaled_df)

In [None]:
kmeans.inertia_

In [None]:
kmeans.cluster_centers_

In [None]:
kmeans.n_iter_

In [None]:
np.unique(kmeans.labels_,return_counts=True)

In [None]:
label = kmeans.fit_predict(scaled_df)
label_0 = scaled_df[label == 0]
label_1 = scaled_df[label == 1]
label_2 = scaled_df[label == 2]
label_3 = scaled_df[label == 3]

In [None]:
np.unique(label,return_counts=True)

In [None]:
filtered_label0

In [None]:
filtered_label0.iloc[:,0]

In [None]:
filtered_label0 = scaled_df[label == 0]

filtered_label1 = scaled_df[label == 1]
 
filtered_label2 = scaled_df[label == 2]

filtered_label3 = scaled_df[label == 3]
 
#Plotting the results
plt.scatter(filtered_label0.iloc[:,0] , filtered_label0.iloc[:,1] , color = 'blue')
plt.scatter(filtered_label1.iloc[:,0] , filtered_label1.iloc[:,1] , color = 'red')
plt.scatter(filtered_label2.iloc[:,0] , filtered_label2.iloc[:,1] , color = 'green')
plt.scatter(filtered_label3.iloc[:,0] , filtered_label3.iloc[:,1] , color = 'orange')
plt.show()

In [None]:
plt.style.use("fivethirtyeight")
plt.plot(range(1, 11), sse)
plt.xticks(range(1, 11))
plt.xlabel("Number of Clusters")
plt.ylabel("SSE")
plt.show()

# Modelling

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler

from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, confusion_matrix,log_loss

In [None]:
def data_build(df):
    X = df.drop('SeriousDlqin2yrs',axis=1)
    y = df.SeriousDlqin2yrs
    sc = StandardScaler()
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=0)
    X_train = sc.fit_transform(X_train)
    X_test = sc.transform(X_test)    

    return X, y, X_train, y_train, X_test, y_test

In [None]:
def modelling(model, X_train, y_train, X_test, y_test,env = True):
    if env:
        print('I am here')
        model_ = model.fit(X_train, y_train)
        predictions = model_.predict(X_test)
        

        print("="*100)
        print('log loss, ', log_loss(y_test,predictions))
        print('Accuracy Score, ', accuracy_score(y_test,predictions))
        print('Recall Score, ', recall_score(y_test,predictions))
        print('Precision Score, ', precision_score(y_test,predictions))
        print("="*100)
        print('Confusion Matrix, \n', confusion_matrix(y_test,predictions))
    else:
        print('I am here_1')
        sc = StandardScaler()
        X_train = sc.fit_transform(X_train)
        X_test = sc.transform(X_test)
        model_ = model.fit(X_train, y_train)
        predictions = model.predict_proba(X_test)
    return predictions

In [None]:
age_map = {'Child':0,'Student':1, 'Young':2, 'Adult':3, 'Old':4}
transformed_data.age_cat = transformed_data.age_cat.map(age_map)

In [None]:
age_map = {'Child':0,'Student':1, 'Young':2, 'Adult':3, 'Old':4}
test_data.age_cat = test_data.age_cat.map(age_map)

In [None]:
transformed_data['MonthlyIncome_nolg_'] = np.exp(transformed_data['MonthlyIncome_lg'])-1
test_data['MonthlyIncome_nolg_'] = np.exp(test_data['MonthlyIncome_lg'])-1

In [None]:
transformed_data.MonthlyIncome_lg

In [None]:
transformed_feature = ['SeriousDlqin2yrs',
                       'RevolvingUtilizationOfUnsecuredLines',
                       'Weighted_Delay_sum', 
                       'age',
                       'MonthlyIncome_nolg_', 
                       'DebtRatio', 
                       'NumberOfDependentsTR',
                       'NumberRealEstateLoansOrLinesTR', 
                       'NumberOfOpenCreditLinesAndLoansTR']

In [None]:
transformed_data[transformed_feature]

In [None]:
transformed_data_bkp = transformed_data.copy()
test_data_bkp = test_data.copy()

In [None]:
df = transformed_data[transformed_feature]

In [None]:
X, y, X_train, y_train, X_test, y_test = data_build(df)

In [None]:
pred = modelling(DecisionTreeClassifier(random_state=0),X_train, y_train, X_test, y_test)

In [None]:
pred = modelling(DecisionTreeClassifier(random_state=0),X_train, y_train, X_test, y_test)

In [None]:
df_test = test_data[transformed_feature]
df_test.drop('SeriousDlqin2yrs',axis=1)

In [None]:
Xtest = df_test.drop('SeriousDlqin2yrs',axis=1)

In [None]:
pred = modelling(RandomForestClassifier(random_state=0),X, y, Xtest, None, False)

In [None]:
pred

In [None]:
Xtest['Probability'] = pred[:,1]
Xtest['Id'] = range(1,101504)

In [None]:
Xtest[['Id','Probability']].to_csv('DataSet/submission_14.csv',index=False)