In [None]:
library(ggplot2)
library(survival)
library(plyr)
library(dplyr)
library(stringr)
library(data.table)
library(tidyr)
library(corrplot)
library(Metrics)
library(caret)
library(dummies)
library(rpart)
library(rpart.plot)
library(e1071)
library(randomForest)
library(glmnet)
library(gbm)
library(Matrix)
library(iterators)
library(xgboost)
library(parallel)
library(parallelMap) 
library(caretEnsemble)
library(ensembleR)
library(caTools)
library(mlbench)
library(party)
library(ranger)
library(lars)
options(scipen = 999)


> Reading in cleaned data file from first kernel

In [None]:
df <- read.csv("../input/maindffinal20210912/maindffinal20210912.csv")

str(df)


> Summarizing data file

In [None]:
summary(df)

> Casting these columns to factor data types for readability

In [None]:
df$NAME_CONTRACT_TYPE <- as.factor(df$NAME_CONTRACT_TYPE)
df$CODE_GENDER <- as.factor(df$CODE_GENDER)
df$FLAG_OWN_CAR <- as.factor(df$FLAG_OWN_CAR)
df$FLAG_OWN_REALTY <- as.factor(df$FLAG_OWN_REALTY)
df$NAME_INCOME_TYPE <- as.factor(df$NAME_INCOME_TYPE)
df$NAME_EDUCATION_TYPE <- as.factor(df$NAME_EDUCATION_TYPE)
df$NAME_FAMILY_STATUS <- as.factor(df$NAME_FAMILY_STATUS)
df$NAME_HOUSING_TYPE <- as.factor(df$NAME_HOUSING_TYPE)
df$data_split <- as.factor(df$data_split)

> Checking if the effect of data types conversion was effected

In [None]:
summary(df)

>  Data file exploratoty analysis

In [None]:
ggplot(df,aes(x = factor(NAME_CONTRACT_TYPE),fill = factor(TARGET))) +
  geom_bar() +
   xlab("Contract type") +
   ylab("Count loans") 


> Over 90% of loans given out are cash loans, they also have the highest default rates

In [None]:
ggplot(df,aes(x = factor(CODE_GENDER),fill = factor(TARGET))) +
  geom_bar() +
   xlab("Gender") +
   ylab("Count loans")

> More females than males have taken loans, defaults are also higher in females than males

In [None]:
ggplot(df,aes(x = factor(FLAG_OWN_CAR),fill = factor(TARGET))) +
  geom_bar() +
   xlab("Owns car") +
   ylab("Count loans")

> Most of the loan borrowers do not own cars, non car owners also default more than car owners

In [None]:
ggplot(df,aes(x = factor(FLAG_OWN_REALTY),fill = factor(TARGET))) +
  geom_bar() +
   xlab("Owns realty") +
   ylab("Count loans")

> Majority of borrowers own realty, realty owners default more than non realty owners

In [None]:
ggplot(df,aes(x=CNT_CHILDREN,fill = factor(TARGET))) + geom_histogram(binwidth=0.5)


> Most borrowers do not have children, defaulting and  number of children have a negative correlation

In [None]:

df %>%
    filter(AMT_INCOME_TOTAL <= 500000) %>%
    ggplot() + 
    geom_histogram(aes(x = AMT_INCOME_TOTAL, fill = factor(TARGET)))


> There are outliers in the income amount variable that have been excluded,majority of customers have income amount total that is below the mean, defaulting and amount income have a correlation

In [None]:
ggplot(df,aes(x=CNT_FAM_MEMBERS,fill = factor(TARGET))) + geom_histogram(binwidth=0.5)


> Majority of borrowers have only 1 family member and also have the highest default. These are probably couples with imbalanced incomes

In [None]:
ggplot(df,aes(x=REGION_RATING_CLIENT,fill = factor(TARGET))) + geom_histogram(binwidth=0.5)

> Client rating 2 has the most borrowers and also highest defaults

In [None]:
ggplot(df,aes(x=AMT_CREDIT,fill = factor(TARGET))) + geom_histogram()

> Amount credited and default have a correlationship

In [None]:
ggplot(df,aes(x= AMT_GOODS_PRICE,fill = factor(TARGET))) + geom_histogram()

> Amount good price and default have a correlation

In [None]:
ggplot(df,aes(x = factor(NAME_INCOME_TYPE),fill = factor(TARGET))) +
  geom_bar() + scale_x_discrete(guide = guide_axis(n.dodge=3)) + 
   xlab("Income type") +
   ylab("Count loans")

In [None]:
ggplot(df,aes(x = factor(NAME_EDUCATION_TYPE),fill = factor(TARGET))) +
  geom_bar() + scale_x_discrete(guide = guide_axis(n.dodge=3)) + 
   xlab("Education type") +
   ylab("Count loans")

> Most of the borrowers have a secondary/special education type, they also mostly earn their incomes from employment or running businesses.

In [None]:
ggplot(df,aes(x = factor(NAME_FAMILY_STATUS),fill = factor(TARGET))) +
  geom_bar() + scale_x_discrete(guide = guide_axis(n.dodge=3)) + 
   xlab("Family status") +
   ylab("Count loans")

> Most borrowers are married couples, with no children, have secondary education and come from regions with a bad client rating. They also have the highest deaults

In [None]:
ggplot(df,aes(x = factor(NAME_HOUSING_TYPE),fill = factor(TARGET))) +
  geom_bar() + scale_x_discrete(guide = guide_axis(n.dodge=3)) + 
   xlab("Housing type") +
   ylab("Count loans")

In [None]:
ggplot(df,aes(x=REGION_POPULATION_RELATIVE,fill = factor(TARGET))) + geom_histogram()

> Region populative relative has no strong relationship with defaulting

In [None]:
ggplot(df,aes(x=DAYS_BIRTH,fill = factor(TARGET))) + geom_histogram()

> The younger the borrower is,  the more likely they are to default

In [None]:
df %>%
    filter(DAYS_EMPLOYED <= 100000) %>%
    ggplot() + 
    geom_histogram(aes(x = DAYS_EMPLOYED, fill = factor(TARGET)))


Borrowers employed more recently are more likely to default than those who have been in employment longer

In [None]:
ggplot(df,aes(x= EXT_SOURCE_1,fill = factor(TARGET))) + geom_histogram()

In [None]:
ggplot(df,aes(x= EXT_SOURCE_2,fill = factor(TARGET))) + geom_histogram()

In [None]:
ggplot(df,aes(x= EXT_SOURCE_3,fill = factor(TARGET))) + geom_histogram()

> A lot of borrowers have an external score of 0.5, with also the highest default, the external source score 2 is widely spread out and, the higher the score the more the borrowers and the  highe the default rate. External source 3 score is less spread out, with more borrowers falling in the 0.5 score and having the highest default rates

In [None]:
ggplot(df,aes(x= Countpreviousapplicactions,fill = factor(TARGET))) + geom_histogram()

> Borrowers with less previous applications have higher defaults than those with more applications. More likely because those who have borrowed more have established a borrowing history making lending decisions more informed

In [None]:
ggplot(df,aes(x= Cash.loans,fill = factor(TARGET))) + geom_histogram()

In [None]:
ggplot(df,aes(x= Consumer.loans,fill = factor(TARGET))) + geom_histogram()

In [None]:
ggplot(df,aes(x= Revolving.loans,fill = factor(TARGET))) + geom_histogram()

In [None]:
ggplot(df,aes(x= Approved,fill = factor(TARGET))) + geom_histogram()

> Customers with less approved loans have more deafaults than those with more approved loans

In [None]:
ggplot(df,aes(x= Countloans,fill = factor(TARGET))) + geom_histogram()

> Same pattern as number of previous application, explanation holds

In [None]:
ggplot(df,aes(x= Canceled,fill = factor(TARGET))) + geom_histogram()

In [None]:
ggplot(df,aes(x=Refused,fill = factor(TARGET))) + geom_histogram()

In [None]:
  ggplot(df,aes(x=Unused.offer,fill = factor(TARGET))) + geom_histogram()

> Borrowers who do not have unused offered loans or never refused a lon have more default than those with the opposite

In [None]:
  ggplot(df,aes(x=Averagedayscredit,fill = factor(TARGET))) + geom_histogram()

> The sooner borrowers had asked for a bureau credit debt the more likely they are to default

In [None]:

df %>%
    filter(Averagedayscreditoverdue <= 0.5) %>%
    ggplot() + 
    geom_histogram(aes(x = Averagedayscreditoverdue, fill = factor(TARGET)))

In [None]:
ggplot(df,aes(x= Credittoannuityratio,fill = factor(TARGET))) + geom_histogram()

In [None]:
df %>%
    filter(Paymenttoinstalmentratio <= 1) %>%
    ggplot() + 
    geom_histogram(aes(x = Paymenttoinstalmentratio, fill = factor(TARGET)))

In [None]:
df %>%
    filter(Daysinstalmenttodaysentryration <= 1) %>%
    ggplot() + 
    geom_histogram(aes(x = Daysinstalmenttodaysentryration, fill = factor(TARGET)))

In [None]:
df %>%
    filter(Downpaymenttocreditratio <= 1) %>%
    ggplot() + 
    geom_histogram(aes(x = Downpaymenttocreditratio, fill = factor(TARGET)))

In [None]:
ggplot(df,aes(x= averageinterestrate,fill = factor(TARGET))) + geom_histogram(binwidth=0.3)

The loans attracted a very small interest rate

In [None]:
ggplot(df,aes(x=averagecreditcardblancepermonth,fill = factor(TARGET))) + geom_histogram(binwidth=0.3)


Majority of customers exhausted their credit card limits every month and defaulted more than those who did not exhaust