In [None]:
train <- read.csv("../input/home-credit-default-risk/application_train.csv")
test <- read.csv("../input/home-credit-default-risk/application_test.csv")
bureau <- read.csv("../input/home-credit-default-risk/bureau.csv")
#bureaubalance <- read.csv("../input/home-credit-default-risk/bureau_balance.csv")
creditcardbalance <- read.csv("../input/home-credit-default-risk/credit_card_balance.csv")
installmentspayments <- read.csv("../input/home-credit-default-risk/installments_payments.csv")
previousapplication <- read.csv("../input/home-credit-default-risk/previous_application.csv")
#POS <- read.csv("../input/home-credit-default-risk/POS_CASH_balance.csv")


In [None]:
library(ggplot2)
library(survival)
library(plyr)
library(dplyr)
library(stringr)
library(data.table)
library(tidyr)
library(corrplot)
library(Metrics)
library(caret)
library(dummies)
library(rpart)
library(rpart.plot)
library(e1071)
library(randomForest)
library(glmnet)
library(gbm)
library(Matrix)
library(iterators)
library(xgboost)
library(parallel)
library(parallelMap) 
library(caretEnsemble)
library(ensembleR)
library(caTools)
library(mlbench)
library(party)
library(ranger)
library(lars)
library(tidyverse)

# Missing values imputatutions and feature engineering

**Previous instalments dataframe - filtered data to only include last 360 days of previous instalments****

In [None]:
for(i in 1:ncol(installmentspayments)){
  installmentspayments[is.na(installmentspayments[,i]), i] <- mean(installmentspayments[,i], na.rm = TRUE)
}

In [None]:
installmentspayments <- installmentspayments %>%
    filter(DAYS_ENTRY_PAYMENT >=-360)

In [None]:
installmentspaymentssummary <- installmentspayments %>%
group_by(SK_ID_CURR) %>%
summarise(Paymenttoinstalmentratio = sum(AMT_PAYMENT)/sum(AMT_INSTALMENT),
         Daysinstalmenttodaysentryration = sum(DAYS_INSTALMENT)/sum(DAYS_ENTRY_PAYMENT))
is.na(installmentspaymentssummary) <- sapply(installmentspaymentssummary, is.infinite)

**Previous applications dataframe - limited data to iclude only last 360 days of previous applications****

In [None]:
for(i in 1:ncol(previousapplication)){
  previousapplication[is.na(previousapplication[,i]), i] <- mean(previousapplication[,i], na.rm = TRUE)
}

In [None]:
previousapplication <- previousapplication %>%
    filter(DAYS_DECISION >=-360)

In [None]:
# counting previous applications per client
previousapplicationsummary <- previousapplication %>%
group_by(SK_ID_CURR) %>%
summarise(Countpreviousapplicactions = n())
#Counting contract types per client from previous applications
previousapplicationcontract<- previousapplication %>%
group_by(SK_ID_CURR,NAME_CONTRACT_TYPE) %>%
summarise(Countloans = n())
previousapplicationcontrantlong <-spread(previousapplicationcontract, key = NAME_CONTRACT_TYPE, value = Countloans)
previousapplicationcontrantlong[is.na(previousapplicationcontrantlong)] <- 0

#Counting the status outcome of previous applications per client
previousapplicationcontractstatus <- previousapplication %>%
group_by(SK_ID_CURR,NAME_CONTRACT_STATUS) %>%
summarise(Countstatus = n())

previousapplicationcontractstatuslong <-spread(previousapplicationcontractstatus, key = NAME_CONTRACT_STATUS, value = Countstatus)
previousapplicationcontractstatuslong[is.na(previousapplicationcontractstatuslong)] <- 0

#Credit to annuity ratio
Credittoannuityratio <- previousapplication %>%
group_by(SK_ID_CURR) %>%
summarise(Credittoannuityratio = sum(AMT_CREDIT) / sum(AMT_ANNUITY))
is.na(Credittoannuityratio) <- sapply(Credittoannuityratio, is.infinite)

# Downpayment to credit ratio
Downpaymenttocreditratio <- previousapplication %>%
group_by(SK_ID_CURR) %>%
summarise(Downpaymenttocreditratio = sum(AMT_DOWN_PAYMENT) / sum(AMT_CREDIT))
is.na(Downpaymenttocreditratio) <- sapply(Downpaymenttocreditratio, is.infinite)

#Interestamount
interestloanperloan <- previousapplication %>%
group_by(SK_ID_CURR) %>%
summarise(interestrateperloan = (CNT_PAYMENT * AMT_ANNUITY)/AMT_CREDIT)

averageinterest <- interestloanperloan %>%
group_by(SK_ID_CURR) %>%
summarise(averageinterestrate = mean(interestrateperloan))       
is.na(averageinterest) <- sapply(Downpaymenttocreditratio, is.infinite)


**Previous applications dataframe needed complex subsetting for feature engineering, hence 4 resultant dataframes**

In [None]:
Contracttypeandstatus <- left_join(x = previousapplicationcontrantlong, y = previousapplicationcontractstatuslong, 
              by = "SK_ID_CURR",all.x = TRUE)
Annuityanddownpaymentrations <- left_join(x = Credittoannuityratio, y = Downpaymenttocreditratio, 
              by = "SK_ID_CURR",all.x = TRUE)

previousapplicationsdf <- left_join( x = previousapplicationsummary, y = Contracttypeandstatus, by = "SK_ID_CURR",all.x = TRUE)

previousapplicationsfeatureengineered <- left_join( x = previousapplicationsdf, y = Annuityanddownpaymentrations, by = "SK_ID_CURR",all.x = TRUE)

previousapplicationsfinaldf <- left_join( x = previousapplicationsfeatureengineered, y = averageinterest, by = "SK_ID_CURR",all.x = TRUE)



**Bureau data**

In [None]:

bureausummary <- bureau %>%
group_by(SK_ID_CURR) %>%
summarise(Countloans = n(),
          Averagedayscredit = mean(DAYS_CREDIT),
          Averagedayscreditoverdue = mean(CREDIT_DAY_OVERDUE),
          Debtratio = sum(AMT_CREDIT_SUM_DEBT)/ sum(AMT_CREDIT_SUM)
         )
is.na(bureausummary) <- sapply(bureausummary, is.infinite)


**Credit card balance, filtered on credit card blance over the last 12 months**

In [None]:

creditcardbalance <- creditcardbalance %>%
    filter(MONTHS_BALANCE >=-12)

creditcardblancepermonth <- creditcardbalance %>%
group_by(SK_ID_CURR) %>%
summarise(creditcardblancepermonth = AMT_BALANCE/AMT_CREDIT_LIMIT_ACTUAL)

 averagecreditcardblancepermonth <- creditcardblancepermonth %>%
 group_by(SK_ID_CURR) %>%
 summarise(averagecreditcardblancepermonth = mean(creditcardblancepermonth))       
 is.na(averagecreditcardblancepermonth) <- sapply(averagecreditcardblancepermonth, is.infinite)

**Excluded columns with most missing values from train and test data frames**

In [None]:
trainselected <- subset(train, select = c(SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,CNT_FAM_MEMBERS,REGION_RATING_CLIENT,
                        AMT_CREDIT,AMT_GOODS_PRICE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,
                        EXT_SOURCE_1,EXT_SOURCE_2,EXT_SOURCE_3,AMT_ANNUITY,DAYS_EMPLOYED))
testselected <- subset(test, select = c(SK_ID_CURR,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,CNT_FAM_MEMBERS,REGION_RATING_CLIENT,
                        AMT_CREDIT,AMT_GOODS_PRICE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,
                        EXT_SOURCE_1,EXT_SOURCE_2,EXT_SOURCE_3,AMT_ANNUITY,DAYS_EMPLOYED))


**Data type conversions in the main train and test dataframes**

In [None]:
for(i in 1:ncol(trainselected)){
  trainselected[is.na(trainselected[,i]), i] <- mean(trainselected[,i], na.rm = TRUE)
}

for(i in 1:ncol(testselected)){
  testselected[is.na(testselected[,i]), i] <- mean(testselected[,i], na.rm = TRUE)
}

**Combining test and train into one df**

In [None]:
maindf <- bind_rows(trainselected %>% 
          mutate(data_split="train"),testselected %>% 
          mutate(data_split="test"))
maindf <- mutate(maindf, Incomecreditflag = ifelse(AMT_INCOME_TOTAL > AMT_CREDIT,1,0))
maindf <- mutate(maindf, Creditincomepercent =  AMT_CREDIT/AMT_INCOME_TOTAL)
maindf <- mutate(maindf, Annuityincomeratio =  AMT_ANNUITY/AMT_INCOME_TOTAL)
maindf <- mutate(maindf, Daysemployedtoageratio =  DAYS_EMPLOYED/DAYS_BIRTH)
maindf <- mutate(maindf, Numberofpayments =  AMT_CREDIT/AMT_ANNUITY)
maindf <- mutate(maindf, Credittogoodsratio =  AMT_CREDIT/AMT_GOODS_PRICE)

**Feature engineering a column by averaging external sources columns**

In [None]:
maindf$Externalsourcesmean <- (maindf$EXT_SOURCE_1 + maindf$EXT_SOURCE_2 + maindf$EXT_SOURCE_3)/3

**Merging main test and train, previous applications, bureau and previous instalments into one dataframe**

In [None]:
maindf1 <- left_join(x = maindf, y = previousapplicationsfinaldf, 
              by = "SK_ID_CURR",all.x = TRUE)

maindf2 <- left_join(x = maindf1, y = installmentspaymentssummary, 
              by = "SK_ID_CURR",all.x = TRUE)
maindf3 <- left_join(x = maindf2, y = averagecreditcardblancepermonth, 
              by = "SK_ID_CURR",all.x = TRUE)
maindffinal <- left_join(x = maindf3, y = bureausummary, 
              by = "SK_ID_CURR",all.x = TRUE)
maindffinal[is.na(maindffinal)] <- 0

In [None]:
write.csv(maindffinal, file = "maindffinal20210912.csv")