In [None]:
library(ggplot2)
library(survival)
library(plyr)
library(dplyr)
library(stringr)
library(data.table)
library(tidyr)
library(corrplot)
library(Metrics)
library(caret)
library(dummies)
library(rpart)
library(rpart.plot)
library(e1071)
library(randomForest)
library(glmnet)
library(gbm)
library(Matrix)
library(iterators)
library(xgboost)
library(parallel)
library(parallelMap) 
library(caretEnsemble)
library(ensembleR)
library(caTools)
library(mlbench)
library(party)
library(ranger)
library(lars)
library(lightgbm)
library(pROC)
options(scipen = 999)

**Reading in data**

In [None]:
#df1 <- read.csv("../input/last360daysv1/maindffinaltodayinterest3.csv")
df <- read.csv("../input/maindffinal20210912/maindffinal20210912.csv")

is.na(df)<-sapply(df, is.infinite)
df[is.na(df)]<-0

**Encoding data**

In [None]:
df$NAME_CONTRACT_TYPE <- ifelse(df$NAME_CONTRACT_TYPE == "Cash loans",1,0)
df$CODE_GENDER <- ifelse(df$CODE_GENDER == "F",1,0)
df$FLAG_OWN_CAR <- ifelse(df$FLAG_OWN_CAR  == "N",1,0)
df$FLAG_OWN_REALTY <- ifelse(df$FLAG_OWN_CAR  == "N",1,0)

**Removing unneeded columns**

In [None]:
df <-subset(df, select = -c(DAYS_EMPLOYED.1,X))

**Splitting data into training and testing sets**

In [None]:

train <- df %>% filter(data_split=="train")
test <- df %>% filter(data_split=="test")

**Removing unneeded column from train data**

In [None]:
train1 <-subset(train, select = -c(data_split))

**Splitting training data into training and validation**

In [None]:
smp_size <- floor(0.75 * nrow(train1))
## set the seed to make your partition reproducible
set.seed(123)
trainsplit <- sample(seq_len(nrow(train1)), size = smp_size)
train2 <- train[trainsplit, ]
validation <- train[-trainsplit, ]
train2 <-subset(train2, select = -c(data_split))
validation <-subset(validation, select = -c(data_split))



**XGBOOST modelling and validation**

In [None]:

train2y <- subset(train2, select = c(TARGET))
train2x <-subset(train2,select = -c(TARGET))
validation1 <-subset(validation,select = -c(TARGET))



train3x <- data.frame(lapply(train2x, as.numeric))
train3y <- data.frame(lapply(train2y, as.numeric))
validation2 <- data.frame(lapply(validation1, as.numeric))


train4x <- as.matrix(train3x)
train4y <- as.matrix(train3y)
validation3 <-as.matrix(validation2)

model1validation <- xgboost(data = train4x, 
                  booster="gbtree",
                  objective = "binary:logistic",
                  eval_metric = "auc",
                  nthread = 4,
                  eta = 0.05,
                  nround = 2000,
                  max.depth = 6,
                  label = train4y,
                  min_child_weight = 30,
                  subsample = 0.85,
                  colsample_bytree = 0.7,
                  colsample_bylevel = 0.632,
                  early_stopping_rounds = 97)

pred1validation <- predict(model1validation,validation3)

pred1validationdf <- data.frame('SK_ID_CURR' = validation$SK_ID_CURR, 'Predictedtarget' = pred1validation,
                               'Actual target' = validation$TARGET)

**XGBOOST ROC curve**

In [None]:
 roc_xgboost<- roc(pred1validationdf$Actual.target, pred1validationdf$Predictedtarget)
 plot(roc_xgboost, print.auc=TRUE) 

**LIGHT BM  modelling and validation**

In [None]:
dtrain <- lgb.Dataset(
    data = train4x,
    label = train4y)

train_params <- list(
       nthread=4,
       objective = "binary",
       n_estimators=10000,
       learning_rate=0.02,
       num_leaves=32,
       colsample_bytree=0.9497036,
       subsample=0.8715623,
       max_depth=8,
       reg_alpha=0.04,
       reg_lambda=0.073,
       min_split_gain=0.0222415,
       min_child_weight=40,
       silent=-1,
      verbose=-1)

bst <- lightgbm(
    data = dtrain,
    params = train_params,
    nrounds = 500)

predlbm <- predict(bst, validation3)

pred1validationlbm <- data.frame('SK_ID_CURR' = validation$SK_ID_CURR, 'Predictedtarget' = predlbm,
                               'Actual target' = validation$TARGET)


**LIGHT BM ROC curve**

In [None]:
 roc_lbm <- roc(pred1validationlbm$Actual.target, pred1validationlbm$Predictedtarget)
 plot(roc_lbm, print.auc=TRUE) 

**XGBOOST has better results than LBM**