![](https://i.imgur.com/ptJccl9.jpg)

Captain's Log Sun Dec 2nd 2018: Memory Problems

Captain's Log Wed Dec 5th 2018: Going to let this kernel age a bit

In [None]:
library(tidyverse)
library(readr)
library(caret)
library(caretEnsemble)
library(plyr); library(dplyr)  # miltidplyr?
library(cluster)
library(HSAUR)
library(corrplot)
library(DMwR)
library(h2o)
library(pryr)
# library(data.table)
# TODO: library(microbenchmark)

In [None]:
printEnv <- function() {
    for(file in as.list(ls(".GlobalEnv"))) {
        print(paste("---", file, "-----------------------------------"))
        print(object_size(eval(parse(text = file))))
    }
}

**IMPORT TRAIN & TEST DATA**

In [None]:
raw_train <- read_csv("../input/elo-merchant-category-recommendation/train.csv")
raw_test <- read_csv("../input/elo-merchant-category-recommendation/test.csv")
# raw_train <- fread("../input/train.csv", sep = ",")
# raw_test <- fread("../input/test.csv", sep = ",")

# TAG TARGET DATA
raw_test$target <- "x"

# SMASH TEST & TRAIN TOGETHER
data_full <- rbind(raw_train, raw_test)

# SAVE SOME SPACE
rm(raw_train, raw_test); gc()

# GET UNIQUE CARD IDS (FOR WHEN SAMPLING)
full_unique_card_id <- as.vector(data_full['card_id'])

# SAVE MORE SPACE (will bring it back later after transaction formatting)
rm(data_full); gc()
ls() # full_unique_card_id

In [None]:
old_trans <- read_csv("../input/elo-merchant-category-recommendation/historical_transactions.csv")
new_trans <- read_csv("../input/elo-merchant-category-recommendation/new_merchant_transactions.csv")

# LABEL NEW & OLD TRANSACTIONS
old_trans$dataset <- 0
new_trans$dataset <- 1

# SMASH OLD & NEW TOGETHER
tran_full <- rbind(old_trans, new_trans)
# print(class(tran_full))
# tran_full <- data.table(tran_full)
# print(class(tran_full))

# PICK OUT RELEVANT COLUMNS
tran_full <- tran_full[,c(
  "authorized_flag",
  "dataset",
  "card_id",
  #"city_id",
  "category_1",
  #"category_2",
  #"category_3",
  "installments",
  "merchant_category_id",
  "merchant_id",
  "month_lag",
  "purchase_amount"
  #"purchase_date"
  #"state_id",
  #"subsector_id"
    )]

# I SHOULD CONVERT PURCHASE DATE TO SOMETHING MORE EFFICIENT HERE
# - YES

# SAVE SOME SPACE
rm(old_trans, new_trans); gc()

print(str(tran_full))
print(ls())

In [None]:
merchants <- read_csv("../input/elo-merchant-category-recommendation/merchants.csv")

# GRAB ALL UNIQUE MERCHANT IDS FROM THE FILTERED TRANSACTION DATA
full_unique_merchant_id <- as.vector(tran_full['merchant_id'])

# I MIGHT IMPUTE THE COLUMNS BELOW RATHER THAN USE 0
#print(mean(merchants$avg_sales_lag3, na.rm = TRUE))
#print(mean(merchants$avg_sales_lag6, na.rm = TRUE))
#print(mean(merchants$avg_sales_lag12, na.rm = TRUE))

# FIX NA VALUES IN MERCHANT DATA
merchants$avg_sales_lag3[is.na(merchants$avg_sales_lag3)] <- 3 # SKETCHY?
merchants$avg_sales_lag6[is.na(merchants$avg_sales_lag6)] <- 6
merchants$avg_sales_lag12[is.na(merchants$avg_sales_lag12)] <- 12
merchants$category_2[is.na(merchants$category_2)] <- 0

# PICK OUT RELEVANT COLUMNS
merchants <- merchants[,c( 
                "merchant_id",
                "numerical_1", 
                "numerical_2",
                "category_1",
                # "category_2",
                "category_4",
                # "most_recent_sales_range",
                # "most_recent_purchases_range",
                #"avg_sales_lag3",
                "avg_purchases_lag3",
                "active_months_lag3",
                #"avg_sales_lag6",
                "avg_purchases_lag6",
                "active_months_lag6",
                #"avg_sales_lag12",
                "avg_purchases_lag12",
                "active_months_lag12"
                )]

# GIVE THE COLUMNS NEW NAMES (TO AVOID DUPLICATES)
colnames(merchants) <- paste("merch", colnames(merchants), sep = "_")

# RENAME MERCHANT_ID COLUMN FOR JOINING
colnames(merchants)[1] <- "merchant_id"

# SMASH MERCHANT DATA INTO THE TRANSACTION DATA
tran_full <- plyr::join(tran_full, merchants, by = "merchant_id", type = "inner")

# SAVE SOME SPACE
rm(merchants, full_unique_merchant_id); gc()

print(names(tran_full))
print(str(tran_full))
print(ls())

**FEATURE ENGINEERING**

In [None]:
# FACTORIZE THE FACTORS
tran_full$category_1 <- as.factor(tran_full$category_1)
# tran_full$category_2 <- as.factor(tran_full$category_2) 
# tran_full$category_3 <- as.factor(tran_full$category_3)
# tran_full$merchant_id <- as.factor(tran_full$merchant_id)
tran_full$authorized_flag <- as.factor(tran_full$authorized_flag)
tran_full$merch_category_1 <- as.factor(tran_full$merch_category_1)
# tran_full$merch_category_2 <- as.factor(tran_full$merch_category_2)
tran_full$merch_category_4 <- as.factor(tran_full$merch_category_4)
# tran_full$merch_most_recent_sales_range <- as.factor(tran_full$merch_most_recent_sales_range)
# tran_full$merch_most_recent_purchases_range <- as.factor(tran_full$merch_most_recent_purchases_range)

# NUMERIZE THE INTEGERS
tran_full$merch_active_months_lag3 <- as.numeric(tran_full$merch_active_months_lag3)
tran_full$merch_active_months_lag6 <- as.numeric(tran_full$merch_active_months_lag6)
tran_full$merch_active_months_lag12 <- as.numeric(tran_full$merch_active_months_lag12)

# REPLACE INFINITE VALUES WITH LAG (NOT SURE ABOUT THIS)
tran_full$merch_avg_purchases_lag3[is.infinite(tran_full$merch_avg_purchases_lag3)] <- 3
tran_full$merch_avg_purchases_lag6[is.infinite(tran_full$merch_avg_purchases_lag6)] <- 6
tran_full$merch_avg_purchases_lag12[is.infinite(tran_full$merch_avg_purchases_lag12)] <- 12

# CHANGE BINARY LEVELS TO 0 & 1
levels(tran_full$category_1)[match("Y",levels(tran_full$category_1))] <- "1"
levels(tran_full$category_1)[match("N",levels(tran_full$category_1))] <- "0"
levels(tran_full$authorized_flag)[match("Y",levels(tran_full$authorized_flag))] <- "1"
levels(tran_full$authorized_flag)[match("N",levels(tran_full$authorized_flag))] <- "0"
levels(tran_full$merch_category_1)[match("Y",levels(tran_full$merch_category_1))] <- "1"
levels(tran_full$merch_category_1)[match("N",levels(tran_full$merch_category_1))] <- "0"
levels(tran_full$merch_category_4)[match("Y",levels(tran_full$merch_category_4))] <- "1"
levels(tran_full$merch_category_4)[match("N",levels(tran_full$merch_category_4))] <- "0"

# CONVERT BINARIZED FACTORS TO NUMERIC
tran_full$category_1 <- as.numeric(tran_full$category_1)
tran_full$authorized_flag <- as.numeric(tran_full$authorized_flag)
tran_full$merch_category_1 <- as.numeric(tran_full$merch_category_1)
tran_full$merch_category_4 <- as.numeric(tran_full$merch_category_4)
tran_full$merch_avg_purchases_lag3 <- as.numeric(tran_full$merch_avg_purchases_lag3)

print(str(tran_full))
print("done")
print(ls())
# print(summary(tran_full))

In [None]:
# FACTORS WITH > 2 LEVELS
    # merch_category_2
    # merch_most_recent_sales_range
    # merch_most_recent_purchases_range
    # merchant_category_id

## TODO : 
 # > Transacation / Merchant Frequency
 # > Transacation / Time Frequency
 # > number of merhants with > 2 transactions
 # > Modal factor in multi-level categories 
 # > $dataset Mean

# GROUP TRANSACTIONS BY CARD_ID
tran_full <- group_by(tran_full, card_id)

# SUMMARISE TRANSACTION DATA
tran_full <- dplyr::summarise(tran_full, n_transactions = n(), 
                             n_unique_merchant = length(unique(merchant_id)),
                             n_unique_merchant_cat = length(unique(merchant_category_id)),
                             mean_dataset = mean(dataset),
                             sum_dataset = sum(dataset),
                             sd_dataset = sd(dataset),
                             mean_month_lag = mean(month_lag),
                             min_month_lag = min(month_lag),
                             max_month_lag = max(month_lag),
                             sd_month_lag = sd(month_lag),
                             mean_purchase_amount = mean(purchase_amount),
                             min_purchase_amount = min(purchase_amount),
                             max_purchase_amount = max(purchase_amount),
                             sd_purchase_amount = sd(purchase_amount),
                             sum_purchase_amount = sd(purchase_amount),
                             mean_category_1 = mean(category_1),
                             sum_category_1 = sum(category_1),
                             sd_category_1 = sum(category_1),
                             mean_authorized = mean(authorized_flag),
                             sum_authorized = sum(authorized_flag),
                             mean_installments = mean(installments),
                             sum_installments = sum(installments),
                             mean_merch_numerical_1 = mean(merch_numerical_1),
                             mean_merch_numerical_2 = mean(merch_numerical_2),
                             mean_merch_category_1 = mean(merch_category_1),
                             mean_merch_category_4 = mean(merch_category_4),
                             mean_merch_avg_purchases_lag3 = mean(merch_avg_purchases_lag3),
                             mean_merch_avg_purchases_lag6 = mean(merch_avg_purchases_lag6),
                             mean_merch_avg_purchases_lag12 = mean(merch_avg_purchases_lag12),
                             mean_merch_active_months_lag3 = mean(merch_active_months_lag3),
                             mean_merch_active_months_lag6 = mean(merch_active_months_lag6),
                             mean_merch_active_months_lag12 = mean(merch_active_months_lag12)
                   )

trans_summary_df <- tran_full

#         'category_2_1.0': ['mean'],
#         'category_2_2.0': ['mean'],
#         'category_2_3.0': ['mean'],
#         'category_2_4.0': ['mean'],
#         'category_2_5.0': ['mean'],
#         'category_3_A': ['mean'],
#         'category_3_B': ['mean'],
#         'category_3_C': ['mean'],
#         'purchase_amount': [ 'max', 'min'],
#         'installments': ['max', 'min', 'std'],
#         'purchase_date': [np.ptp],

# SAVE SOME SPACE
rm(tran_full, full_unique_card_id); gc()

print(str(trans_summary_df))

In [None]:
# RELOAD THE TEST/TRAIN
raw_train <- read_csv("../input/elo-merchant-category-recommendation/train.csv")
raw_test <- read_csv("../input/elo-merchant-category-recommendation/test.csv")

# LABEL THE TEST DATA
raw_test$target <- "x"

# SMASH TEST & TRAIN TOGETHER AGAIN!
data_full <- rbind(raw_train, raw_test)

rm(raw_train, raw_test); gc()

# SMASH THE SUMMARISED TRANSACTION DATA INTO THE FULL TEST / TRAIN DATA
data_full <- plyr::join(data_full, trans_summary_df, by = "card_id", type = "inner")

In [None]:
# ISOLATE THE ROWS THAT ARE TRAINING ROWS
data_test_i <- which(grepl("x", data_full$target))

# SETUP TEST DATA DF
data_test <- data_full[data_test_i, ]

# CHOOSE RELEVANT COLUMNS
train_columns <- c("target", 
                   "feature_1", 
                   "feature_2", 
                   "feature_3", 
                   "mean_dataset",
                   "n_transactions",
                   "mean_month_lag",      
                   "min_month_lag",     
                   "max_month_lag",
                   "mean_purchase_amount",
                   "min_purchase_amount", 
                   "max_purchase_amount",
                   "sd_purchase_amount",
                   "mean_category_1",
                   "mean_authorized",
                   "mean_installments",
                   "mean_merch_numerical_1",
                   "mean_merch_numerical_2",
                   "mean_merch_category_1",
                   "mean_merch_category_4",
                   "mean_merch_avg_purchases_lag3",
                   "mean_merch_avg_purchases_lag6",
                   "mean_merch_avg_purchases_lag12",
                   "mean_merch_active_months_lag3",
                   "mean_merch_active_months_lag6",
                   "mean_merch_active_months_lag12",
                   "sum_purchase_amount",
                   "sum_installments",
                   "sd_month_lag",
                   "sum_dataset",
                   "sd_dataset",
                   "sum_authorized",
                   "sum_category_1",
                   "sd_category_1",
                   "n_unique_merchant_cat"
                  )

# SETUP TRAIN DATA
data_train <- data_full[-data_test_i, train_columns]

# MAKE SURE WE'RE PREDICTING A NUMERIC VALUE
data_train$target <- as.numeric(data_train$target)

# psych::describe(data_train)
# psych::describe(data_test)
print("done")

In [None]:
# par(mfrow=c(2,2))

# plot(density(data_full$feature_1), main = "feature_1 density")
# plot(density(data_full$feature_2), main = "feature_2 density")
# plot(density(data_full$feature_3), main = "feature_3 density")

# plot(density(data_full$n_transactions), main = "trans_count density")
# plot(density(data_full$mean_purchase_amount), main = "mean_purchase_amount density")
# plot(density(data_full$mean_month_lag), main = "mean_month_lag density")

# plot(density(data_full$mean_category_1), main = "mean_catgory_1 density")
# plot(density(data_full$mean_authorized), main = "mean_authorized density")

In [None]:
allNumeric <- dplyr::select_if(data_train, is.numeric) 

corr <- cor(allNumeric, method = "spearman")
mat <- allNumeric

# # https://medium.swirrl.com/@northernjamie Thanks Jamie
cor.mtest <- function(mat, conf.level = 0.95){
    mat <- as.matrix(mat)
    n <- ncol(mat)
    p.mat <- lowCI.mat <- uppCI.mat <- matrix(NA, n, n)
    diag(p.mat) <- 0
    diag(lowCI.mat) <- diag(uppCI.mat) <- 1
for(i in 1:(n-1)){
    for(j in (i+1):n){
        tmp <- cor.test(mat[,i], mat[,j], conf.level = conf.level)
        p.mat[i,j] <- p.mat[j,i] <- tmp$p.value
        lowCI.mat[i,j] <- lowCI.mat[j,i] <- tmp$conf.int[1]
        uppCI.mat[i,j] <- uppCI.mat[j,i] <- tmp$conf.int[2]
    }
}
return(list(p.mat, lowCI.mat, uppCI.mat))
}
res1 <- cor.mtest(corr,0.95)

# par(mfrow=c(2,2))

sig <- c(0.0001, 0.001, 0.0023, 0.00529, 0.01217, 0.02798, 0.06436, 0.14804, 0.34048, 0.49)

for (i in 1:10){
  corrplot(corr, method = "square", order = "hclust", tl.col = "black", 
                     tl.cex = 0.75, p.mat = res1[[1]], sig.level = sig[i],  
                     insig = "pch", pch.cex = 1, main = paste("sig level ", sig[i]), 
                     bg = "white", addrect = 3)
}

In [None]:
# CLUSTERING?

# wss <- c()
# totss <- c()
# betweenss <- c()
# tot.withinss <- c()

# for (i in 1:10) {
#   km.out <- kmeans(data_train, centers = i, nstart = 20, iter.max = 50)

#   wss[i] <- km.out$withinss
#   totss[i] <- km.out$totss
#   betweenss[i] <- km.out$betweenss
#   tot.withinss[i] <- km.out$tot.withinss
# }

# par(mfrow = c(2,2))

# plot(1:10, wss, type = "b",
#      xlab = "Number of Clusters",
#      ylab = "Within groups sum of squares")

# plot(1:10, totss, type = "b",
#      xlab = "Number of Clusters",
#      ylab = "Total sum of squares")

# plot(1:10, betweenss, type = "b",
#      xlab = "Number of Clusters",
#      ylab = "Between sum of squares")

# plot(1:10, tot.withinss, type = "b",
#      xlab = "Number of Clusters",
#      ylab = "Total within sum of squares")

In [None]:
# filterCtrl <- caret::sbfControl(functions = rfSBF)
# r <- sbf(target ~ ., data = allNumeric, sbfControl = filterCtrl)
# print(r)

**MODEL CONFIGURATION**

In [None]:
set.seed(42) # DONT PANIC

# print(mean((test_set$y - predict.lm(model, test_set)) ^ 2))
# seq(0.001,0.1,by = 0.001)

myFolds <- createFolds(data_train$target, k = 2)

# "BoxCox", "YeoJohnson", "expoTrans", "center", "scale", "range", "knnImpute", 
# "bagImpute", "medianImpute", "pca", "ica", "spatialSign", "corr", "zv", "nzv", "conditionalX"

myPreProcess <- c("zv", "nzv", "center")
# myPreProcess <- c("conditionalX")

myControl <- trainControl(
  method = "repeatedcv",
  number = 10, 
  repeats = 5,
  search = "grid",
  summaryFunction = defaultSummary,
  #classProbs = TRUE,
  verboseIter = TRUE,
  index = myFolds
)

#formula <- as.formula("target ~ n_transactions + feature_1 + feature_2 + feature_3 + mean_dataset + sum_dataset + sd_dataset + mean_month_lag + min_month_lag + max_month_lag + sd_month_lag + mean_purchase_amount + min_purchase_amount + max_purchase_amount + sum_purchase_amount + sd_purchase_amount + mean_category_1 + mean_authorized + sum_authorized + mean_installments + sum_installments + mean_merch_numerical_1 + mean_merch_numerical_2 + mean_merch_category_1 + mean_merch_category_4 + mean_merch_avg_purchases_lag6 + mean_merch_avg_purchases_lag12 + mean_merch_active_months_lag3 + mean_merch_active_months_lag6 + mean_merch_active_months_lag12 + sum_category_1 + sd_category_1")
formula <- as.formula("target ~ n_transactions + feature_1 + feature_2 + feature_3 + 
mean_dataset + sum_dataset + sd_dataset + mean_month_lag + min_month_lag + max_month_lag + 
sd_month_lag + mean_purchase_amount + min_purchase_amount + max_purchase_amount + 
sum_purchase_amount + sd_purchase_amount + mean_category_1 + mean_authorized + 
sum_authorized + mean_installments + sum_installments + mean_merch_numerical_1 + 
mean_merch_numerical_2 + mean_merch_category_1 + mean_merch_category_4 + 
mean_merch_avg_purchases_lag6 + mean_merch_avg_purchases_lag12 + mean_merch_active_months_lag3 + 
mean_merch_active_months_lag6 + mean_merch_active_months_lag12 + n_unique_merchant_cat")
# + 
# mean_cat_2_1 + sum_cat_2_1 + mean_cat_2_2 + sum_cat_2_2 + mean_cat_2_3 + sum_cat_2_3 + 
# mean_cat_2_4 + sum_cat_2_4 + mean_cat_2_5 + sum_cat_2_5 + mean_m_cat_2_1 + sum_m_cat_2_1 + 
# mean_m_cat_2_2 + sum_m_cat_2_2 + mean_m_cat_2_3 + sum_m_cat_2_3 + mean_m_cat_2_4 + 
# sum_m_cat_2_4 + mean_m_cat_2_5 + sum_m_cat_2_5 + mean_m_cat_2_6 + sum_m_cat_2_6 + mean_m_mrsr_a + 
# sum_m_mrsr_a + mean_m_mrsr_b + sum_m_mrsr_b + mean_m_mrsr_c + sum_m_mrsr_c + mean_m_mrsr_d + 
# sum_m_mrsr_d + mean_m_mrsr_e + sum_m_mrsr_e + mean_m_mrpr_a + sum_m_mrpr_a + mean_m_mrpr_b + 
# sum_m_mrpr_b + mean_m_mrpr_c + sum_m_mrpr_c + mean_m_mrpr_d + sum_m_mrpr_d + mean_m_mrpr_e + 
# sum_m_mrpr_e")

**M_1 - Linear Model**

In [None]:
m_1 <- caret::train(formula,
                      data = data_train,
                      method = "lm",
                      metric = "RMSE",
                      preProcess = myPreProcess,
                      trControl = myControl
)

print(summary(m_1))
# 3.834

**M_2 - GLM**

In [None]:
# glmGrid = expand.grid(alpha = 1,
#                       lambda = 0.021)

# m_2 <- caret::train(formula,
#                       data = data_train,
#                       method = "glmnet",
#                       metric = "RMSE",
#                       preProcess = myPreProcess,
#                       trControl = myControl,
#                       tuneGrid = glmGrid
# )

# plot(caret::varImp(m_2))
# print(m_2)
# BEST RMSE: 3.839075 

**M_3 - xgbLinear**

In [None]:
xgbLinearGrid <- expand.grid(nrounds = 10,
                             lambda = 1, 
                             alpha = 1, 
                             eta = 0.025)

m_3 <- caret::train(formula,
                    data = data_train,
                    method = "xgbLinear",
                    metric = "RMSE",
                    preProcess = myPreProcess,
                    trControl = myControl,
                    savePredictions = 'final',
                  # tuneLength = 10
                    tuneGrid = xgbLinearGrid
)

plot(caret::varImp(m_3))
print(m_3)
# BEST RMSE 3.809129

**M_4 - xgbTree**

In [None]:
# xgbTreeGrid <- expand.grid(nrounds = 100,
#                         max_depth = 3,
#                         eta = 0.025,
#                         gamma = 5,
#                         colsample_bytree = 0.09,
#                         min_child_weight = 100,
#                         subsample = 0.8)

# m_4 <- caret::train(formula,
#                     data = data_train,
#                     method = "xgbTree",
#                     metric = "RMSE",
#                     preProcess = myPreProcess,
#                     trControl = myControl,
#                     tuneGrid = xgbTreeGrid
# )

# plot(caret::varImp(m_4))
# print(m_4)
# BEST RMSE: 3.835275

**M_5 - gbm_h2o**

In [None]:
# APPARENTLY THIS ONE NEEDS A KICKSTART
h2o.init()

h2oGrid <- expand.grid(max_depth = 3,
                       ntrees = 150,
                       min_rows = 10, 
                       learn_rate = 0.1, 
                       col_sample_rate = 1)

m_5 <- caret::train(formula,
                    data = data_train,
                    method = "gbm_h2o",
                    metric = "RMSE",
                    preProcess = myPreProcess,
                    trControl = myControl,
                    tuneGrid = h2oGrid
                 #   tuneLength = 10
)

# plot(caret::varImp(m_5))
print(m_5)
# BEST RMSE: 3.80203

**M_6 - Boosted Linear Model**

In [None]:
# Grid <- expand.grid()

# m_6 <- caret::train(formula,
#                     data = data_train,
#                     method = "BstLm",
#                     metric = "RMSE",
#                     preProcess = myPreProcess,
#                     trControl = myControl,
#                     tuneLength = 5
#                     #tuneGrid = xgbDARTGrid
# )

# plot(caret::varImp(m_6))
# print(m_6)
# BEST RMSE: 3.838467

In [None]:
# alg_list <- c("rf", "glm", "gbm", "glmboost", "nnet", "treebag", "svmLinear")
# alg_list <- c("glmboost", "svmLinear")

# multi_mod <- caretList(formula, 
#                        data = data_train, 
#                        trControl = myControl, 
#                        methodList = alg_list, 
#                        metric = "RMSE")

# res <- resamples(multi_mod)
# summary(res)

In [None]:
# Stacking Algorithms
# algorithmList <- c('xgbLinear','lm')

# stack_models <- caretList(formula, data = data_train, trControl = myControl, methodList = algorithmList)

# stacking_results <- resamples(stack_models)

# summary(stacking_results)

# dotplot(stacking_results)
# Check correlation between models to ensure the results are uncorrelated and can be ensembled
# modelCor(stacking_results)
# splom(stacking_results)
# stack using Logistics Regression
# stackControl <- trainControl(sampling="rose",method="repeatedcv", number=5, repeats=2, savePredictions=TRUE, classProbs=TRUE)

# stack.glm <- caretStack(stack_models, method="glm", metric=metric, trControl=stackControl)
# print(stack.glm)
# evaluate results on test set

# stack.gbm <- caretStack(stack_models, method="gbm", metric=metric, trControl=stackControl)
# print(stack.gbm)

**COMPARING MODELS**

In [None]:
resamps <- caret::resamples(list(
                               linear = m_1, 
                               # glmNet = m_2, 
                                 xgbLinear = m_3, 
                               # xgbTree = m_4,
                                 h2o = m_5
                               # BstLm = m_6
                                ), metric = "RMSE"
                           )

print(summary(resamps))
bwplot(resamps)

**SUBMISSION**

In [None]:
print("=== SUBMISSION") ## SUBMISSION

sample_sub <- read.csv("../input/elo-merchant-category-recommendation/sample_submission.csv")

# sample_target <- sample_sub$target
# predictions_1 <- predict(m_1, data_test)
# predictions_2 <- predict(m_2, data_test)
predictions_3 <- predict(m_3, data_test)
# predictions_4 <- predict(m_4, data_test)
predictions_5 <- predict(m_5, data_test)

# predictions <- (predictions_1 + predictions_2 + predictions_3 + predictions_4) / 4
# predictions <- (predictions_3 + predictions_5 + sample_target) / 3
predictions <- (predictions_3 + predictions_5) / 2
# predictions <- predictions_5

sub <- sample_sub %>% mutate(target = predictions)
write_csv(sub, "submission_1.csv", append = FALSE)

print("=== DONE")