In [3]:
train.data <- read.csv(file.path("..", "data", "training_data.csv"))
test.data <- read.csv(file.path("..", "data", "test_data.csv"))

In [4]:
set.seed(100)
numeric.intensity <- as.numeric(train.data$Intensity)-1
x <- train.data[, -c(2,3)]
x$Intensity <- numeric.intensity

idx.zero.var <- apply(x, 2, var) == 0
x <- x[,!idx.zero.var]
y <- train.data$VALENCE.PLEASANTNESS

## Boosting

One difference between boosting and random forests: in boosting, because the growth of a particular tree takes into account the other trees that have already been grown, smaller trees are typically sufficient (less splits and depth)

In [5]:
library(xgboost)

In [16]:
set.seed(100)
len <- length(x[,1])
idx.train <- sample(1:len, 2*len/3)

train.x <- x[idx.train,]
train.y <- y[idx.train]
val.x <- x[-idx.train,]
val.y <- y[-idx.train]

#Test data : tout sauf les 
test <- test.data[,-c(1,2)]
test$Intensity <- as.numeric(test.data$Intensity)-1
test <- test[,!idx.zero.var]


In [7]:
dtrain = xgb.DMatrix(data =  as.matrix(train.x), label = train.y )
dval = xgb.DMatrix(data =  as.matrix(val.x), label = val.y)

In [11]:
watchlist = list(train=dtrain, val=dval)

Let's tune the algorithm with 3 parameters : 
1) The number of trees 

2) The shrinkage parameter lambda : Typical values are 0.01 or 0.001, and the right choice can depend on the problem. Very small λ can require using a very large value of B in order to achieve good performance.

3) The number of splits in each tree, which controls the complexity of the boosted ensemble (controlled with max.depth)

In [19]:
max.depths = c(6,10,20)
etas = c(0.3,0.1, 0.05)
nrounds = c(100,200,400)

best_params = 0
best_score = 0

count = 1

for( depth in max.depths ) {
    for(num in etas) {
        for(numround in nrounds) {

        bst_grid = xgb.train(data = dtrain, 
                                max.depth = depth, 
                                eta=num,  
                                nround = numround, 
                                watchlist = watchlist, 
                                objective = "reg:squarederror", 
                                early_stopping_rounds = 50, 
                                verbose=0)

        if(count == 1){
            best_params = bst_grid$params
            best_score = bst_grid$best_score
            count = count + 1
            }
        else if( bst_grid$best_score < best_score){
            best_params = bst_grid$params
            best_score = bst_grid$best_score
        }
    }
 }
}
best_params
best_score

We can then use these parameters and get 50 validation errors for boosting (running for 50 seeds). 

## Validation for Boosting on 50 seeds 

In [12]:
library(tidymodels)

boost.validation <- function(seed){
    set.seed(seed)
    len <- length(x[,1])
    idx.train <- sample(1:len, 2*len/3)
    train.x <- x[idx.train,]
    train.y <- y[idx.train]
    val.x <- x[-idx.train,]
    val.y <- y[-idx.train]
    
    dtrain = xgb.DMatrix(data =  as.matrix(train.x), label = train.y )
    dval = xgb.DMatrix(data =  as.matrix(val.x), label = val.y)
    
    boost <- xgb.train(data = dtrain, 
                    objective = "reg:squarederror", 
                    max_depth = 6,
                    eta=0.05,
                    watchlist = watchlist,
                    nround = 100, 
                    verbose=0)
    
    prediction <- predict(boost, dval)
    mean((prediction - val.y)^2)
    
}
seeds <- seq.int(100,150)
MSEs <- sapply(seeds, boost.validation)
mean.MSE <- mean(sqrt(MSEs))
var <- var(sqrt(MSEs))
cat("The mean RMSE is = ", mean.MSE, " and the variance is = ", var)

The mean RMSE is =  23.66608  and the variance is =  0.8775085

## Submission

In [14]:
boost <- xgb.train(data = dtrain, 
                    objective = "reg:squarederror", 
                    max_depth = 6,
                    eta=0.05,
                    watchlist = watchlist,
                    nround = 100, 
                    verbose=0)

In [18]:
test = test[,c(which(colnames(test)=="Intensity"),which(colnames(test)!="Intensity"))]
colnames(test) <- NULL

In [19]:
predictions = predict(boost, xgb.DMatrix(data = as.matrix(test)))
submission <- data.frame(Id = 1:68, VALENCE.PLEASANTNESS = predictions)
write.csv(submission, file = "../Submissions/boosting.csv", row.names = FALSE)
