In [11]:
train.data <- read.csv(file.path("..", "data", "training_data.csv"))
test.data <- read.csv(file.path("..", "data", "test_data.csv"))

In [12]:
set.seed(100)
numeric.intensity <- as.numeric(train.data$Intensity)-1
x <- train.data[, -c(2,3)]
x$Intensity <- numeric.intensity

idx.zero.var <- apply(x, 2, var) == 0
x <- x[,!idx.zero.var]
y <- train.data$VALENCE.PLEASANTNESS

## Boosting 1

In [None]:
#Function for Cross-Validation and Predictions
fit_and_evaluate <- function(fold, formula = VALENCE.PLEASANTNESS ~ ., eta = 0.1, max.depth = 7, nrounds = 100  ) {
    boosting <- randomForest::randomForest(VALENCE.PLEASANTNESS ~ ., fold, mtry = mtry, ntree = ntree, maxnodes = maxnodes)
    valid.set <- assessment(fold)
    mean((valid.set$VALENCE.PLEASANTNESS - predict(random.Forest, valid.set))^2)
}

In [57]:
set.seed(100)
#train and validation indexes
len <- length(x[,1])
idx.train <- sample(1:len, 2*len/3)

#xgboost does not accept data frames therefore we will first convert the data into ordinary matrices
library(xgboost)
library(Matrix)
train.x <- sparse.model.matrix(VALENCE.PLEASANTNESS ~ . -1, data = data[idx.train,])
validation.x <- sparse.model.matrix(VALENCE.PLEASANTNESS ~ . -1, data = data[-idx.train,])
train.y <- data$VALENCE.PLEASANTNESS[idx.train]
validation.y <- data$VALENCE.PLEASANTNESS[-idx.train]

In [59]:
#Boosting Cross Validation 
library(tidymodels)
set.seed(100)

full.data <- x
full.data$VALENCE.PLEASANTNESS <- y


len <- length(x[,1])
idx.train <- sample(1:len, 3*len/4)

train.x <- as.matrix(x[idx.train,])
train.y <- y[idx.train]
val.x <- as.matrix(x[-idx.train,])
val.y <- y[-idx.train]


#parameters
eta = seq(0.1,0.3, 0.05)
seeds <- seq.int(100,120)
max.depth = seq(5,9,1)
nrounds = seq(50, 350,50)

In [67]:
for i in eta {
    boost.heart <- xgboost(train.x, label = train.y,
                      objective = "reg:squarederror",
                      eta = i,
                      max_depth = 5,
                      nround = 100, 
                          )
}

In [53]:
boost.heart <- xgboost(train.x, label = train.y,
                      objective = "reg:squarederror",
                      eta = 0.01,
                      max_depth = 2,
                      nround = 500)

In [9]:
prediction.train <- predict(boost.heart, train.x)
prediction.validation <- predict(boost.heart, validation.x)
MSE.train <- mean((prediction.train - train.y)^2)
MSE.validation <- mean((prediction.validation - validation.y)^2)


In [10]:
MSE.train
MSE.validation

In [None]:
library(xgboost)
library(Matrix)
#Boosting Submission
set.seed(100)
#Preparation of training and test data
train <- train.data[, -c(1,2,3)]
idx.zero.var <- apply(train, 2, var) == 0

train <- train[,!idx.zero.var]
test <- test.data[,-c(1,2)]
test <- test[,!idx.zero.var]


#test$Intensity <- as.factor(test.data$Intensity)
train$Intensity <- as.numeric(train.data$Intensity)-1
train
#test intensity is always at level high, so that the prediction function has a problem (cheat with adding a row that afterwards is substracted)
#test <- rbind(test, train[1,])
train.x = train

train$VALENCE.PLEASANTNESS <- train.data$VALENCE.PLEASANTNESS
train.y = train$VALENCE.PLEASANTNESS

train.x <- sparse.model.matrix(VALENCE.PLEASANTNESS ~ . -1, data = train)
#test.x <- sparse.model.matrix(VALENCE.PLEASANTNESS ~ . -1, data = test)
train.y <- train$VALENCE.PLEASANTNESS
#validation.y <- data$VALENCE.PLEASANTNESS[-idx.train]

In [82]:
test <- test.data[,-c(1,2)]
test <- test[,!idx.zero.var]
test$Intensity <- as.numeric(test.data$Intensity)-1

In [None]:
boost.heart <- xgboost(train.x, label = train.y,
                      objective = "reg:squarederror",
                      eta = 0.01,
                      max_depth = 2,
                      nround = 500)

In [79]:
prediction.boost = predict(boost.heart, as.matrix(test))
submission <- data.frame(Id = 1:68, VALENCE.PLEASANTNESS = prediction.boost)
write.csv(submission, file = "../Submissions/boosting2.csv", row.names = FALSE)


## Boosting 2 - Regularized gradient boosting 

One difference between boosting and random forests: in boosting, because the growth of a particular tree takes into account the other trees that have already been grown, smaller trees are typically sufficient (less splits and depth)

In [13]:
library(xgboost)

In [14]:
set.seed(100)
len <- length(x[,1])
idx.train <- sample(1:len, 2*len/3)

train.x <- x[idx.train,]
train.y <- y[idx.train]
val.x <- x[-idx.train,]
val.y <- y[-idx.train]


In [27]:
dtrain = xgb.DMatrix(data =  as.matrix(train.x), label = train.y )
dval = xgb.DMatrix(data =  as.matrix(val.x), label = val.y)

In [28]:
watchlist = list(train=dtrain, val=dval)

Let's tune the algorithm with 3 parameters : 
1) The number of trees 

2) The shrinkage parameter lambda : Typical values are 0.01 or 0.001, and the right choice can depend on the problem. Very small λ can require using a very large value of B in order to achieve good performance.

3) The number of splits in each tree, which controls the complexity of the boosted ensemble (controlled with max.depth)

In [19]:
max.depths = c(6,10,20)
etas = c(0.3,0.1, 0.05)
nrounds = c(100,200,400)

best_params = 0
best_score = 0

count = 1

for( depth in max.depths ) {
    for(num in etas) {
        for(numround in nrounds) {

        bst_grid = xgb.train(data = dtrain, 
                                max.depth = depth, 
                                eta=num,  
                                nround = numround, 
                                watchlist = watchlist, 
                                objective = "reg:squarederror", 
                                early_stopping_rounds = 50, 
                                verbose=0)

        if(count == 1){
            best_params = bst_grid$params
            best_score = bst_grid$best_score
            count = count + 1
            }
        else if( bst_grid$best_score < best_score){
            best_params = bst_grid$params
            best_score = bst_grid$best_score
        }
    }
 }
}
best_params
best_score

We can then use these parameters and get 50 validation errors for boosting (running for 50 seeds). 

In [None]:
library(tidymodels)

boost.validation <- function(seed){
    set.seed(seed)
    len <- length(x[,1])
    idx.train <- sample(1:len, 2*len/3)
    train.x <- x[idx.train,]
    train.y <- y[idx.train]
    val.x <- x[-idx.train,]
    val.y <- y[-idx.train]
    
    dtrain = xgb.DMatrix(data =  as.matrix(train.x), label = train.y )
    dval = xgb.DMatrix(data =  as.matrix(val.x), label = val.y)
    
    boost <- xgb.train(data = dtrain, 
                    objective = "reg:squarederror", 
                    max_depth = 6,
                    eta=0.05,
                    watchlist = watchlist,
                    nround = 1000, 
                    verbose=0)
    
    prediction <- predict(boost, dval)
    mean((prediction - val.y)^2)
    
}
seeds <- seq.int(100,150)
MSEs <- sapply(seeds, boost.validation)
mean.MSE <- mean(sqrt(MSEs))
var <- var(sqrt(MSEs))
cat("The mean RMSE is = ", mean.MSE, " and the variance is = ", var)

In [32]:
mean.MSE