In [None]:
library(caret)
library(randomForest)
library(tidymodels)

In [17]:
train <- read.csv(file = '../../Data/training_data.csv')
train <- subset(train,select = -SWEETORSOUR)
train$Intensity <- as.numeric(train$Intensity)
Y <- train$VALENCE.PLEASANTNESS
train <- subset(train,select = -VALENCE.PLEASANTNESS)

Remove correlated data; as we do not want to overfit, we remove variables that contains the "same informations"

In [19]:
train.Z <- subset(train,select = -nearZeroVar(train))
train.Z
correlation <- cor(train.Z)
id <- sort(findCorrelation(correlation, cutoff=0.5))
data.unC <- train [,-id]

In [21]:
head(data.unC)

Intensity,ZM1,DECC,SMTIV,piPC06,VR1_D,SpMax_Dz.Z.,VR3_Dz.Z.,ATSC3i,ATSC5s,...,Psychotic.80,Psychotic.50,Hypertens.80,Hypertens.50,Hypnotic.80,Hypnotic.50,Neoplastic.80,Neoplastic.50,Infective.80,Infective.50
2,80,1.067,2454,4.19,15.566,47.917,1.788,0.933,23.627,...,0,0,1,0,1,0,1,0,1,0
1,40,0.938,630,3.876,8.678,15.012,0.845,0.353,7.007,...,0,0,0,0,0,0,0,0,0,0
1,86,1.222,4839,4.952,18.526,57.873,2.282,0.683,45.724,...,0,0,1,0,1,1,1,0,1,1
1,22,0.444,254,0.0,4.745,8.595,0.405,0.182,6.0,...,0,0,0,0,0,0,0,0,0,0
2,44,0.76,801,3.766,9.654,20.358,0.984,0.49,15.44,...,0,0,0,0,0,0,0,0,0,0
1,60,1.235,2514,3.797,13.574,46.628,1.586,0.643,17.25,...,0,0,0,0,1,0,1,0,1,0


Now our strategy, as CV did not work so well so far and it is very time consuming, is to have ntree very big, mtry and maxnode relatively small.
The reason for this is that we do not want to overfit data -> we would like low variance. A good way to reduce variance is to do RF with very big ntree.
The choice of mtry and max_node should be given by CV, we imagine it will be low, but not too low (too low mtry/max_node will result in high bias)

In [22]:
MSE.estimation <- function (Data,valid.split,p,ntree,mtry,maxnodes) {
    MSE <- c()
    for (i in 1:p){
        id <- sample(nrow(Data),nrow(Data)*0.8)
        train <- Data[id,]
        valid <- Data[-id,]
        RF <- randomForest(VALENCE.PLEASANTNESS ~ ., train, mtry = mtry, ntree = ntree, maxnodes = maxnodes)
        MSE <- append(MSE,mean((predict(RF,subset(valid,select = -VALENCE.PLEASANTNESS))-valid$VALENCE.PLEASANTNESS)^2))
        }
    mean(MSE)
}


In [25]:
data.unC$VALENCE.PLEASANTNESS <- Y  


In [None]:
MSE.estimation (data.unC, valid.split = 0.95, 15, ntree = 20000, mtry = 20, maxnodes = 40)

Let s start "recursive" CV finding (recursive is explained in the report)

In [None]:
fit_and_evaluate <- function(fold,ntree,mtry,maxnodes) {      
    sapply (iter, function (i) {
        tree <- randomForest(VALENCE.PLEASANTNESS ~ ., analysis(fold), mtry = mtry, ntree = ntree, maxnodes = maxnodes) 
        valid.set <- assessment(fold)
        mean((valid.set$VALENCE.PLEASANTNESS-predict(tree,valid.set))^2)
    })
}

get.error <- function(data, v = 10, ) {
    iter <- seq(1,maxnodes,by = 10)
    validation_data <- vfold_cv(data, v = v) #v = nbr of fold !  ####RELANCER AVEC dATA
    v.errors <- sapply(validation_data$splits, fit_and_evaluate.maxN,maxnodes)
    v.errors.maxnodes <- rowMeans(v.errors)
}

In [None]:
validation_data <- vfold_cv(data, v = 10) #v = nbr of fold !  ####RELANCER AVEC dATA
maxnodes <- 601
v.errors <- sapply(validation_data$splits, fit_and_evaluate.maxN,maxnodes)
v.errors.maxnodes <- rowMeans(v.errors)
plot(seq(1,maxnodes,by = 10),v.errors.maxnodes,xlab='max nodes',ylab = 'MSE')