In [None]:
require(randomForest)
require(data.table)
require(lubridate)
require(caret)
require(rattle)
require(openxlsx)
library(datasets)
library(caTools)
library(party)
library(dplyr)
library(magrittr)
library(rpart)
library(rpart.plot)

setwd("/Users/merve.oncel/Downloads/")
covtype = read.csv("covtype.data", header=FALSE,sep=",")
datacov = covtype[1:1000,]


# obtain classification data and new column with two factors by taking V1 elevator column
classification_data=copy(datacov)
classification_data <- transform(classification_data, is_higher= ifelse(V1>3000, "hgh_1", "hgh_0"))
classification_data=classification_data[,2:56]
head(classification_data,10) #show 10 rows

classification_data$is_higher <- as.factor(classification_data$is_higher)


sample_data = sample.split(classification_data, SplitRatio = 0.8)
train_data <- subset(classification_data, sample_data == TRUE)
test_data <- subset(classification_data, sample_data == FALSE)
#target=as.factor(classification_data$is_higher)
#y predicted using x
y_train=as.factor(train_data$is_higher)
x_train = train_data[,1:54]
y_test=as.factor(test_data$is_higher)
x_test = test_data[,1:54]
#split training data for 10 folds for cross validation
n_repeats=10
n_folds=10


fitControl=trainControl(method = "repeatedcv",
                        number = n_folds,
                        repeats = n_repeats,
                        summaryFunction=twoClassSummary,	# Use AUC to pick the best model
                        classProbs=TRUE)   



#dtree model
tree_fit=train(is_higher ~ ., data = data.frame(classification_data),
               method = "rpart", 
               trControl = fitControl, 
               tuneLength = 5)
tree_fit   
trellis.par.set(caretTheme())
plot(tree_fit)  

#prediction of test data
preds= predict(tree_fit, x_test)
table_mat <- table(y_test, preds)
table_mat

accuracy_Test <- sum(diag(table_mat)) / sum(table_mat)
print(paste('Accuracy for test', accuracy_Test))

#random forest model 
fitrf=randomForest(x_train,y_train,ntree=500)

fitrf
dim(x)

plot(fitrf)

varImpPlot(fitrf)
#compare class probabilities according to column V2
partialPlot(fitrf,classification_data,x.var='V2',which.class='hgh_1')

## random forest with ranger
rf_grid=expand.grid(mtry = c(1:5),
                    splitrule = c("gini", "extratrees"),
                    min.node.size = c(5))
rf_grid  



n_repeats=10
n_folds=10


fitControl=trainControl(method = "repeatedcv",
                        number = n_folds,
                        repeats = n_repeats,
                        summaryFunction=twoClassSummary,	# Use AUC to pick the best model
                        classProbs=TRUE)    

rf_fit=train(is_higher ~ ., data = data.frame(classification_data), 
             method = "ranger", 
             trControl = fitControl, num.trees=500,
             tuneGrid = rf_grid,importance='impurity') 

rf_fit
plot(rf_fit)

rf_fit$finalModel$variable.importance

rfr_fin_model=randomForest(x_train,y_train,ntree=500, mtry=5)

predictions = predict(rfr_fin_model, x_test)

con_mat = table(y_test, predictions)
con_mat

accuracy_rfr <- sum(diag(con_mat)) / sum(con_mat)
print(paste('Accuracy for test', accuracy_rfr))

results = resamples(list(dtree=tree_fit,rf=rf_fit),metrics='Accuracy')
summary(results)
bwplot(results)
densityplot(results)






