In [11]:
library('ggplot2') 
library('forecast') 
library('tseries')

install.packages("rattle")
library(rpart) 
library(rattle) 
library(textir) ## needed to standardize the data library(class) ## needed for knn library(ggplot2) # visualization library(ggthemes) # visualization library(scales) # visualization library(dplyr) # data manipulation library(randomForest) # random forest library(corrplot) # correlation library(gridExtra) library(GGally) library(e1071) library(dbplyr) library(sparklyr) library(caret)

install.packages("randomForest")
install.packages("tidyverse")
install.packages("tidyr")

#devtools::install_github("tidyverse/tidyr")
library(tidyr)

daily_data =read.csv("BTC.csv",header=TRUE, stringsAsFactors=FALSE ) 
head(daily_data)

daily_data[,1] <- factor(daily_data[,1]);
daily_data[,1] <- as.Date(daily_data[,1], format = "%m/%d/%Y");

daily_data$date = as.Date(daily_data$date)

cnt_ma30 <- ma(daily_data$close, order=30)

daily_data <- daily_data[nrow(daily_data):1,]

t <- daily_data$close t<- as.data.frame(t) t<- t [2:1745,] daily_data <- daily_data[1:1744,] daily_data$close1 <- t

ggplot() + geom_line(data = daily_data, aes(x = date, y = close, colour = "Close Value")) + geom_line(data = daily_data, aes(x = date, y = cnt_ma30, colour = "Monthly Average")) + ylab('Close value')

set.seed(123)

index <- sample(1:nrow(daily_data),size = 0.7*nrow(daily_data))

train <- daily_data[index,] test <- daily_data [-index,]

nrow(train) nrow(test)

###Baseline Model
best.guess <- mean(train$close) RMSE.baseline <- sqrt(mean((best.guess-test$close)^2)) RMSE.baseline

MAE.baseline <- mean(abs(best.guess-test$close)) MAE.baseline

###Multiple linear regression
lin.reg <- lm(log(close1 +1) ~ open + high + low + close , data = train) summary(lin.reg)

test.pred.lin <- exp(predict(lin.reg,test))-1

RMSE.lin.reg <- sqrt(mean((test.pred.lin-test$close1)^2)) RMSE.lin.reg

MAE.lin.reg <- mean(abs(test.pred.lin-test$close1)) MAE.lin.reg

###############Decision Tree
rt <- rpart(close1 ~ open + high + low + close, data=train)

test.pred.rtree <- predict(rt,test)

RMSE.rtree <- sqrt(mean((test.pred.rtree-test$close1)^2)) RMSE.rtree

MAE.rtree <- mean(abs(test.pred.rtree-test$close1)) MAE.rtree

printcp(rt)

min.xerror <- rt$cptable[which.min(rt$cptable[,"xerror"]),"CP"] min.xerror

rt.pruned <- prune(rt,cp = min.xerror) fancyRpartPlot(rt.pruned)

test.pred.rtree.p <- predict(rt.pruned,test)

RMSE.rtree.pruned <- sqrt(mean((test.pred.rtree.p-test$close1)^2)) RMSE.rtree.pruned

MAE.rtree.pruned <- mean(abs(test.pred.rtree.p-test$close1)) MAE.rtree.pruned

########Random Forest
library(randomForest)

set.seed(123)

rf <- randomForest(close1 ~ open + high + low + close, data = train, importance = TRUE, ntree=1000)

which.min(rf$mse)

test.pred.forest <- predict(rf,test)

RMSE.forest <- sqrt(mean((test.pred.forest-test$close1)^2)) RMSE.forest

MAE.forest <- mean(abs(test.pred.forest-test$close1)) MAE.forest

#
accuracy <- data.frame(Method = c("Baseline","Linear Regression","Full tree","Pruned tree","Random forest"), RMSE = c(RMSE.baseline,RMSE.lin.reg,RMSE.rtree,RMSE.rtree.pruned,RMSE.forest), MAE = c(MAE.baseline,MAE.lin.reg,MAE.rtree,MAE.rtree.pruned,MAE.forest))

accuracy$RMSE <- round(accuracy$RMSE,2) accuracy$MAE <- round(accuracy$MAE,2) accuracy

all.predictions <- data.frame(actual = test$close, baseline = best.guess, linear.regression = test.pred.lin, full.tree = test.pred.rtree, pruned.tree = test.pred.rtree.p, random.forest = test.pred.forest) head(all.predictions)

#
install.packages("tidyverse")
#Alternatively, install just tidyr:
install.packages("tidyr")
#Or the the development version from GitHub:
install.packages("devtools")
devtools::install_github("tidyverse/tidyr")
library(tidyr)

####################### Visualization - Random Forest- Predicted VS actual
test_rf<-as.data.frame(test.pred.forest) test_rf$actual<- test$close

ggplot(data = test_rf,aes(x = actual, y =test.pred.forest ))+ geom_point(colour = "blue") + geom_abline(intercept = 0, slope = 1, colour = "red") + geom_vline(xintercept = 23, colour = "green", linetype = "dashed")+ ggtitle("Predicted vs. Actual, Random Forest")

test_rf_d<- as.data.frame(test_rf) result_rf <- data.frame(test$close1,test_rf_d )

error<- test_rf_d$actual - test_rf_d$test.pred.forest

result_rf_2 <- data.frame(test_rf_d$actual,test_rf_d$test, error )

names(result_rf_2) <- c("Actual", "Prediction", "Error") hist(result_rf_2$Error)

####################### Visualization - Full Tree- Predicted VS actual
test_ft<-as.data.frame(test.pred.rtree.p) test_ft$actual<- test$close

ggplot(data = test_ft,aes(x = actual, y =test.pred.rtree.p, ))+ geom_point(colour = "blue") + geom_abline(intercept = 0, slope = 1, colour = "red") + geom_vline(xintercept = 23, colour = "green", linetype = "dashed")+ ggtitle("Predicted vs. Actual, Pruned Tree")

# ####################### Visualization - Linear Regression- Predicted VS actual
test_ft<-as.data.frame(test.pred.lin) test_ft$actual<- test$close

ggplot(data = test_ft,aes(x = actual, y =test.pred.lin ))+ geom_point(colour = "blue") + geom_abline(intercept = 0, slope = 1, colour = "red") + geom_vline(xintercept = 23, colour = "green", linetype = "dashed")+ ggtitle("Predicted vs. Actual, Linear Regression")

#
all.predictions <- gather(all.predictions,key = model,value = predictions,2:6)

head(all.predictions)

tail (all.predictions)

ggplot(data = all.predictions,aes(x = actual, y = predictions)) +
#geom_point(colour = "blue") + #geom_abline(intercept = 0, slope = 1, colour = "red") + #geom_vline(xintercept = 23, colour = "green", linetype = "dashed") + #facet_wrap(~ model,ncol = 2) + #coord_cartesian(xlim = c(0,70),ylim = c(0,70)) + #ggtitle("Predicted vs. Actual, by model")

#
random.forest.predictions <- data.frame(actual = test$close,random.forest = test.pred.forest) random.forest.predictions <- data.frame(actual = test$close,random.forest = test.pred.forest)

test.predictions<- data.frame(actual = test$close, baseline = best.guess, linear.regression = test.pred.lin, full.tree = test.pred.rtree, pruned.tree = test.pred.rtree.p, random.forest = test.pred.forest)

actual<- test$close Random_Forest <-test.pred.forest pruned_tree <- test.pred.rtree.p full_tree <- test.pred.rtree linear_regression <- test.pred.lin Baseline_m <- best.guess

result <- data.frame(actual,Random_Forest , pruned_tree,linear_regression, Baseline_m)

########################## NN
library('ggplot2') library('forecast') library('tseries') library(rpart) library(rattle) library(textir) ## needed to standardize the data library(class) ## needed for knn library(ggplot2) # visualization library(ggthemes) # visualization library(scales) # visualization library(dplyr) # data manipulation library(randomForest) # random forest library(corrplot) # correlation library(gridExtra) library(GGally) library(e1071) library(dbplyr) library(sparklyr) library(caret) library(tidyr)

daily_data =read.csv("D:/University/Queens/Term 3/Analytics for Financial markets/Assignments/Project/full.csv",header=TRUE,stringsAsFactors=FALSE ) head(daily_data)

daily_data[,1] <- factor(daily_data[,1]) daily_data[,1] <- as.Date(daily_data[,1], format = "%m/%d/%Y")

daily_data$date = as.Date(daily_data$date)

t <- daily_data$close t<- as.data.frame(t) t<- t [2:1745,] daily_data <- daily_data[1:1744,] daily_data$close1 <- t

data <- daily_data

data <- daily_data[,2:6]

set.seed(500) library(MASS)

index <- sample(1:nrow(data),round(0.70*nrow(data))) train <- data[index,] test <- data[-index,]

maxs <- apply(data, 2, max) mins <- apply(data, 2, min)

scaled <- as.data.frame(scale(data, center = mins, scale = maxs - mins))

train_ <- scaled[index,] test_ <- scaled[-index,]

library(neuralnet) n <- names(train_) f <- as.formula(paste("close1 ~", paste(n[!n %in% "close1"], collapse = " + "))) nn <- neuralnet(f,data=train_,hidden=c(3),linear.output=T) plot(nn)

################# predicting
pr.nn <- compute(nn,test_[,1:4])

pr.nn_ <- pr.nn$net.result*(max(data$close1)-min(data$close1))+min(data$close1) test.r <- (test_$close1)*(max(data$close1)-min(data$close1))+min(data$close1)

MSE.nn <- sum((test.r - pr.nn_)^2)/nrow(test_)

print(paste(MSE.lm,MSE.nn))

#
par(mfrow=c(1,2))

plot(test$close1,pr.nn_,col='red',main='Real vs predicted NN',pch=18,cex=0.7) abline(0,1,lwd=2) legend('bottomright',legend='NN',pch=18,col='red', bty='n')

#
library(boot) set.seed(200) lm.fit <- glm(close1~.,data=data) cv.glm(data,lm.fit,K=10)$delta[1]

set.seed(450) cv.error <- NULL k <- 10

library(plyr) pbar <- create_progress_bar('text') pbar$init(k)

for(i in 1:k){ index <- sample(1:nrow(data),round(0.9*nrow(data))) train.cv <- scaled[index,] test.cv <- scaled[-index,]

nn2 <- neuralnet(f,data=train.cv,hidden=c(3),linear.output=T)

pr.nn2 <- compute(nn,test.cv[,1:4]) pr.nn2 <- pr.nn2$net.result*(max(data$close1)-min(data$close1))+min(data$close1)

test.cv.r <- (test.cv$close1)*(max(data$close1)-min(data$close1))+min(data$close1)

cv.error[i] <- sum((test.cv.r - pr.nn2)^2)/nrow(test.cv)

pbar$step() }

mean(cv.error)

boxplot(cv.error,xlab='MSE CV',col='cyan', border='blue',names='CV error (MSE)', main='CV error (MSE) for NN',horizontal=TRUE)

#
pr.nn_d<- as.data.frame(pr.nn_) result_nn <- data.frame(test$close1,pr.nn_d )

error<- test$close1 - pr.nn_d

result_nn <- data.frame(test$close1,pr.nn_d, error )

names(result_nn) <- c("Actual", "Prediction", "Error") hist(result_nn$Error)

RMSE.nn <- sqrt(mean((result_nn$Prediction-result_nn$Actual)^2)) RMSE.nn

MAE.nn <- mean(abs(result_nn$Prediction-result_nn$Actual)) MAE.nn

#
mean(result_nn$Error)

SyntaxError: invalid syntax (<ipython-input-11-34f72f058c99>, line 20)