In [7]:
library(data.table)
library(repr)
library(lubridate)
library(rpart)
library(partykit)
library(ggplot2)
library(Metrics)
library(TSdist)
library(dtw)
library(TSrepr)
library(TunePareto)
library(caret)
library(writexl)
library(forecast)
library(tidyr)
library(randomForest)
library(rattle)
options(repr.plot.width=10, repr.plot.height=10)

In [8]:
dt <- fread("C:/Users/kaan9/OneDrive/Masaüstü/bulk_imbalance_son.csv")
total_vol <- data.table(dt[,c("date","hour","downRegulationZeroCoded",
                              "upRegulationZeroCoded","net","system_direction")])
colnames(total_vol) <- c("date","hour","yat_vol","yal_vol","net_imb","direction")

In [9]:
wt <- fread("C:/Users/kaan9/OneDrive/Masaüstü/weather_son.csv")
wt$loc <- paste("loc",as.character(wt$lat),as.character(wt$lon),sep="_")
wt <- data.table(pivot_wider(wt[,c(1,2,7,5,6)],names_from = c(loc,variable),values_from =value))
wt$day <- wday(wt$date)
wt$month <- month(wt$date)
total_vol <- wt[total_vol,on=.(date,hour)]
total_vol$t <- 1:nrow(total_vol)
total_vol[, direction:=ifelse(net_imb>50, "Positive" , ifelse(net_imb<(-50),"Negative","Neutral"))]
total_vol[, net_imb:=ifelse(net_imb<(-5000), (-5000), ifelse(net_imb>5000, 5000, net_imb))]

In [10]:
pca <- princomp(total_vol[,-c("date","yat_vol","yal_vol","net_imb","direction","t")])
summary(pca)

Importance of components:
                            Comp.1      Comp.2       Comp.3      Comp.4
Standard deviation     704.7270289 122.4732832 101.31160430 93.61372640
Proportion of Variance   0.8477219   0.0256032   0.01751983  0.01495859
Cumulative Proportion    0.8477219   0.8733251   0.89084489  0.90580348
                           Comp.5      Comp.6      Comp.7       Comp.8
Standard deviation     88.2780487 84.05383925 82.47595069 74.944001432
Proportion of Variance  0.0133020  0.01205943  0.01161091  0.009587056
Cumulative Proportion   0.9191055  0.93116491  0.94277582  0.952362872
                             Comp.9      Comp.10      Comp.11      Comp.12
Standard deviation     70.788955716 68.386011879 61.357560060 56.287399268
Proportion of Variance  0.008553474  0.007982631  0.006426103  0.005407965
Cumulative Proportion   0.960916346  0.968898978  0.975325081  0.980733046
                            Comp.13      Comp.14      Comp.15    Comp.16
Standard deviation     52.041

In [11]:
total_vol[,pca1:=pca$scores[,1]]

In [12]:
date_fltr <- which(total_vol$date=="2022-01-12")[total_vol[which(total_vol$date=="2022-01-12")]$hour==16]
total_vol_train <- total_vol[1:date_fltr,]

In [8]:
start <- Sys.time()
model_rf_all <- randomForest(net_imb ~.-date-yat_vol-yal_vol-direction-t-pca1,total_vol_train)
end <- Sys.time()
save(model_rf_all,file="C:/Users/kaan9/OneDrive/Masaüstü/rf_model_all.Rdata")

In [13]:
model_rf_all


Call:
 randomForest(formula = net_imb ~ . - date - yat_vol - yal_vol -      direction - t - pca1, data = total_vol_train) 
               Type of random forest: regression
                     Number of trees: 500
No. of variables tried at each split: 15

          Mean of squared residuals: 378077.6
                    % Var explained: 62.9

In [None]:
control1 <- trainControl(method="cv",
                        number=10)
start <- Sys.time()
model_ranger <-  train(net_imb ~.-date-yat_vol-yal_vol-direction-t-pca1, 
                       data = total_vol_train,
                       method = 'ranger',
                       metric = 'RMSE',
                       trControl = control1,
                       tuneGrid = expand.grid(.mtry=c(15,25,35),.splitrule="variance",.min.node.size=c(10,30,50)))
end <- Sys.time()
save(model_ranger,file="C:/Users/kaan9/OneDrive/Masaüstü/ranger_model.Rdata")

In [19]:
model_ranger

Random Forest 

26585 samples
   51 predictor

No pre-processing
Resampling: Cross-Validated (10 fold) 
Summary of sample sizes: 23925, 23927, 23927, 23926, 23927, 23928, ... 
Resampling results across tuning parameters:

  mtry  min.node.size  RMSE      Rsquared   MAE     
   5    3              650.5268  0.6498134  473.3433
   5    5              653.8487  0.6459687  476.2564
   5    7              658.5292  0.6397895  479.8987
  10    3              632.6367  0.6636021  460.0678
  10    5              636.0772  0.6601107  462.6722
  10    7              639.5813  0.6558312  465.6089
  15    3              625.6155  0.6685003  455.0549
  15    5              628.2432  0.6654816  456.8414
  15    7              631.4221  0.6619813  459.5056

Tuning parameter 'splitrule' was held constant at a value of variance
RMSE was used to select the optimal model using the smallest value.
The final values used for the model were mtry = 15, splitrule = variance
 and min.node.size = 3.

In [None]:
control1 <- trainControl(method="cv",
                        number=10)
start <- Sys.time()
model_ranger_son <-  train(net_imb ~.-date-yat_vol-yal_vol-direction-t-pca1, 
                       data = total_vol_train,
                       num.trees=500,
                       method = 'ranger',
                       metric = 'RMSE',
                       trControl = control1,
                       tuneGrid = expand.grid(.mtry=15,.splitrule="variance",.min.node.size=3))
end <- Sys.time()
save(model_ranger_son,file="C:/Users/kaan9/OneDrive/Masaüstü/ranger_model_son.Rdata")

In [21]:
model_ranger_son

Random Forest 

26585 samples
   51 predictor

No pre-processing
Resampling: Cross-Validated (10 fold) 
Summary of sample sizes: 23926, 23926, 23926, 23926, 23928, 23926, ... 
Resampling results:

  RMSE      Rsquared   MAE     
  624.4039  0.6715859  454.5149

Tuning parameter 'mtry' was held constant at a value of 15
Tuning
 parameter 'splitrule' was held constant at a value of variance

Tuning parameter 'min.node.size' was held constant at a value of 3

In [13]:
load("C:/Users/kaan9/OneDrive/Masaüstü/ranger_model_son.Rdata")

In [14]:
fc <- predict(model_ranger_son,total_vol)
total_vol$rf <- fc
total_vol <- total_vol[,c("date","hour","pca1","rf","direction")]

In [15]:
series <- list()
start_hours_before <- 0
window_sizes <- c(12,24,36)
cols <- c("rf","pca1")
for(c in cols){
    for(w in window_sizes){
        tmp <- data.table(total_vol)
        tmp[, paste0((start_hours_before):(w+start_hours_before-1), "_hours_before") := shift(tmp[[c]], (start_hours_before):(w+start_hours_before-1))]
        tmp <- tmp[complete.cases(tmp),]
        for(h in 12:23){
           hour <- paste("hour",as.character(h),sep="")
           window <- paste("window",as.character(w),sep="")
           st <- which(colnames(tmp)==paste(start_hours_before,"hours_before",sep="_"))
           e <- length(colnames(tmp))
           dir <- which(colnames(tmp)=="direction") 
           indices <- c(1,dir,st:e)
           series[[paste(hour,window,c,sep="_")]]  <- tmp[hour==h,..indices]
        }
    }
}

In [16]:
time_start <- Sys.time()
for(n in names(series)){
  series_long <- melt(series[[n]],id.vars = c("date","direction"))
  long_dt <- data.table()
  for(d in unique(series_long$date)){
    temp_dt <- series_long[date==d,]
    temp_tree <- rpart(value~variable,temp_dt,minbucket=1,minsplit=2)
    temp_pred <- predict(temp_tree)
    temp_dt$tree <- temp_pred
    temp_dt[,t:=1:.N]
    temp_sax <- repr_sax(temp_dt$value, q = 2, a = 4)
    dummy_time=c(1:(length(temp_sax)-1))*2
    dummy_time=c(dummy_time,nrow(temp_dt))  
    dt_sax=data.table(t=dummy_time,sax_rep_char=temp_sax)
    temp_dt <- merge(temp_dt,dt_sax,by="t",all.x=T)
    temp_dt[,sax_rep_char_num:=nafill(as.numeric(as.factor(sax_rep_char)),'nocb')] # from data.table  
    temp_dt[,sax_rep:=mean(value),by=list(sax_rep_char_num)]  
    long_dt <- rbind(long_dt,temp_dt)
  }
  series[[paste(n,"tree",sep="")]] <- dcast(long_dt,date+direction~variable,value.var="tree")
  series[[paste(n,"sax",sep="")]] <- dcast(long_dt,date+direction~variable,value.var="sax_rep")  
}
time_end <- Sys.time()

In [17]:
time_end-time_start

Time difference of 11.40317 mins

In [18]:
load("C:/Users/kaan9/OneDrive/Masaüstü/best_test.RData")

In [19]:
trainclasses <- list()
for(h in 12:23){
    trainclasses[[as.character(h)]] <- total_vol[hour==h,]$direction
}

In [20]:
predict_knn <- function(bestModels,date){
    
    today <- date
    predictions <- character(0)
    for(n in best_models){
        seri <- substr(n,start = 1,stop=gregexpr(pattern ='_',n)[[1]][3]-1)
        hour <- as.numeric(substr(n,start = 5,stop=gregexpr(pattern ='_',n)[[1]][1]-1))
        dis <- substr(n,start = gregexpr(pattern ='_',n)[[1]][3]+1,stop=gregexpr(pattern ='_',n)[[1]][4]-1)
        k <- as.numeric(substr(n,start = gregexpr(pattern ='_',n)[[1]][4]+2,stop=nchar(n)))
        train_class <- trainclasses[[as.character(hour)]]   

        if(dis=="edr"){ 
           dist_matrix <- TSDatabaseDistances(X = series[[seri]][,3:length(series[[seri]])],
                                              Y = series[[seri]][date==today,3:length(series[[seri]])],        
                                              distance='erp',g=0.5) 
        }else if(dis=="dtw"){  
            dist_matrix <- dtwDist(mx=series[[seri]][,3:length(series[[seri]])],
                                   my=series[[seri]][date==today,3:length(series[[seri]])],
                                   window.type='sakoechiba',window.size=10)      
        }else if(dis=="euc"){    
            dist_matrix <- TSDatabaseDistances(X = series[[seri]][,3:length(series[[seri]])],
                                               Y = series[[seri]][date==today,3:length(series[[seri]])],        
                                               distance='euc')      
        }

        dist_matrix[length(dist_matrix)] <- 1000000
        ordered_indices <- order(dist_matrix)
        nearest_class <- train_class[ordered_indices[1:k]]
        tmp_table <- table(nearest_class)    
        pred <- names(which.max(tmp_table))
        predictions <- c(predictions,pred)
    }
    return(predictions)
    
}

In [21]:
res <- predict_knn(best_models,"2022-01-20")
res

In [26]:
paste(res,sep=",")
apply()

#0ffdeaa3-0914-4c9d-8d26-363e1c536d2c
https://forms.gle/Z8h5SJpxVguEsUXF6
https://docs.google.com/spreadsheets/d/1wEZjAZrtnalIu4Y7ct2vutBU8HMAy0EQ93cZgRoWDrs
https://docs.google.com/spreadsheets/d/1wes0Ws6-UNUcRFh3IidSABgcjCZDdANKpHQxhr4Ne-o/edit?invite=CO3mossB#gid=0

In [36]:
test_start <- "2021-12-12"
test_end <- "2022-01-12"
test_dates <- unique(total_vol[hour %in% c(12:23),][(date>=test_start) & (date<=test_end),date])
real <- total_vol[hour %in% c(12:23),][(date>=test_start) & (date<=test_end),direction]

In [37]:
preds <- character(0)
for(d in test_dates){
    preds <- c(preds,predict_knn(best_models,d))
}

In [38]:
Metrics::accuracy(actual = real,predicted = preds)