# Data Preparation
- Includes feature extraction and forecast calculation

In [None]:
if (!require("pacman")) install.packages("pacman")
pacman::p_load(tidyverse, ggfortify, zoo, forecast, tsfeatures, parallel, data.table, furrr, tictoc, janitor, data.table, readr)
library(M4metalearning)

source("helperCode/UserFunctionality.R")

In [None]:
ts_dataframe <- read_csv("~/data/ts_dataframe.csv")
load("~/data/SPdata.RDATA")

### Remove duplicate Time Series

In [3]:
ts_rows <- ts_dataframe %>% pivot_wider(id_cols = 'id', values_from='Y', names_from='date')

unique_ts <- ts_rows %>% distinct_at(vars(-id), .keep_all=T) %>% select(id)

SPdata_reduced <- SPdata[unique_ts$id]

### Rename lst elements appropriately

In [4]:
SPdata_reduced <- lapply(SPdata_reduced, seriesrenamR)

### Extract features

In [5]:
tsfresh_feats <- lapply(SPdata_reduced,tsfresh_extractor)

In [None]:
Hyndman_Feats <- THA_features(SPdata_reduced)

### Map both feature groups into one df

In [9]:
all_feats <- map2(tsfresh_feats, Hyndman_Feats, featconnectR)

In [10]:
all_feats_df <- data.frame(matrix(unlist(all_feats), nrow = length(all_feats), byrow = T))

names(all_feats_df) <- names(all_feats[[1]])

### Reduce perfectly correlating elements

In [None]:
cormat <- cor(all_feats_df)

### Kick Rows with only NA

In [12]:
ColNums_NotAllMissing <- function(df){ # helper function
  out <- as.vector(which(colSums(is.na(df)) != nrow(df)-1))
  return(out)
}

delete.na <- function(DF, n=0) {
  return(DF[rowSums(is.na(DF)) <= n,])
}


### Reduce elements with all misssings

In [13]:
cordf <- data.frame(cormat) %>% select(ColNums_NotAllMissing(.))

cormat_reduced <- as.matrix(delete.na(cordf, 292))

cordf_reduced <- data.frame(cormat_reduced)



In [14]:
cordf_reduced %>% gather(X, value) %>% 
  filter(value >=0.999999999999) %>% 
  tally()

n
<int>
527


In [15]:
table(cormat_reduced)['1']

### Show number of 1s per column

In [16]:
out <- cordf_reduced %>% gather(X, value) %>% 
  filter(value  >=0.999999999999) %>% 
  group_by(X) %>% 
  tally()

### Store elements with more than 1 element having corr = 1

In [18]:
noDups <- out %>% filter(n ==1) %>% select(X) 

final_df <- cordf_reduced[,c(noDups$X)]

In [19]:
grouping <- c(names(final_df))

### Syntactical adjustments

In [20]:
names(all_feats_df) <- gsub('"', '.', names(all_feats_df))

names(all_feats_df) <- gsub(',', '.', names(all_feats_df))

names(all_feats_df) <- gsub(' ', '.', names(all_feats_df))

names(all_feats_df) <- gsub('[()]', '.', names(all_feats_df))

names(all_feats_df) <- gsub('-', '.', names(all_feats_df))


In [21]:
available_feats <- all_feats_df[,grouping]


### Normalization of available features

In [None]:
available_feats_normed <- py_scaler(available_feats) %>% as_data_frame()

names(available_feats_normed) <- names(available_feats)

In [23]:
# Add new feats to dataset

for (i in 1:length(SPdata_reduced)){
SPdata_reduced[[i]]$features <- round(available_feats_normed[i,], 7)}

### Calculate forecasts for SPData

In [24]:

SPdata_holdout <- temp_holdout(SPdata_reduced[1:10])

tic()
SPdata_forecasted <- calc_forecasts(SPdata_holdout, c('naive_forec', 'snaive_forec', 'stlm_ar_forec', 'ets_forec', 'rw_drift_forec', 'thetaf_forec', 'auto_arima_forec', 'nnetar_forec'), n.cores=11)
toc()

#save(SPdata_forecasted, file = '../data/SPdata_forecasted.RData')


13.5 sec elapsed


### Check for NA Inf and huge errors

In [None]:
sptrain <- calc_errors(SPdata_forecasted)


idxlst <- unlist(lapply(1:length(sptrain),function(i){if(any(is.na(sptrain[[i]]$mase_err)) | any(is.infinite(sptrain[[i]]$mase_err))){return(i)}}))


spdat_reduced <- SPdata_forecasted[-idxlst] 

#spdat_reduced <- calc_errors(spdat_reduced[-c(23545, 4771, 27102)])

train_data <- create_feat_classif_problem(spdat_reduced)

