Skip to content

Commit

Permalink
Suppress .RData files and update with usethis instead of devtools
Browse files Browse the repository at this point in the history
  • Loading branch information
APierrot committed Jul 2, 2019
1 parent 5cf7551 commit a90e861
Show file tree
Hide file tree
Showing 10 changed files with 179 additions and 13 deletions.
2 changes: 1 addition & 1 deletion cran-comments.md
Expand Up @@ -2,7 +2,7 @@
* local OS X install, R 3.6.0

## Update
suppress .rda data files, keep only .RData ones
suppress .RData files, keep only .rda ones
(mail from Kurt Hornik, 2019/06/27)

## R CMD check results
Expand Down
165 changes: 165 additions & 0 deletions data-raw/DATASET.R
@@ -0,0 +1,165 @@
## code to prepare `DATASET` dataset goes here

# PACKAGES
library(dplyr)

# DIRECTORY
data_dir <- file.path('/Users', 'amandinepierrot', 'Documents', 'data', 'UK',
'2017_global')

# FUNCTIONS
source(file.path('data-raw', 'trick-functions.R'))


# LOAD DATA --------------------------------------------------------------------

# IMPORT

gb_load <- read.csv(file.path(data_dir, 'DemandData_2011-2016.csv'), sep = ',') %>%
dplyr::select(SETTLEMENT_DATE, SETTLEMENT_PERIOD, ENGLAND_WALES_DEMAND) %>%
dplyr::mutate(SETTLEMENT_DATE = lubridate::dmy(SETTLEMENT_DATE,
tz = 'Europe/London'),
HOURS = (SETTLEMENT_PERIOD - 1) %/% 2,
MINUTES = (SETTLEMENT_PERIOD - 1) %% 2 * 30,
TIMESTAMP = SETTLEMENT_DATE + lubridate::dhours(HOURS) +
lubridate::dminutes(MINUTES)) %>%
dplyr::select(SETTLEMENT_DATE, SETTLEMENT_PERIOD, TIMESTAMP,
ENGLAND_WALES_DEMAND)


# WEATHER DATA -----------------------------------------------------------------

# IMPORT
gb_weather <- read.table(file.path(data_dir, 'meteo', '29657358679dat.txt'),
header = TRUE, fill = TRUE, na.strings = '****') %>%
mutate(code = factor(USAF),
timestamp = lubridate::with_tz(lubridate::ymd_hm(YR..MODAHRMN, tz = 'GMT'),
tzone = 'Europe/London'),
value = weathermetrics::fahrenheit.to.celsius(TEMP)) %>%
select(code, timestamp, value)

# INTERPOLATION
time_index <- seq(lubridate::floor_date(gb_weather$timestamp[1], unit = 'hour'),
lubridate::floor_date(gb_weather$timestamp[nrow(gb_weather)],
unit = 'hour'),
by = '30 min')
data_with_info <- detect_NA(gb_weather)
interpol_data <- data.frame(code = rep(unique(data_with_info[['code']]),
each = length(time_index)),
timestamp = time_index,
value = NA,
mv = NA)
for (c in unique(interpol_data[['code']])) {
interpol_idx <- interpol_data[['code']] == c
original_idx <- data_with_info[['code']] == c
interpol_data[interpol_idx, 'value'] <- approx(
data_with_info[original_idx, 'timestamp'],
data_with_info[original_idx, 'value'],
xout = time_index)$y
interpol_data[interpol_idx, 'mv'] <- approx(
data_with_info[original_idx, 'timestamp'],
data_with_info[original_idx, 'mv'],
xout = time_index)$y > 0
}

# WEIGHTED TEMPERATURE
station_pop <- read.table(file.path(data_dir, 'meteo',
'corr_meteo_town_code.txt'),
sep = '\t', header = TRUE)
# two stations in Liverpool
station_pop[station_pop[['town']] == 'LIVERPOOL',
'pop'] <- station_pop[station_pop[['town']] == 'LIVERPOOL',
'pop'] / 2
station_pop[station_pop[['town']] == 'LIVERPOOL_CROSBY',
'pop'] <- station_pop[station_pop[['town']] == 'LIVERPOOL', 'pop']
station_pop <- station_pop %>%
mutate(weight = pop / sum(pop))
raw_temperatures <- tidyr::spread(interpol_data[, c('code', 'timestamp', 'value')],
code, value)
is_mv <- tidyr::spread(interpol_data[, c('code', 'timestamp', 'mv')],
code, mv)
gb_weather <- data.frame(TIMESTAMP = interpol_data[['timestamp']],
TEMPERATURE = as.matrix(raw_temperatures[, -1]) %*%
station_pop[['weight']],
MV = as.matrix(is_mv[, -1]) %*%
station_pop[['weight']])


# BOTH DATA --------------------------------------------------------------------
gb_load <- dplyr::left_join(gb_load, gb_weather, by = 'TIMESTAMP') %>%
distinct()


# DAY TYPES --------------------------------------------------------------------
bank_hol <- timeDate::holidayLONDON(
unique(lubridate::year(gb_load[['TIMESTAMP']]))
)
bank_hol <- lubridate::with_tz(as.POSIXct(bank_hol), tzone = 'Europe/London')
gb_load <- gb_load %>%
mutate(wday7 = lubridate::wday(TIMESTAMP),
bh = ifelse(SETTLEMENT_DATE %in% bank_hol, 1, 0),
DAY_TYPE = ifelse(bh == 0, wday7, 8)) %>%
select(-wday7, -bh)
rm(bank_hol)

# save(gb_load, file = file.path('data', 'gb_load.RData'))



# CLUSTERS ---------------------------------------------------------------------
cluster_data <- gb_load %>%
select(SETTLEMENT_DATE, DAY_TYPE) %>%
mutate(MONTH = month(SETTLEMENT_DATE),
season_clust = ifelse(MONTH %in% c(1:3, 11:12), 1, 2)) %>%
select(-MONTH) %>%
distinct()

Y_info <- cluster_data[2:nrow(cluster_data), ] %>%
select(-SETTLEMENT_DATE) %>%
rename(Y_dt = DAY_TYPE,
Y_sc = season_clust)
X_info <- cluster_data[1:(nrow(cluster_data) - 1), ] %>%
select(-SETTLEMENT_DATE) %>%
rename(X_dt = DAY_TYPE,
X_sc = season_clust)

clusters <- cbind(Y_info, X_info) %>%
distinct() %>%
filter(Y_dt != 8,
X_dt != 8)
clusters <- data.frame(clust = 1:nrow(clusters),
clusters)

begin_pred <- which(as.character(
cluster_data[2:nrow(cluster_data), 'SETTLEMENT_DATE']) == '2016-01-01')

# clusters on train data
clust_train <- find_id(Y_info = Y_info[1:(begin_pred - 1), ],
X_info = X_info[1:(begin_pred - 1), ],
clusters = clusters)
n_byclust <- sapply(clust_train, length)
clust_train <- clust_train[n_byclust > 5]

# save(clust_train, file = file.path('data', 'clust_train.RData'))


# clusters on test data
clust_test <- find_id(Y_info = Y_info[begin_pred:nrow(Y_info), ],
X_info = X_info[begin_pred:nrow(X_info), ],
clusters = clusters)
clust_test <- clust_test[n_byclust > 5]
# keep only clusters with 5 obs for fitting

# save(clust_test, file = file.path('data', 'clust_test.RData'))


# clusters on all data
clust <- find_id(Y_info = Y_info,
X_info = X_info,
clusters = clusters)
# keep only clusters with 5 obs on fitting period
clust <- clust[n_byclust > 5]

usethis::use_data("DATASET")


16 changes: 8 additions & 8 deletions data-raw/devtools-tricks.R
@@ -1,11 +1,11 @@
devtools::use_data_raw()
usethis::use_data_raw()
usethis::use_data(gb_load)
usethis::use_data(clust_train)
usethis::use_data(clust_test)
devtools::document()
devtools::use_data(gb_load, overwrite = TRUE)
devtools::use_data(clust_train)
devtools::use_data(clust_test)
devtools::use_package('magrittr')
devtools::use_package('lubridate')
devtools::use_package('dplyr')
devtools::use_package('stats')
usethis::use_package('magrittr')
usethis::use_package('lubridate')
usethis::use_package('dplyr')
usethis::use_package('stats')
usethis::use_build_ignore('cran-comments.md')
usethis::use_build_ignore('NEWS.md')
9 changes: 5 additions & 4 deletions data-raw/prepare-data.R
Expand Up @@ -3,7 +3,8 @@
library(dplyr)

# DIRECTORY
data_dir <- file.path('/Users', 'amandinepierrot', 'Documents', 'donnees', 'UK')
data_dir <- file.path('/Users', 'amandinepierrot', 'Documents', 'data', 'UK',
'2017_global')

# FUNCTIONS
source(file.path('data-raw', 'trick-functions.R'))
Expand Down Expand Up @@ -100,7 +101,7 @@ gb_load <- gb_load %>%
select(-wday7, -bh)
rm(bank_hol)

save(gb_load, file = file.path('data', 'gb_load.RData'))
# save(gb_load, file = file.path('data', 'gb_load.RData'))



Expand Down Expand Up @@ -138,7 +139,7 @@ clust_train <- find_id(Y_info = Y_info[1:(begin_pred - 1), ],
n_byclust <- sapply(clust_train, length)
clust_train <- clust_train[n_byclust > 5]

save(clust_train, file = file.path('data', 'clust_train.RData'))
# save(clust_train, file = file.path('data', 'clust_train.RData'))


# clusters on test data
Expand All @@ -148,7 +149,7 @@ clust_test <- find_id(Y_info = Y_info[begin_pred:nrow(Y_info), ],
clust_test <- clust_test[n_byclust > 5]
# keep only clusters with 5 obs for fitting

save(clust_test, file = file.path('data', 'clust_test.RData'))
# save(clust_test, file = file.path('data', 'clust_test.RData'))


# clusters on all data
Expand Down
Binary file removed data/clust_test.RData
Binary file not shown.
Binary file added data/clust_test.rda
Binary file not shown.
Binary file removed data/clust_train.RData
Binary file not shown.
Binary file added data/clust_train.rda
Binary file not shown.
Binary file removed data/gb_load.RData
Binary file not shown.
Binary file added data/gb_load.rda
Binary file not shown.

0 comments on commit a90e861

Please sign in to comment.