-
Notifications
You must be signed in to change notification settings - Fork 5
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Support datatypes: array, map, struct #135
Comments
Branch library(DBI)
con <- dbConnect(noctua::athena(), convert_array = T)
query1 <- "SELECT
ARRAY [CAST(4 AS VARCHAR), CAST(5 AS VARCHAR)]
AS items"
query2 <-
"SELECT
ARRAY[CAST(MAP(ARRAY['a1', 'a2', 'a3'], ARRAY[1, 2, 3]) AS JSON)] ||
ARRAY[CAST(MAP(ARRAY['b1', 'b2', 'b3'], ARRAY[4, 5, 6]) AS JSON)]
AS items"
query3 <- "SELECT
CAST(
ROW('Bob', 38) AS ROW(name VARCHAR, age INTEGER)
) AS users"
query4 <- "SELECT ARRAY[
CAST(ROW('Bob', 38) AS ROW(name VARCHAR, age INTEGER)),
CAST(ROW('Alice', 35) AS ROW(name VARCHAR, age INTEGER)),
CAST(ROW('Jane', 27) AS ROW(name VARCHAR, age INTEGER))
] AS users"
query5 <- "SELECT
CAST(
ROW('aws.amazon.com', ROW(true)) AS ROW(hostname VARCHAR, flaggedActivity ROW(isNew BOOLEAN))
) AS sites"
query6 <-
"SELECT
sites.hostname,
sites.flaggedactivity.isnew
FROM (
SELECT
CAST(
ROW('aws.amazon.com', ROW(true)) AS ROW(hostname VARCHAR, flaggedActivity ROW(isNew BOOLEAN))
) AS sites
) as dataset"
query7 <- "SELECT MAP(
ARRAY['first', 'last', 'age'],
ARRAY['Bob', 'Smith', '35']
) AS user"
df1 <- dbGetQuery(con, query1)
# Info: (Data scanned: 0 Bytes)
df2 <- dbGetQuery(con, query2)
# Info: (Data scanned: 0 Bytes)
df3 <- dbGetQuery(con, query3)
# Info: (Data scanned: 0 Bytes)
df4 <- dbGetQuery(con, query4)
# Info: (Data scanned: 0 Bytes)
# Warning message:
# Column `users` was unable to be converted. Returning column `users` as character
df5 <- dbGetQuery(con, query5)
# Info: (Data scanned: 0 Bytes)
df6 <- dbGetQuery(con, query6)
# Info: (Data scanned: 0 Bytes)
df7 <- dbGetQuery(con, query7)
# Info: (Data scanned: 0 Bytes) AWS Athena SQL Query Resource: |
What the above AWS Athena Queries return from AWS Athena:
Note as |
Returning R formats when df1:
# items
# 1: <list[2]>
df1$items[[1]]
# [[1]]
# [1] 4
#
# [[2]]
# [1] 5
df2:
# items
# 1: <list[2]>
df2$items[[1]]
# [[1]]
# [[1]]$a1
# [1] 1
#
# [[1]]$a2
# [1] 2
#
# [[1]]$a3
# [1] 3
#
#
# [[2]]
# [[2]]$b1
# [1] 4
#
# [[2]]$b2
# [1] 5
#
# [[2]]$b3
# [1] 6
df3:
# users
# 1: {name=Bob, age=38}
df4:
# users
# 1: [{name=Bob, age=38}, {name=Alice, age=35}, {name=Jane, age=27}]
df5:
# sites
# 1: {hostname=aws.amazon.com, flaggedactivity={isnew=true}}
df6:
# hostname isnew
# 1: aws.amazon.com TRUE
df7:
# user
# 1: {last=Smith, first=Bob, age=35} |
As Json can be parsed different ways, it might be worth letting the user provide custom parsers if they don't like the default. For example: library(DBI)
# use default parse: jsonlite::parse_json
con1 <- dbConnect(noctua::athena())
# leave arrays as characters
con2 <- dbConnect(noctua::athena(), convert_array = NULL)
# use custom json parser
con3 <- dbConnect(noctua::athena(), convert_array = jsonlite::fromJSON) |
Another question: Should |
Added support to more AWS Athena Data types: varbinary, ipaddress and json. I am pretty happy with the binary method currently proposed in branch: library(DBI)
library(data.table)
# default binary conversion connection
con1 <- dbConnect(noctua::athena())
query <- "SELECT to_utf8('helloworld') as hi"
dt1 = dbGetQuery(con1, query)
dt1
# hi
# 1: 68,65,6c,6c,6f,77,...
sapply(dt1, class)
hi
"list"
con2 <- dbConnect(noctua::athena(), binary = "character")
dt2 = dbGetQuery(con2, query)
dt2
# hi
# 1: 68 65 6c 6c 6f 77 6f 72 6c 64
sapply(dt2, class)
# hi
# "character" The default binary data type conversion method, returns a list of raw vectors. This makes it really easy for R users to convert it back to what ever. dt1[, string := lapply(hi, rawToChar)]
dt1
# hi string
# 1: 68,65,6c,6c,6f,77,... helloworld |
For completeness here is the dplyr method for above data.table: library(DBI)
library(dplyr)
noctua::noctua_options("vroom")
# default binary conversion connection
con1 <- dbConnect(noctua::athena())
query <- "SELECT to_utf8('helloworld') as hi"
dt1 = dbGetQuery(con1, query)
dt1
# A tibble: 1 x 1
# hi
# <list>
#1 <raw [10]>
con2 <- dbConnect(noctua::athena(), binary = "character")
dt2 = dbGetQuery(con2, query)
dt2
# A tibble: 1 x 1
# hi
# <chr>
#1 68 65 6c 6c 6f 77 6f 72 6c 64 For the dplyr approach a nested column of raw vectors is provide. To get the data users can do the following: library(purrr)
dt1 %>% mutate(string = map_chr(hi, rawToChar))
# A tibble: 1 x 2
# hi string
# <list> <chr>
#1 <raw [10]> helloworld note: |
Wow, looks amazing! You sure work fast with these :) |
@OssiLehtinen been pondering on it all last night thinking it over :P I am guessing you would say the same for the I am more than happy to have it in |
… and json on the fly without the need to create a new connection. (#135)
Added parameters for
library(DBI)
library(noctua)
# default conversion methods
con <- dbConnect(noctua::athena())
# change json conversion method
noctua_options(json = "character")
noctua:::athena_option_env$json
# [1] "character"
# change json conversion to custom method
noctua_options(json = jsonify::from_json)
noctua:::athena_option_env$json
# function (json, simplify = TRUE, fill_na = FALSE, buffer_size = 1024)
# {
# json_to_r(json, simplify, fill_na, buffer_size)
# }
# <bytecode: 0x7f823b9f6830>
# <environment: namespace:jsonify>
# change bigint conversion without affecting custom json conversion methods
noctua_options(bigint = "numeric")
noctua:::athena_option_env$json
# function (json, simplify = TRUE, fill_na = FALSE, buffer_size = 1024)
# {
# json_to_r(json, simplify, fill_na, buffer_size)
# }
# <bytecode: 0x7f823b9f6830>
# <environment: namespace:jsonify>
noctua:::athena_option_env$bigint
# [1] "numeric"
# change binary conversion without affect, bigint or json methods
noctua_options(binary = "character")
noctua:::athena_option_env$json
# function (json, simplify = TRUE, fill_na = FALSE, buffer_size = 1024)
# {
# json_to_r(json, simplify, fill_na, buffer_size)
# }
# <bytecode: 0x7f823b9f6830>
# <environment: namespace:jsonify>
noctua:::athena_option_env$bigint
# [1] "numeric"
noctua:::athena_option_env$binary
# [1] "character" Note |
Looks great after some testing too! It is a shame that athena returns the weirdly formatted stuff with structs and maps. We tried brainstorming a bit with some colleagues on how to parse those but couldn't come up with anything even remotely elegant. One note on the bigint part:
and query a table with big bigints (which overwflow the 32 bit ints), I get the following message and the column remain int64:
So the behaviour and returned data types depend on the data entries, which makes things a bit unpredictable. Of course it is a silly thing to do to cast an overflowing int64 to in, but still. With the vroom parser, I get consistently int-columns, but the overflown entries come out as NAs. I'm not sure which is better, to be honest. |
Ah thanks @OssiLehtinen I am currently looking at how to improve performance for the currently conversion methods, possible some improvements with binary and json methods 😄 . I will have a look at |
3 different methods for parsing a string of json objects. # method 1
# collapse vector of json strings and create 1 big json:
method_1 <- function(string) jsonlite::parse_json(paste0("[", paste(string, collapse = ","), "]"))
# method 2
# parse each json object
method_2 <- function(string) lapply(string, jsonlite::parse_json)
# method 3
# chunk up json strings then collapse json chunks before parsing them
method_3 <- function(string, fun=jsonlite::parse_json, min_chunk = 10000L){
if(length(string) < min_chunk){
output <- fun(paste0("[", paste(string, collapse = ","), "]"))
} else {
len <- max(ceiling(length(string)/20), min_chunk)
split_string <- split_vec(string, len)
output <- unlist(
lapply(split_string, function(i) fun(create_json_string(i))),
recursive = FALSE
)
}
return(output)
}
split_vec <- function(vec, len, max_len = length(vec)){
chunks <- seq(1, max_len, len)
ll <- Map(function(i) list(), 1:length(chunks))
for (i in seq_along(chunks))
ll[[i]] <- vec[chunks[i]:min(chunks[i]+(len-1), max_len)]
return(ll)
}
create_json_string <- function(string){paste0("[", paste(string, collapse = ","), "]")} method 1:
method 2:
method 3:
|
library(microbenchmark)
library(ggplot2)
n <- 1e6 # 10, 10,000 , 1,000,000
string <- rep(jsonlite::toJSON(iris[1,]), n)
bench <- microbenchmark(
"paste method" = method_1(string),
"lapply method" = method_2(string),
"chunk paste method" = method_3(string),
times = 100
)
autoplot(bench) +
labs(title = "Speed of parsing json string",
subtitle = sprintf("Number of json strings: %s", n)) edit: updated charts to reflect update method 3 From these benchmarks it looks like paste method and chunk paste method are the winners. Also the over head of chunking the vector doesn't seem noticeable. From these results |
Quick benchmark with json parser and binary parser inplace library(data.table)
test_data <- function(N = 10000L, seed = 142L){
set.seed(seed)
data.table(
id=1:N,
original=sapply(1:N, function(x) paste(sample(letters, sample(5:10)), collapse = ",")),
json = jsonlite::toJSON(iris[1,]))
}
output_test_data <- function(N = 10000L, temp_file){
dt_list <- test_data(N)
dt_list[, raw := lapply(original, charToRaw)]
dt_list[, raw_string := sapply(raw, function(x) paste(x, collapse = " "))]
fwrite(dt_list[,.(id, original, raw_string, json)],
temp_file,
quote = T)
}
test_file <- tempfile()
size <- 1e6
output_test_data(size, test_file)
data_type <- list(list(Name = c("id", "original", "raw_string", "json"),
Type = c("integer", "string", "varbinary", "json")))
method <- "method"
class(method) <- "athena_data.table"
# noctua::noctua_options(json="auto")
# noctua::noctua_options(binary="raw")
system.time({
dt <- noctua:::athena_read.athena_data.table(
method,
test_file,
data_type)
})
# user system elapsed
# 23.470 0.659 24.166
# noctua::noctua_options(binary="raw")
noctua::noctua_options(json="character")
system.time({
dt <- noctua:::athena_read.athena_data.table(
method,
test_file,
data_type)
})
# user system elapsed
# 14.639 0.232 15.268
noctua::noctua_options(binary="character")
noctua::noctua_options(json="character")
system.time({
dt <- noctua:::athena_read.athena_data.table(
method,
test_file,
data_type)
})
# user system elapsed
# 7.824 0.053 7.899 The slowest parser is converting "raw" string to actual raw. This is due to the raw conversion needing to be called on each string. I don't believe it can be called in chunks, however if it could then that would speed up that conversion Overall the conversions doesn't slow down reading that much, however users can always turn them off and implement their own if desired :) |
…lises chunks to help with memory limitations (#135)
I think I have thought of away to improve the speed of the raw conversion: library(data.table)
N <- 1e6
set.seed(142)
dt_list <- data.table(
id=1:N,
original=sapply(1:N, function(x) paste(sample(letters, sample(5:10)), collapse = ",")))
dt_list[, raw := lapply(original, charToRaw)]
dt_list[, raw_string := sapply(raw, function(x) paste(x, collapse = " "))]
# method 1:
# Takes a string and converts it to raw
hex2raw <- function(string){
split_str = strsplit(string, split = " ", fixed = TRUE)
return(lapply(split_str, function(x) as.raw(as.hexmode(x))))
}
# method 2:
hex2raw_v2 <- function(string){
split_str <- strsplit(string, split = " ", fixed = TRUE)
output <- as.raw(as.hexmode(unlist(split_str)))
split_raw_v1(output, sapply(split_str, length))
}
# helper function to split raw vector back into list format
split_raw_v1 <- function(vec, splits){
ll <- Map(function(i) list(), 1:length(splits))
Slices <- cumsum(c(1, splits))
for (i in seq_along(splits))
ll[[i]] <- vec[Slices[i]:(Slices[i+1]-1)]
return(ll)
}
# method 3:
hex2raw_v3 <- function(string){
split_str <- strsplit(string, split = " ", fixed = TRUE)
output <- as.raw(as.hexmode(unlist(split_str)))
split_raw_v2(output, sapply(split_str, length))
}
split_raw_v2 <- function(vec, splits){
start <- cumsum(c(1, splits))
lapply(seq_along(splits), function(i) vec[start[i]:(start[i+1]-1)])
}
# method 4:
hex2raw_v4 <- function(string){
split_str <- strsplit(string, split = " ", fixed = TRUE)
output <- as.raw(as.hexmode(unlist(split_str)))
split_raw_v3(output, sapply(split_str, length))
}
# helper function to split raw vector back into list format
split_raw_v3 <- function(vec, splits){
start <- cumsum(c(1, splits))
end <- start[-1]-1
lapply(seq_along(splits), function(i) vec[start[i]:end[i]])
} method 1: method 2: method 3: method 4: |
…to turn of rstudio connection tab view (#136)
Merged PR #138 |
Not a 100% sure what to do with data.table and bigint as I can't find a method to force integer.
Possibly will just document this behaviour so users are aware of what will happen if they cast a big integer as just integer when integers will simply fail. |
Sounds reasonable. It also helps that data.table alerts the user if casting fails.
|
…for more information
Currently
noctua
doesn't support AWS Athena data types[array, map, struct]
The text was updated successfully, but these errors were encountered: