Skip to content

Commit

Permalink
convert numeric to character value in tribble, swap read_csv for vroo…
Browse files Browse the repository at this point in the history
…m for speed
  • Loading branch information
shanepiesik committed Feb 22, 2020
1 parent 074ceba commit d5900df
Show file tree
Hide file tree
Showing 5 changed files with 71 additions and 65 deletions.
1 change: 1 addition & 0 deletions .Rbuildignore
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,4 @@
^README\.Rmd$
^README-.*\.png$
^\.travis\.yml$
^baseballr\.Rproj$
3 changes: 2 additions & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,8 @@ Imports:
XML,
xml2,
pbapply,
tibble
tibble,
vroom
License: MIT + file LICENSE
URL: https://billpetti.github.io/baseballr/
BugReports: https://github.com/BillPetti/baseballr/issues
Expand Down
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -111,4 +111,5 @@ importFrom(tibble,tribble)
importFrom(tidyr,gather)
importFrom(tidyr,separate)
importFrom(tidyr,spread)
importFrom(vroom,vroom)
importFrom(xml2,read_html)
113 changes: 58 additions & 55 deletions R/scrape_statcast.R
Original file line number Diff line number Diff line change
@@ -1,37 +1,38 @@
#' Query Statcast and PITCHf/x Data for data from \url{http://baseballsavant.mlb.com}
#'
#' This function allows you to query Statcast and PITCHf/x data as provided on \url{http://baseballsavant.mlb.com} and have that data returned as a \code{\link{data.frame}}.
#' @param start_date Date of first game for which you want data.
#' @param start_date Date of first game for which you want data.
#' Format must be in YYYY-MM-DD format.
#' @param end_date Date of last game for which you want data.
#' @param end_date Date of last game for which you want data.
#' Format must be in YYYY-MM-DD format.
#' @param playerid The MLBAM ID for the player whose data you want to query.
#' @param player_type The player type. Can be \code{batter} or \code{pitcher}.
#' @param player_type The player type. Can be \code{batter} or \code{pitcher}.
#' Default is \code{batter}
#' @param ... currently ignored
#' @keywords MLB, sabermetrics, Statcast
#' @importFrom tibble tribble
#' @importFrom lubridate year
#' @importFrom vroom vroom
#' @export
#' @examples
#' \dontrun{
#' correa <- scrape_statcast_savant(start_date = "2016-04-06",
#' correa <- scrape_statcast_savant(start_date = "2016-04-06",
#' end_date = "2016-04-15", playerid = 621043)
#'
#' noah <- scrape_statcast_savant(start_date = "2016-04-06",
#' noah <- scrape_statcast_savant(start_date = "2016-04-06",
#' end_date = "2016-04-15", playerid = 592789, player_type = 'pitcher')
#'
#' daily <- scrape_statcast_savant(start_date = "2016-04-06", end_date = "2016-04-06")
#' }

scrape_statcast_savant <- function(start_date = Sys.Date() - 1, end_date = Sys.Date(),
playerid = NULL,
scrape_statcast_savant <- function(start_date = Sys.Date() - 1, end_date = Sys.Date(),
playerid = NULL,
player_type = "batter", ...) UseMethod("scrape_statcast_savant")

#' @rdname scrape_statcast_savant
#' @export

scrape_statcast_savant.Date <- function(start_date = Sys.Date() - 1, end_date = Sys.Date(),
scrape_statcast_savant.Date <- function(start_date = Sys.Date() - 1, end_date = Sys.Date(),
playerid = NULL, player_type = "batter", ...) {
# Check for other user errors.
if (start_date <= "2015-03-01") { # March 1, 2015 was the first date of Spring Training.
Expand All @@ -48,71 +49,71 @@ scrape_statcast_savant.Date <- function(start_date = Sys.Date() - 1, end_date =
stop("The start date is later than the end date.")
return(NULL)
}
playerid_var <- ifelse(player_type == "pitcher",

playerid_var <- ifelse(player_type == "pitcher",
"pitchers_lookup%5B%5D", "batters_lookup%5B%5D")

vars <- tibble::tribble(
~var, ~value,
"all", "true",
"hfPT", "",
"hfAB", "",
"hfBBT", "",
"hfPR", "",
"hfZ", "",
"stadium", "",
"hfBBL", "",
"hfPT", "",
"hfAB", "",
"hfBBT", "",
"hfPR", "",
"hfZ", "",
"stadium", "",
"hfBBL", "",
"hfNewZones", "",
"hfGT", "R%7CPO%7CS%7C&hfC",
"hfSea", paste0(lubridate::year(start_date), "%7C"),
"hfSit", "",
"hfOuts", "",
"opponent", "",
"pitcher_throws", "",
"batter_stands", "",
"hfSA", "",
"hfSit", "",
"hfOuts", "",
"opponent", "",
"pitcher_throws", "",
"batter_stands", "",
"hfSA", "",
"player_type", player_type,
"hfInfield", "",
"team", "",
"position", "",
"hfOutfield", "",
"hfRO", "",
"hfInfield", "",
"team", "",
"position", "",
"hfOutfield", "",
"hfRO", "",
"home_road", "",
playerid_var, ifelse(is.null(playerid), "", playerid),
playerid_var, ifelse(is.null(playerid), "", as.character(playerid)),
"game_date_gt", as.character(start_date),
"game_date_lt", as.character(end_date),
"hfFlag", "",
"hfPull", "",
"metric_1", "",
"hfInn", "",
"min_pitches", 0,
"min_results", 0,
"hfFlag", "",
"hfPull", "",
"metric_1", "",
"hfInn", "",
"min_pitches", "0",
"min_results", "0",
"group_by", "name",
"sort_col", "pitches",
"player_event_sort", "h_launch_speed",
"sort_order", "desc",
"min_abs", 0,
"type", "details"
) %>%
dplyr::mutate_(pairs = ~paste(var, "=", value, sep = ""))

"min_abs", "0",
"type", "details") %>%
dplyr::mutate(pairs = paste0(var, "=", value))

if (is.null(playerid)) {
message("No playerid specified. Collecting data for all batters/pitchers.")
vars <- dplyr::filter_(vars, ~!grepl("lookup", var))
vars <- vars %>% dplyr::filter(!grepl("lookup", var))
}

url_vars <- paste0(vars$pairs, collapse = "&")
url <- paste0("https://baseballsavant.mlb.com/statcast_search/csv?", url_vars)
message(url)

# Do a try/catch to show errors that the user may encounter while downloading.
tryCatch(
{
message("These data are from BaseballSavant and are property of MLB Advanced Media, L.P. All rights reserved.")
message("Grabbing data, this may take a minute...")
suppressMessages(
suppressWarnings(
payload <- readr::read_csv(url, na = "null")
# use vroom::vroom for significant speed improvment
payload <- vroom::vroom(url, delim = ",")
)
)
},
Expand All @@ -128,7 +129,9 @@ scrape_statcast_savant.Date <- function(start_date = Sys.Date() - 1, end_date =
message(cond)
}
)
if (ncol(payload) > 1) {
# adjust to handle that vroom::vroom
# returns 0 rows on failure but > 1 columns
if (nrow(payload) > 1) {
message("URL read and payload acquired successfully.")
return(process_statcast_payload(payload))
} else {
Expand All @@ -141,7 +144,7 @@ scrape_statcast_savant.Date <- function(start_date = Sys.Date() - 1, end_date =
#' @rdname scrape_statcast_savant
#' @export

scrape_statcast_savant.default <- function(start_date = Sys.Date() - 1, end_date = Sys.Date(),
scrape_statcast_savant.default <- function(start_date = Sys.Date() - 1, end_date = Sys.Date(),
playerid = NULL, player_type = "batter", ...) {
# Check to make sure args are in the correct format.
# if(!is.character(start_date) | !is.character(end_date)) {
Expand All @@ -150,7 +153,7 @@ scrape_statcast_savant.default <- function(start_date = Sys.Date() - 1, end_date
# }
message(paste0(start_date, " is not a date. Attempting to coerce..."))
start_Date <- as.Date(start_date)

tryCatch(
{
end_Date <- as.Date(end_date)
Expand All @@ -163,7 +166,7 @@ scrape_statcast_savant.default <- function(start_date = Sys.Date() - 1, end_date
}
)

scrape_statcast_savant(start_Date, end_Date,
scrape_statcast_savant(start_Date, end_Date,
playerid, player_type, ...)

}
Expand All @@ -174,20 +177,20 @@ scrape_statcast_savant.default <- function(start_date = Sys.Date() - 1, end_date
#' @export
#' @examples
#' \dontrun{
#' correa <- scrape_statcast_savant_batter(start_date = "2016-04-06",
#' correa <- scrape_statcast_savant_batter(start_date = "2016-04-06",
#' end_date = "2016-04-15", batterid = 621043)
#' }

scrape_statcast_savant_batter <- function(start_date, end_date, batterid, ...) {
scrape_statcast_savant(start_date, end_date, playerid = batterid,
scrape_statcast_savant(start_date, end_date, playerid = batterid,
player_type = "batter", ...)
}

#' @rdname scrape_statcast_savant
#' @export
#' @examples
#' \dontrun{
#' daily <- scrape_statcast_savant_batter_all(start_date = "2016-04-06",
#' daily <- scrape_statcast_savant_batter_all(start_date = "2016-04-06",
#' end_date = "2016-04-06")
#' }

Expand All @@ -200,20 +203,20 @@ scrape_statcast_savant_batter_all <- function(start_date, end_date, ...) {
#' @export
#' @examples
#' \dontrun{
#' noah <- scrape_statcast_savant_pitcher(start_date = "2016-04-06",
#' noah <- scrape_statcast_savant_pitcher(start_date = "2016-04-06",
#' end_date = "2016-04-15", pitcherid = 592789)
#' }

scrape_statcast_savant_pitcher <- function(start_date, end_date, pitcherid, ...) {
scrape_statcast_savant(start_date, end_date, playerid = pitcherid,
scrape_statcast_savant(start_date, end_date, playerid = pitcherid,
player_type = "pitcher", ...)
}

#' @rdname scrape_statcast_savant
#' @export
#' @examples
#' \dontrun{
#' daily <- scrape_statcast_savant_pitcher_all(start_date = "2016-04-06",
#' daily <- scrape_statcast_savant_pitcher_all(start_date = "2016-04-06",
#' end_date = "2016-04-06")
#' }

Expand Down
18 changes: 9 additions & 9 deletions man/scrape_statcast_savant.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit d5900df

Please sign in to comment.