Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

cleanup to get closer to CRAN submission #75

Merged
merged 21 commits into from Jun 21, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
2 changes: 2 additions & 0 deletions .Rbuildignore
@@ -1,2 +1,4 @@
^.*\.Rproj$
^\.Rproj\.user$
^README\.Rmd$
^README-.*\.png$
16 changes: 13 additions & 3 deletions DESCRIPTION
@@ -1,7 +1,12 @@
Package: baseballr
Title: Functions for acquiring and analyzing baseball data
Version: 0.3.3
Version: 0.3.3.9002
Author: Bill Petti <billpetti@gmail.com>
Authors@R: c(
person("Bill", "Petti", email = "billpetti@gmail.com",
role = c("aut", "cre")),
person("Ben", "Baumer", email = "ben.baumer@gmail.com", role = c("ctb")),
person("Ben", "Dilday", email = "ben.dilday.phd@gmail.com", role = "ctb"))
Maintainer: Bill Petti <billpetti@gmail.com>
Description: Provides numerous functions for acquiring and analyzing baseball
data. Data can be acquired from various online sources from within R.
Expand All @@ -10,22 +15,27 @@ Depends:
R (>= 3.2.0)
Imports:
dplyr,
ggplot2,
stringr,
tidyr,
lubridate,
pitchRx,
readr,
reldist,
rvest,
XML,
xml2,
magrittr,
pbapply,
tibble,
highcharter
License: MIT
License: MIT + file LICENSE
URL: https://billpetti.github.io/baseballr/
BugReports: https://github.com/BillPetti/baseballr/issues
LazyData: true
RoxygenNote: 6.0.1
Suggests:
knitr,
rmarkdown
rmarkdown,
testthat
VignetteBuilder: knitr
Empty file removed EXAMPLES/.Rapp.history
Empty file.
Binary file removed EXAMPLES/betts_angle_speed_year.png
Binary file not shown.
2 changes: 2 additions & 0 deletions LICENSE
@@ -0,0 +1,2 @@
YEAR: 2018
COPYRIGHT HOLDER: Bill Petti
7 changes: 6 additions & 1 deletion NAMESPACE
@@ -1,5 +1,7 @@
# Generated by roxygen2: do not edit by hand

S3method(scrape_statcast_savant,Date)
S3method(scrape_statcast_savant,default)
export("%<>%")
export("%>%")
export(batter_boxscore)
Expand All @@ -23,6 +25,7 @@ export(ncaa_scrape)
export(pitcher_boxscore)
export(pitcher_game_logs_fg)
export(playerid_lookup)
export(process_statcast_payload)
export(run_expectancy_code)
export(run_expectancy_table)
export(school_id_lu)
Expand Down Expand Up @@ -88,13 +91,15 @@ importFrom(magrittr,"%<>%")
importFrom(magrittr,"%>%")
importFrom(pbapply,pbsapply)
importFrom(pitchRx,scrape)
importFrom(readr,read_csv)
importFrom(reldist,gini)
importFrom(rvest,html_node)
importFrom(rvest,html_nodes)
importFrom(rvest,html_table)
importFrom(rvest,html_text)
importFrom(stats,setNames)
importFrom(stringr,str_count)
importFrom(tibble,tribble)
importFrom(tidyr,gather)
importFrom(tidyr,separate)
importFrom(utils,read.csv)
importFrom(xml2,read_html)
697 changes: 697 additions & 0 deletions NEWS.md

Large diffs are not rendered by default.

9 changes: 6 additions & 3 deletions R/batter_boxscore.R
Expand Up @@ -4,12 +4,15 @@
#'
#' @param x A boxscore.xml url for a given game from the MLBAM GameDay app data.
#' @keywords MLB, PITCHf/x, Game Day, boxscore, sabermetrics
#' @importFrom XML xmlToList
#' @importFrom XML xmlParse
#' @importFrom XML xmlToList xmlParse
#' @importFrom dplyr bind_rows
#' @export
#' @examples
#' batter_boxscore("http://gd2.mlb.com/components/game/mlb/year_2016/month_05/day_21/gid_2016_05_21_milmlb_nynmlb_1/boxscore.xml")
#' # batters
#' url_base <- "http://gd2.mlb.com/components/game/mlb/"
#' url <- paste0(url_base,
#' "year_2016/month_05/day_21/gid_2016_05_21_milmlb_nynmlb_1/boxscore.xml")
#' batter_boxscore(url)


batter_boxscore <- function(x) {
Expand Down
3 changes: 2 additions & 1 deletion R/fg_park_hand.R
Expand Up @@ -2,6 +2,7 @@
#'
#' This function allows you to scrape park factors by handedness from FanGraphs.com for a given single year.
#' @param yr Season for which you want to scrape the park factors.
#' @importFrom stats setNames
#' @keywords MLB, sabermetrics
#' @export
#' @examples
Expand All @@ -11,7 +12,7 @@ fg_park_hand <- function(yr) {
read_html(paste0("http://www.fangraphs.com/guts.aspx?type=pfh&teamid=0&season=", yr)) %>%
html_node(xpath = '//*[(@id = "GutsBoard1_dg1_ctl00")]') %>%
html_table %>%
setNames(c("season", "home_team", "single_as_LHH", "single_as_RHH",
stats::setNames(c("season", "home_team", "single_as_LHH", "single_as_RHH",
"double_as_LHH", "double_as_RHH", "triple_as_LHH", "triple_as_RHH",
"hr_as_LHH", "hr_as_RHH"))
}
31 changes: 17 additions & 14 deletions R/label_statcast_imputed_data.R
Expand Up @@ -6,35 +6,38 @@
#' have been imputed.
#'
#' @param statcast_df A dataframe containing Statcast batted ball data
#' @param impute_file A csv file giving the launch angle, launch speed, bb_type, events fields to label
#' as imputed. if NULL then it's read from the `extdata` folder of the package.
#' @param impute_file A CSV file giving the launch angle, launch speed,
#' \code{bb_type}, events fields to label
#' as imputed. if NULL then it's read from the \code{extdata} folder of the package.
#' @param inverse_precision inverse of how many digits to truncate the launch angle
#' and speed to for comparison. default is 10000, i.e. keep 4 digits of precision.
#' and speed to for comparison. Default is \code{10000}, i.e. keep 4 digits of precision.
#' @keywords MLB, Statcast, sabermetrics
#' @importFrom dplyr bind_rows
#' @importFrom dplyr left_join
#' @return A copy of the input dataframe with a new column "imputed" appended. imputed
#' @importFrom readr read_csv
#' @return A copy of the input dataframe with a new column \code{imputed} appended. imputed
#' is 1 if launch angle and launch speed are likely imputed, 0 otherwise.
#' @export
#' @examples
#' #' \dontrun{
#' statcast_df = scrape_statcast_savant("2017-05-01", "2017-05-02")
#' statcast_df = label_statcast_imputed_data(statcast_df)
#' mean(statcast_df$imputed)
#' \dontrun{
#' statcast_df <- scrape_statcast_savant("2017-05-01", "2017-05-02")
#' sc_df <- label_statcast_imputed_data(statcast_df)
#' mean(sc_df$imputed)
#' }
label_statcast_imputed_data <- function(statcast_df, impute_file=NULL, inverse_precision=10000) {
label_statcast_imputed_data <- function(statcast_df, impute_file = NULL,
inverse_precision = 10000) {

if (is.null(impute_file)) {
impute_file = system.file("extdata/statcast_impute.csv", package = "baseballr")
impute_file <- system.file("extdata/statcast_impute.csv", package = "baseballr")
}

imputed_df = read.csv(impute_file, stringsAsFactors = FALSE)
imputed_df <- suppressMessages(readr::read_csv(impute_file))

imputed_df$imputed <- 1
tmp <- dplyr::left_join(
statcast_df %>% mutate(ila=as.integer(launch_angle * inverse_precision),
ils=as.integer(launch_speed * inverse_precision)),
imputed_df, by=c("ils", "ila", "bb_type", "events"))
statcast_df %>% mutate(ila = as.integer(launch_angle * inverse_precision),
ils = as.integer(launch_speed * inverse_precision)),
imputed_df, by = c("ils", "ila", "bb_type", "events"))
tmp$imputed <- ifelse(is.na(tmp$imputed), 0, 1)
tmp
}
2 changes: 1 addition & 1 deletion R/linear_weights_savant.R
@@ -1,7 +1,7 @@
#' Generate linear weight values for events using Baseball Savant data
#'
#' This function allows a user to generate linear weight values for events using Baseball Savant data. Output includes both linear weights above average and linear weights above outs for home runs, triples, doubles, singles, walks, hit by pitches, and outs.
#' @param df A data frame generated from Baseball Savant that has been run through the baseballr::run_expectancy_code() function.
#' @param df A data frame generated from Baseball Savant that has been run through the \code{\link{run_expectancy_code}} function.
#' @keywords MLB, sabermetrics
#' @importFrom dplyr filter group_by summarise arrange mutate add_row
#' @export
Expand Down
15 changes: 6 additions & 9 deletions R/pitcher_boxscore.R
@@ -1,14 +1,11 @@
#' Retrieve pitcher boxscore data for a single game played
#'
#' This function allows a user to retrieve a boxscore of pitcher statistics for any game played in the PITCHf/x era (2008-current). The function takes a boxscore.xml url as it's only argument and returns boxscore data for both the home and away pitchers.
#'
#' @param x A boxscore.xml url for a given game from the MLBAM GameDay app data.
#' @keywords MLB, PITCHf/x, Game Day, boxscore, sabermetrics
#' @importFrom XML xmlToList
#' @importFrom XML xmlParse
#' @rdname batter_boxscore
#' @export
#' @examples
#' pitcher_boxscore("http://gd2.mlb.com/components/game/mlb/year_2016/month_05/day_21/gid_2016_05_21_milmlb_nynmlb_1/boxscore.xml")
#' # pitchers
#' url_base <- "http://gd2.mlb.com/components/game/mlb/"
#' url <- paste0(url_base,
#' "year_2016/month_05/day_21/gid_2016_05_21_milmlb_nynmlb_1/boxscore.xml")
#' pitcher_boxscore(url)

pitcher_boxscore <- function(x) {
url <- x
Expand Down
31 changes: 18 additions & 13 deletions R/playerid_lookup.R
Expand Up @@ -4,24 +4,27 @@
#' @param last_name A text string used to return results for players with that string in their last name.
#' @param first_name A text string used to return results for players with that string in their first name.
#' @keywords MLB, sabermetrics
#' @importFrom readr read_csv
#' @export
#' @examples
#' \dontrun{playerid_lookup("Garcia", "Karim")}
#' \dontrun{
#' playerid_lookup("Garcia", "Karim")
#' }

playerid_lookup <- function(last_name=NULL, first_name=NULL) {
playerid_lookup <- function(last_name = NULL, first_name = NULL) {
if (!exists("chadwick_player_lu_table")) {
print("Be patient, this may take a few seconds...")
print("Data courtesy of the Chadwick Bureau Register (https://github.com/chadwickbureau/register)")
message("Be patient, this may take a few seconds...")
message("Data courtesy of the Chadwick Bureau Register (https://github.com/chadwickbureau/register)")
url <- "https://raw.githubusercontent.com/chadwickbureau/register/master/data/people.csv"
chadwick_player_lu_table <- read.csv(url)
suppressMessages(
chadwick_player_lu_table <- readr::read_csv(url)
)
assign("chadwick_player_lu_table", chadwick_player_lu_table, envir = .GlobalEnv)

x <- process_player_name(last_name, first_name)

names(x) <- c("first_name", "last_name", "given_name", "name_suffix", "nick_name", "birth_year", "mlb_played_first", "mlbam_id", "retrosheet_id", "bbref_id", "fangraphs_id")

x$fangraphs_id <- as.character(x$fangraphs_id) %>% as.numeric()
x$birth_year <- as.character(x$birth_year) %>% as.numeric()
x
}

Expand All @@ -35,17 +38,19 @@ playerid_lookup <- function(last_name=NULL, first_name=NULL) {
}
}

process_player_name <- function(last_name=NULL, first_name=NULL) {
#' @importFrom dplyr filter select

process_player_name <- function(last_name = NULL, first_name = NULL) {
if (is.null(first_name)) {
x <- chadwick_player_lu_table %>%
filter(grepl(last_name, name_last)) %>%
select(name_first, name_last, name_given, name_suffix, name_nick, birth_year, mlb_played_first, key_mlbam, key_retro, key_bbref, key_fangraphs)
dplyr::filter(grepl(last_name, name_last)) %>%
dplyr::select(name_first, name_last, name_given, name_suffix, name_nick, birth_year, mlb_played_first, key_mlbam, key_retro, key_bbref, key_fangraphs)
}
else {
x <- chadwick_player_lu_table %>%
filter(grepl(last_name, name_last)) %>%
filter(grepl(first_name, name_first)) %>%
select(name_first, name_last, name_given, name_suffix, name_nick, birth_year, mlb_played_first, key_mlbam, key_retro, key_bbref, key_fangraphs)
dplyr::filter(grepl(last_name, name_last)) %>%
dplyr::filter(grepl(first_name, name_first)) %>%
dplyr::select(name_first, name_last, name_given, name_suffix, name_nick, birth_year, mlb_played_first, key_mlbam, key_retro, key_bbref, key_fangraphs)
}
x
}
55 changes: 12 additions & 43 deletions R/process_statcast_payload.R
@@ -1,56 +1,25 @@
#' Process Baseball Savant CSV payload
#'
#' This is a helper function for all scrape_statcast_savant functions. The function processes the initial csv payload acquired from baseballsavant to ensure consistency in formattting across downloads
#' @param payload payload from a Baseball Savant request, e.g. from utils::read.csv
#' This is a helper function for all scrape_statcast_savant functions.
#' The function processes the initial csv payload acquired from
#' baseballsavant to ensure consistency in formattting across downloads
#' @param payload payload from a Baseball Savant request, e.g.
#' from \code{\link[readr]{read_csv}}
#' @keywords MLB, sabermetrics, Statcast
#' @importFrom dplyr mutate_
#' @export
#' @examples
#' \dontrun{
#' process_statcast_payload(payload)
#' }

process_statcast_payload <- function(payload) {

# Clean up formatting.
payload[payload=="null"] <- NA
payload$game_date <- as.Date(payload$game_date, "%Y-%m-%d")
payload$des <- as.character(payload$des)
payload$game_pk <- as.character(payload$game_pk) %>% as.numeric()
payload$hc_x <- as.character(payload$hc_x) %>% as.numeric()
payload$hc_y <- as.character(payload$hc_y) %>% as.numeric()
payload$on_1b <- as.character(payload$on_1b) %>% as.numeric()
payload$on_2b <- as.character(payload$on_2b) %>% as.numeric()
payload$on_3b <- as.character(payload$on_3b) %>% as.numeric()
payload$release_pos_x <- as.character(payload$release_pos_x) %>% as.numeric()
payload$release_pos_x <- as.character(payload$release_pos_x) %>% as.numeric()
payload$hit_distance_sc <- as.character(payload$hit_distance_sc) %>% as.numeric()
payload$launch_speed <- as.character(payload$launch_speed) %>% as.numeric()
payload$launch_angle <- as.character(payload$launch_angle) %>% as.numeric()
payload$pfx_x <- as.character(payload$pfx_x) %>% as.numeric()
payload$pfx_z <- as.character(payload$pfx_z) %>% as.numeric()
payload$plate_x <- as.character(payload$plate_x) %>% as.numeric()
payload$plate_z <- as.character(payload$plate_z) %>% as.numeric()
payload$vx0 <- as.character(payload$vx0) %>% as.numeric()
payload$vy0 <- as.character(payload$vy0) %>% as.numeric()
payload$vz0 <- as.character(payload$vz0) %>% as.numeric()
payload$ax <- as.character(payload$ax) %>% as.numeric()
payload$az <- as.character(payload$az) %>% as.numeric()
payload$ay <- as.character(payload$ay) %>% as.numeric()
payload$sz_bot <- as.character(payload$sz_bot) %>% as.numeric()
payload$sz_top <- as.character(payload$sz_top) %>% as.numeric()
payload$effective_speed <- as.character(payload$effective_speed) %>% as.numeric()
payload$release_speed <- as.character(payload$release_speed) %>% as.numeric()
payload$release_spin_rate <- as.character(payload$release_spin_rate) %>% as.numeric()
payload$release_extension <- as.character(payload$release_extension) %>% as.numeric()
payload$pitch_name <- as.character(payload$pitch_name)
payload$home_score <- as.character(payload$home_score) %>% as.numeric()
payload$away_score <- as.character(payload$away_score) %>% as.numeric()
payload$bat_score <- as.character(payload$bat_score) %>% as.numeric()
payload$fld_score <- as.character(payload$fld_score) %>% as.numeric()
payload$post_away_score <- as.character(payload$post_away_score) %>% as.numeric()
payload$post_home_score <- as.character(payload$post_home_score) %>% as.numeric()
payload$post_bat_score <- as.character(payload$post_bat_score) %>% as.numeric()
payload$post_fld_score <- as.character(payload$post_fld_score) %>% as.numeric()
payload$zone <- as.character(payload$zone) %>% as.numeric()
payload$barrel <- with(payload, ifelse(launch_angle <= 50 & launch_speed >= 98 & launch_speed * 1.5 - launch_angle >= 11 & launch_speed + launch_angle >= 124, 1, 0))
payload <- payload %>%
dplyr::mutate_(
barrel = ~ifelse(launch_angle <= 50 & launch_speed >= 98 & launch_speed * 1.5 - launch_angle >= 11 & launch_speed + launch_angle >= 124, 1, 0)
)

return(payload)

Expand Down