diff --git a/.Rbuildignore b/.Rbuildignore index 91114bf2..8453a6f6 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -1,2 +1,4 @@ ^.*\.Rproj$ ^\.Rproj\.user$ +^README\.Rmd$ +^README-.*\.png$ diff --git a/DESCRIPTION b/DESCRIPTION index 571dabf5..8de57329 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,12 @@ Package: baseballr Title: Functions for acquiring and analyzing baseball data -Version: 0.3.3 +Version: 0.3.3.9002 Author: Bill Petti +Authors@R: c( + person("Bill", "Petti", email = "billpetti@gmail.com", + role = c("aut", "cre")), + person("Ben", "Baumer", email = "ben.baumer@gmail.com", role = c("ctb")), + person("Ben", "Dilday", email = "ben.dilday.phd@gmail.com", role = "ctb")) Maintainer: Bill Petti Description: Provides numerous functions for acquiring and analyzing baseball data. Data can be acquired from various online sources from within R. @@ -10,22 +15,27 @@ Depends: R (>= 3.2.0) Imports: dplyr, + ggplot2, + stringr, tidyr, lubridate, pitchRx, + readr, reldist, rvest, XML, xml2, magrittr, pbapply, + tibble, highcharter -License: MIT +License: MIT + file LICENSE URL: https://billpetti.github.io/baseballr/ BugReports: https://github.com/BillPetti/baseballr/issues LazyData: true RoxygenNote: 6.0.1 Suggests: knitr, - rmarkdown + rmarkdown, + testthat VignetteBuilder: knitr diff --git a/EXAMPLES/.Rapp.history b/EXAMPLES/.Rapp.history deleted file mode 100644 index e69de29b..00000000 diff --git a/EXAMPLES/betts_angle_speed_year.png b/EXAMPLES/betts_angle_speed_year.png deleted file mode 100644 index 38b108b2..00000000 Binary files a/EXAMPLES/betts_angle_speed_year.png and /dev/null differ diff --git a/LICENSE b/LICENSE new file mode 100644 index 00000000..0abc9226 --- /dev/null +++ b/LICENSE @@ -0,0 +1,2 @@ +YEAR: 2018 +COPYRIGHT HOLDER: Bill Petti diff --git a/NAMESPACE b/NAMESPACE index 15a506e7..eb0a9d62 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,5 +1,7 @@ # Generated by roxygen2: do not edit by hand +S3method(scrape_statcast_savant,Date) +S3method(scrape_statcast_savant,default) export("%<>%") export("%>%") export(batter_boxscore) @@ -23,6 +25,7 @@ export(ncaa_scrape) export(pitcher_boxscore) export(pitcher_game_logs_fg) export(playerid_lookup) +export(process_statcast_payload) export(run_expectancy_code) export(run_expectancy_table) export(school_id_lu) @@ -88,13 +91,15 @@ importFrom(magrittr,"%<>%") importFrom(magrittr,"%>%") importFrom(pbapply,pbsapply) importFrom(pitchRx,scrape) +importFrom(readr,read_csv) importFrom(reldist,gini) importFrom(rvest,html_node) importFrom(rvest,html_nodes) importFrom(rvest,html_table) importFrom(rvest,html_text) +importFrom(stats,setNames) importFrom(stringr,str_count) +importFrom(tibble,tribble) importFrom(tidyr,gather) importFrom(tidyr,separate) -importFrom(utils,read.csv) importFrom(xml2,read_html) diff --git a/NEWS.md b/NEWS.md new file mode 100644 index 00000000..a7710469 --- /dev/null +++ b/NEWS.md @@ -0,0 +1,697 @@ +# baseballr 0.3.4 (2018-05-29) + +* Added a `NEWS.md` file to track changes to the package. + +The latest release of the [`baseballr`](https://billpetti.github.io/baseballr/) package for `R` (0.5) includes a number of enhancements and bug fixes. + +## New Functions + +`run_expectancy_code()` + +This function formats Baseball Savant data so that users can generate the run expectancy for different base-out or count-base-out states. It will also append the data frame with new variables necessary for generating linear weights (see new function below). The only argument is a data frame downloaded from Baseball Savant + +Columns created and appended to Baseball Savant data: + +- `final_pitch_game`: whether a pitch was the final one thrown in a game +- `final_pitch_inning`: whether a pitch is the final one thrown in an inning +- `final_pitch_at_bat`: whether a pitch is the final one thrown in an at bat +- `runs_scored_on_pitch`: how many runs scored as a result of the pitch +- `bat_score_start_inning`: the score for the batting team at the beginning of the inning +- `bat_score_end_inning`: the score for the batting team at the end of the inning +- `bat_score_after`: the score for the batting team after the pitch is thrown +- `cum_runs_in_inning`: how many cumulative runs have been scored from the beginning of the inning through the pitch +- `runs_to_end_inning`: how many runs were scored as a result of the pitch through the end of the inning +- `base_out_state` or `count_base_out_state`: the specific combination of base-outs or count-base-outs when the pitch was thrown +- `avg_re`: the average run expectancy of that base-out or count-base-out state +- `next_avg_re`: the average run expectancy of the base-out or count-base-out state that results from the pitch +- `change_re`: the change in run expectancy as a result of the pitch +- `re24`: the total change in run expectancy through the end of the inning resulting from the pitch based on the change in base-out or count-base-out state plus the number of runs scored as a result of the pitch/at bat + +Example: + +```r +> x2016_statcast_re <- run_expectancy_code(x2016_statcast) + +> sample_n(x2016_statcast_re, 10) %>% + select(final_pitch_inning:re24) %>% + glimpse() + +Observations: 10 +Variables: 11 +$ final_pitch_inning 0, 0, 0, 0, 0, 0, 0, 0, 1, 0 +$ bat_score_start_inning 1, 0, 5, 0, 3, 2, 1, 0, 0, 0 +$ bat_score_end_inning 2, 0, 5, 1, 3, 2, 5, 0, 0, 2 +$ cum_runs_in_inning 1, 0, 0, 0, 0, 0, 2, 0, 0, 1 +$ runs_to_end_inning 0, 0, 0, 1, 0, 0, 2, 0, 0, 1 +$ base_out_state "2 outs, 1b _ _", "0 outs, _ _ _", "0 outs... +$ avg_re 0.2149885, 0.5057877, 0.5057877, 0.5057877, 0.5... +$ next_base_out_state "2 outs, 1b 2b _", "1 outs, _ _ _", "1 out... +$ next_avg_re 0.4063525, 0.2718802, 0.2718802, 0.8629357, 0.2... +$ change_re 0.1913640, -0.2339075, -0.2339075, 0.3571479, -... +$ re24 0.1913640, -0.2339075, -0.2339075, 0.3571479, -... +``` + +`run_expectancy_table()` + +This functions works with the `run_expectancy_code` function and does the work of generating the run expectancy tables that are automatically exported into the Global Environment + +Example: + +```r +> x2016_statcast_re %>% + run_expectancy_table() %>% + print(n=Inf) + +base_out_state avg_re + +1 0 outs, 1b 2b 3b 2.13 +2 0 outs, _ 2b 3b 1.95 +3 0 outs, 1b _ 3b 1.76 +4 1 outs, 1b 2b 3b 1.55 +5 0 outs, 1b 2b _ 1.42 +6 1 outs, _ 2b 3b 1.36 +7 0 outs, _ _ 3b 1.36 +8 1 outs, 1b _ 3b 1.18 +9 0 outs, _ 2b _ 1.14 +10 1 outs, _ _ 3b 0.951 +11 1 outs, 1b 2b _ 0.906 +12 0 outs, 1b _ _ 0.863 +13 2 outs, 1b 2b 3b 0.689 +14 1 outs, _ 2b _ 0.669 +15 2 outs, _ 2b 3b 0.525 +16 1 outs, 1b _ _ 0.520 +17 0 outs, _ _ _ 0.506 +18 2 outs, 1b _ 3b 0.456 +19 2 outs, 1b 2b _ 0.406 +20 2 outs, _ _ 3b 0.366 +21 2 outs, _ 2b _ 0.299 +22 1 outs, _ _ _ 0.272 +23 2 outs, 1b _ _ 0.215 +24 2 outs, _ _ _ 0.106 +``` + +`linear_weights_savant()` + +This function works in tandem with `run_expectancy_code()` to generate linear weights for offensive events after the Baseball Savant data has been properly formatted. Currently, the function will return linear weights above average and linear weights above outs. It does not apply any scaling to align with league wOBA. Users can do that themselves if they like, or it may be added to a future version of the function. + +Example: + +```r + +> x2016_statcast_re %>% + linear_weights_savant() %>% + print(n=Inf) + +A tibble: 7 x 3 +events linear_weights_above_average linear_weights_above_outs + +1 home_run 1.38 1.63 +2 triple 1.00 1.25 +3 double 0.730 0.980 +4 single 0.440 0.690 +5 hit_by_pitch 0.320 0.570 +6 walk 0.290 0.540 +7 outs -0.250 0. +``` + +I used Baseball Savant data from 2010-2015 and compared the linear weights generated by `baseballr` to those by Tom Tango using retrosheet data. `baseballr`'s weights are generally a little lower than what Tango generated, but that could be due to a number of things, such as the data source, code, etc., but the values appear reasonable enough to be reliable: + +| base_out_state | baseballr_2010_2015 | tango_2010_2015 | diff | %_diff | +|--------------------|---------------------|-----------------|-------|--------| +| 0 outs, 1b 2b 3b | 2.27 | 2.29 | -0.02 | -1% | +| 0 outs, _ 2b 3b | 1.96 | 1.96 | 0 | 0% | +| 0 outs, 1b _ 3b | 1.76 | 1.78 | -0.03 | -1% | +| 1 outs, 1b 2b 3b | 1.51 | 1.54 | -0.03 | -2% | +| 0 outs, 1b 2b _ | 1.42 | 1.44 | -0.02 | -1% | +| 0 outs, _ _ 3b | 1.38 | 1.38 | 0 | 0% | +| 1 outs, _ 2b 3b | 1.35 | 1.35 | 0 | 0% | +| 1 outs, 1b _ 3b | 1.1 | 1.13 | -0.03 | -2% | +| 0 outs, _ 2b _ | 1.09 | 1.1 | -0.01 | -1% | +| 1 outs, _ _ 3b | 0.93 | 0.95 | -0.02 | -2% | +| 1 outs, 1b 2b _ | 0.86 | 0.88 | -0.02 | -3% | +| 0 outs, 1b _ _ | 0.84 | 0.86 | -0.02 | -2% | +| 2 outs, 1b 2b 3b | 0.71 | 0.75 | -0.04 | -5% | +| 1 outs, _ 2b _ | 0.65 | 0.66 | -0.01 | -2% | +| 2 outs, _ 2b 3b | 0.54 | 0.58 | -0.04 | -7% | +| 1 outs, 1b _ _ | 0.5 | 0.51 | -0.01 | -2% | +| 0 outs, _ _ _ | 0.48 | 0.48 | 0 | -1% | +| 2 outs, 1b _ 3b | 0.45 | 0.48 | -0.03 | -7% | +| 2 outs, 1b 2b _ | 0.41 | 0.43 | -0.02 | -4% | +| 2 outs, _ _ 3b | 0.33 | 0.35 | -0.02 | -6% | +| 2 outs, _ 2b _ | 0.31 | 0.32 | -0.01 | -3% | +| 1 outs, _ _ _ | 0.25 | 0.25 | 0 | -1% | +| 2 outs, 1b _ _ | 0.21 | 0.22 | -0.01 | -6% | +| 2 outs, _ _ _ | 0.1 | 0.1 | 0 | -2% | + +We also had some great contributions by others that I've added into this release: + +`label_statcast_imputed_data()` + +[Ben Dilday](https://github.com/bdilday) again contributes with a cool experimental function meant to tag batted ball cases where significant imputation may have been used to generate some of the Statcast values by MLBAM, i.e. `launch_speed` and `launch_angle`. You can read more about Ben's function [here](https://github.com/BillPetti/baseballr/pull/71). + +`fg_park()` + +[Sam Boysel](https://github.com/sboysel) updated the park factors function so that it now includes the new columns added by FanGraphs (5-year, 3-year, 1-year park factors) and ensures the column names are correct + +## Updgrades + +`fg_bat_leaders()` + +- `playerid` now returned as part of the data returned. +- Dozens of additional variables are also returned, including aggregate data from Pitch Info as well as contact type. + +## Bug Fixes + +`process_statcast_payload()` +- hc_x, hc_y are now converted to numeric + + +# baseballr 0.3.3 (2017-05-08) + +The latest release of the [`baseballr`](https://billpetti.github.io/baseballr/) package for `R` includes a number of enhancement to acquiring data from [Baseball Savant](http://baseballsavant.com) as well as minor grammatical clean up in the documentation. + +Previous functions `scrape_statcast_savant_batter` and `scrape_statcast_savant_pitcher` allowed for the acquistion of data from baseballsavant.com for a given player over a user-determined time frame. However, this is somewhat inefficient if you want to acquire data on all players over a given time frame. + +Two new functions have been added, `scrape_statcast_savant_batter_all` and `scrape_statcast_savant_pitcher_all`, that allow a user to acquire data for either all pitchers or all hitters over a given time frame. + +Both functions take only two arguments: + +`start_date`: the first date for which the user wants records returned +`end_date`: the final date for which the user wants records returned + +Remember, baseballsavant.com's csv download option allows for about 50,000 records in a single query. That works out to roughly 10-12 days of games. Longer time frames will take longer to download. + +Example: acquire data for all batters from 2017-04-03 through 2017-04-10 + +```r +> head(scrape_statcast_savant_batter_all('2017-04-03', '2017-04-10')) +[1] "These data are from BaseballSevant and are property of MLB Advanced Media, L.P. All rights reserved." +[1] "Grabbing data, this may take a minute..." +URL read and payload aquired successfully. + pitch_type game_date release_speed release_pos_x release_pos_z player_name +1 FF 2017-04-10 92.7 -1.0367 5.7934 Eric Fryer +2 FF 2017-04-10 93.2 -0.9753 5.6007 Eric Fryer +3 FF 2017-04-10 93.0 -1.1196 5.6958 Eric Fryer +4 FF 2017-04-10 93.1 -0.9952 5.7978 Eric Fryer +5 SL 2017-04-10 83.4 -1.2385 5.8164 Eric Fryer +6 FF 2017-04-10 93.7 -1.0307 5.8740 Aledmys Diaz + batter pitcher events description spin_dir spin_rate_deprecated +1 518700 518875 strikeout swinging_strike NA NA +2 518700 518875 ball NA NA +3 518700 518875 ball NA NA +4 518700 518875 swinging_strike NA NA +5 518700 518875 called_strike NA NA +6 649557 518875 field_out hit_into_play NA NA + break_angle_deprecated break_length_deprecated zone +1 NA NA 5 +2 NA NA 12 +3 NA NA 12 +4 NA NA 3 +5 NA NA 6 +6 NA NA 6 + des game_type stand +1 Eric Fryer strikes out swinging. R R +2 R R +3 R R +4 R R +5 R R +6 Aledmys Diaz flies out to right fielder Bryce Harper. R R + p_throws home_team away_team type hit_location bb_type balls strikes +1 R WSH STL S 2 2 +2 R WSH STL B 1 2 +3 R WSH STL B 0 2 +4 R WSH STL S 0 1 +5 R WSH STL S 0 0 +6 R WSH STL X 9 fly_ball 0 1 + game_year pfx_x pfx_z plate_x plate_z on_3b on_2b on_1b outs_when_up +1 2017 -0.4262 1.7261 -0.0042 2.9680 NA NA 594824 2 +2 2017 0.2420 1.3633 1.3747 3.5269 NA NA 594824 2 +3 2017 0.4912 1.6758 0.5389 4.3795 NA NA 594824 2 +4 2017 0.1924 1.7964 0.6868 3.5700 NA NA 594824 2 +5 2017 -0.1604 0.3532 0.6048 2.6308 NA NA 594824 2 +6 2017 0.5956 1.8068 0.4993 3.1386 NA NA 594824 1 + inning inning_topbot hc_x hc_y tfs_deprecated tfs_zulu_deprecated +1 9 Top NA NA +2 9 Top NA NA +3 9 Top NA NA +4 9 Top NA NA +5 9 Top NA NA +6 9 Top 186.56 105.27 NA NA + pos2_person_id umpire sv_id vx0 vy0 vz0 ax ay az sz_top sz_bot +1 446308 NA 170411_025210 NA NA NA NA NA NA 3.8420 1.5890 +2 446308 NA 170411_025153 NA NA NA NA NA NA 3.5602 1.7127 +3 446308 NA 170411_025133 NA NA NA NA NA NA 3.6761 1.6780 +4 446308 NA 170411_025117 NA NA NA NA NA NA 3.6760 1.5040 +5 446308 NA 170411_025104 NA NA NA NA NA NA 3.5139 1.6548 +6 446308 NA 170411_025018 NA NA NA NA NA NA 3.9500 1.6810 + hit_distance_sc launch_speed launch_angle effective_speed release_spin_rate +1 NA NA NA 93.033 2285 +2 NA NA NA 93.301 2323 +3 NA NA NA 92.892 2322 +4 NA NA NA 92.906 2324 +5 NA NA NA 83.371 NA +6 266 87.5 47.444 93.529 2406 + release_extension game_pk pos1_person_id pos2_person_id.1 pos3_person_id +1 6.248 490201 518875 446308 475582 +2 6.265 490201 518875 446308 475582 +3 6.281 490201 518875 446308 475582 +4 6.187 490201 518875 446308 475582 +5 6.155 490201 518875 446308 475582 +6 6.269 490201 518875 446308 475582 + pos4_person_id pos5_person_id pos6_person_id pos7_person_id pos8_person_id +1 502517 543685 452220 594809 572191 +2 502517 543685 452220 594809 572191 +3 502517 543685 452220 594809 572191 +4 502517 543685 452220 594809 572191 +5 502517 543685 452220 594809 572191 +6 502517 543685 452220 594809 572191 + pos9_person_id release_pos_y estimated_ba_using_speedangle +1 547180 54.2491 0.000 +2 547180 54.2319 0.000 +3 547180 54.2163 0.000 +4 547180 54.3096 0.000 +5 547180 54.3420 0.000 +6 547180 54.2282 0.007 + estimated_woba_using_speedangle woba_value woba_denom babip_value iso_value +1 0.000 0.00 1 0 0 +2 0.000 +3 0.000 +4 0.000 +5 0.000 +6 0.008 0.00 1 0 0 + barrel +1 NA +2 NA +3 NA +4 NA +5 NA +6 0 +``` + +# baseballr 0.3.2 (2017-09-12) + +The latest release of the [`baseballr`](https://billpetti.github.io/baseballr/) package for `R` includes a number of enhancements and bug fixes. + +In terms of new functions, `statline_from_statcast` allows users to take raw pitch-by-pitch data from Statcast/PITCHf/x and calculate aggregated, statline-like output. Examples include count data such as number of singles, doubles, etc., as well as rate metrics like Slugging and wOBA on swings or contact. + +The function only has two arguments: + +* `df`: a dataframe that includes pitch-by-pitch information. The function assumes the following columns are present: `events`, `description`, `game_date`, and `type`. +* `base`: base indicates what the denomincator should be for the rate stats that are calculated. The function defaults to "swings", but you can also choose to use "contact" + +Here is an example using all data from the week of 2017-09-04. Here, we want to see a statline for all hitters based on swings: + +```r +test <- scrape_statcast_savant_batter_all("2017-09-04", "2017-09-10") + +statline_from_statcast(test) + +year swings batted_balls X1B X2B X3B HR swing_and_miss swinging_strike_percent ba +1 2017 13790 10663 1129 352 37 259 3127 0.227 0.129 + +obp slg ops woba +1 0.129 0.216 0.345 0.144 +``` + +You can also combine the `statline_from_statcast` function with a loop to create statlines for multiple players at once. + +Example: calculate statlines for batters on contact for all games played 2017-09-04 through 2017-09-10: + +```r +test <- scrape_statcast_savant_batter_all("2017-09-04", "2017-09-10") + +output <- data.frame() + +for (i in c("Jose Ramirez", "J.D. Martinez", "Francisco Lindor", "Gary Sanchez", "Rhys Hoskins")) { + reduced_test <- test %>% + filter(player_name == i) + x <- statline_from_statcast(reduced_test, base = "contact") + x$player <- i + x <- x %>% + select(player, everything()) + output <- rbind(output, x) %>% + arrange(desc(woba)) +} + +print(output, width = Inf) + +# A tibble: 5 x 12 + player year batted_balls X1B X2B X3B HR ba obp slg ops woba + +1 J.D. Martinez 2017 17 4 1 0 7 0.706 0.706 2.000 2.706 1.092 +2 Gary Sanchez 2017 11 3 1 0 2 0.545 0.545 1.182 1.727 0.710 +3 Francisco Lindor 2017 27 4 2 1 3 0.370 0.370 0.852 1.222 0.498 +4 Rhys Hoskins 2017 14 2 1 0 2 0.357 0.357 0.857 1.214 0.495 +5 Jose Ramirez 2017 16 0 0 0 3 0.188 0.188 0.750 0.938 0.370 +``` + +# baseballr 0.3.1 (2016-11-22) + +The latest release of the [`baseballr`](https://billpetti.github.io/baseballr/) includes a function for acquiring player statistics from the [NCAA's website](http://stats.ncaa.org) for baseball teams across the three major divisions (I, II, III). + +The function, `ncaa_scrape`, requires the user to pass values for three parameters for the function to work: + +`school_id`: numerical code used by the NCAA for each school +`year`: a four-digit year +`type`: whether to pull data for batters or pitchers + +If you want to pull batting statistics for Vanderbilt for the 2013 season, you would use the following: + +```r +> baseballr::ncaa_scrape(736, 2013, "batting") %>% ++ select(year:OBPct) + year school conference division Jersey Player Yr Pos GP GS BA OBPct +1 2013 Vanderbilt Southeastern 1 18 Yastrzemski, Mike Sr OF 66 66 0.312 0.411 +2 2013 Vanderbilt Southeastern 1 20 Harrell, Connor Sr OF 66 66 0.312 0.418 +3 2013 Vanderbilt Southeastern 1 3 Conde, Vince So INF 66 65 0.307 0.380 +4 2013 Vanderbilt Southeastern 1 6 Kemp, Tony Jr OF 66 66 0.391 0.471 +5 2013 Vanderbilt Southeastern 1 55 Gregor, Conrad Jr OF 65 65 0.308 0.440 +6 2013 Vanderbilt Southeastern 1 9 Turner, Xavier Fr INF 59 51 0.324 0.387 +7 2013 Vanderbilt Southeastern 1 5 Navin, Spencer Jr C 57 56 0.302 0.430 +8 2013 Vanderbilt Southeastern 1 51 Lupo, Jack Sr OF 57 51 0.297 0.352 +9 2013 Vanderbilt Southeastern 1 8 Wiseman, Rhett Fr OF 54 11 0.289 0.360 +10 2013 Vanderbilt Southeastern 1 10 Norwood, John So OF 33 9 0.328 0.388 +11 2013 Vanderbilt Southeastern 1 43 Wiel, Zander So INF 33 15 0.305 0.406 +12 2013 Vanderbilt Southeastern 1 44 Harvey, Chris So C 29 13 0.250 0.328 +13 2013 Vanderbilt Southeastern 1 42 McKeithan, Joel Jr INF 25 12 0.220 0.267 +14 2013 Vanderbilt Southeastern 1 39 Smith, Kyle Fr INF 23 7 0.250 0.455 +15 2013 Vanderbilt Southeastern 1 17 Harris, Andrew Sr INF 21 0 0.125 0.222 +16 2013 Vanderbilt Southeastern 1 2 Campbell, Tyler Fr INF 12 2 0.312 0.389 +17 2013 Vanderbilt Southeastern 1 7 Swanson, Dansby Fr INF 11 4 0.188 0.435 +18 2013 Vanderbilt Southeastern 1 25 Luna, D.J. Jr INF 8 0 0.000 0.333 +19 2013 Vanderbilt Southeastern 1 23 Cooper, Will So OF 4 0 1.000 1.000 +20 2013 Vanderbilt Southeastern 1 - Totals - - - - 0.313 0.407 +21 2013 Vanderbilt Southeastern 1 - Opponent Totals - - - - 0.220 0.320 +``` + +The same can be done for pitching, just by changing the `type` parameter: + +```r +> baseballr::ncaa_scrape(736, 2013, "pitching") %>% ++ select(year:ERA) + year school conference division Jersey Player Yr Pos GP App GS ERA +1 2013 Vanderbilt Southeastern 1 11 Beede, Tyler So P 37 17 17 2.32 +2 2013 Vanderbilt Southeastern 1 33 Miller, Brian So P 32 32 NA 1.58 +3 2013 Vanderbilt Southeastern 1 35 Ziomek, Kevin Jr P 32 17 17 2.12 +4 2013 Vanderbilt Southeastern 1 15 Fulmer, Carson Fr P 26 26 NA 2.39 +5 2013 Vanderbilt Southeastern 1 39 Smith, Kyle Fr INF 23 1 NA 0.00 +6 2013 Vanderbilt Southeastern 1 28 Miller, Jared So P 22 22 NA 2.31 +7 2013 Vanderbilt Southeastern 1 19 Rice, Steven Jr P 21 21 NA 2.57 +8 2013 Vanderbilt Southeastern 1 13 Buehler, Walker Fr P 16 16 9 3.14 +9 2013 Vanderbilt Southeastern 1 22 Pfeifer, Philip So P 15 15 12 3.68 +10 2013 Vanderbilt Southeastern 1 12 Ravenelle, Adam So P 11 11 NA 3.18 +11 2013 Vanderbilt Southeastern 1 40 Pecoraro, T.J. Jr P 10 10 7 5.97 +12 2013 Vanderbilt Southeastern 1 45 Ferguson, Tyler Fr P 8 8 4 4.21 +13 2013 Vanderbilt Southeastern 1 27 Kolinsky, Keenan Jr P 2 2 NA 0.00 +14 2013 Vanderbilt Southeastern 1 24 Wilson, Nevin So P 1 1 NA 0.00 +15 2013 Vanderbilt Southeastern 1 - Totals - - - NA NA 2.76 +16 2013 Vanderbilt Southeastern 1 - Opponent Totals - - - NA NA 6.19 +``` + +Now, the function is dependent on the user knowing the `school_id` used by the NCAA website. Given that, I've included a `school_id_lu` function so that users can find the `school_id` they need. + +Just pass a string to the function and it will return possible matches based on the school's name: + +```r +> school_id_lu("Vand") +# A tibble: 4 × 6 + school conference school_id year division conference_id + +1 Vanderbilt Southeastern 736 2013 1 911 +2 Vanderbilt Southeastern 736 2014 1 911 +3 Vanderbilt Southeastern 736 2015 1 911 +4 Vanderbilt Southeastern 736 2016 1 911 +``` + +# baseballr 0.2.1 (2016-10-08) + + +Updates to functions in this release: + +`scrape_statcast_savant_batter`
+`scrape_statcast_savant_pitcher`
+ +New functions in this release: + +`code_barrel`
+ +The research team at Major League Baseball Advanced Media have developed a way to categorize batted balls that on average having a batting average over .500 and slugging over 1.500. The specific coding criteria can be found in comment #2 [here] (http://tangotiger.com/index.php/site/comments/statcast-lab-barrels#2). + +Now, whenver a user scrapes Statcast data using either the `scrape_statcast_savant_batter` or `scrape_statcast_savant_pitcher` functions the results will include a column `barrel`, where if the batted ball matches the barrel criteria it will code as 1, otherwise 0. + +Example: + +```r +> scrape_statcast_savant_batter(start_date = "2016-04-06", end_date = "2016-04-15", batterid = 621043) %>% ++ filter(type == "X") %>% ++ filter(!is.na(barrel)) %>% ++ select(player_name, game_date, hit_angle, hit_speed, barrel) %>% ++ tail() +[1] "Be patient, this may take a few seconds..." +[1] "Data courtesy of Baseball Savant and MLBAM (baseballsavant.mlb.com)" + player_name game_date hit_angle hit_speed barrel +25 Carlos Correa 2016-04-07 31.10 103.33 1 +26 Carlos Correa 2016-04-07 27.77 87.25 0 +27 Carlos Correa 2016-04-06 29.62 103.97 1 +28 Carlos Correa 2016-04-06 0.11 105.20 0 +29 Carlos Correa 2016-04-06 23.76 113.55 1 +30 Carlos Correa 2016-04-06 -2.18 113.39 0 +``` +If you already have Statcast data--say, in a database that you've been collecting--I've also included a simple function that will take a dataframe and code whether each row contains a barrel or not. All you need to do is pass your dataframe to `code_barrel`. + + +# baseballr 0.2.0 (2016-08-25) + + +Functions added to this release: + +`scrape_statcast_savant_batter`
+`scrape_statcast_savant_pitcher`
+`playerid_lookup` + +The two savant functions allow a user to retrieve PITCHf/x and Statcast data for either a specific batter or pitcher from [Baseball Savants' Statcast Search] (https://baseballsavant.mlb.com/statcast_search). The user needs to provide a start date, end date, and the batter or pitcher's MLBAMID. + +Example: + +```r +> scrape_statcast_savant_batter(start_date = "2016-04-06", end_date = "2016-04-15", batterid = 621043) %>% + filter(type == "X") %>% + select(3,7,54:56) %>% + tail() +[1] "Be patient, this may take a few seconds..." +[1] "Data courtesy of Baseball Savant and MLBAM (baseballsavant.mlb.com)" + game_date player_name hit_distance_sc hit_speed hit_angle +26 2016-04-07 Carlos Correa 385 103.33 31.10 +27 2016-04-07 Carlos Correa 288 87.25 27.77 +28 2016-04-06 Carlos Correa 392 103.97 29.62 +29 2016-04-06 Carlos Correa 189 105.20 0.11 +30 2016-04-06 Carlos Correa 462 113.55 23.76 +31 2016-04-06 Carlos Correa 228 113.39 -2.18 +``` + +Since the savant functions require users to pass a valid MLBAMID, a lookup function is included that leverages the Chadwich public register. Users provide a text string and only those players with that string present in their last name will be returned. + +Here is an example where the user is looking for players with the last name "Seager": + +```r +> playerid_lookup("Seager") +[1] "Be patient, this may take a few seconds..." +[1] "Data courtesy of the Chadwick Bureau Register (https://github.com/chadwickbureau/register)" + first_name last_name given_name name_suffix nick_name birth_year mlb_played_first mlbam_id retrosheet_id bbref_id fangraphs_id +1 Ben Seager Ben NA NA NA NA +2 Corey Seager Corey Drew 1994 2015 608369 seagc001 seageco01 13624 +3 Justin Seager Justin Ryan 1992 NA 643529 NA +4 Kyle Seager Kyle Duerr 1987 2011 572122 seagk001 seageky01 9785 +``` + + +# baseballr 0.1.4 (2016-05-24) + + +Functions added to this release: + +`pitcher_boxscore`: This function allows a user to retrieve a boxscore of pitcher statistics for any game played in the PITCHf/x era (2008-current). The function takes a boxscore.xml url as it's only argument and returns boxscore data for both the home and away pitchers. + +Example: + +```r +> pitcher_boxscore("http://gd2.mlb.com/components/game/mlb/year_2016/month_05/day_21/gid_2016_05_21_milmlb_nynmlb_1/boxscore.xml") %>% select(id:so) +Source: local data frame [9 x 10] + + id name name_display_first_last pos out bf er r h so + (chr) (chr) (chr) (chr) (chr) (chr) (chr) (chr) (chr) (chr) +1 605200 Davies Zach Davies P 16 22 4 4 5 5 +2 430641 Boyer Blaine Boyer P 2 4 0 0 2 0 +3 448614 Torres, C Carlos Torres P 3 4 0 0 0 1 +4 592804 Thornburg Tyler Thornburg P 3 3 0 0 0 1 +5 518468 Blazek Michael Blazek P 1 5 1 1 2 0 +6 594798 deGrom Jacob deGrom P 15 23 4 4 5 7 +7 570663 Robles Hansel Robles P 6 7 0 0 0 3 +8 592665 Reed, A Addison Reed P 3 5 0 0 1 2 +9 544727 Familia Jeurys Familia P 3 4 0 0 1 1 +``` + +`batter_boxscore`: This function does the same thing as `pitcher_boxscore`, but for batters. + +Example: + +```r +> batter_boxscore("http://gd2.mlb.com/components/game/mlb/year_2016/month_05/day_21/gid_2016_05_21_milmlb_nynmlb_1/boxscore.xml") %>% select(id:bb) +Source: local data frame [29 x 10] + + id name name_display_first_last pos bo ab po r a bb + (chr) (chr) (chr) (chr) (chr) (chr) (chr) (chr) (chr) (chr) +1 542340 Villar Jonathan Villar SS 100 5 1 0 0 0 +2 571697 Gennett Scooter Gennett 2B 200 4 2 0 3 1 +3 518960 Lucroy Jonathan Lucroy C 300 5 8 0 1 0 +4 474892 Carter Chris Carter 1B 400 4 10 0 2 0 +5 543590 Nieuwenhuis Kirk Nieuwenhuis CF 500 4 0 0 0 0 +6 431094 Hill, A Aaron Hill 3B 600 1 1 2 4 3 +7 502100 Presley Alex Presley LF 700 3 0 1 0 1 +8 570717 Flores, R Ramon Flores RF 800 3 2 1 0 0 +9 605200 Davies Zach Davies P 900 2 1 0 1 0 +10 430641 Boyer Blaine Boyer P 901 0 0 0 0 0 +``` + + + +# baseballr 0.1.3 (2016-03-25) + + +Functions added to this release: + +`edge_code`: This function allows a user to pass their own dataframe and have individual pitches coded according to the scheme provided by Edge%. The dataframe must contain at least three columns of data: `b_height`, `stand`, `px`, and `pz`. + +Example (based on data from "2015-04-05"): + +```r +> edge_code(df) %>% .[, c(6:7, 27:28, 82)] %>% head(10) + stand b_height px pz location +1 L 6-3 0.416 2.963 Inside Edge +2 L 6-3 -0.191 2.347 Heart +3 L 6-3 -0.518 3.284 Upper Edge +4 L 6-3 -0.641 1.221 Out of Zone +5 L 6-3 -1.821 2.083 Out of Zone +6 L 6-3 0.627 2.397 Inside Edge +7 L 6-5 -1.088 1.610 Out of Zone +8 L 6-5 -0.257 2.047 Lower Edge +9 L 6-5 NA NA +10 L 6-3 -1.539 1.525 Out of Zone +``` + +Functions updated for this release: + +`standings_on_date_bref`: [JonathanBob](https://github.com/JonathanBob) updated this funtion to allow for records to be returned for the given date or from that date forward. Also, users can input a full date string instead of three separate arguments for the day, month, and year. Users can also choose to pull records for the AL and NL overall, not just for a given division. + +Example: + +```r +> standings_on_date_bref("2015-08-01", "NL East", from = FALSE) +$`NL East` + Tm W L W-L% GB RS RA pythW-L% +1 WSN 54 48 0.529 -- 422 391 0.535 +2 NYM 54 50 0.519 1.0 368 373 0.494 +3 ATL 46 58 0.442 9.0 379 449 0.423 +4 MIA 42 62 0.404 13.0 370 408 0.455 +5 PHI 41 64 0.390 14.5 386 511 0.374 + +> standings_on_date_bref("2015-08-01", "NL East", from = TRUE) +$`NL East` + Tm W L W-L% GB RS RA pythW-L% +1 NYM 36 22 0.621 -- 315 240 0.622 +2 MIA 29 29 0.500 7.0 243 270 0.452 +3 WSN 29 31 0.483 8.0 281 244 0.564 +4 PHI 22 35 0.386 13.5 240 298 0.402 +5 ATL 21 37 0.362 15.0 194 311 0.297 +``` + + +# baseballr 0.1.2 (2016-03-07) + + +Functions added to this release: + +`edge_scrape_split()`: This function builds of off `edge_scrape()` and adds the ability to view the data split by batter and pitcher handedness. As with `edge_scrape()`, the function returns a dataframe grouped by either pitchers or batters and the percentge of pitches in each of the various Edge zones, but adds in handedness. + +Example (Edge% splits by batters with handedness): + +```r +> edge_scrape_split("2015-04-05", "2015-04-05", "batter") %>% .[,c(1:5,9:14)] + + batter_name batter p_throws stand All_pitches Upper_Edge Lower_Edge Inside_Edge Outside_Edge Heart Out_of_Zone + (chr) (dbl) (chr) (chr) (int) (dbl) (dbl) (dbl) (dbl) (dbl) (dbl) +1 Matt Holliday 407812 L R 11 0.000 0.182 0.000 0.182 0.182 0.455 +2 Matt Holliday 407812 R R 10 0.000 0.000 0.000 0.200 0.300 0.500 +3 David Ross 424325 R R 8 0.000 0.000 0.000 0.125 0.625 0.250 +4 Jhonny Peralta 425509 L R 9 0.000 0.111 0.444 0.000 0.111 0.333 +5 Jhonny Peralta 425509 R R 6 0.167 0.000 0.000 0.167 0.167 0.500 +6 Adam Wainwright 425794 L R 8 0.000 0.125 0.000 0.000 0.125 0.750 +7 Adam Wainwright 425794 R R 3 0.000 0.000 0.000 0.333 0.667 0.000 +8 Yadier Molina 425877 L R 13 0.077 0.077 0.000 0.000 0.077 0.769 +9 Yadier Molina 425877 R R 7 0.143 0.000 0.143 0.143 0.143 0.429 +10 Jonathan Jay 445055 L L 9 0.000 0.000 0.222 0.000 0.556 0.222 +.. ... ... ... ... ... ... ... ... ... ... ... +``` + + +# baseballr 0.1.0 (2016-03-01) + + +Functions added to this release: + +`fip_plus()`: This function mimics the functionality in the `woba_plus()` function, except that the unit of analysis is pitchers. The function will generate Fielding Indepedent Pitching (FIP) for each pitcher in the data set that is passed to the function, along with wOBA against and wOBA against on contact. + +Example: + +```r +> daily_pitcher_bref("2015-04-05", "2015-04-30") %>% fip_plus() %>% select(season, Name, IP, ERA, SO, uBB, HBP, HR, FIP, wOBA_against, wOBA_CON_against) %>% arrange(desc(IP)) %>% head(10) + season Name IP ERA SO uBB HBP HR FIP wOBA_against wOBA_CON_against +1 2015 Johnny Cueto 37.0 1.95 38 4 2 3 2.62 0.210 0.276 +2 2015 Dallas Keuchel 37.0 0.73 22 11 0 0 2.84 0.169 0.151 +3 2015 Sonny Gray 36.1 1.98 25 6 1 1 2.69 0.218 0.239 +4 2015 Mike Leake 35.2 3.03 25 7 0 5 4.16 0.240 0.281 +5 2015 Felix Hernandez 34.2 1.82 36 6 3 1 2.20 0.225 0.272 +6 2015 Corey Kluber 34.0 4.24 36 5 2 2 2.40 0.295 0.391 +7 2015 Jake Odorizzi 33.2 2.41 26 8 1 0 2.38 0.213 0.228 +8 2015 Josh Collmenter 32.2 2.76 16 3 0 1 2.82 0.290 0.330 +9 2015 Bartolo Colon 32.2 3.31 25 1 0 4 3.29 0.280 0.357 +10 2015 Zack Greinke 32.2 1.93 27 7 1 2 3.01 0.240 0.274 +``` + +`edge_scrape()`: This function allows the user to scrape PITCHf/x data from the GameDay application using Carson Sievert's [pitchRx](https://github.com/cpsievert/pitchRx) package and to calculate metrics associated with [Edge%](https://billpetti.shinyapps.io/edge_shiny/). The function returns a data.frame grouped by either pitchers or batters and the percentge of pitches in each of the various Edge zones. + +Example (pitchers): + +```r +> edge_scrape("2015-04-06", "2015-04-07", "pitcher") %>% .[, c(1:3,7:12)] %>% head(10) + pitcher_name pitcher All_pitches Upper_Edge Lower_Edge Inside_Edge Outside_Edge Heart Out_of_Zone + (chr) (dbl) (int) (dbl) (dbl) (dbl) (dbl) (dbl) (dbl) +1 Bartolo Colon 112526 86 0.035 0.081 0.058 0.151 0.209 0.465 +2 LaTroy Hawkins 115629 12 0.000 0.333 0.000 0.000 0.083 0.583 +3 Joe Nathan 150274 4 0.000 0.000 0.000 0.000 0.000 1.000 +4 Buddy Carlyle 234194 9 0.000 0.222 0.000 0.000 0.333 0.444 +5 Jason Grilli 276351 14 0.000 0.000 0.214 0.000 0.286 0.500 +6 Kevin Gregg 276514 17 0.000 0.000 0.118 0.176 0.235 0.471 +7 Joaquin Benoit 276542 19 0.053 0.053 0.105 0.000 0.158 0.632 +8 Ryan Vogelsong 285064 99 0.010 0.051 0.141 0.061 0.182 0.556 +9 Jeremy Affeldt 346793 5 0.000 0.000 0.200 0.000 0.000 0.800 +10 Grant Balfour 346797 21 0.095 0.000 0.000 0.048 0.333 0.524 +``` + +Example (batters): + +```r +> edge_scrape("2015-04-06", "2015-04-07", "batter") %>% .[, c(1:3,7:12)] %>% head(10) + batter_name batter All_pitches Upper_Edge Lower_Edge Inside_Edge Outside_Edge Heart Out_of_Zone + (chr) (dbl) (int) (dbl) (dbl) (dbl) (dbl) (dbl) (dbl) +1 Bartolo Colon 112526 7 0.000 0.000 0.429 0.000 0.143 0.429 +2 Torii Hunter 116338 19 0.000 0.105 0.105 0.105 0.000 0.684 +3 David Ortiz 120074 18 0.056 0.000 0.111 0.056 0.222 0.556 +4 Alex Rodriguez 121347 17 0.000 0.000 0.353 0.000 0.118 0.529 +5 Aramis Ramirez 133380 23 0.000 0.087 0.261 0.000 0.261 0.391 +6 Adrian Beltre 134181 26 0.000 0.038 0.154 0.115 0.231 0.462 +7 Carlos Beltran 136860 22 0.136 0.045 0.136 0.000 0.136 0.545 +8 Michael Cuddyer 150212 14 0.000 0.214 0.214 0.000 0.214 0.357 +9 Jimmy Rollins 276519 41 0.024 0.122 0.049 0.049 0.220 0.537 +10 Ryan Vogelsong 285064 10 0.000 0.200 0.300 0.000 0.200 0.300 +``` diff --git a/R/batter_boxscore.R b/R/batter_boxscore.R index 46238495..a68dd419 100644 --- a/R/batter_boxscore.R +++ b/R/batter_boxscore.R @@ -4,12 +4,15 @@ #' #' @param x A boxscore.xml url for a given game from the MLBAM GameDay app data. #' @keywords MLB, PITCHf/x, Game Day, boxscore, sabermetrics -#' @importFrom XML xmlToList -#' @importFrom XML xmlParse +#' @importFrom XML xmlToList xmlParse #' @importFrom dplyr bind_rows #' @export #' @examples -#' batter_boxscore("http://gd2.mlb.com/components/game/mlb/year_2016/month_05/day_21/gid_2016_05_21_milmlb_nynmlb_1/boxscore.xml") +#' # batters +#' url_base <- "http://gd2.mlb.com/components/game/mlb/" +#' url <- paste0(url_base, +#' "year_2016/month_05/day_21/gid_2016_05_21_milmlb_nynmlb_1/boxscore.xml") +#' batter_boxscore(url) batter_boxscore <- function(x) { diff --git a/R/fg_park_hand.R b/R/fg_park_hand.R index 2ceb755a..0ddc1192 100644 --- a/R/fg_park_hand.R +++ b/R/fg_park_hand.R @@ -2,6 +2,7 @@ #' #' This function allows you to scrape park factors by handedness from FanGraphs.com for a given single year. #' @param yr Season for which you want to scrape the park factors. +#' @importFrom stats setNames #' @keywords MLB, sabermetrics #' @export #' @examples @@ -11,7 +12,7 @@ fg_park_hand <- function(yr) { read_html(paste0("http://www.fangraphs.com/guts.aspx?type=pfh&teamid=0&season=", yr)) %>% html_node(xpath = '//*[(@id = "GutsBoard1_dg1_ctl00")]') %>% html_table %>% - setNames(c("season", "home_team", "single_as_LHH", "single_as_RHH", + stats::setNames(c("season", "home_team", "single_as_LHH", "single_as_RHH", "double_as_LHH", "double_as_RHH", "triple_as_LHH", "triple_as_RHH", "hr_as_LHH", "hr_as_RHH")) } \ No newline at end of file diff --git a/R/label_statcast_imputed_data.R b/R/label_statcast_imputed_data.R index 5234e9a3..bac9a3ec 100644 --- a/R/label_statcast_imputed_data.R +++ b/R/label_statcast_imputed_data.R @@ -6,35 +6,38 @@ #' have been imputed. #' #' @param statcast_df A dataframe containing Statcast batted ball data -#' @param impute_file A csv file giving the launch angle, launch speed, bb_type, events fields to label -#' as imputed. if NULL then it's read from the `extdata` folder of the package. +#' @param impute_file A CSV file giving the launch angle, launch speed, +#' \code{bb_type}, events fields to label +#' as imputed. if NULL then it's read from the \code{extdata} folder of the package. #' @param inverse_precision inverse of how many digits to truncate the launch angle -#' and speed to for comparison. default is 10000, i.e. keep 4 digits of precision. +#' and speed to for comparison. Default is \code{10000}, i.e. keep 4 digits of precision. #' @keywords MLB, Statcast, sabermetrics #' @importFrom dplyr bind_rows #' @importFrom dplyr left_join -#' @return A copy of the input dataframe with a new column "imputed" appended. imputed +#' @importFrom readr read_csv +#' @return A copy of the input dataframe with a new column \code{imputed} appended. imputed #' is 1 if launch angle and launch speed are likely imputed, 0 otherwise. #' @export #' @examples -#' #' \dontrun{ -#' statcast_df = scrape_statcast_savant("2017-05-01", "2017-05-02") -#' statcast_df = label_statcast_imputed_data(statcast_df) -#' mean(statcast_df$imputed) +#' \dontrun{ +#' statcast_df <- scrape_statcast_savant("2017-05-01", "2017-05-02") +#' sc_df <- label_statcast_imputed_data(statcast_df) +#' mean(sc_df$imputed) #' } -label_statcast_imputed_data <- function(statcast_df, impute_file=NULL, inverse_precision=10000) { +label_statcast_imputed_data <- function(statcast_df, impute_file = NULL, + inverse_precision = 10000) { if (is.null(impute_file)) { - impute_file = system.file("extdata/statcast_impute.csv", package = "baseballr") + impute_file <- system.file("extdata/statcast_impute.csv", package = "baseballr") } - imputed_df = read.csv(impute_file, stringsAsFactors = FALSE) + imputed_df <- suppressMessages(readr::read_csv(impute_file)) imputed_df$imputed <- 1 tmp <- dplyr::left_join( - statcast_df %>% mutate(ila=as.integer(launch_angle * inverse_precision), - ils=as.integer(launch_speed * inverse_precision)), - imputed_df, by=c("ils", "ila", "bb_type", "events")) + statcast_df %>% mutate(ila = as.integer(launch_angle * inverse_precision), + ils = as.integer(launch_speed * inverse_precision)), + imputed_df, by = c("ils", "ila", "bb_type", "events")) tmp$imputed <- ifelse(is.na(tmp$imputed), 0, 1) tmp } diff --git a/R/linear_weights_savant.R b/R/linear_weights_savant.R index 09c6bd45..496572fb 100644 --- a/R/linear_weights_savant.R +++ b/R/linear_weights_savant.R @@ -1,7 +1,7 @@ #' Generate linear weight values for events using Baseball Savant data #' #' This function allows a user to generate linear weight values for events using Baseball Savant data. Output includes both linear weights above average and linear weights above outs for home runs, triples, doubles, singles, walks, hit by pitches, and outs. -#' @param df A data frame generated from Baseball Savant that has been run through the baseballr::run_expectancy_code() function. +#' @param df A data frame generated from Baseball Savant that has been run through the \code{\link{run_expectancy_code}} function. #' @keywords MLB, sabermetrics #' @importFrom dplyr filter group_by summarise arrange mutate add_row #' @export diff --git a/R/pitcher_boxscore.R b/R/pitcher_boxscore.R index dfd9b1f9..6df7121d 100644 --- a/R/pitcher_boxscore.R +++ b/R/pitcher_boxscore.R @@ -1,14 +1,11 @@ -#' Retrieve pitcher boxscore data for a single game played -#' -#' This function allows a user to retrieve a boxscore of pitcher statistics for any game played in the PITCHf/x era (2008-current). The function takes a boxscore.xml url as it's only argument and returns boxscore data for both the home and away pitchers. -#' -#' @param x A boxscore.xml url for a given game from the MLBAM GameDay app data. -#' @keywords MLB, PITCHf/x, Game Day, boxscore, sabermetrics -#' @importFrom XML xmlToList -#' @importFrom XML xmlParse +#' @rdname batter_boxscore #' @export #' @examples -#' pitcher_boxscore("http://gd2.mlb.com/components/game/mlb/year_2016/month_05/day_21/gid_2016_05_21_milmlb_nynmlb_1/boxscore.xml") +#' # pitchers +#' url_base <- "http://gd2.mlb.com/components/game/mlb/" +#' url <- paste0(url_base, +#' "year_2016/month_05/day_21/gid_2016_05_21_milmlb_nynmlb_1/boxscore.xml") +#' pitcher_boxscore(url) pitcher_boxscore <- function(x) { url <- x diff --git a/R/playerid_lookup.R b/R/playerid_lookup.R index 9cb8e149..d2d4a274 100644 --- a/R/playerid_lookup.R +++ b/R/playerid_lookup.R @@ -4,24 +4,27 @@ #' @param last_name A text string used to return results for players with that string in their last name. #' @param first_name A text string used to return results for players with that string in their first name. #' @keywords MLB, sabermetrics +#' @importFrom readr read_csv #' @export #' @examples -#' \dontrun{playerid_lookup("Garcia", "Karim")} +#' \dontrun{ +#' playerid_lookup("Garcia", "Karim") +#' } -playerid_lookup <- function(last_name=NULL, first_name=NULL) { +playerid_lookup <- function(last_name = NULL, first_name = NULL) { if (!exists("chadwick_player_lu_table")) { - print("Be patient, this may take a few seconds...") - print("Data courtesy of the Chadwick Bureau Register (https://github.com/chadwickbureau/register)") + message("Be patient, this may take a few seconds...") + message("Data courtesy of the Chadwick Bureau Register (https://github.com/chadwickbureau/register)") url <- "https://raw.githubusercontent.com/chadwickbureau/register/master/data/people.csv" - chadwick_player_lu_table <- read.csv(url) + suppressMessages( + chadwick_player_lu_table <- readr::read_csv(url) + ) assign("chadwick_player_lu_table", chadwick_player_lu_table, envir = .GlobalEnv) x <- process_player_name(last_name, first_name) names(x) <- c("first_name", "last_name", "given_name", "name_suffix", "nick_name", "birth_year", "mlb_played_first", "mlbam_id", "retrosheet_id", "bbref_id", "fangraphs_id") - x$fangraphs_id <- as.character(x$fangraphs_id) %>% as.numeric() - x$birth_year <- as.character(x$birth_year) %>% as.numeric() x } @@ -35,17 +38,19 @@ playerid_lookup <- function(last_name=NULL, first_name=NULL) { } } -process_player_name <- function(last_name=NULL, first_name=NULL) { +#' @importFrom dplyr filter select + +process_player_name <- function(last_name = NULL, first_name = NULL) { if (is.null(first_name)) { x <- chadwick_player_lu_table %>% - filter(grepl(last_name, name_last)) %>% - select(name_first, name_last, name_given, name_suffix, name_nick, birth_year, mlb_played_first, key_mlbam, key_retro, key_bbref, key_fangraphs) + dplyr::filter(grepl(last_name, name_last)) %>% + dplyr::select(name_first, name_last, name_given, name_suffix, name_nick, birth_year, mlb_played_first, key_mlbam, key_retro, key_bbref, key_fangraphs) } else { x <- chadwick_player_lu_table %>% - filter(grepl(last_name, name_last)) %>% - filter(grepl(first_name, name_first)) %>% - select(name_first, name_last, name_given, name_suffix, name_nick, birth_year, mlb_played_first, key_mlbam, key_retro, key_bbref, key_fangraphs) + dplyr::filter(grepl(last_name, name_last)) %>% + dplyr::filter(grepl(first_name, name_first)) %>% + dplyr::select(name_first, name_last, name_given, name_suffix, name_nick, birth_year, mlb_played_first, key_mlbam, key_retro, key_bbref, key_fangraphs) } x } \ No newline at end of file diff --git a/R/process_statcast_payload.R b/R/process_statcast_payload.R index 633c5c74..cff397d9 100644 --- a/R/process_statcast_payload.R +++ b/R/process_statcast_payload.R @@ -1,56 +1,25 @@ #' Process Baseball Savant CSV payload #' -#' This is a helper function for all scrape_statcast_savant functions. The function processes the initial csv payload acquired from baseballsavant to ensure consistency in formattting across downloads -#' @param payload payload from a Baseball Savant request, e.g. from utils::read.csv +#' This is a helper function for all scrape_statcast_savant functions. +#' The function processes the initial csv payload acquired from +#' baseballsavant to ensure consistency in formattting across downloads +#' @param payload payload from a Baseball Savant request, e.g. +#' from \code{\link[readr]{read_csv}} #' @keywords MLB, sabermetrics, Statcast +#' @importFrom dplyr mutate_ #' @export +#' @examples #' \dontrun{ #' process_statcast_payload(payload) +#' } process_statcast_payload <- function(payload) { # Clean up formatting. - payload[payload=="null"] <- NA - payload$game_date <- as.Date(payload$game_date, "%Y-%m-%d") - payload$des <- as.character(payload$des) - payload$game_pk <- as.character(payload$game_pk) %>% as.numeric() - payload$hc_x <- as.character(payload$hc_x) %>% as.numeric() - payload$hc_y <- as.character(payload$hc_y) %>% as.numeric() - payload$on_1b <- as.character(payload$on_1b) %>% as.numeric() - payload$on_2b <- as.character(payload$on_2b) %>% as.numeric() - payload$on_3b <- as.character(payload$on_3b) %>% as.numeric() - payload$release_pos_x <- as.character(payload$release_pos_x) %>% as.numeric() - payload$release_pos_x <- as.character(payload$release_pos_x) %>% as.numeric() - payload$hit_distance_sc <- as.character(payload$hit_distance_sc) %>% as.numeric() - payload$launch_speed <- as.character(payload$launch_speed) %>% as.numeric() - payload$launch_angle <- as.character(payload$launch_angle) %>% as.numeric() - payload$pfx_x <- as.character(payload$pfx_x) %>% as.numeric() - payload$pfx_z <- as.character(payload$pfx_z) %>% as.numeric() - payload$plate_x <- as.character(payload$plate_x) %>% as.numeric() - payload$plate_z <- as.character(payload$plate_z) %>% as.numeric() - payload$vx0 <- as.character(payload$vx0) %>% as.numeric() - payload$vy0 <- as.character(payload$vy0) %>% as.numeric() - payload$vz0 <- as.character(payload$vz0) %>% as.numeric() - payload$ax <- as.character(payload$ax) %>% as.numeric() - payload$az <- as.character(payload$az) %>% as.numeric() - payload$ay <- as.character(payload$ay) %>% as.numeric() - payload$sz_bot <- as.character(payload$sz_bot) %>% as.numeric() - payload$sz_top <- as.character(payload$sz_top) %>% as.numeric() - payload$effective_speed <- as.character(payload$effective_speed) %>% as.numeric() - payload$release_speed <- as.character(payload$release_speed) %>% as.numeric() - payload$release_spin_rate <- as.character(payload$release_spin_rate) %>% as.numeric() - payload$release_extension <- as.character(payload$release_extension) %>% as.numeric() - payload$pitch_name <- as.character(payload$pitch_name) - payload$home_score <- as.character(payload$home_score) %>% as.numeric() - payload$away_score <- as.character(payload$away_score) %>% as.numeric() - payload$bat_score <- as.character(payload$bat_score) %>% as.numeric() - payload$fld_score <- as.character(payload$fld_score) %>% as.numeric() - payload$post_away_score <- as.character(payload$post_away_score) %>% as.numeric() - payload$post_home_score <- as.character(payload$post_home_score) %>% as.numeric() - payload$post_bat_score <- as.character(payload$post_bat_score) %>% as.numeric() - payload$post_fld_score <- as.character(payload$post_fld_score) %>% as.numeric() - payload$zone <- as.character(payload$zone) %>% as.numeric() - payload$barrel <- with(payload, ifelse(launch_angle <= 50 & launch_speed >= 98 & launch_speed * 1.5 - launch_angle >= 11 & launch_speed + launch_angle >= 124, 1, 0)) + payload <- payload %>% + dplyr::mutate_( + barrel = ~ifelse(launch_angle <= 50 & launch_speed >= 98 & launch_speed * 1.5 - launch_angle >= 11 & launch_speed + launch_angle >= 124, 1, 0) + ) return(payload) diff --git a/R/scrape_statcast.R b/R/scrape_statcast.R index 27468788..e8fddd3a 100644 --- a/R/scrape_statcast.R +++ b/R/scrape_statcast.R @@ -1,93 +1,223 @@ -#' Query Statcast and PITCHf/x Data for data from baseballsavant.mlb.com +#' Query Statcast and PITCHf/x Data for data from \url{http://baseballsavant.mlb.com} #' -#' This function allows you to query Statcast and PITCHf/x data as provided on baseballsavant.mlb.com and have that data returned as a dataframe. -#' @param start_date Date of first game for which you want data. Format must be in YYYY-MM-DD format. -#' @param end_date Date of last game for which you want data. Format must be in YYYY-MM-DD format. -#' @param playerid The MLBAM ID for the player who's data you want to query. -#' @param player_type The player type. Can be 'batter' or 'pitcher' +#' This function allows you to query Statcast and PITCHf/x data as provided on \url{http://baseballsavant.mlb.com} and have that data returned as a \code{\link{data.frame}}. +#' @param start_date Date of first game for which you want data. +#' Format must be in YYYY-MM-DD format. +#' @param end_date Date of last game for which you want data. +#' Format must be in YYYY-MM-DD format. +#' @param playerid The MLBAM ID for the player whose data you want to query. +#' @param player_type The player type. Can be \code{batter} or \code{pitcher}. +#' Default is \code{batter} +#' @param ... currently ignored #' @keywords MLB, sabermetrics, Statcast -#' @importFrom utils read.csv +#' @importFrom tibble tribble +#' @importFrom dplyr mutate_ filter_ +#' @importFrom lubridate year #' @export #' @examples #' \dontrun{ -#' scrape_statcast_savant(start_date = "2016-04-06", end_date = "2016-04-15", playerid = 621043, player_type='batter') +#' correa <- scrape_statcast_savant(start_date = "2016-04-06", +#' end_date = "2016-04-15", playerid = 621043) #' -#' scrape_statcast_savant(start_date = "2016-04-06", end_date = "2016-04-15", playerid = 592789, player_type='pitcher') +#' noah <- scrape_statcast_savant(start_date = "2016-04-06", +#' end_date = "2016-04-15", playerid = 592789, player_type = 'pitcher') #' -#' scrape_statcast_savant(start_date = "2016-04-06", end_date = "2016-04-06") +#' daily <- scrape_statcast_savant(start_date = "2016-04-06", end_date = "2016-04-06") #' } -scrape_statcast_savant <- function(start_date, end_date, playerid=NULL, player_type=NULL) { - # Check to make sure args are in the correct format. - if(!is.character(start_date) | !is.character(end_date)) { - warning("Please wrap your dates in quotations in 'yyyy-mm-dd' format.") - return(NULL) - } +scrape_statcast_savant <- function(start_date = Sys.Date() - 1, end_date = Sys.Date(), + playerid = NULL, + player_type = "batter", ...) UseMethod("scrape_statcast_savant") + +#' @rdname scrape_statcast_savant +#' @export + +scrape_statcast_savant.Date <- function(start_date = Sys.Date() - 1, end_date = Sys.Date(), + playerid = NULL, player_type = "batter", ...) { # Check for other user errors. - if(as.Date(start_date)<="2015-03-01") { # March 1, 2015 was the first date of Spring Training. + if (start_date <= "2015-03-01") { # March 1, 2015 was the first date of Spring Training. message("Some metrics such as Exit Velocity and Batted Ball Events have only been compiled since 2015.") } - if(as.Date(start_date)<="2008-03-25") { # March 25, 2008 was the first date of Spring Training. + if (start_date <= "2008-03-25") { # March 25, 2008 was the first date of Spring Training. stop("The data are limited to the 2008 MLB season and after.") return(NULL) } - if(as.Date(start_date)==Sys.Date()) { + if (start_date == Sys.Date()) { message("The data are collected daily at 3 a.m. Some of today's games may not be included.") } - if(as.Date(start_date)>as.Date(end_date)) { + if (start_date > as.Date(end_date)) { stop("The start date is later than the end date.") return(NULL) } - - # extract season from start_date - - year <- substr(start_date, 1,4) - - if(is.null(playerid) & is.null(player_type)) { - warning("No player_type specified. Player_type will default to 'batter'.") - warning("No playerid specified. Collecting data for all batters.") - url <- paste0("https://baseballsavant.mlb.com/statcast_search/csv?all=true&hfPT=&hfAB=&hfBBT=&hfPR=&hfZ=&stadium=&hfBBL=&hfNewZones=&hfGT=R%7C&hfC=&hfSea=", year, "%7C&hfSit=&player_type=batter&hfOuts=&opponent=&pitcher_throws=&batter_stands=&hfSA=&game_date_gt=", start_date, "&game_date_lt=", end_date, "&team=&position=&hfRO=&home_road=&hfFlag=&metric_1=&hfInn=&min_pitches=0&min_results=0&group_by=name&sort_col=pitches&player_event_sort=h_launch_speed&sort_order=desc&min_abs=0&type=details&") - -} else if (!is.null(playerid) & is.null(player_type)) { - warning("No player_type specified. Player_type will default to 'batter'.") - url <- paste0("https://baseballsavant.mlb.com/statcast_search/csv?all=true&hfPT=&hfAB=&hfBBT=&hfPR=&hfZ=&stadium=&hfBBL=&hfNewZones=&hfGT=R%7C&hfC=&hfSea=", year, "%7C&hfSit=&player_type=batter&hfOuts=&opponent=&pitcher_throws=&batter_stands=&hfSA=&game_date_gt=", start_date, "&game_date_lt=", end_date, "&team=&position=&hfRO=&home_road=&batters_lookup%5B%5D=", playerid, "&hfFlag=&metric_1=&hfInn=&min_pitches=0&min_results=0&group_by=name&sort_col=pitches&player_event_sort=h_launch_speed&sort_order=desc&min_abs=0&type=details&") - -} else if (!is.null(playerid) & player_type=='batter') { - url <- paste0("https://baseballsavant.mlb.com/statcast_search/csv?all=true&hfPT=&hfAB=&hfBBT=&hfPR=&hfZ=&stadium=&hfBBL=&hfNewZones=&hfGT=R%7C&hfC=&hfSea=", year, "%7C&hfSit=&player_type=", player_type, "&hfOuts=&opponent=&pitcher_throws=&batter_stands=&hfSA=&game_date_gt=", start_date, "&game_date_lt=", end_date, "&team=&position=&hfRO=&home_road=&batters_lookup%5B%5D=", playerid, "&hfFlag=&metric_1=&hfInn=&min_pitches=0&min_results=0&group_by=name&sort_col=pitches&player_event_sort=h_launch_speed&sort_order=desc&min_abs=0&type=details&") - -} else if (!is.null(playerid) & player_type=='pitcher') { - url <- paste0("https://baseballsavant.mlb.com/statcast_search/csv?all=true&hfPT=&hfAB=&hfBBT=&hfPR=&hfZ=&stadium=&hfBBL=&hfNewZones=&hfGT=R%7C&hfC=&hfSea=", year, "%7C&hfSit=&player_type=", player_type, "&hfOuts=&opponent=&pitcher_throws=&batter_stands=&hfSA=&game_date_gt=", start_date, "&game_date_lt=", end_date, "&team=&position=&hfRO=&home_road=&pitchers_lookup%5B%5D=", playerid, "&hfFlag=&metric_1=&hfInn=&min_pitches=0&min_results=0&group_by=name&sort_col=pitches&player_event_sort=h_launch_speed&sort_order=desc&min_abs=0&type=details&") - -} else if (is.null(playerid) & player_type=='pitcher'){ - warning("Collecting data for all pitchers for dates specified.") - url <- paste0("https://baseballsavant.mlb.com/statcast_search/csv?all=true&hfPT=&hfAB=&hfBBT=&hfPR=&hfZ=&stadium=&hfBBL=&hfNewZones=&hfGT=R%7C&hfC=&hfSea=", year, "%7C&hfSit=&player_type=pitcher&hfOuts=&opponent=&pitcher_throws=&batter_stands=&hfSA=&game_date_gt=", start_date, "&game_date_lt=", end_date, "&team=&position=&hfRO=&home_road=&&hfFlag=&metric_1=&hfInn=&min_pitches=0&min_results=0&group_by=name&sort_col=pitches&player_event_sort=h_launch_speed&sort_order=desc&min_abs=0&type=details&") - -} else { - warning("Collecting data for all batters for dates specified.") - url <- paste0("https://baseballsavant.mlb.com/statcast_search/csv?all=true&hfPT=&hfAB=&hfBBT=&hfPR=&hfZ=&stadium=&hfBBL=&hfNewZones=&hfGT=R%7C&hfC=&hfSea=", year, "%7C&hfSit=&player_type=batter&hfOuts=&opponent=&pitcher_throws=&batter_stands=&hfSA=&game_date_gt=", start_date, "&game_date_lt=", end_date, "&team=&position=&hfRO=&home_road=&&hfFlag=&metric_1=&hfInn=&min_pitches=0&min_results=0&group_by=name&sort_col=pitches&player_event_sort=h_launch_speed&sort_order=desc&min_abs=0&type=details&") -} - + + playerid_var <- ifelse(player_type == "pitcher", + "pitchers_lookup%5B%5D", "batters_lookup%5B%5D") + + vars <- tibble::tribble( + ~var, ~value, + "all", "true", + "hfPT", "", + "hfAB", "", + "hfBBT", "", + "hfPR", "", + "hfZ", "", + "stadium", "", + "hfBBL", "", + "hfNewZones", "", + "hfGT", "R%7CPO%7CS%7C&hfC", + "hfSea", paste0(lubridate::year(start_date), "%7C"), + "hfSit", "", + "hfOuts", "", + "opponent", "", + "pitcher_throws", "", + "batter_stands", "", + "hfSA", "", + "player_type", player_type, + "hfInfield", "", + "team", "", + "position", "", + "hfOutfield", "", + "hfRO", "", + "home_road", "", + playerid_var, ifelse(is.null(playerid), "", playerid), + "game_date_gt", as.character(start_date), + "game_date_lt", as.character(end_date), + "hfFlag", "", + "hfPull", "", + "metric_1", "", + "hfInn", "", + "min_pitches", 0, + "min_results", 0, + "group_by", "name", + "sort_col", "pitches", + "player_event_sort", "h_launch_speed", + "sort_order", "desc", + "min_abs", 0, + "type", "details" + ) %>% + dplyr::mutate_(pairs = ~paste(var, "=", value, sep = "")) + + if (is.null(playerid)) { + message("No playerid specified. Collecting data for all batters/pitchers.") + vars <- dplyr::filter_(vars, ~!grepl("lookup", var)) + } + + url_vars <- paste0(vars$pairs, collapse = "&") + url <- paste0("https://baseballsavant.mlb.com/statcast_search/csv?", url_vars) + message(url) + # Do a try/catch to show errors that the user may encounter while downloading. tryCatch( { - print("These data are from BaseballSevant and are property of MLB Advanced Media, L.P. All rights reserved.") - print("Grabbing data, this may take a minute...") - payload <- utils::read.csv(url) - processed_payload <- process_statcast_payload(payload) - message("URL read and payload acquired successfully.") - return(processed_payload) + message("These data are from BaseballSevant and are property of MLB Advanced Media, L.P. All rights reserved.") + message("Grabbing data, this may take a minute...") + suppressMessages( + suppressWarnings( + payload <- readr::read_csv(url, na = "null") + ) + ) }, - error=function(cond) { - message(paste("URL does not seem to exist, please check your Internet connection:")) + error = function(cond) { + message("URL does not seem to exist, please check your Internet connection") message("Original error message:") message(cond) - return(NA) + stop("No payload acquired") }, - warning=function(cond) { - message(paste("URL caused a warning. Make sure your playerid, player_type, and date range are correct:")) + # this will never run?? + warning = function(cond) { message("Original warning message:") message(cond) - return(NULL) } ) + if (ncol(payload) > 1) { + message("URL read and payload acquired successfully.") + return(process_statcast_payload(payload)) + } else { + warning("No valid data found") + message("Make sure your playerid, player_type, and date range are correct") + return(payload) + } +} + +#' @rdname scrape_statcast_savant +#' @export + +scrape_statcast_savant.default <- function(start_date = Sys.Date() - 1, end_date = Sys.Date(), + playerid = NULL, player_type = "batter", ...) { + # Check to make sure args are in the correct format. + # if(!is.character(start_date) | !is.character(end_date)) { + # warning("Please wrap your dates in quotations in 'yyyy-mm-dd' format.") + # return(NULL) + # } + message(paste0(start_date, " is not a date. Attempting to coerce...")) + start_Date <- as.Date(start_date) + + tryCatch( + { + end_Date <- as.Date(end_date) + }, + warning = function(cond) { + message(paste0(end_date, " was not coercible into a date. Using today.")) + end_Date <- Sys.Date() + message("Original warning message:") + message(cond) + } + ) + + scrape_statcast_savant(start_Date, end_Date, + playerid, player_type, ...) + +} + + +#' @rdname scrape_statcast_savant +#' @param batterid The MLBAM ID for the batter whose data you want to query. +#' @export +#' @examples +#' \dontrun{ +#' correa <- scrape_statcast_savant_batter(start_date = "2016-04-06", +#' end_date = "2016-04-15", batterid = 621043) +#' } + +scrape_statcast_savant_batter <- function(start_date, end_date, batterid, ...) { + scrape_statcast_savant(start_date, end_date, playerid = batterid, + player_type = "batter", ...) +} + +#' @rdname scrape_statcast_savant +#' @export +#' @examples +#' \dontrun{ +#' daily <- scrape_statcast_savant_batter_all(start_date = "2016-04-06", +#' end_date = "2016-04-06") +#' } + +scrape_statcast_savant_batter_all <- function(start_date, end_date, ...) { + scrape_statcast_savant(start_date, end_date, player_type = "batter", ...) +} + +#' @rdname scrape_statcast_savant +#' @param pitcherid The MLBAM ID for the pitcher whose data you want to query. +#' @export +#' @examples +#' \dontrun{ +#' noah <- scrape_statcast_savant_pitcher(start_date = "2016-04-06", +#' end_date = "2016-04-15", pitcherid = 592789) +#' } + +scrape_statcast_savant_pitcher <- function(start_date, end_date, pitcherid, ...) { + scrape_statcast_savant(start_date, end_date, playerid = pitcherid, + player_type = "pitcher", ...) +} + +#' @rdname scrape_statcast_savant +#' @export +#' @examples +#' \dontrun{ +#' daily <- scrape_statcast_savant_pitcher_all(start_date = "2016-04-06", +#' end_date = "2016-04-06") +#' } + +scrape_statcast_savant_pitcher_all <- function(start_date, end_date, ...) { + scrape_statcast_savant(start_date, end_date, player_type = "pitcher", ...) } diff --git a/R/scrape_statcast_savant_batter.R b/R/scrape_statcast_savant_batter.R deleted file mode 100644 index 3b3d30ac..00000000 --- a/R/scrape_statcast_savant_batter.R +++ /dev/null @@ -1,95 +0,0 @@ -#' Query Statcast and PITCHf/x Data for Batters from baseballsavant.mlb.com -#' -#' This function allows you to query Statcast and PITCHf/x data as provided on baseballsavant.mlb.com and have that data returned as a dataframe. -#' @param start_date Date of first game for which you want data. Format must be in YYYY-MM-DD format. -#' @param end_date Date of last game for which you want data. Format must be in YYYY-MM-DD format. -#' @param batterid The MLBAM ID for the batter who's data you want to query. -#' @keywords MLB, sabermetrics, Statcast -#' @importFrom utils read.csv -#' @export -#' @examples -#' \dontrun{ -#' scrape_statcast_savant_batter(start_date = "2016-04-06", end_date = "2016-04-15", batterid = 621043) -#' } - -scrape_statcast_savant_batter <- function(start_date, end_date, batterid) { - # Check to make sure args are in the correct format. - if(!is.character(start_date) | !is.character(end_date)) { - warning("Please wrap your dates in quotations in 'yyyy-mm-dd' format.") - return(NULL) - } - # Check for other user errors. - if(as.Date(start_date)<="2015-03-01") { # March 1, 2015 was the first date of Spring Training. - message("Some metrics such as Exit Velocity and Batted Ball Events have only been compiled since 2015.") - } - if(as.Date(start_date)<="2008-03-25") { # March 25, 2008 was the first date of Spring Training. - stop("The data are limited to the 2008 MLB season and after.") - return(NULL) - } - if(as.Date(start_date)==Sys.Date()) { - message("The data are collected daily at 3 a.m. Some of today's games may not be included.") - } - if(as.Date(start_date)>as.Date(end_date)) { - stop("The start date is later than the end date.") - return(NULL) - } - - # extract season from start_date - - year <- substr(start_date, 1,4) - - # Base URL. - url <- paste0("https://baseballsavant.mlb.com/statcast_search/csv?all=true&hfPT=&hfAB=&hfBBT=&hfPR=&hfZ=&stadium=&hfBBL=&hfNewZones=&hfGT=R%7CPO%7CS%7C&hfC=&hfSea=", year, "%7C&hfSit=&player_type=batter&hfOuts=&opponent=&pitcher_throws=&batter_stands=&hfSA=&game_date_gt=",start_date,"&game_date_lt=",end_date,"&player_lookup%5B%5D=",batterid,"&team=&position=&hfRO=&home_road=&hfFlag=&metric_1=&hfInn=&min_pitches=0&min_results=0&group_by=name&sort_col=pitches&player_event_sort=h_launch_speed&sort_order=desc&min_abs=0&type=details&") - - # Do a try/catch to show errors that the user may encounter while downloading. - tryCatch( - { - print("These data are from BaseballSevant and are property of MLB Advanced Media, L.P. All rights reserved.") - print("Grabbing data, this may take a minute...") - payload <- utils::read.csv(url) - - }, - error=function(cond) { - message(paste("URL does not seem to exist, please check your Internet connection:")) - message("Original error message:") - message(cond) - return(NA) - }, - warning=function(cond) { - message(paste("URL caused a warning. Make sure your batterid and date range are correct:")) - message("Original warning message:") - message(cond) - return(NULL) - } - ) - # Clean up formatting. - payload[payload=="null"] <- NA - payload$game_date <- as.Date(payload$game_date, "%Y-%m-%d") - payload$des <- as.character(payload$des) - payload$game_pk <- as.character(payload$game_pk) %>% as.numeric() - payload$on_1b <- as.character(payload$on_1b) %>% as.numeric() - payload$on_2b <- as.character(payload$on_2b) %>% as.numeric() - payload$on_3b <- as.character(payload$on_3b) %>% as.numeric() - payload$release_pos_x <- as.character(payload$release_pos_x) %>% as.numeric() - payload$release_pos_x <- as.character(payload$release_pos_x) %>% as.numeric() - payload$hit_distance_sc <- as.character(payload$hit_distance_sc) %>% as.numeric() - payload$launch_speed <- as.character(payload$launch_speed) %>% as.numeric() - payload$launch_angle <- as.character(payload$launch_angle) %>% as.numeric() - payload$effective_speed <- as.character(payload$effective_speed) %>% as.numeric() - payload$release_spin_rate <- as.character(payload$release_spin_rate) %>% as.numeric() - payload$release_extension <- as.character(payload$release_extension) %>% as.numeric() - payload$pitch_name <- as.character(payload$pitch_name) - payload$home_score <- as.character(payload$home_score) %>% as.numeric() - payload$away_score <- as.character(payload$away_score) %>% as.numeric() - payload$bat_score <- as.character(payload$bat_score) %>% as.numeric() - payload$fld_score <- as.character(payload$fld_score) %>% as.numeric() - payload$post_away_score <- as.character(payload$post_away_score) %>% as.numeric() - payload$post_home_score <- as.character(payload$post_home_score) %>% as.numeric() - payload$post_bat_score <- as.character(payload$post_bat_score) %>% as.numeric() - payload$post_fld_score <- as.character(payload$post_fld_score) %>% as.numeric() - payload$barrel <- with(payload, ifelse(launch_angle <= 50 & launch_speed >= 98 & launch_speed * 1.5 - launch_angle >= 11 & launch_speed + launch_angle >= 124, 1, 0)) - message("URL read and payload aquired successfully.") - - return(payload) - -} diff --git a/R/scrape_statcast_savant_batter_all.R b/R/scrape_statcast_savant_batter_all.R deleted file mode 100644 index 9d2181b0..00000000 --- a/R/scrape_statcast_savant_batter_all.R +++ /dev/null @@ -1,94 +0,0 @@ -#' Query Statcast and PITCHf/x Data for All Batters from baseballsavant.mlb.com -#' -#' This function allows you to query Statcast and PITCHf/x data as provided on baseballsavant.mlb.com and have that data returned as a dataframe. Query returns data for all batters over a given time frame. -#' @param start_date Date of first game for which you want data. Format must be in YYYY-MM-DD format. -#' @param end_date Date of last game for which you want data. Format must be in YYYY-MM-DD format. -#' @keywords MLB, sabermetrics, Statcast -#' @importFrom utils read.csv -#' @export -#' @examples -#' \dontrun{ -#' scrape_statcast_savant_batter_all(start_date = "2016-04-06", end_date = "2016-04-15") -#' } - -scrape_statcast_savant_batter_all <- function(start_date, end_date) { - # Check to make sure args are in the correct format. - if(!is.character(start_date) | !is.character(end_date)) { - warning("Please wrap your dates in quotations in 'yyyy-mm-dd' format.") - return(NULL) - } - # Check for other user errors. - if(as.Date(start_date)<="2015-03-01") { # March 1, 2015 was the first date of Spring Training. - message("Some metrics such as Exit Velocity and Batted Ball Events have only been compiled since 2015.") - } - if(as.Date(start_date)<="2008-03-25") { # March 25, 2008 was the first date of Spring Training. - stop("The data are limited to the 2008 MLB season and after.") - return(NULL) - } - if(as.Date(start_date)==Sys.Date()) { - message("The data are collected daily at 3 a.m. Some of today's games may not be included.") - } - if(as.Date(start_date)>as.Date(end_date)) { - stop("The start date is later than the end date.") - return(NULL) - } - - # extract season from start_date - - year <- substr(start_date, 1,4) - - # Base URL. - url <- paste0("https://baseballsavant.mlb.com/statcast_search/csv?all=true&hfPT=&hfAB=&hfBBT=&hfPR=&hfZ=&stadium=&hfBBL=&hfNewZones=&hfGT=R%7CPO%7CS%7C&hfC=&hfSea=", year, "%7C&hfSit=&player_type=batter&hfOuts=&opponent=&pitcher_throws=&batter_stands=&hfSA=&game_date_gt=",start_date,"&game_date_lt=",end_date,"&team=&position=&hfRO=&home_road=&hfFlag=&metric_1=&hfInn=&min_pitches=0&min_results=0&group_by=name&sort_col=pitches&player_event_sort=h_launch_speed&sort_order=desc&min_abs=0&type=details&") - - # Do a try/catch to show errors that the user may encounter while downloading. - tryCatch( - { - print("These data are from BaseballSevant and are property of MLB Advanced Media, L.P. All rights reserved.") - print("Grabbing data, this may take a minute...") - payload <- utils::read.csv(url) - - }, - error=function(cond) { - message(paste("URL does not seem to exist, please check your Internet connection:")) - message("Original error message:") - message(cond) - return(NA) - }, - warning=function(cond) { - message(paste("URL caused a warning. Make sure your date range is correct:")) - message("Original warning message:") - message(cond) - return(NULL) - } - ) - # Clean up formatting. - payload[payload=="null"] <- NA - payload$game_date <- as.Date(payload$game_date, "%Y-%m-%d") - payload$des <- as.character(payload$des) - payload$game_pk <- as.character(payload$game_pk) %>% as.numeric() - payload$on_1b <- as.character(payload$on_1b) %>% as.numeric() - payload$on_2b <- as.character(payload$on_2b) %>% as.numeric() - payload$on_3b <- as.character(payload$on_3b) %>% as.numeric() - payload$release_pos_x <- as.character(payload$release_pos_x) %>% as.numeric() - payload$release_pos_x <- as.character(payload$release_pos_x) %>% as.numeric() - payload$hit_distance_sc <- as.character(payload$hit_distance_sc) %>% as.numeric() - payload$launch_speed <- as.character(payload$launch_speed) %>% as.numeric() - payload$launch_angle <- as.character(payload$launch_angle) %>% as.numeric() - payload$effective_speed <- as.character(payload$effective_speed) %>% as.numeric() - payload$release_spin_rate <- as.character(payload$release_spin_rate) %>% as.numeric() - payload$release_extension <- as.character(payload$release_extension) %>% as.numeric() - payload$pitch_name <- as.character(payload$pitch_name) - payload$home_score <- as.character(payload$home_score) %>% as.numeric() - payload$away_score <- as.character(payload$away_score) %>% as.numeric() - payload$bat_score <- as.character(payload$bat_score) %>% as.numeric() - payload$fld_score <- as.character(payload$fld_score) %>% as.numeric() - payload$post_away_score <- as.character(payload$post_away_score) %>% as.numeric() - payload$post_home_score <- as.character(payload$post_home_score) %>% as.numeric() - payload$post_bat_score <- as.character(payload$post_bat_score) %>% as.numeric() - payload$post_fld_score <- as.character(payload$post_fld_score) %>% as.numeric() - payload$barrel <- with(payload, ifelse(launch_angle <= 50 & launch_speed >= 98 & launch_speed * 1.5 - launch_angle >= 11 & launch_speed + launch_angle >= 124, 1, 0)) - message("URL read and payload aquired successfully.") - - return(payload) - -} diff --git a/R/scrape_statcast_savant_pitcher.R b/R/scrape_statcast_savant_pitcher.R deleted file mode 100644 index 02f3f984..00000000 --- a/R/scrape_statcast_savant_pitcher.R +++ /dev/null @@ -1,97 +0,0 @@ -#' Query Statcast and PITCHf/x Data for Pitchers from baseballsavant.mlb.com -#' -#' This function allows you to query Statcast and PITCHf/x data as provided on baseballsavant.mlb.com and have that data returned as a dataframe. -#' @param start_date Date of first game for which you want data. Format must be in YYYY-MM-DD format. -#' @param end_date Date of last game for which you want data. Format must be in YYYY-MM-DD format. -#' @param pitcherid The MLBAM ID for the pitcher who's data you want to query. -#' @keywords MLB, sabermetrics, Statcast -#' @importFrom utils read.csv -#' @export -#' @examples -#' \dontrun{ -#' scrape_statcast_savant_pitcher(start_date = "2016-04-06", -#' end_date = "2016-04-15", pitcherid = 592789) -#' } - -scrape_statcast_savant_pitcher <- function(start_date, end_date, pitcherid) { - # Check to make sure args are in the correct format. - if(!is.character(start_date) | !is.character(end_date)) { - warning("Please wrap your dates in quotations in 'yyyy-mm-dd' format.") - return(NULL) - } - # Check for other user errors. - if(as.Date(start_date)<="2015-03-01") { # March 1, 2015 was the first date of Spring Training. - message("Some metrics such as Exit Velocity and Batted Ball Events have only been compiled since 2015.") - } - if(as.Date(start_date)<="2008-03-25") { # March 25, 2008 was the first date of Spring Training. - stop("The data are limited to the 2008 MLB season and after.") - return(NULL) - } - if(as.Date(start_date)==Sys.Date()) { - message("The data are collected daily at 3 a.m. Some of today's games may not be included.") - } - if(as.Date(start_date)>as.Date(end_date)) { - stop("The start date is later than the end date.") - return(NULL) - } - - # extract season from start_date - - year <- substr(start_date, 1,4) - - # Base URL. - url <- paste0("https://baseballsavant.mlb.com/statcast_search/csv?all=true&hfPT=&hfAB=&hfBBT=&hfPR=&hfZ=&stadium=&hfBBL=&hfNewZones=&hfGT=R%7CPO%7CS%7C&hfC=&hfSea=", year, "%7C&hfSit=&player_type=pitcher&hfOuts=&opponent=&pitcher_throws=&batter_stands=&hfSA=&game_date_gt=",start_date,"&game_date_lt=",end_date,"&player_lookup%5B%5D=",pitcherid,"&team=&position=&hfRO=&home_road=&hfFlag=&metric_1=&hfInn=&min_pitches=0&min_results=0&group_by=name&sort_col=pitches&player_event_sort=h_launch_speed&sort_order=desc&min_abs=0&type=details&") - - # Do a try/catch to show errors that the user may encounter while downloading. - tryCatch( - { - print("These data are from BaseballSevant and are property of MLB Advanced Media, L.P. All rights reserved.") - print("Grabbing data, this may take a minute...") - payload <- utils::read.csv(url) - - }, - error=function(cond) { - message(paste("URL does not seem to exist, please check your Internet connection:")) - message("Original error message:") - message(cond) - return(NA) - }, - warning=function(cond) { - message(paste("URL caused a warning. Make sure your pitcherid and date range are correct:")) - message("Original warning message:") - message(cond) - return(NULL) - } - ) - # Clean up formatting. - payload[payload=="null"] <- NA - payload$game_date <- as.Date(payload$game_date, "%Y-%m-%d") - payload$des <- as.character(payload$des) - payload$game_pk <- as.character(payload$game_pk) %>% as.numeric() - payload$on_1b <- as.character(payload$on_1b) %>% as.numeric() - payload$on_2b <- as.character(payload$on_2b) %>% as.numeric() - payload$on_3b <- as.character(payload$on_3b) %>% as.numeric() - payload$release_pos_x <- as.character(payload$release_pos_x) %>% as.numeric() - payload$release_pos_x <- as.character(payload$release_pos_x) %>% as.numeric() - payload$hit_distance_sc <- as.character(payload$hit_distance_sc) %>% as.numeric() - payload$launch_speed <- as.character(payload$launch_speed) %>% as.numeric() - payload$launch_angle <- as.character(payload$launch_angle) %>% as.numeric() - payload$effective_speed <- as.character(payload$effective_speed) %>% as.numeric() - payload$release_spin_rate <- as.character(payload$release_spin_rate) %>% as.numeric() - payload$release_extension <- as.character(payload$release_extension) %>% as.numeric() - payload$pitch_name <- as.character(payload$pitch_name) - payload$home_score <- as.character(payload$home_score) %>% as.numeric() - payload$away_score <- as.character(payload$away_score) %>% as.numeric() - payload$bat_score <- as.character(payload$bat_score) %>% as.numeric() - payload$fld_score <- as.character(payload$fld_score) %>% as.numeric() - payload$post_away_score <- as.character(payload$post_away_score) %>% as.numeric() - payload$post_home_score <- as.character(payload$post_home_score) %>% as.numeric() - payload$post_bat_score <- as.character(payload$post_bat_score) %>% as.numeric() - payload$post_fld_score <- as.character(payload$post_fld_score) %>% as.numeric() - payload$barrel <- with(payload, ifelse(launch_angle <= 50 & launch_speed >= 98 & launch_speed * 1.5 - launch_angle >= 11 & launch_speed + launch_angle >= 124, 1, 0)) - message("URL read and payload aquired successfully.") - - return(payload) - -} - diff --git a/R/scrape_statcast_savant_pitcher_all.R b/R/scrape_statcast_savant_pitcher_all.R deleted file mode 100644 index d441e2ea..00000000 --- a/R/scrape_statcast_savant_pitcher_all.R +++ /dev/null @@ -1,96 +0,0 @@ -#' Query Statcast and PITCHf/x Data for Pitchers from baseballsavant.mlb.com -#' -#' This function allows you to query Statcast and PITCHf/x data as provided on baseballsavant.mlb.com and have that data returned as a dataframe. Query returns data for all pitchers over a given time frame. -#' @param start_date Date of first game for which you want data. Format must be in YYYY-MM-DD format. -#' @param end_date Date of last game for which you want data. Format must be in YYYY-MM-DD format. -#' @keywords MLB, sabermetrics, Statcast -#' @importFrom utils read.csv -#' @export -#' @examples -#' \dontrun{ -#' scrape_statcast_savant_pitcher(start_date = "2016-04-06", -#' end_date = "2016-04-15", pitcherid = 592789) -#' } - -scrape_statcast_savant_pitcher_all <- function(start_date, end_date) { - # Check to make sure args are in the correct format. - if(!is.character(start_date) | !is.character(end_date)) { - warning("Please wrap your dates in quotations in 'yyyy-mm-dd' format.") - return(NULL) - } - # Check for other user errors. - if(as.Date(start_date)<="2015-03-01") { # March 1, 2015 was the first date of Spring Training. - message("Some metrics such as Exit Velocity and Batted Ball Events have only been compiled since 2015.") - } - if(as.Date(start_date)<="2008-03-25") { # March 25, 2008 was the first date of Spring Training. - stop("The data are limited to the 2008 MLB season and after.") - return(NULL) - } - if(as.Date(start_date)==Sys.Date()) { - message("The data are collected daily at 3 a.m. Some of today's games may not be included.") - } - if(as.Date(start_date)>as.Date(end_date)) { - stop("The start date is later than the end date.") - return(NULL) - } - - # extract season from start_date - - year <- substr(start_date, 1,4) - - # Base URL. - url <- paste0("https://baseballsavant.mlb.com/statcast_search/csv?all=true&hfPT=&hfAB=&hfBBT=&hfPR=&hfZ=&stadium=&hfBBL=&hfNewZones=&hfGT=R%7CPO%7CS%7C&hfC=&hfSea=", year, "%7C&hfSit=&player_type=pitcher&hfOuts=&opponent=&pitcher_throws=&batter_stands=&hfSA=&game_date_gt=",start_date,"&game_date_lt=",end_date,"&team=&position=&hfRO=&home_road=&hfFlag=&metric_1=&hfInn=&min_pitches=0&min_results=0&group_by=name&sort_col=pitches&player_event_sort=h_launch_speed&sort_order=desc&min_abs=0&type=details&") - - # Do a try/catch to show errors that the user may encounter while downloading. - tryCatch( - { - print("These data are from BaseballSevant and are property of MLB Advanced Media, L.P. All rights reserved.") - print("Grabbing data, this may take a minute...") - payload <- utils::read.csv(url) - - }, - error=function(cond) { - message(paste("URL does not seem to exist, please check your Internet connection:")) - message("Original error message:") - message(cond) - return(NA) - }, - warning=function(cond) { - message(paste("URL caused a warning. Make sure your date range is correct:")) - message("Original warning message:") - message(cond) - return(NULL) - } - ) - # Clean up formatting. - payload[payload=="null"] <- NA - payload$game_date <- as.Date(payload$game_date, "%Y-%m-%d") - payload$des <- as.character(payload$des) - payload$game_pk <- as.character(payload$game_pk) %>% as.numeric() - payload$on_1b <- as.character(payload$on_1b) %>% as.numeric() - payload$on_2b <- as.character(payload$on_2b) %>% as.numeric() - payload$on_3b <- as.character(payload$on_3b) %>% as.numeric() - payload$release_pos_x <- as.character(payload$release_pos_x) %>% as.numeric() - payload$release_pos_x <- as.character(payload$release_pos_x) %>% as.numeric() - payload$hit_distance_sc <- as.character(payload$hit_distance_sc) %>% as.numeric() - payload$launch_speed <- as.character(payload$launch_speed) %>% as.numeric() - payload$launch_angle <- as.character(payload$launch_angle) %>% as.numeric() - payload$effective_speed <- as.character(payload$effective_speed) %>% as.numeric() - payload$release_spin_rate <- as.character(payload$release_spin_rate) %>% as.numeric() - payload$release_extension <- as.character(payload$release_extension) %>% as.numeric() - payload$pitch_name <- as.character(payload$pitch_name) - payload$home_score <- as.character(payload$home_score) %>% as.numeric() - payload$away_score <- as.character(payload$away_score) %>% as.numeric() - payload$bat_score <- as.character(payload$bat_score) %>% as.numeric() - payload$fld_score <- as.character(payload$fld_score) %>% as.numeric() - payload$post_away_score <- as.character(payload$post_away_score) %>% as.numeric() - payload$post_home_score <- as.character(payload$post_home_score) %>% as.numeric() - payload$post_bat_score <- as.character(payload$post_bat_score) %>% as.numeric() - payload$post_fld_score <- as.character(payload$post_fld_score) %>% as.numeric() - payload$barrel <- with(payload, ifelse(launch_angle <= 50 & launch_speed >= 98 & launch_speed * 1.5 - launch_angle >= 11 & launch_speed + launch_angle >= 124, 1, 0)) - message("URL read and payload acquired successfully.") - - return(payload) - -} - diff --git a/README.Rmd b/README.Rmd new file mode 100644 index 00000000..78c3a06a --- /dev/null +++ b/README.Rmd @@ -0,0 +1,107 @@ +--- +output: github_document +--- + + + +```{r, echo = FALSE} +knitr::opts_chunk$set( + collapse = TRUE, + comment = "#>", + warning = FALSE, + message = FALSE, + fig.path = "README-" +) +``` + +# baseballr + +# `baseballr` 0.5 +**(latest version released 2018-05-29)** + +`baseballr` is a package written for R focused on baseball analysis. It includes functions for scraping various data from websites, such as FanGraphs.com, Baseball-Reference.com, and baseballsavant.com. It also includes functions for calculating metrics, such as wOBA, FIP, and team-level consistency over custom time frames. + +You can read more about some of the functions and how to use them at its [official site](http://billpetti.github.io/baseballr/) as well as this [Hardball Times article](http://www.hardballtimes.com/developing-the-baseballr-package-for-r/). + + +## Installation + +You can install `baseballr` from github with: + +```{r gh-installation, eval = FALSE} +# install.packages("devtools") +devtools::install_github("BillPetti/baseballr") +``` + +## Functionality + +The package consists of two main sets of functions: data acquisition and metric calculation. + +For example, if you want to see the standings for a specific MLB division on a given date, you can use the `standings_on_date_bref()` function. Just pass the year, month, day, and division you want: + +```{r} +library(baseballr) +standings_on_date_bref("2015-08-01", "NL East", from = FALSE) +``` + +Right now the function works as far as back as 1994, which is when both leagues split into three divisions. + +You can also pull data for all hitters over a specific date range. Here are the results for all hitters from August 1st through October 3rd during the 2015 season: + +```{r} +library(dplyr) +data <- daily_batter_bref("2015-08-01", "2015-10-03") %>% + head() +``` + +In terms of metric calculation, the package allows the user to calculate the consistency of team scoring and run prevention for any year using `team_consistency()`: + +```{r} +team_consistency(2015) +``` + +You can also calculate wOBA per plate appearance and wOBA on contact for any set of data over any date range, provided you have the data available. + +Simply pass the proper data frame to `woba_plus`: + +```{r} +data %>% + filter(PA > 200) %>% + woba_plus %>% + arrange(desc(wOBA)) %>% + select(Name, Team, season, PA, wOBA, wOBA_CON) %>% + head() +``` + + +You can also generate these wOBA-based stats, as well as FIP, for pitchers using the `fip_plus()` function: + + + +```{r} +daily_pitcher_bref("2015-04-05", "2015-04-30") %>% + fip_plus() %>% + select(season, Name, IP, ERA, SO, uBB, HBP, HR, FIP, wOBA_against, wOBA_CON_against) %>% + arrange(desc(IP)) %>% + head(10) +``` + +The `edge_scrape()` function allows the user to scrape PITCHf/x data from the GameDay application using Carson Sievert's [pitchRx](https://github.com/cpsievert/pitchRx) package and to calculate metrics associated with [Edge%](https://billpetti.shinyapps.io/edge_shiny/). The function returns a dataframe grouped by either pitchers or batters and the percentge of pitches in each of the various Edge zones. + +Example (pitchers): + +```{r, message=FALSE} +edge_scrape("2015-04-06", "2015-04-07", "pitcher") %>% + select(-6:-4, -13) %>% + head(10) +``` + +Example (batters): + +```{r, message=FALSE} +edge_scrape("2015-04-06", "2015-04-07", "batter") %>% + select(-6:-4, -13) %>% + head(10) +``` + +More functionality will be added soon. Please leave any suggestions or bugs in the [Issues section](https://github.com/BillPetti/baseballr/issues). \ No newline at end of file diff --git a/README.md b/README.md index 2a93b54b..e83f3978 100644 --- a/README.md +++ b/README.md @@ -1,142 +1,216 @@ -# baseballr 0.5
+ + +baseballr +========= + +`baseballr` 0.5 +=============== + **(latest version released 2018-05-29)** `baseballr` is a package written for R focused on baseball analysis. It includes functions for scraping various data from websites, such as FanGraphs.com, Baseball-Reference.com, and baseballsavant.com. It also includes functions for calculating metrics, such as wOBA, FIP, and team-level consistency over custom time frames. You can read more about some of the functions and how to use them at its [official site](http://billpetti.github.io/baseballr/) as well as this [Hardball Times article](http://www.hardballtimes.com/developing-the-baseballr-package-for-r/). -It can be installed by using [`devtools`](https://github.com/hadley/devtools): +Installation +------------ + +You can install `baseballr` from github with: -```R -require(devtools) -install_github("BillPetti/baseballr") -require(baseballr) +``` r +# install.packages("devtools") +devtools::install_github("BillPetti/baseballr") ``` + +Functionality +------------- + The package consists of two main sets of functions: data acquisition and metric calculation. For example, if you want to see the standings for a specific MLB division on a given date, you can use the `standings_on_date_bref()` function. Just pass the year, month, day, and division you want: -```R -> standings_on_date_bref("2015-08-01", "NL East", from = FALSE) -$`NL East` - Tm W L W-L% GB RS RA pythW-L% -1 WSN 54 48 0.529 -- 422 391 0.535 -2 NYM 54 50 0.519 1.0 368 373 0.494 -3 ATL 46 58 0.442 9.0 379 449 0.423 -4 MIA 42 62 0.404 13.0 370 408 0.455 -5 PHI 41 64 0.390 14.5 386 511 0.374 +``` r +library(baseballr) +standings_on_date_bref("2015-08-01", "NL East", from = FALSE) +#> $`NL East_up to_2015-08-01` +#> Tm W L W-L% GB RS RA pythW-L% +#> 1 WSN 54 48 0.529 -- 422 391 0.535 +#> 2 NYM 54 50 0.519 1.0 368 373 0.494 +#> 3 ATL 46 58 0.442 9.0 379 449 0.423 +#> 4 MIA 42 62 0.404 13.0 370 408 0.455 +#> 5 PHI 41 64 0.390 14.5 386 511 0.374 ``` + Right now the function works as far as back as 1994, which is when both leagues split into three divisions. You can also pull data for all hitters over a specific date range. Here are the results for all hitters from August 1st through October 3rd during the 2015 season: -```R -> head(daily_batter_bref("2015-08-01", "2015-10-03")) - season Name Age Level Team G PA AB R H X1B X2B X3B HR RBI BB IBB uBB SO HBP SH SF GDP SB CS BA OBP -1 2015 Manny Machado 22 MLB-AL Baltimore 59 266 237 36 66 43 10 0 13 32 26 1 25 42 2 0 1 5 6 4 0.278 0.353 -2 2015 Matt Duffy 24 MLB-NL San Francisco 59 264 248 33 71 54 12 2 3 30 15 0 15 35 0 0 1 9 8 0 0.286 0.326 -3 2015 Jose Altuve 25 MLB-AL Houston 57 262 244 30 81 53 19 3 6 18 10 1 9 28 4 1 3 6 11 4 0.332 0.364 -4 2015 Adam Eaton 26 MLB-AL Chicago 58 262 230 37 74 56 12 1 5 31 23 1 22 55 5 2 2 1 9 4 0.322 0.392 -5 2015 Shin-Soo Choo 32 MLB-AL Texas 58 260 211 48 71 47 14 1 9 34 39 1 38 51 8 1 1 1 2 0 0.336 0.456 -6 2015 Francisco Lindor 21 MLB-AL Cleveland 58 259 224 35 79 51 17 4 7 32 18 0 18 38 1 11 5 4 10 2 0.353 0.395 - SLG OPS -1 0.485 0.839 -2 0.387 0.713 -3 0.508 0.872 -4 0.448 0.840 -5 0.540 0.996 -6 0.558 0.953 +``` r +library(dplyr) +data <- daily_batter_bref("2015-08-01", "2015-10-03") %>% + head() ``` In terms of metric calculation, the package allows the user to calculate the consistency of team scoring and run prevention for any year using `team_consistency()`: -```R -> team_consistency(2015) -Source: local data frame [30 x 5] - - Team Con_R Con_RA Con_R_Ptile Con_RA_Ptile - (chr) (dbl) (dbl) (dbl) (dbl) -1 ARI 0.37 0.36 22 15 -2 ATL 0.41 0.40 87 67 -3 BAL 0.40 0.38 70 42 -4 BOS 0.39 0.40 52 67 -5 CHC 0.38 0.41 33 88 -6 CHW 0.39 0.40 52 67 -7 CIN 0.41 0.36 87 15 -8 CLE 0.41 0.40 87 67 -9 COL 0.35 0.34 7 3 -10 DET 0.39 0.38 52 42 -.. ... ... ... ... ... +``` r +team_consistency(2015) +#> # A tibble: 30 x 5 +#> Team Con_R Con_RA Con_R_Ptile Con_RA_Ptile +#> +#> 1 ARI 0.37 0.39 43 80 +#> 2 ATL 0.38 0.36 65 25 +#> 3 BAL 0.4 0.36 88 25 +#> 4 BOS 0.37 0.39 43 80 +#> 5 CHC 0.41 0.37 97 50 +#> 6 CHW 0.38 0.36 65 25 +#> 7 CIN 0.36 0.38 22 63 +#> 8 CLE 0.38 0.42 65 100 +#> 9 COL 0.38 0.38 65 63 +#> 10 DET 0.39 0.35 82 3 +#> # ... with 20 more rows ``` You can also calculate wOBA per plate appearance and wOBA on contact for any set of data over any date range, provided you have the data available. Simply pass the proper data frame to `woba_plus`: -```R -> x <- woba_plus(df) -> head(x)[,c(1,2,24,26,27)] - Name Team season wOBA wOBA_CON -1 Bryce Harper Nationals 2015 0.464 0.554 -2 Joey Votto Reds 2015 0.428 0.485 -3 Paul Goldschmidt Diamondbacks 2015 0.422 0.517 -4 Mike Trout Angels 2015 0.418 0.519 -5 Miguel Cabrera Tigers 2015 0.415 0.462 -6 Josh Donaldson Blue Jays 2015 0.404 0.467 +``` r +data %>% + filter(PA > 200) %>% + woba_plus %>% + arrange(desc(wOBA)) %>% + select(Name, Team, season, PA, wOBA, wOBA_CON) %>% + head() +#> Name Team season PA wOBA wOBA_CON +#> 1 Shin-Soo Choo Texas 2015 260 0.430 0.495 +#> 2 Francisco Lindor Cleveland 2015 259 0.404 0.468 +#> 3 Jose Altuve Houston 2015 262 0.372 0.406 +#> 4 Adam Eaton Chicago 2015 262 0.367 0.436 +#> 5 Manny Machado Baltimore 2015 266 0.362 0.396 +#> 6 Matt Duffy San Francisco 2015 264 0.312 0.338 ``` + You can also generate these wOBA-based stats, as well as FIP, for pitchers using the `fip_plus()` function: -```r -> daily_pitcher_bref("2015-04-05", "2015-04-30") %>% fip_plus() %>% select(season, Name, IP, ERA, SO, uBB, HBP, HR, FIP, wOBA_against, wOBA_CON_against) %>% arrange(desc(IP)) %>% head(10) - season Name IP ERA SO uBB HBP HR FIP wOBA_against wOBA_CON_against -1 2015 Johnny Cueto 37.0 1.95 38 4 2 3 2.62 0.210 0.276 -2 2015 Dallas Keuchel 37.0 0.73 22 11 0 0 2.84 0.169 0.151 -3 2015 Sonny Gray 36.1 1.98 25 6 1 1 2.69 0.218 0.239 -4 2015 Mike Leake 35.2 3.03 25 7 0 5 4.16 0.240 0.281 -5 2015 Felix Hernandez 34.2 1.82 36 6 3 1 2.20 0.225 0.272 -6 2015 Corey Kluber 34.0 4.24 36 5 2 2 2.40 0.295 0.391 -7 2015 Jake Odorizzi 33.2 2.41 26 8 1 0 2.38 0.213 0.228 -8 2015 Josh Collmenter 32.2 2.76 16 3 0 1 2.82 0.290 0.330 -9 2015 Bartolo Colon 32.2 3.31 25 1 0 4 3.29 0.280 0.357 -10 2015 Zack Greinke 32.2 1.93 27 7 1 2 3.01 0.240 0.274 +``` r +daily_pitcher_bref("2015-04-05", "2015-04-30") %>% + fip_plus() %>% + select(season, Name, IP, ERA, SO, uBB, HBP, HR, FIP, wOBA_against, wOBA_CON_against) %>% + arrange(desc(IP)) %>% + head(10) +#> season Name IP ERA SO uBB HBP HR FIP wOBA_against +#> 1 2015 Johnny Cueto 37.0 1.95 38 4 2 3 2.62 0.210 +#> 2 2015 Dallas Keuchel 37.0 0.73 22 11 0 0 2.84 0.169 +#> 3 2015 Sonny Gray 36.1 1.98 25 6 1 1 2.69 0.218 +#> 4 2015 Mike Leake 35.2 3.03 25 7 0 5 4.16 0.240 +#> 5 2015 Felix Hernandez 34.2 1.82 36 6 3 1 2.20 0.225 +#> 6 2015 Corey Kluber 34.0 4.24 36 5 2 2 2.40 0.295 +#> 7 2015 Jake Odorizzi 33.2 2.41 26 8 1 0 2.38 0.213 +#> 8 2015 Josh Collmenter 32.2 2.76 16 3 0 1 2.82 0.290 +#> 9 2015 Bartolo Colon 32.2 3.31 25 1 0 4 3.29 0.280 +#> 10 2015 Zack Greinke 32.2 1.93 27 7 1 2 3.01 0.240 +#> wOBA_CON_against +#> 1 0.276 +#> 2 0.151 +#> 3 0.239 +#> 4 0.281 +#> 5 0.272 +#> 6 0.391 +#> 7 0.228 +#> 8 0.330 +#> 9 0.357 +#> 10 0.274 ``` The `edge_scrape()` function allows the user to scrape PITCHf/x data from the GameDay application using Carson Sievert's [pitchRx](https://github.com/cpsievert/pitchRx) package and to calculate metrics associated with [Edge%](https://billpetti.shinyapps.io/edge_shiny/). The function returns a dataframe grouped by either pitchers or batters and the percentge of pitches in each of the various Edge zones. Example (pitchers): -```r -> edge_scrape("2015-04-06", "2015-04-07", "pitcher") %>% .[, c(1:3,7:12)] %>% head(10) - pitcher_name pitcher All_pitches Upper_Edge Lower_Edge Inside_Edge Outside_Edge Heart Out_of_Zone - (chr) (dbl) (int) (dbl) (dbl) (dbl) (dbl) (dbl) (dbl) -1 Bartolo Colon 112526 86 0.035 0.081 0.058 0.151 0.209 0.465 -2 LaTroy Hawkins 115629 12 0.000 0.333 0.000 0.000 0.083 0.583 -3 Joe Nathan 150274 4 0.000 0.000 0.000 0.000 0.000 1.000 -4 Buddy Carlyle 234194 9 0.000 0.222 0.000 0.000 0.333 0.444 -5 Jason Grilli 276351 14 0.000 0.000 0.214 0.000 0.286 0.500 -6 Kevin Gregg 276514 17 0.000 0.000 0.118 0.176 0.235 0.471 -7 Joaquin Benoit 276542 19 0.053 0.053 0.105 0.000 0.158 0.632 -8 Ryan Vogelsong 285064 99 0.010 0.051 0.141 0.061 0.182 0.556 -9 Jeremy Affeldt 346793 5 0.000 0.000 0.200 0.000 0.000 0.800 -10 Grant Balfour 346797 21 0.095 0.000 0.000 0.048 0.333 0.524 +``` r +edge_scrape("2015-04-06", "2015-04-07", "pitcher") %>% + select(-6:-4, -13) %>% + head(10) +#> http://gd2.mlb.com/components/game/mlb/year_2015/month_04/day_06/gid_2015_04_06_tormlb_nyamlb_1/inning/inning_all.xml +#> http://gd2.mlb.com/components/game/mlb/year_2015/month_04/day_06/gid_2015_04_06_minmlb_detmlb_1/inning/inning_all.xml +#> http://gd2.mlb.com/components/game/mlb/year_2015/month_04/day_06/gid_2015_04_06_colmlb_milmlb_1/inning/inning_all.xml +#> http://gd2.mlb.com/components/game/mlb/year_2015/month_04/day_06/gid_2015_04_06_bosmlb_phimlb_1/inning/inning_all.xml +#> http://gd2.mlb.com/components/game/mlb/year_2015/month_04/day_06/gid_2015_04_06_balmlb_tbamlb_1/inning/inning_all.xml +#> http://gd2.mlb.com/components/game/mlb/year_2015/month_04/day_06/gid_2015_04_06_nynmlb_wasmlb_1/inning/inning_all.xml +#> http://gd2.mlb.com/components/game/mlb/year_2015/month_04/day_06/gid_2015_04_06_atlmlb_miamlb_1/inning/inning_all.xml +#> http://gd2.mlb.com/components/game/mlb/year_2015/month_04/day_06/gid_2015_04_06_chamlb_kcamlb_1/inning/inning_all.xml +#> http://gd2.mlb.com/components/game/mlb/year_2015/month_04/day_06/gid_2015_04_06_anamlb_seamlb_1/inning/inning_all.xml +#> http://gd2.mlb.com/components/game/mlb/year_2015/month_04/day_06/gid_2015_04_06_pitmlb_cinmlb_1/inning/inning_all.xml +#> http://gd2.mlb.com/components/game/mlb/year_2015/month_04/day_06/gid_2015_04_06_sdnmlb_lanmlb_1/inning/inning_all.xml +#> http://gd2.mlb.com/components/game/mlb/year_2015/month_04/day_06/gid_2015_04_06_clemlb_houmlb_1/inning/inning_all.xml +#> http://gd2.mlb.com/components/game/mlb/year_2015/month_04/day_06/gid_2015_04_06_texmlb_oakmlb_1/inning/inning_all.xml +#> http://gd2.mlb.com/components/game/mlb/year_2015/month_04/day_06/gid_2015_04_06_sfnmlb_arimlb_1/inning/inning_all.xml +#> http://gd2.mlb.com/components/game/mlb/year_2015/month_04/day_07/gid_2015_04_07_atlmlb_miamlb_1/inning/inning_all.xml +#> http://gd2.mlb.com/components/game/mlb/year_2015/month_04/day_07/gid_2015_04_07_balmlb_tbamlb_1/inning/inning_all.xml +#> http://gd2.mlb.com/components/game/mlb/year_2015/month_04/day_07/gid_2015_04_07_colmlb_milmlb_1/inning/inning_all.xml +#> http://gd2.mlb.com/components/game/mlb/year_2015/month_04/day_07/gid_2015_04_07_sfnmlb_arimlb_1/inning/inning_all.xml +#> http://gd2.mlb.com/components/game/mlb/year_2015/month_04/day_07/gid_2015_04_07_texmlb_oakmlb_1/inning/inning_all.xml +#> http://gd2.mlb.com/components/game/mlb/year_2015/month_04/day_07/gid_2015_04_07_anamlb_seamlb_1/inning/inning_all.xml +#> http://gd2.mlb.com/components/game/mlb/year_2015/month_04/day_07/gid_2015_04_07_sdnmlb_lanmlb_1/inning/inning_all.xml +#> # A tibble: 10 x 9 +#> pitcher_name pitcher All_pitches Upper_Edge Lower_Edge Inside_Edge +#> +#> 1 Bartolo Colon 112526 86 0.035 0.081 0.058 +#> 2 LaTroy Hawkins 115629 12 0.083 0.333 0 +#> 3 Joe Nathan 150274 4 0 0 0 +#> 4 Buddy Carlyle 234194 9 0 0.222 0 +#> 5 Jason Grilli 276351 14 0 0 0.214 +#> 6 Kevin Gregg 276514 17 0 0.059 0.118 +#> 7 Joaquin Benoit 276542 19 0 0 0.158 +#> 8 Ryan Vogelsong 285064 99 0.01 0.071 0.141 +#> 9 Jeremy Affeldt 346793 5 0 0 0.4 +#> 10 Grant Balfour 346797 21 0.095 0 0 +#> # ... with 3 more variables: Outside_Edge , Heart , +#> # Out_of_Zone ``` Example (batters): -```r -> edge_scrape("2015-04-06", "2015-04-07", "batter") %>% .[, c(1:3,7:12)] %>% head(10) - batter_name batter All_pitches Upper_Edge Lower_Edge Inside_Edge Outside_Edge Heart Out_of_Zone - (chr) (dbl) (int) (dbl) (dbl) (dbl) (dbl) (dbl) (dbl) -1 Bartolo Colon 112526 7 0.000 0.000 0.429 0.000 0.143 0.429 -2 Torii Hunter 116338 19 0.000 0.105 0.105 0.105 0.000 0.684 -3 David Ortiz 120074 18 0.056 0.000 0.111 0.056 0.222 0.556 -4 Alex Rodriguez 121347 17 0.000 0.000 0.353 0.000 0.118 0.529 -5 Aramis Ramirez 133380 23 0.000 0.087 0.261 0.000 0.261 0.391 -6 Adrian Beltre 134181 26 0.000 0.038 0.154 0.115 0.231 0.462 -7 Carlos Beltran 136860 22 0.136 0.045 0.136 0.000 0.136 0.545 -8 Michael Cuddyer 150212 14 0.000 0.214 0.214 0.000 0.214 0.357 -9 Jimmy Rollins 276519 41 0.024 0.122 0.049 0.049 0.220 0.537 -10 Ryan Vogelsong 285064 10 0.000 0.200 0.300 0.000 0.200 0.300 +``` r +edge_scrape("2015-04-06", "2015-04-07", "batter") %>% + select(-6:-4, -13) %>% + head(10) +#> http://gd2.mlb.com/components/game/mlb/year_2015/month_04/day_06/gid_2015_04_06_tormlb_nyamlb_1/inning/inning_all.xml +#> http://gd2.mlb.com/components/game/mlb/year_2015/month_04/day_06/gid_2015_04_06_minmlb_detmlb_1/inning/inning_all.xml +#> http://gd2.mlb.com/components/game/mlb/year_2015/month_04/day_06/gid_2015_04_06_colmlb_milmlb_1/inning/inning_all.xml +#> http://gd2.mlb.com/components/game/mlb/year_2015/month_04/day_06/gid_2015_04_06_bosmlb_phimlb_1/inning/inning_all.xml +#> http://gd2.mlb.com/components/game/mlb/year_2015/month_04/day_06/gid_2015_04_06_balmlb_tbamlb_1/inning/inning_all.xml +#> http://gd2.mlb.com/components/game/mlb/year_2015/month_04/day_06/gid_2015_04_06_nynmlb_wasmlb_1/inning/inning_all.xml +#> http://gd2.mlb.com/components/game/mlb/year_2015/month_04/day_06/gid_2015_04_06_atlmlb_miamlb_1/inning/inning_all.xml +#> http://gd2.mlb.com/components/game/mlb/year_2015/month_04/day_06/gid_2015_04_06_chamlb_kcamlb_1/inning/inning_all.xml +#> http://gd2.mlb.com/components/game/mlb/year_2015/month_04/day_06/gid_2015_04_06_anamlb_seamlb_1/inning/inning_all.xml +#> http://gd2.mlb.com/components/game/mlb/year_2015/month_04/day_06/gid_2015_04_06_pitmlb_cinmlb_1/inning/inning_all.xml +#> http://gd2.mlb.com/components/game/mlb/year_2015/month_04/day_06/gid_2015_04_06_sdnmlb_lanmlb_1/inning/inning_all.xml +#> http://gd2.mlb.com/components/game/mlb/year_2015/month_04/day_06/gid_2015_04_06_clemlb_houmlb_1/inning/inning_all.xml +#> http://gd2.mlb.com/components/game/mlb/year_2015/month_04/day_06/gid_2015_04_06_texmlb_oakmlb_1/inning/inning_all.xml +#> http://gd2.mlb.com/components/game/mlb/year_2015/month_04/day_06/gid_2015_04_06_sfnmlb_arimlb_1/inning/inning_all.xml +#> http://gd2.mlb.com/components/game/mlb/year_2015/month_04/day_07/gid_2015_04_07_atlmlb_miamlb_1/inning/inning_all.xml +#> http://gd2.mlb.com/components/game/mlb/year_2015/month_04/day_07/gid_2015_04_07_balmlb_tbamlb_1/inning/inning_all.xml +#> http://gd2.mlb.com/components/game/mlb/year_2015/month_04/day_07/gid_2015_04_07_colmlb_milmlb_1/inning/inning_all.xml +#> http://gd2.mlb.com/components/game/mlb/year_2015/month_04/day_07/gid_2015_04_07_sfnmlb_arimlb_1/inning/inning_all.xml +#> http://gd2.mlb.com/components/game/mlb/year_2015/month_04/day_07/gid_2015_04_07_texmlb_oakmlb_1/inning/inning_all.xml +#> http://gd2.mlb.com/components/game/mlb/year_2015/month_04/day_07/gid_2015_04_07_anamlb_seamlb_1/inning/inning_all.xml +#> http://gd2.mlb.com/components/game/mlb/year_2015/month_04/day_07/gid_2015_04_07_sdnmlb_lanmlb_1/inning/inning_all.xml +#> # A tibble: 10 x 9 +#> batter_name batter All_pitches Upper_Edge Lower_Edge Inside_Edge +#> +#> 1 Bartolo Colon 112526 7 0 0 0.429 +#> 2 Torii Hunter 116338 19 0 0.158 0.105 +#> 3 David Ortiz 120074 18 0 0 0.111 +#> 4 Alex Rodriguez 121347 17 0 0 0.353 +#> 5 Aramis Ramirez 133380 23 0 0.087 0.217 +#> 6 Adrian Beltre 134181 26 0 0.038 0.154 +#> 7 Carlos Beltran 136860 22 0.091 0 0.136 +#> 8 Michael Cuddyer 150212 14 0 0.143 0.143 +#> 9 Jimmy Rollins 276519 41 0.024 0.146 0.049 +#> 10 Ryan Vogelsong 285064 10 0 0.1 0.3 +#> # ... with 3 more variables: Outside_Edge , Heart , +#> # Out_of_Zone ``` More functionality will be added soon. Please leave any suggestions or bugs in the [Issues section](https://github.com/BillPetti/baseballr/issues). diff --git a/_posts/2016-11-22-baseballr-0.3.1-ncaa-functions.md b/_posts/2016-11-22-baseballr-0.3.1-ncaa-functions.md deleted file mode 100644 index d15c3381..00000000 --- a/_posts/2016-11-22-baseballr-0.3.1-ncaa-functions.md +++ /dev/null @@ -1,82 +0,0 @@ ---- -layout: post -title: baseballr 0.3.1 is out with new NCAA functions -tags: rstats, web-scraping, baseballr, NCAA ---- - -The latest release of the [`baseballr`](https://billpetti.github.io/baseballr/) includes a function for acquiring player statistics from the [NCAA's website](http://stats.ncaa.org) for baseball teams across the three major divisions (I, II, III). - -The function, `ncaa_scrape`, requires the user to pass values for three parameters for the function to work: - -`school_id`: numerical code used by the NCAA for each school -`year`: a four-digit year -`type`: whether to pull data for batters or pitchers - -If you want to pull batting statistics for Vanderbilt for the 2013 season, you would use the following: - -```r -> baseballr::ncaa_scrape(736, 2013, "batting") %>% -+ select(year:OBPct) - year school conference division Jersey Player Yr Pos GP GS BA OBPct -1 2013 Vanderbilt Southeastern 1 18 Yastrzemski, Mike Sr OF 66 66 0.312 0.411 -2 2013 Vanderbilt Southeastern 1 20 Harrell, Connor Sr OF 66 66 0.312 0.418 -3 2013 Vanderbilt Southeastern 1 3 Conde, Vince So INF 66 65 0.307 0.380 -4 2013 Vanderbilt Southeastern 1 6 Kemp, Tony Jr OF 66 66 0.391 0.471 -5 2013 Vanderbilt Southeastern 1 55 Gregor, Conrad Jr OF 65 65 0.308 0.440 -6 2013 Vanderbilt Southeastern 1 9 Turner, Xavier Fr INF 59 51 0.324 0.387 -7 2013 Vanderbilt Southeastern 1 5 Navin, Spencer Jr C 57 56 0.302 0.430 -8 2013 Vanderbilt Southeastern 1 51 Lupo, Jack Sr OF 57 51 0.297 0.352 -9 2013 Vanderbilt Southeastern 1 8 Wiseman, Rhett Fr OF 54 11 0.289 0.360 -10 2013 Vanderbilt Southeastern 1 10 Norwood, John So OF 33 9 0.328 0.388 -11 2013 Vanderbilt Southeastern 1 43 Wiel, Zander So INF 33 15 0.305 0.406 -12 2013 Vanderbilt Southeastern 1 44 Harvey, Chris So C 29 13 0.250 0.328 -13 2013 Vanderbilt Southeastern 1 42 McKeithan, Joel Jr INF 25 12 0.220 0.267 -14 2013 Vanderbilt Southeastern 1 39 Smith, Kyle Fr INF 23 7 0.250 0.455 -15 2013 Vanderbilt Southeastern 1 17 Harris, Andrew Sr INF 21 0 0.125 0.222 -16 2013 Vanderbilt Southeastern 1 2 Campbell, Tyler Fr INF 12 2 0.312 0.389 -17 2013 Vanderbilt Southeastern 1 7 Swanson, Dansby Fr INF 11 4 0.188 0.435 -18 2013 Vanderbilt Southeastern 1 25 Luna, D.J. Jr INF 8 0 0.000 0.333 -19 2013 Vanderbilt Southeastern 1 23 Cooper, Will So OF 4 0 1.000 1.000 -20 2013 Vanderbilt Southeastern 1 - Totals - - - - 0.313 0.407 -21 2013 Vanderbilt Southeastern 1 - Opponent Totals - - - - 0.220 0.320 -``` - -The same can be done for pitching, just by changing the `type` parameter: - -```r -> baseballr::ncaa_scrape(736, 2013, "pitching") %>% -+ select(year:ERA) - year school conference division Jersey Player Yr Pos GP App GS ERA -1 2013 Vanderbilt Southeastern 1 11 Beede, Tyler So P 37 17 17 2.32 -2 2013 Vanderbilt Southeastern 1 33 Miller, Brian So P 32 32 NA 1.58 -3 2013 Vanderbilt Southeastern 1 35 Ziomek, Kevin Jr P 32 17 17 2.12 -4 2013 Vanderbilt Southeastern 1 15 Fulmer, Carson Fr P 26 26 NA 2.39 -5 2013 Vanderbilt Southeastern 1 39 Smith, Kyle Fr INF 23 1 NA 0.00 -6 2013 Vanderbilt Southeastern 1 28 Miller, Jared So P 22 22 NA 2.31 -7 2013 Vanderbilt Southeastern 1 19 Rice, Steven Jr P 21 21 NA 2.57 -8 2013 Vanderbilt Southeastern 1 13 Buehler, Walker Fr P 16 16 9 3.14 -9 2013 Vanderbilt Southeastern 1 22 Pfeifer, Philip So P 15 15 12 3.68 -10 2013 Vanderbilt Southeastern 1 12 Ravenelle, Adam So P 11 11 NA 3.18 -11 2013 Vanderbilt Southeastern 1 40 Pecoraro, T.J. Jr P 10 10 7 5.97 -12 2013 Vanderbilt Southeastern 1 45 Ferguson, Tyler Fr P 8 8 4 4.21 -13 2013 Vanderbilt Southeastern 1 27 Kolinsky, Keenan Jr P 2 2 NA 0.00 -14 2013 Vanderbilt Southeastern 1 24 Wilson, Nevin So P 1 1 NA 0.00 -15 2013 Vanderbilt Southeastern 1 - Totals - - - NA NA 2.76 -16 2013 Vanderbilt Southeastern 1 - Opponent Totals - - - NA NA 6.19 -``` - -Now, the function is dependent on the user knowing the `school_id` used by the NCAA website. Given that, I've included a `school_id_lu` function so that users can find the `school_id` they need. - -Just pass a string to the function and it will return possible matches based on the school's name: - -```r -> school_id_lu("Vand") -# A tibble: 4 × 6 - school conference school_id year division conference_id - -1 Vanderbilt Southeastern 736 2013 1 911 -2 Vanderbilt Southeastern 736 2014 1 911 -3 Vanderbilt Southeastern 736 2015 1 911 -4 Vanderbilt Southeastern 736 2016 1 911 -``` - diff --git a/baseballr_Updates/baseballr_Updates_10_8_2016.md b/baseballr_Updates/baseballr_Updates_10_8_2016.md deleted file mode 100644 index f0c198c2..00000000 --- a/baseballr_Updates/baseballr_Updates_10_8_2016.md +++ /dev/null @@ -1,36 +0,0 @@ -## Update Notes for baseballr package: Version 0.2.1 -### October 8, 2016 - -Updates to functions in this release: - -`scrape_statcast_savant_batter`
-`scrape_statcast_savant_pitcher`
- -New functions in this release: - -`code_barrel`
- -The research team at Major League Baseball Advanced Media have developed a way to categorize batted balls that on average having a batting average over .500 and slugging over 1.500. The specific coding criteria can be found in comment #2 [here] (http://tangotiger.com/index.php/site/comments/statcast-lab-barrels#2). - -Now, whenver a user scrapes Statcast data using either the `scrape_statcast_savant_batter` or `scrape_statcast_savant_pitcher` functions the results will include a column `barrel`, where if the batted ball matches the barrel criteria it will code as 1, otherwise 0. - -Example: - -```r -> scrape_statcast_savant_batter(start_date = "2016-04-06", end_date = "2016-04-15", batterid = 621043) %>% -+ filter(type == "X") %>% -+ filter(!is.na(barrel)) %>% -+ select(player_name, game_date, hit_angle, hit_speed, barrel) %>% -+ tail() -[1] "Be patient, this may take a few seconds..." -[1] "Data courtesy of Baseball Savant and MLBAM (baseballsavant.mlb.com)" - player_name game_date hit_angle hit_speed barrel -25 Carlos Correa 2016-04-07 31.10 103.33 1 -26 Carlos Correa 2016-04-07 27.77 87.25 0 -27 Carlos Correa 2016-04-06 29.62 103.97 1 -28 Carlos Correa 2016-04-06 0.11 105.20 0 -29 Carlos Correa 2016-04-06 23.76 113.55 1 -30 Carlos Correa 2016-04-06 -2.18 113.39 0 -``` -If you already have Statcast data--say, in a database that you've been collecting--I've also included a simple function that will take a dataframe and code whether each row contains a barrel or not. All you need to do is pass your dataframe to `code_barrel`. - diff --git a/baseballr_Updates/baseballr_Updates_3_1_2016.md b/baseballr_Updates/baseballr_Updates_3_1_2016.md deleted file mode 100644 index db79f001..00000000 --- a/baseballr_Updates/baseballr_Updates_3_1_2016.md +++ /dev/null @@ -1,61 +0,0 @@ -## Update Notes for baseballr package: Version 0.1.0 -### March 1, 2016 - -Functions added to this release: - -`fip_plus()`: This function mimics the functionality in the `woba_plus()` function, except that the unit of analysis is pitchers. The function will generate Fielding Indepedent Pitching (FIP) for each pitcher in the data set that is passed to the function, along with wOBA against and wOBA against on contact. - -Example: - -```r -> daily_pitcher_bref("2015-04-05", "2015-04-30") %>% fip_plus() %>% select(season, Name, IP, ERA, SO, uBB, HBP, HR, FIP, wOBA_against, wOBA_CON_against) %>% arrange(desc(IP)) %>% head(10) - season Name IP ERA SO uBB HBP HR FIP wOBA_against wOBA_CON_against -1 2015 Johnny Cueto 37.0 1.95 38 4 2 3 2.62 0.210 0.276 -2 2015 Dallas Keuchel 37.0 0.73 22 11 0 0 2.84 0.169 0.151 -3 2015 Sonny Gray 36.1 1.98 25 6 1 1 2.69 0.218 0.239 -4 2015 Mike Leake 35.2 3.03 25 7 0 5 4.16 0.240 0.281 -5 2015 Felix Hernandez 34.2 1.82 36 6 3 1 2.20 0.225 0.272 -6 2015 Corey Kluber 34.0 4.24 36 5 2 2 2.40 0.295 0.391 -7 2015 Jake Odorizzi 33.2 2.41 26 8 1 0 2.38 0.213 0.228 -8 2015 Josh Collmenter 32.2 2.76 16 3 0 1 2.82 0.290 0.330 -9 2015 Bartolo Colon 32.2 3.31 25 1 0 4 3.29 0.280 0.357 -10 2015 Zack Greinke 32.2 1.93 27 7 1 2 3.01 0.240 0.274 -``` - -`edge_scrape()`: This function allows the user to scrape PITCHf/x data from the GameDay application using Carson Sievert's [pitchRx](https://github.com/cpsievert/pitchRx) package and to calculate metrics associated with [Edge%](https://billpetti.shinyapps.io/edge_shiny/). The function returns a data.frame grouped by either pitchers or batters and the percentge of pitches in each of the various Edge zones. - -Example (pitchers): - -```r -> edge_scrape("2015-04-06", "2015-04-07", "pitcher") %>% .[, c(1:3,7:12)] %>% head(10) - pitcher_name pitcher All_pitches Upper_Edge Lower_Edge Inside_Edge Outside_Edge Heart Out_of_Zone - (chr) (dbl) (int) (dbl) (dbl) (dbl) (dbl) (dbl) (dbl) -1 Bartolo Colon 112526 86 0.035 0.081 0.058 0.151 0.209 0.465 -2 LaTroy Hawkins 115629 12 0.000 0.333 0.000 0.000 0.083 0.583 -3 Joe Nathan 150274 4 0.000 0.000 0.000 0.000 0.000 1.000 -4 Buddy Carlyle 234194 9 0.000 0.222 0.000 0.000 0.333 0.444 -5 Jason Grilli 276351 14 0.000 0.000 0.214 0.000 0.286 0.500 -6 Kevin Gregg 276514 17 0.000 0.000 0.118 0.176 0.235 0.471 -7 Joaquin Benoit 276542 19 0.053 0.053 0.105 0.000 0.158 0.632 -8 Ryan Vogelsong 285064 99 0.010 0.051 0.141 0.061 0.182 0.556 -9 Jeremy Affeldt 346793 5 0.000 0.000 0.200 0.000 0.000 0.800 -10 Grant Balfour 346797 21 0.095 0.000 0.000 0.048 0.333 0.524 -``` - -Example (batters): - -```r -> edge_scrape("2015-04-06", "2015-04-07", "batter") %>% .[, c(1:3,7:12)] %>% head(10) - batter_name batter All_pitches Upper_Edge Lower_Edge Inside_Edge Outside_Edge Heart Out_of_Zone - (chr) (dbl) (int) (dbl) (dbl) (dbl) (dbl) (dbl) (dbl) -1 Bartolo Colon 112526 7 0.000 0.000 0.429 0.000 0.143 0.429 -2 Torii Hunter 116338 19 0.000 0.105 0.105 0.105 0.000 0.684 -3 David Ortiz 120074 18 0.056 0.000 0.111 0.056 0.222 0.556 -4 Alex Rodriguez 121347 17 0.000 0.000 0.353 0.000 0.118 0.529 -5 Aramis Ramirez 133380 23 0.000 0.087 0.261 0.000 0.261 0.391 -6 Adrian Beltre 134181 26 0.000 0.038 0.154 0.115 0.231 0.462 -7 Carlos Beltran 136860 22 0.136 0.045 0.136 0.000 0.136 0.545 -8 Michael Cuddyer 150212 14 0.000 0.214 0.214 0.000 0.214 0.357 -9 Jimmy Rollins 276519 41 0.024 0.122 0.049 0.049 0.220 0.537 -10 Ryan Vogelsong 285064 10 0.000 0.200 0.300 0.000 0.200 0.300 -``` diff --git a/baseballr_Updates/baseballr_Updates_3_25_2016.md b/baseballr_Updates/baseballr_Updates_3_25_2016.md deleted file mode 100644 index 1e3a00c8..00000000 --- a/baseballr_Updates/baseballr_Updates_3_25_2016.md +++ /dev/null @@ -1,49 +0,0 @@ -## Update Notes for baseballr package: Version 0.1.3 -### March 25, 2016 - -Functions added to this release: - -`edge_code`: This function allows a user to pass their own dataframe and have individual pitches coded according to the scheme provided by Edge%. The dataframe must contain at least three columns of data: `b_height`, `stand`, `px`, and `pz`. - -Example (based on data from "2015-04-05"): - -```r -> edge_code(df) %>% .[, c(6:7, 27:28, 82)] %>% head(10) - stand b_height px pz location -1 L 6-3 0.416 2.963 Inside Edge -2 L 6-3 -0.191 2.347 Heart -3 L 6-3 -0.518 3.284 Upper Edge -4 L 6-3 -0.641 1.221 Out of Zone -5 L 6-3 -1.821 2.083 Out of Zone -6 L 6-3 0.627 2.397 Inside Edge -7 L 6-5 -1.088 1.610 Out of Zone -8 L 6-5 -0.257 2.047 Lower Edge -9 L 6-5 NA NA -10 L 6-3 -1.539 1.525 Out of Zone -``` - -Functions updated for this release: - -`standings_on_date_bref`: [JonathanBob](https://github.com/JonathanBob) updated this funtion to allow for records to be returned for the given date or from that date forward. Also, users can input a full date string instead of three separate arguments for the day, month, and year. Users can also choose to pull records for the AL and NL overall, not just for a given division. - -Example: - -```r -> standings_on_date_bref("2015-08-01", "NL East", from = FALSE) -$`NL East` - Tm W L W-L% GB RS RA pythW-L% -1 WSN 54 48 0.529 -- 422 391 0.535 -2 NYM 54 50 0.519 1.0 368 373 0.494 -3 ATL 46 58 0.442 9.0 379 449 0.423 -4 MIA 42 62 0.404 13.0 370 408 0.455 -5 PHI 41 64 0.390 14.5 386 511 0.374 - -> standings_on_date_bref("2015-08-01", "NL East", from = TRUE) -$`NL East` - Tm W L W-L% GB RS RA pythW-L% -1 NYM 36 22 0.621 -- 315 240 0.622 -2 MIA 29 29 0.500 7.0 243 270 0.452 -3 WSN 29 31 0.483 8.0 281 244 0.564 -4 PHI 22 35 0.386 13.5 240 298 0.402 -5 ATL 21 37 0.362 15.0 194 311 0.297 -``` diff --git a/baseballr_Updates/baseballr_Updates_3_7_2016.md b/baseballr_Updates/baseballr_Updates_3_7_2016.md deleted file mode 100644 index 2475e708..00000000 --- a/baseballr_Updates/baseballr_Updates_3_7_2016.md +++ /dev/null @@ -1,27 +0,0 @@ -## Update Notes for baseballr package: Version 0.1.2 -### March 7, 2016 - -Functions added to this release: - -`edge_scrape_split()`: This function builds of off `edge_scrape()` and adds the ability to view the data split by batter and pitcher handedness. As with `edge_scrape()`, the function returns a dataframe grouped by either pitchers or batters and the percentge of pitches in each of the various Edge zones, but adds in handedness. - -Example (Edge% splits by batters with handedness): - -```r -> edge_scrape_split("2015-04-05", "2015-04-05", "batter") %>% .[,c(1:5,9:14)] - - batter_name batter p_throws stand All_pitches Upper_Edge Lower_Edge Inside_Edge Outside_Edge Heart Out_of_Zone - (chr) (dbl) (chr) (chr) (int) (dbl) (dbl) (dbl) (dbl) (dbl) (dbl) -1 Matt Holliday 407812 L R 11 0.000 0.182 0.000 0.182 0.182 0.455 -2 Matt Holliday 407812 R R 10 0.000 0.000 0.000 0.200 0.300 0.500 -3 David Ross 424325 R R 8 0.000 0.000 0.000 0.125 0.625 0.250 -4 Jhonny Peralta 425509 L R 9 0.000 0.111 0.444 0.000 0.111 0.333 -5 Jhonny Peralta 425509 R R 6 0.167 0.000 0.000 0.167 0.167 0.500 -6 Adam Wainwright 425794 L R 8 0.000 0.125 0.000 0.000 0.125 0.750 -7 Adam Wainwright 425794 R R 3 0.000 0.000 0.000 0.333 0.667 0.000 -8 Yadier Molina 425877 L R 13 0.077 0.077 0.000 0.000 0.077 0.769 -9 Yadier Molina 425877 R R 7 0.143 0.000 0.143 0.143 0.143 0.429 -10 Jonathan Jay 445055 L L 9 0.000 0.000 0.222 0.000 0.556 0.222 -.. ... ... ... ... ... ... ... ... ... ... ... -``` - diff --git a/baseballr_Updates/baseballr_Updates_5_24_2016.md b/baseballr_Updates/baseballr_Updates_5_24_2016.md deleted file mode 100644 index ac9f8d78..00000000 --- a/baseballr_Updates/baseballr_Updates_5_24_2016.md +++ /dev/null @@ -1,47 +0,0 @@ -## Update Notes for baseballr package: Version 0.1.4 -### May 24, 2016 - -Functions added to this release: - -`pitcher_boxscore`: This function allows a user to retrieve a boxscore of pitcher statistics for any game played in the PITCHf/x era (2008-current). The function takes a boxscore.xml url as it's only argument and returns boxscore data for both the home and away pitchers. - -Example: - -```r -> pitcher_boxscore("http://gd2.mlb.com/components/game/mlb/year_2016/month_05/day_21/gid_2016_05_21_milmlb_nynmlb_1/boxscore.xml") %>% select(id:so) -Source: local data frame [9 x 10] - - id name name_display_first_last pos out bf er r h so - (chr) (chr) (chr) (chr) (chr) (chr) (chr) (chr) (chr) (chr) -1 605200 Davies Zach Davies P 16 22 4 4 5 5 -2 430641 Boyer Blaine Boyer P 2 4 0 0 2 0 -3 448614 Torres, C Carlos Torres P 3 4 0 0 0 1 -4 592804 Thornburg Tyler Thornburg P 3 3 0 0 0 1 -5 518468 Blazek Michael Blazek P 1 5 1 1 2 0 -6 594798 deGrom Jacob deGrom P 15 23 4 4 5 7 -7 570663 Robles Hansel Robles P 6 7 0 0 0 3 -8 592665 Reed, A Addison Reed P 3 5 0 0 1 2 -9 544727 Familia Jeurys Familia P 3 4 0 0 1 1 -``` - -`batter_boxscore`: This function does the same thing as `pitcher_boxscore`, but for batters. - -Example: - -```r -> batter_boxscore("http://gd2.mlb.com/components/game/mlb/year_2016/month_05/day_21/gid_2016_05_21_milmlb_nynmlb_1/boxscore.xml") %>% select(id:bb) -Source: local data frame [29 x 10] - - id name name_display_first_last pos bo ab po r a bb - (chr) (chr) (chr) (chr) (chr) (chr) (chr) (chr) (chr) (chr) -1 542340 Villar Jonathan Villar SS 100 5 1 0 0 0 -2 571697 Gennett Scooter Gennett 2B 200 4 2 0 3 1 -3 518960 Lucroy Jonathan Lucroy C 300 5 8 0 1 0 -4 474892 Carter Chris Carter 1B 400 4 10 0 2 0 -5 543590 Nieuwenhuis Kirk Nieuwenhuis CF 500 4 0 0 0 0 -6 431094 Hill, A Aaron Hill 3B 600 1 1 2 4 3 -7 502100 Presley Alex Presley LF 700 3 0 1 0 1 -8 570717 Flores, R Ramon Flores RF 800 3 2 1 0 0 -9 605200 Davies Zach Davies P 900 2 1 0 1 0 -10 430641 Boyer Blaine Boyer P 901 0 0 0 0 0 -``` \ No newline at end of file diff --git a/baseballr_Updates/baseballr_Updates_8_25_2016.md b/baseballr_Updates/baseballr_Updates_8_25_2016.md deleted file mode 100644 index 45a6094c..00000000 --- a/baseballr_Updates/baseballr_Updates_8_25_2016.md +++ /dev/null @@ -1,43 +0,0 @@ -## Update Notes for baseballr package: Version 0.2.0 -### August 25, 2016 - -Functions added to this release: - -`scrape_statcast_savant_batter`
-`scrape_statcast_savant_pitcher`
-`playerid_lookup` - -The two savant functions allow a user to retrieve PITCHf/x and Statcast data for either a specific batter or pitcher from [Baseball Savants' Statcast Search] (https://baseballsavant.mlb.com/statcast_search). The user needs to provide a start date, end date, and the batter or pitcher's MLBAMID. - -Example: - -```r -> scrape_statcast_savant_batter(start_date = "2016-04-06", end_date = "2016-04-15", batterid = 621043) %>% - filter(type == "X") %>% - select(3,7,54:56) %>% - tail() -[1] "Be patient, this may take a few seconds..." -[1] "Data courtesy of Baseball Savant and MLBAM (baseballsavant.mlb.com)" - game_date player_name hit_distance_sc hit_speed hit_angle -26 2016-04-07 Carlos Correa 385 103.33 31.10 -27 2016-04-07 Carlos Correa 288 87.25 27.77 -28 2016-04-06 Carlos Correa 392 103.97 29.62 -29 2016-04-06 Carlos Correa 189 105.20 0.11 -30 2016-04-06 Carlos Correa 462 113.55 23.76 -31 2016-04-06 Carlos Correa 228 113.39 -2.18 -``` - -Since the savant functions require users to pass a valid MLBAMID, a lookup function is included that leverages the Chadwich public register. Users provide a text string and only those players with that string present in their last name will be returned. - -Here is an example where the user is looking for players with the last name "Seager": - -```r -> playerid_lookup("Seager") -[1] "Be patient, this may take a few seconds..." -[1] "Data courtesy of the Chadwick Bureau Register (https://github.com/chadwickbureau/register)" - first_name last_name given_name name_suffix nick_name birth_year mlb_played_first mlbam_id retrosheet_id bbref_id fangraphs_id -1 Ben Seager Ben NA NA NA NA -2 Corey Seager Corey Drew 1994 2015 608369 seagc001 seageco01 13624 -3 Justin Seager Justin Ryan 1992 NA 643529 NA -4 Kyle Seager Kyle Duerr 1987 2011 572122 seagk001 seageky01 9785 -``` diff --git a/baseballr_Updates/baseballr_updates_05_08_2017.md b/baseballr_Updates/baseballr_updates_05_08_2017.md deleted file mode 100644 index 90d8d0ef..00000000 --- a/baseballr_Updates/baseballr_updates_05_08_2017.md +++ /dev/null @@ -1,125 +0,0 @@ ---- -layout: page -title: Updated for baseballr 0.3.2 -tags: rstats, web-scraping, baseballr, statcast ---- - -The latest release of the [`baseballr`](https://billpetti.github.io/baseballr/) package for `R` includes a number of enhancement to acquiring data from [Baseball Savant](http://baseballsavant.com) as well as minor grammatical clean up in the documentation. - -Previous functions `scrape_statcast_savant_batter` and `scrape_statcast_savant_pitcher` allowed for the acquistion of data from baseballsavant.com for a given player over a user-determined time frame. However, this is somewhat inefficient if you want to acquire data on all players over a given time frame. - -Two new functions have been added, `scrape_statcast_savant_batter_all` and `scrape_statcast_savant_pitcher_all`, that allow a user to acquire data for either all pitchers or all hitters over a given time frame. - -Both functions take only two arguments: - -`start_date`: the first date for which the user wants records returned -`end_date`: the final date for which the user wants records returned - -Remember, baseballsavant.com's csv download option allows for about 50,000 records in a single query. That works out to roughly 10-12 days of games. Longer time frames will take longer to download. - -Example: acquire data for all batters from 2017-04-03 through 2017-04-10 - -```r -> head(scrape_statcast_savant_batter_all('2017-04-03', '2017-04-10')) -[1] "These data are from BaseballSevant and are property of MLB Advanced Media, L.P. All rights reserved." -[1] "Grabbing data, this may take a minute..." -URL read and payload aquired successfully. - pitch_type game_date release_speed release_pos_x release_pos_z player_name -1 FF 2017-04-10 92.7 -1.0367 5.7934 Eric Fryer -2 FF 2017-04-10 93.2 -0.9753 5.6007 Eric Fryer -3 FF 2017-04-10 93.0 -1.1196 5.6958 Eric Fryer -4 FF 2017-04-10 93.1 -0.9952 5.7978 Eric Fryer -5 SL 2017-04-10 83.4 -1.2385 5.8164 Eric Fryer -6 FF 2017-04-10 93.7 -1.0307 5.8740 Aledmys Diaz - batter pitcher events description spin_dir spin_rate_deprecated -1 518700 518875 strikeout swinging_strike NA NA -2 518700 518875 ball NA NA -3 518700 518875 ball NA NA -4 518700 518875 swinging_strike NA NA -5 518700 518875 called_strike NA NA -6 649557 518875 field_out hit_into_play NA NA - break_angle_deprecated break_length_deprecated zone -1 NA NA 5 -2 NA NA 12 -3 NA NA 12 -4 NA NA 3 -5 NA NA 6 -6 NA NA 6 - des game_type stand -1 Eric Fryer strikes out swinging. R R -2 R R -3 R R -4 R R -5 R R -6 Aledmys Diaz flies out to right fielder Bryce Harper. R R - p_throws home_team away_team type hit_location bb_type balls strikes -1 R WSH STL S 2 2 -2 R WSH STL B 1 2 -3 R WSH STL B 0 2 -4 R WSH STL S 0 1 -5 R WSH STL S 0 0 -6 R WSH STL X 9 fly_ball 0 1 - game_year pfx_x pfx_z plate_x plate_z on_3b on_2b on_1b outs_when_up -1 2017 -0.4262 1.7261 -0.0042 2.9680 NA NA 594824 2 -2 2017 0.2420 1.3633 1.3747 3.5269 NA NA 594824 2 -3 2017 0.4912 1.6758 0.5389 4.3795 NA NA 594824 2 -4 2017 0.1924 1.7964 0.6868 3.5700 NA NA 594824 2 -5 2017 -0.1604 0.3532 0.6048 2.6308 NA NA 594824 2 -6 2017 0.5956 1.8068 0.4993 3.1386 NA NA 594824 1 - inning inning_topbot hc_x hc_y tfs_deprecated tfs_zulu_deprecated -1 9 Top NA NA -2 9 Top NA NA -3 9 Top NA NA -4 9 Top NA NA -5 9 Top NA NA -6 9 Top 186.56 105.27 NA NA - pos2_person_id umpire sv_id vx0 vy0 vz0 ax ay az sz_top sz_bot -1 446308 NA 170411_025210 NA NA NA NA NA NA 3.8420 1.5890 -2 446308 NA 170411_025153 NA NA NA NA NA NA 3.5602 1.7127 -3 446308 NA 170411_025133 NA NA NA NA NA NA 3.6761 1.6780 -4 446308 NA 170411_025117 NA NA NA NA NA NA 3.6760 1.5040 -5 446308 NA 170411_025104 NA NA NA NA NA NA 3.5139 1.6548 -6 446308 NA 170411_025018 NA NA NA NA NA NA 3.9500 1.6810 - hit_distance_sc launch_speed launch_angle effective_speed release_spin_rate -1 NA NA NA 93.033 2285 -2 NA NA NA 93.301 2323 -3 NA NA NA 92.892 2322 -4 NA NA NA 92.906 2324 -5 NA NA NA 83.371 NA -6 266 87.5 47.444 93.529 2406 - release_extension game_pk pos1_person_id pos2_person_id.1 pos3_person_id -1 6.248 490201 518875 446308 475582 -2 6.265 490201 518875 446308 475582 -3 6.281 490201 518875 446308 475582 -4 6.187 490201 518875 446308 475582 -5 6.155 490201 518875 446308 475582 -6 6.269 490201 518875 446308 475582 - pos4_person_id pos5_person_id pos6_person_id pos7_person_id pos8_person_id -1 502517 543685 452220 594809 572191 -2 502517 543685 452220 594809 572191 -3 502517 543685 452220 594809 572191 -4 502517 543685 452220 594809 572191 -5 502517 543685 452220 594809 572191 -6 502517 543685 452220 594809 572191 - pos9_person_id release_pos_y estimated_ba_using_speedangle -1 547180 54.2491 0.000 -2 547180 54.2319 0.000 -3 547180 54.2163 0.000 -4 547180 54.3096 0.000 -5 547180 54.3420 0.000 -6 547180 54.2282 0.007 - estimated_woba_using_speedangle woba_value woba_denom babip_value iso_value -1 0.000 0.00 1 0 0 -2 0.000 -3 0.000 -4 0.000 -5 0.000 -6 0.008 0.00 1 0 0 - barrel -1 NA -2 NA -3 NA -4 NA -5 NA -6 0 -``` \ No newline at end of file diff --git a/baseballr_Updates/baseballr_updates_09_12_2017.md b/baseballr_Updates/baseballr_updates_09_12_2017.md deleted file mode 100644 index 2b470a1e..00000000 --- a/baseballr_Updates/baseballr_updates_09_12_2017.md +++ /dev/null @@ -1,60 +0,0 @@ ---- -layout: page -title: Updated for baseballr 0.3.3 -tags: rstats, web-scraping, baseballr, statcast ---- - -The latest release of the [`baseballr`](https://billpetti.github.io/baseballr/) package for `R` includes a number of enhancements and bug fixes. - -In terms of new functions, `statline_from_statcast` allows users to take raw pitch-by-pitch data from Statcast/PITCHf/x and calculate aggregated, statline-like output. Examples include count data such as number of singles, doubles, etc., as well as rate metrics like Slugging and wOBA on swings or contact. - -The function only has two arguments: - -* `df`: a dataframe that includes pitch-by-pitch information. The function assumes the following columns are present: `events`, `description`, `game_date`, and `type`. -* `base`: base indicates what the denomincator should be for the rate stats that are calculated. The function defaults to "swings", but you can also choose to use "contact" - -Here is an example using all data from the week of 2017-09-04. Here, we want to see a statline for all hitters based on swings: - -```r -test <- scrape_statcast_savant_batter_all("2017-09-04", "2017-09-10") - -statline_from_statcast(test) - -year swings batted_balls X1B X2B X3B HR swing_and_miss swinging_strike_percent ba -1 2017 13790 10663 1129 352 37 259 3127 0.227 0.129 - -obp slg ops woba -1 0.129 0.216 0.345 0.144 -``` - -You can also combine the `statline_from_statcast` function with a loop to create statlines for multiple players at once. - -Example: calculate statlines for batters on contact for all games played 2017-09-04 through 2017-09-10: - -```r -test <- scrape_statcast_savant_batter_all("2017-09-04", "2017-09-10") - -output <- data.frame() - -for (i in c("Jose Ramirez", "J.D. Martinez", "Francisco Lindor", "Gary Sanchez", "Rhys Hoskins")) { - reduced_test <- test %>% - filter(player_name == i) - x <- statline_from_statcast(reduced_test, base = "contact") - x$player <- i - x <- x %>% - select(player, everything()) - output <- rbind(output, x) %>% - arrange(desc(woba)) -} - -print(output, width = Inf) - -# A tibble: 5 x 12 - player year batted_balls X1B X2B X3B HR ba obp slg ops woba - -1 J.D. Martinez 2017 17 4 1 0 7 0.706 0.706 2.000 2.706 1.092 -2 Gary Sanchez 2017 11 3 1 0 2 0.545 0.545 1.182 1.727 0.710 -3 Francisco Lindor 2017 27 4 2 1 3 0.370 0.370 0.852 1.222 0.498 -4 Rhys Hoskins 2017 14 2 1 0 2 0.357 0.357 0.857 1.214 0.495 -5 Jose Ramirez 2017 16 0 0 0 3 0.188 0.188 0.750 0.938 0.370 -``` diff --git a/baseballr_Updates/baseballr_updates_11_22_2016.md b/baseballr_Updates/baseballr_updates_11_22_2016.md deleted file mode 100644 index 74f88578..00000000 --- a/baseballr_Updates/baseballr_updates_11_22_2016.md +++ /dev/null @@ -1,82 +0,0 @@ ---- -layout: page -title: Updated for baseballr 0.3.1 -tags: rstats, web-scraping, baseballr, NCAA ---- - -The latest release of the [`baseballr`](https://billpetti.github.io/baseballr/) includes a function for acquiring player statistics from the [NCAA's website](http://stats.ncaa.org) for baseball teams across the three major divisions (I, II, III). - -The function, `ncaa_scrape`, requires the user to pass values for three parameters for the function to work: - -`school_id`: numerical code used by the NCAA for each school -`year`: a four-digit year -`type`: whether to pull data for batters or pitchers - -If you want to pull batting statistics for Vanderbilt for the 2013 season, you would use the following: - -```r -> baseballr::ncaa_scrape(736, 2013, "batting") %>% -+ select(year:OBPct) - year school conference division Jersey Player Yr Pos GP GS BA OBPct -1 2013 Vanderbilt Southeastern 1 18 Yastrzemski, Mike Sr OF 66 66 0.312 0.411 -2 2013 Vanderbilt Southeastern 1 20 Harrell, Connor Sr OF 66 66 0.312 0.418 -3 2013 Vanderbilt Southeastern 1 3 Conde, Vince So INF 66 65 0.307 0.380 -4 2013 Vanderbilt Southeastern 1 6 Kemp, Tony Jr OF 66 66 0.391 0.471 -5 2013 Vanderbilt Southeastern 1 55 Gregor, Conrad Jr OF 65 65 0.308 0.440 -6 2013 Vanderbilt Southeastern 1 9 Turner, Xavier Fr INF 59 51 0.324 0.387 -7 2013 Vanderbilt Southeastern 1 5 Navin, Spencer Jr C 57 56 0.302 0.430 -8 2013 Vanderbilt Southeastern 1 51 Lupo, Jack Sr OF 57 51 0.297 0.352 -9 2013 Vanderbilt Southeastern 1 8 Wiseman, Rhett Fr OF 54 11 0.289 0.360 -10 2013 Vanderbilt Southeastern 1 10 Norwood, John So OF 33 9 0.328 0.388 -11 2013 Vanderbilt Southeastern 1 43 Wiel, Zander So INF 33 15 0.305 0.406 -12 2013 Vanderbilt Southeastern 1 44 Harvey, Chris So C 29 13 0.250 0.328 -13 2013 Vanderbilt Southeastern 1 42 McKeithan, Joel Jr INF 25 12 0.220 0.267 -14 2013 Vanderbilt Southeastern 1 39 Smith, Kyle Fr INF 23 7 0.250 0.455 -15 2013 Vanderbilt Southeastern 1 17 Harris, Andrew Sr INF 21 0 0.125 0.222 -16 2013 Vanderbilt Southeastern 1 2 Campbell, Tyler Fr INF 12 2 0.312 0.389 -17 2013 Vanderbilt Southeastern 1 7 Swanson, Dansby Fr INF 11 4 0.188 0.435 -18 2013 Vanderbilt Southeastern 1 25 Luna, D.J. Jr INF 8 0 0.000 0.333 -19 2013 Vanderbilt Southeastern 1 23 Cooper, Will So OF 4 0 1.000 1.000 -20 2013 Vanderbilt Southeastern 1 - Totals - - - - 0.313 0.407 -21 2013 Vanderbilt Southeastern 1 - Opponent Totals - - - - 0.220 0.320 -``` - -The same can be done for pitching, just by changing the `type` parameter: - -```r -> baseballr::ncaa_scrape(736, 2013, "pitching") %>% -+ select(year:ERA) - year school conference division Jersey Player Yr Pos GP App GS ERA -1 2013 Vanderbilt Southeastern 1 11 Beede, Tyler So P 37 17 17 2.32 -2 2013 Vanderbilt Southeastern 1 33 Miller, Brian So P 32 32 NA 1.58 -3 2013 Vanderbilt Southeastern 1 35 Ziomek, Kevin Jr P 32 17 17 2.12 -4 2013 Vanderbilt Southeastern 1 15 Fulmer, Carson Fr P 26 26 NA 2.39 -5 2013 Vanderbilt Southeastern 1 39 Smith, Kyle Fr INF 23 1 NA 0.00 -6 2013 Vanderbilt Southeastern 1 28 Miller, Jared So P 22 22 NA 2.31 -7 2013 Vanderbilt Southeastern 1 19 Rice, Steven Jr P 21 21 NA 2.57 -8 2013 Vanderbilt Southeastern 1 13 Buehler, Walker Fr P 16 16 9 3.14 -9 2013 Vanderbilt Southeastern 1 22 Pfeifer, Philip So P 15 15 12 3.68 -10 2013 Vanderbilt Southeastern 1 12 Ravenelle, Adam So P 11 11 NA 3.18 -11 2013 Vanderbilt Southeastern 1 40 Pecoraro, T.J. Jr P 10 10 7 5.97 -12 2013 Vanderbilt Southeastern 1 45 Ferguson, Tyler Fr P 8 8 4 4.21 -13 2013 Vanderbilt Southeastern 1 27 Kolinsky, Keenan Jr P 2 2 NA 0.00 -14 2013 Vanderbilt Southeastern 1 24 Wilson, Nevin So P 1 1 NA 0.00 -15 2013 Vanderbilt Southeastern 1 - Totals - - - NA NA 2.76 -16 2013 Vanderbilt Southeastern 1 - Opponent Totals - - - NA NA 6.19 -``` - -Now, the function is dependent on the user knowing the `school_id` used by the NCAA website. Given that, I've included a `school_id_lu` function so that users can find the `school_id` they need. - -Just pass a string to the function and it will return possible matches based on the school's name: - -```r -> school_id_lu("Vand") -# A tibble: 4 × 6 - school conference school_id year division conference_id - -1 Vanderbilt Southeastern 736 2013 1 911 -2 Vanderbilt Southeastern 736 2014 1 911 -3 Vanderbilt Southeastern 736 2015 1 911 -4 Vanderbilt Southeastern 736 2016 1 911 -``` - diff --git a/baseballr_Updates/current-release-notes.md b/baseballr_Updates/current-release-notes.md deleted file mode 100644 index f82b2f6c..00000000 --- a/baseballr_Updates/current-release-notes.md +++ /dev/null @@ -1,169 +0,0 @@ ---- -layout: page -title: baseballr current release notes -tags: rstats, baseballr ---- -## May 29, 2018 - -The latest release of the [`baseballr`](https://billpetti.github.io/baseballr/) package for `R` (0.5) includes a number of enhancements and bug fixes. - -## New Functions - -`run_expectancy_code()` - -This function formats Baseball Savant data so that users can generate the run expectancy for different base-out or count-base-out states. It will also append the data frame with new variables necessary for generating linear weights (see new function below). The only argument is a data frame downloaded from Baseball Savant - -Columns created and appended to Baseball Savant data: - -- `final_pitch_game`: whether a pitch was the final one thrown in a game -- `final_pitch_inning`: whether a pitch is the final one thrown in an inning -- `final_pitch_at_bat`: whether a pitch is the final one thrown in an at bat -- `runs_scored_on_pitch`: how many runs scored as a result of the pitch -- `bat_score_start_inning`: the score for the batting team at the beginning of the inning -- `bat_score_end_inning`: the score for the batting team at the end of the inning -- `bat_score_after`: the score for the batting team after the pitch is thrown -- `cum_runs_in_inning`: how many cumulative runs have been scored from the beginning of the inning through the pitch -- `runs_to_end_inning`: how many runs were scored as a result of the pitch through the end of the inning -- `base_out_state` or `count_base_out_state`: the specific combination of base-outs or count-base-outs when the pitch was thrown -- `avg_re`: the average run expectancy of that base-out or count-base-out state -- `next_avg_re`: the average run expectancy of the base-out or count-base-out state that results from the pitch -- `change_re`: the change in run expectancy as a result of the pitch -- `re24`: the total change in run expectancy through the end of the inning resulting from the pitch based on the change in base-out or count-base-out state plus the number of runs scored as a result of the pitch/at bat - -Example: - -```r -> x2016_statcast_re <- run_expectancy_code(x2016_statcast) - -> sample_n(x2016_statcast_re, 10) %>% - select(final_pitch_inning:re24) %>% - glimpse() - -Observations: 10 -Variables: 11 -$ final_pitch_inning 0, 0, 0, 0, 0, 0, 0, 0, 1, 0 -$ bat_score_start_inning 1, 0, 5, 0, 3, 2, 1, 0, 0, 0 -$ bat_score_end_inning 2, 0, 5, 1, 3, 2, 5, 0, 0, 2 -$ cum_runs_in_inning 1, 0, 0, 0, 0, 0, 2, 0, 0, 1 -$ runs_to_end_inning 0, 0, 0, 1, 0, 0, 2, 0, 0, 1 -$ base_out_state "2 outs, 1b _ _", "0 outs, _ _ _", "0 outs... -$ avg_re 0.2149885, 0.5057877, 0.5057877, 0.5057877, 0.5... -$ next_base_out_state "2 outs, 1b 2b _", "1 outs, _ _ _", "1 out... -$ next_avg_re 0.4063525, 0.2718802, 0.2718802, 0.8629357, 0.2... -$ change_re 0.1913640, -0.2339075, -0.2339075, 0.3571479, -... -$ re24 0.1913640, -0.2339075, -0.2339075, 0.3571479, -... -``` - -`run_expectancy_table()` - -This functions works with the `run_expectancy_code` function and does the work of generating the run expectancy tables that are automatically exported into the Global Environment - -Example: - -```r -> x2016_statcast_re %>% - run_expectancy_table() %>% - print(n=Inf) - -base_out_state avg_re - -1 0 outs, 1b 2b 3b 2.13 -2 0 outs, _ 2b 3b 1.95 -3 0 outs, 1b _ 3b 1.76 -4 1 outs, 1b 2b 3b 1.55 -5 0 outs, 1b 2b _ 1.42 -6 1 outs, _ 2b 3b 1.36 -7 0 outs, _ _ 3b 1.36 -8 1 outs, 1b _ 3b 1.18 -9 0 outs, _ 2b _ 1.14 -10 1 outs, _ _ 3b 0.951 -11 1 outs, 1b 2b _ 0.906 -12 0 outs, 1b _ _ 0.863 -13 2 outs, 1b 2b 3b 0.689 -14 1 outs, _ 2b _ 0.669 -15 2 outs, _ 2b 3b 0.525 -16 1 outs, 1b _ _ 0.520 -17 0 outs, _ _ _ 0.506 -18 2 outs, 1b _ 3b 0.456 -19 2 outs, 1b 2b _ 0.406 -20 2 outs, _ _ 3b 0.366 -21 2 outs, _ 2b _ 0.299 -22 1 outs, _ _ _ 0.272 -23 2 outs, 1b _ _ 0.215 -24 2 outs, _ _ _ 0.106 -``` - -`linear_weights_savant()` - -This function works in tandem with `run_expectancy_code()` to generate linear weights for offensive events after the Baseball Savant data has been properly formatted. Currently, the function will return linear weights above average and linear weights above outs. It does not apply any scaling to align with league wOBA. Users can do that themselves if they like, or it may be added to a future version of the function. - -Example: - -```r - -> x2016_statcast_re %>% - linear_weights_savant() %>% - print(n=Inf) - -A tibble: 7 x 3 -events linear_weights_above_average linear_weights_above_outs - -1 home_run 1.38 1.63 -2 triple 1.00 1.25 -3 double 0.730 0.980 -4 single 0.440 0.690 -5 hit_by_pitch 0.320 0.570 -6 walk 0.290 0.540 -7 outs -0.250 0. -``` - -I used Baseball Savant data from 2010-2015 and compared the linear weights generated by `baseballr` to those by Tom Tango using retrosheet data. `baseballr`'s weights are generally a little lower than what Tango generated, but that could be due to a number of things, such as the data source, code, etc., but the values appear reasonable enough to be reliable: - -| base_out_state | baseballr_2010_2015 | tango_2010_2015 | diff | %_diff | -|--------------------|---------------------|-----------------|-------|--------| -| 0 outs, 1b 2b 3b | 2.27 | 2.29 | -0.02 | -1% | -| 0 outs, _ 2b 3b | 1.96 | 1.96 | 0 | 0% | -| 0 outs, 1b _ 3b | 1.76 | 1.78 | -0.03 | -1% | -| 1 outs, 1b 2b 3b | 1.51 | 1.54 | -0.03 | -2% | -| 0 outs, 1b 2b _ | 1.42 | 1.44 | -0.02 | -1% | -| 0 outs, _ _ 3b | 1.38 | 1.38 | 0 | 0% | -| 1 outs, _ 2b 3b | 1.35 | 1.35 | 0 | 0% | -| 1 outs, 1b _ 3b | 1.1 | 1.13 | -0.03 | -2% | -| 0 outs, _ 2b _ | 1.09 | 1.1 | -0.01 | -1% | -| 1 outs, _ _ 3b | 0.93 | 0.95 | -0.02 | -2% | -| 1 outs, 1b 2b _ | 0.86 | 0.88 | -0.02 | -3% | -| 0 outs, 1b _ _ | 0.84 | 0.86 | -0.02 | -2% | -| 2 outs, 1b 2b 3b | 0.71 | 0.75 | -0.04 | -5% | -| 1 outs, _ 2b _ | 0.65 | 0.66 | -0.01 | -2% | -| 2 outs, _ 2b 3b | 0.54 | 0.58 | -0.04 | -7% | -| 1 outs, 1b _ _ | 0.5 | 0.51 | -0.01 | -2% | -| 0 outs, _ _ _ | 0.48 | 0.48 | 0 | -1% | -| 2 outs, 1b _ 3b | 0.45 | 0.48 | -0.03 | -7% | -| 2 outs, 1b 2b _ | 0.41 | 0.43 | -0.02 | -4% | -| 2 outs, _ _ 3b | 0.33 | 0.35 | -0.02 | -6% | -| 2 outs, _ 2b _ | 0.31 | 0.32 | -0.01 | -3% | -| 1 outs, _ _ _ | 0.25 | 0.25 | 0 | -1% | -| 2 outs, 1b _ _ | 0.21 | 0.22 | -0.01 | -6% | -| 2 outs, _ _ _ | 0.1 | 0.1 | 0 | -2% | - -We also had some great contributions by others that I've added into this release: - -`label_statcast_imputed_data()` - -[Ben Dilday](https://github.com/bdilday) again contributes with a cool experimental function meant to tag batted ball cases where significant imputation may have been used to generate some of the Statcast values by MLBAM, i.e. `launch_speed` and `launch_angle`. You can read more about Ben's function [here](https://github.com/BillPetti/baseballr/pull/71). - -`fg_park()` - -[Sam Boysel](https://github.com/sboysel) updated the park factors function so that it now includes the new columns added by FanGraphs (5-year, 3-year, 1-year park factors) and ensures the column names are correct - -## Updgrades - -`fg_bat_leaders()` - -- `playerid` now returned as part of the data returned. -- Dozens of additional variables are also returned, including aggregate data from Pitch Info as well as contact type. - -## Bug Fixes - -`process_statcast_payload()` -- hc_x, hc_y are now converted to numeric \ No newline at end of file diff --git a/man/batter_boxscore.Rd b/man/batter_boxscore.Rd index 79e6ba2e..8cbd8f86 100644 --- a/man/batter_boxscore.Rd +++ b/man/batter_boxscore.Rd @@ -1,10 +1,13 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/batter_boxscore.R +% Please edit documentation in R/batter_boxscore.R, R/pitcher_boxscore.R \name{batter_boxscore} \alias{batter_boxscore} +\alias{pitcher_boxscore} \title{Retrieve batter boxscore data for a single game played} \usage{ batter_boxscore(x) + +pitcher_boxscore(x) } \arguments{ \item{x}{A boxscore.xml url for a given game from the MLBAM GameDay app data.} @@ -13,7 +16,16 @@ batter_boxscore(x) This function allows a user to retrieve a boxscore of batter statistics for any game played in the PITCHf/x era (2008-current). The function takes a boxscore.xml url as it's only argument and returns boxscore data for both the home and away batters. } \examples{ -batter_boxscore("http://gd2.mlb.com/components/game/mlb/year_2016/month_05/day_21/gid_2016_05_21_milmlb_nynmlb_1/boxscore.xml") +# batters +url_base <- "http://gd2.mlb.com/components/game/mlb/" +url <- paste0(url_base, + "year_2016/month_05/day_21/gid_2016_05_21_milmlb_nynmlb_1/boxscore.xml") +batter_boxscore(url) +# pitchers +url_base <- "http://gd2.mlb.com/components/game/mlb/" +url <- paste0(url_base, + "year_2016/month_05/day_21/gid_2016_05_21_milmlb_nynmlb_1/boxscore.xml") +pitcher_boxscore(url) } \keyword{Day,} \keyword{Game} diff --git a/man/label_statcast_imputed_data.Rd b/man/label_statcast_imputed_data.Rd index 0061ac9b..21828129 100644 --- a/man/label_statcast_imputed_data.Rd +++ b/man/label_statcast_imputed_data.Rd @@ -10,14 +10,15 @@ label_statcast_imputed_data(statcast_df, impute_file = NULL, \arguments{ \item{statcast_df}{A dataframe containing Statcast batted ball data} -\item{impute_file}{A csv file giving the launch angle, launch speed, bb_type, events fields to label -as imputed. if NULL then it's read from the `extdata` folder of the package.} +\item{impute_file}{A CSV file giving the launch angle, launch speed, +\code{bb_type}, events fields to label +as imputed. if NULL then it's read from the \code{extdata} folder of the package.} \item{inverse_precision}{inverse of how many digits to truncate the launch angle -and speed to for comparison. default is 10000, i.e. keep 4 digits of precision.} +and speed to for comparison. Default is \code{10000}, i.e. keep 4 digits of precision.} } \value{ -A copy of the input dataframe with a new column "imputed" appended. imputed +A copy of the input dataframe with a new column \code{imputed} appended. imputed is 1 if launch angle and launch speed are likely imputed, 0 otherwise. } \description{ @@ -26,10 +27,10 @@ label Statcast data for which the launch angle and speed have been imputed. } \examples{ -#' \dontrun{ -statcast_df = scrape_statcast_savant("2017-05-01", "2017-05-02") -statcast_df = label_statcast_imputed_data(statcast_df) -mean(statcast_df$imputed) +\dontrun{ +statcast_df <- scrape_statcast_savant("2017-05-01", "2017-05-02") +sc_df <- label_statcast_imputed_data(statcast_df) +mean(sc_df$imputed) } } \keyword{MLB,} diff --git a/man/linear_weights_savant.Rd b/man/linear_weights_savant.Rd index b77fb23b..bfe58229 100644 --- a/man/linear_weights_savant.Rd +++ b/man/linear_weights_savant.Rd @@ -7,7 +7,7 @@ linear_weights_savant(df) } \arguments{ -\item{df}{A data frame generated from Baseball Savant that has been run through the baseballr::run_expectancy_code() function.} +\item{df}{A data frame generated from Baseball Savant that has been run through the \code{\link{run_expectancy_code}} function.} } \description{ This function allows a user to generate linear weight values for events using Baseball Savant data. Output includes both linear weights above average and linear weights above outs for home runs, triples, doubles, singles, walks, hit by pitches, and outs. diff --git a/man/pitcher_boxscore.Rd b/man/pitcher_boxscore.Rd deleted file mode 100644 index 4fd31e22..00000000 --- a/man/pitcher_boxscore.Rd +++ /dev/null @@ -1,23 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/pitcher_boxscore.R -\name{pitcher_boxscore} -\alias{pitcher_boxscore} -\title{Retrieve pitcher boxscore data for a single game played} -\usage{ -pitcher_boxscore(x) -} -\arguments{ -\item{x}{A boxscore.xml url for a given game from the MLBAM GameDay app data.} -} -\description{ -This function allows a user to retrieve a boxscore of pitcher statistics for any game played in the PITCHf/x era (2008-current). The function takes a boxscore.xml url as it's only argument and returns boxscore data for both the home and away pitchers. -} -\examples{ -pitcher_boxscore("http://gd2.mlb.com/components/game/mlb/year_2016/month_05/day_21/gid_2016_05_21_milmlb_nynmlb_1/boxscore.xml") -} -\keyword{Day,} -\keyword{Game} -\keyword{MLB,} -\keyword{PITCHf/x,} -\keyword{boxscore,} -\keyword{sabermetrics} diff --git a/man/playerid_lookup.Rd b/man/playerid_lookup.Rd index 0778ef73..082c971f 100644 --- a/man/playerid_lookup.Rd +++ b/man/playerid_lookup.Rd @@ -15,7 +15,9 @@ playerid_lookup(last_name = NULL, first_name = NULL) This function allows you to query the Chadwick Bureau's public register of baseball players and the various IDs associated with them in different systems of record. } \examples{ -\dontrun{playerid_lookup("Garcia", "Karim")} +\dontrun{ +playerid_lookup("Garcia", "Karim") +} } \keyword{MLB,} \keyword{sabermetrics} diff --git a/man/process_statcast_payload.Rd b/man/process_statcast_payload.Rd index dba2eaa9..b997c461 100644 --- a/man/process_statcast_payload.Rd +++ b/man/process_statcast_payload.Rd @@ -7,10 +7,18 @@ process_statcast_payload(payload) } \arguments{ -\item{payload}{payload from a Baseball Savant request, e.g. from utils::read.csv} +\item{payload}{payload from a Baseball Savant request, e.g. +from \code{\link[readr]{read_csv}}} } \description{ -This is a helper function for all scrape_statcast_savant functions. The function processes the initial csv payload acquired from baseballsavant to ensure consistency in formattting across downloads +This is a helper function for all scrape_statcast_savant functions. +The function processes the initial csv payload acquired from +baseballsavant to ensure consistency in formattting across downloads +} +\examples{ +\dontrun{ +process_statcast_payload(payload) +} } \keyword{MLB,} \keyword{Statcast} diff --git a/man/scrape_statcast_savant.Rd b/man/scrape_statcast_savant.Rd index 488c9454..78c69748 100644 --- a/man/scrape_statcast_savant.Rd +++ b/man/scrape_statcast_savant.Rd @@ -2,30 +2,77 @@ % Please edit documentation in R/scrape_statcast.R \name{scrape_statcast_savant} \alias{scrape_statcast_savant} -\title{Query Statcast and PITCHf/x Data for data from baseballsavant.mlb.com} +\alias{scrape_statcast_savant.Date} +\alias{scrape_statcast_savant.default} +\alias{scrape_statcast_savant_batter} +\alias{scrape_statcast_savant_batter_all} +\alias{scrape_statcast_savant_pitcher} +\alias{scrape_statcast_savant_pitcher_all} +\title{Query Statcast and PITCHf/x Data for data from \url{http://baseballsavant.mlb.com}} \usage{ -scrape_statcast_savant(start_date, end_date, playerid = NULL, - player_type = NULL) +scrape_statcast_savant(start_date = Sys.Date() - 1, end_date = Sys.Date(), + playerid = NULL, player_type = "batter", ...) + +\method{scrape_statcast_savant}{Date}(start_date = Sys.Date() - 1, + end_date = Sys.Date(), playerid = NULL, player_type = "batter", ...) + +\method{scrape_statcast_savant}{default}(start_date = Sys.Date() - 1, + end_date = Sys.Date(), playerid = NULL, player_type = "batter", ...) + +scrape_statcast_savant_batter(start_date, end_date, batterid, ...) + +scrape_statcast_savant_batter_all(start_date, end_date, ...) + +scrape_statcast_savant_pitcher(start_date, end_date, pitcherid, ...) + +scrape_statcast_savant_pitcher_all(start_date, end_date, ...) } \arguments{ -\item{start_date}{Date of first game for which you want data. Format must be in YYYY-MM-DD format.} +\item{start_date}{Date of first game for which you want data. +Format must be in YYYY-MM-DD format.} + +\item{end_date}{Date of last game for which you want data. +Format must be in YYYY-MM-DD format.} + +\item{playerid}{The MLBAM ID for the player whose data you want to query.} + +\item{player_type}{The player type. Can be \code{batter} or \code{pitcher}. +Default is \code{batter}} -\item{end_date}{Date of last game for which you want data. Format must be in YYYY-MM-DD format.} +\item{...}{currently ignored} -\item{playerid}{The MLBAM ID for the player who's data you want to query.} +\item{batterid}{The MLBAM ID for the batter whose data you want to query.} -\item{player_type}{The player type. Can be 'batter' or 'pitcher'} +\item{pitcherid}{The MLBAM ID for the pitcher whose data you want to query.} } \description{ -This function allows you to query Statcast and PITCHf/x data as provided on baseballsavant.mlb.com and have that data returned as a dataframe. +This function allows you to query Statcast and PITCHf/x data as provided on \url{http://baseballsavant.mlb.com} and have that data returned as a \code{\link{data.frame}}. } \examples{ \dontrun{ -scrape_statcast_savant(start_date = "2016-04-06", end_date = "2016-04-15", playerid = 621043, player_type='batter') +correa <- scrape_statcast_savant(start_date = "2016-04-06", + end_date = "2016-04-15", playerid = 621043) -scrape_statcast_savant(start_date = "2016-04-06", end_date = "2016-04-15", playerid = 592789, player_type='pitcher') +noah <- scrape_statcast_savant(start_date = "2016-04-06", + end_date = "2016-04-15", playerid = 592789, player_type = 'pitcher') -scrape_statcast_savant(start_date = "2016-04-06", end_date = "2016-04-06") +daily <- scrape_statcast_savant(start_date = "2016-04-06", end_date = "2016-04-06") +} +\dontrun{ +correa <- scrape_statcast_savant_batter(start_date = "2016-04-06", + end_date = "2016-04-15", batterid = 621043) +} +\dontrun{ +daily <- scrape_statcast_savant_batter_all(start_date = "2016-04-06", + end_date = "2016-04-06") +} +\dontrun{ +noah <- scrape_statcast_savant_pitcher(start_date = "2016-04-06", + end_date = "2016-04-15", pitcherid = 592789) +} +\dontrun{ +daily <- scrape_statcast_savant_pitcher_all(start_date = "2016-04-06", + end_date = "2016-04-06") } } \keyword{MLB,} diff --git a/man/scrape_statcast_savant_batter.Rd b/man/scrape_statcast_savant_batter.Rd deleted file mode 100644 index 87de7121..00000000 --- a/man/scrape_statcast_savant_batter.Rd +++ /dev/null @@ -1,26 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/scrape_statcast_savant_batter.R -\name{scrape_statcast_savant_batter} -\alias{scrape_statcast_savant_batter} -\title{Query Statcast and PITCHf/x Data for Batters from baseballsavant.mlb.com} -\usage{ -scrape_statcast_savant_batter(start_date, end_date, batterid) -} -\arguments{ -\item{start_date}{Date of first game for which you want data. Format must be in YYYY-MM-DD format.} - -\item{end_date}{Date of last game for which you want data. Format must be in YYYY-MM-DD format.} - -\item{batterid}{The MLBAM ID for the batter who's data you want to query.} -} -\description{ -This function allows you to query Statcast and PITCHf/x data as provided on baseballsavant.mlb.com and have that data returned as a dataframe. -} -\examples{ -\dontrun{ -scrape_statcast_savant_batter(start_date = "2016-04-06", end_date = "2016-04-15", batterid = 621043) -} -} -\keyword{MLB,} -\keyword{Statcast} -\keyword{sabermetrics,} diff --git a/man/scrape_statcast_savant_batter_all.Rd b/man/scrape_statcast_savant_batter_all.Rd deleted file mode 100644 index a99350b3..00000000 --- a/man/scrape_statcast_savant_batter_all.Rd +++ /dev/null @@ -1,24 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/scrape_statcast_savant_batter_all.R -\name{scrape_statcast_savant_batter_all} -\alias{scrape_statcast_savant_batter_all} -\title{Query Statcast and PITCHf/x Data for All Batters from baseballsavant.mlb.com} -\usage{ -scrape_statcast_savant_batter_all(start_date, end_date) -} -\arguments{ -\item{start_date}{Date of first game for which you want data. Format must be in YYYY-MM-DD format.} - -\item{end_date}{Date of last game for which you want data. Format must be in YYYY-MM-DD format.} -} -\description{ -This function allows you to query Statcast and PITCHf/x data as provided on baseballsavant.mlb.com and have that data returned as a dataframe. Query returns data for all batters over a given time frame. -} -\examples{ -\dontrun{ -scrape_statcast_savant_batter_all(start_date = "2016-04-06", end_date = "2016-04-15") -} -} -\keyword{MLB,} -\keyword{Statcast} -\keyword{sabermetrics,} diff --git a/man/scrape_statcast_savant_pitcher.Rd b/man/scrape_statcast_savant_pitcher.Rd deleted file mode 100644 index 82f8f30a..00000000 --- a/man/scrape_statcast_savant_pitcher.Rd +++ /dev/null @@ -1,27 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/scrape_statcast_savant_pitcher.R -\name{scrape_statcast_savant_pitcher} -\alias{scrape_statcast_savant_pitcher} -\title{Query Statcast and PITCHf/x Data for Pitchers from baseballsavant.mlb.com} -\usage{ -scrape_statcast_savant_pitcher(start_date, end_date, pitcherid) -} -\arguments{ -\item{start_date}{Date of first game for which you want data. Format must be in YYYY-MM-DD format.} - -\item{end_date}{Date of last game for which you want data. Format must be in YYYY-MM-DD format.} - -\item{pitcherid}{The MLBAM ID for the pitcher who's data you want to query.} -} -\description{ -This function allows you to query Statcast and PITCHf/x data as provided on baseballsavant.mlb.com and have that data returned as a dataframe. -} -\examples{ -\dontrun{ -scrape_statcast_savant_pitcher(start_date = "2016-04-06", -end_date = "2016-04-15", pitcherid = 592789) -} -} -\keyword{MLB,} -\keyword{Statcast} -\keyword{sabermetrics,} diff --git a/man/scrape_statcast_savant_pitcher_all.Rd b/man/scrape_statcast_savant_pitcher_all.Rd deleted file mode 100644 index 4f3c280e..00000000 --- a/man/scrape_statcast_savant_pitcher_all.Rd +++ /dev/null @@ -1,25 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/scrape_statcast_savant_pitcher_all.R -\name{scrape_statcast_savant_pitcher_all} -\alias{scrape_statcast_savant_pitcher_all} -\title{Query Statcast and PITCHf/x Data for Pitchers from baseballsavant.mlb.com} -\usage{ -scrape_statcast_savant_pitcher_all(start_date, end_date) -} -\arguments{ -\item{start_date}{Date of first game for which you want data. Format must be in YYYY-MM-DD format.} - -\item{end_date}{Date of last game for which you want data. Format must be in YYYY-MM-DD format.} -} -\description{ -This function allows you to query Statcast and PITCHf/x data as provided on baseballsavant.mlb.com and have that data returned as a dataframe. Query returns data for all pitchers over a given time frame. -} -\examples{ -\dontrun{ -scrape_statcast_savant_pitcher(start_date = "2016-04-06", -end_date = "2016-04-15", pitcherid = 592789) -} -} -\keyword{MLB,} -\keyword{Statcast} -\keyword{sabermetrics,} diff --git a/tests/testthat.R b/tests/testthat.R new file mode 100644 index 00000000..773a0671 --- /dev/null +++ b/tests/testthat.R @@ -0,0 +1,4 @@ +library(testthat) +library(baseballr) + +test_check("baseballr") diff --git a/tests/testthat/test-baseballr.R b/tests/testthat/test-baseballr.R new file mode 100644 index 00000000..85813813 --- /dev/null +++ b/tests/testthat/test-baseballr.R @@ -0,0 +1,31 @@ +context("baseballr") + +test_that("scraper works", { + correa <- scrape_statcast_savant(start_date = "2016-04-15", + end_date = "2016-04-15", + playerid = 621043) + expect_equal(nrow(correa), 18) + expect_equal(ncol(correa), 90) + + expect_identical(correa, scrape_statcast_savant_batter(start_date = "2016-04-15", + end_date = "2016-04-15", + batterid = 621043)) + + noah <- scrape_statcast_savant(start_date = "2016-04-06", + end_date = "2016-04-15", playerid = 592789, + player_type = 'pitcher') + expect_equal(nrow(noah), 99) + expect_equal(ncol(noah), 90) + + daily <- scrape_statcast_savant(start_date = "2016-04-06", + end_date = "2016-04-06") + + expect_equal(nrow(daily), 3846) + expect_equal(ncol(daily), 90) + + expect_error(scrape_statcast_savant(start_date = "1970-01-01"), "limited to the 2008") + expect_warning(scrape_statcast_savant(playerid = "nonsense"), "No valid data found") + + expect_equal(nrow(playerid_lookup("Garcia", "Karim")), 1) + expect_equal(nrow(playerid_lookup("Baumer", "Ben")), 0) +}) diff --git a/vignettes/ncaa_scraping.Rmd b/vignettes/ncaa_scraping.Rmd new file mode 100644 index 00000000..e6831e91 --- /dev/null +++ b/vignettes/ncaa_scraping.Rmd @@ -0,0 +1,52 @@ +--- +title: "NCAA Scraping" +author: "Bill Petti" +date: "2016-11-22" +output: rmarkdown::html_vignette +vignette: > + %\VignetteIndexEntry{Vignette Title} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +```{r setup, include = FALSE} +knitr::opts_chunk$set( + collapse = TRUE, + comment = "#>" +) +``` + +The latest release of the [`baseballr`](https://billpetti.github.io/baseballr/) includes a function for acquiring player statistics from the [NCAA's website](http://stats.ncaa.org) for baseball teams across the three major divisions (I, II, III). + +The function, `ncaa_scrape`, requires the user to pass values for three parameters for the function to work: + +`school_id`: numerical code used by the NCAA for each school +`year`: a four-digit year +`type`: whether to pull data for batters or pitchers + +If you want to pull batting statistics for Vanderbilt for the 2013 season, you would use the following: + +```{r} +library(baseballr) +library(dplyr) +ncaa_scrape(736, 2013, "batting") %>% + select(year:OBPct) +``` + + +The same can be done for pitching, just by changing the `type` parameter: + +```{r} +ncaa_scrape(736, 2013, "pitching") %>% + select(year:ERA) +``` + + +Now, the function is dependent on the user knowing the `school_id` used by the NCAA website. Given that, I've included a `school_id_lu` function so that users can find the `school_id` they need. + +Just pass a string to the function and it will return possible matches based on the school's name: + +```{r} +school_id_lu("Vand") +``` + diff --git a/EXAMPLES/scraping_plotting_statcast.R b/vignettes/plotting_statcast.Rmd similarity index 70% rename from EXAMPLES/scraping_plotting_statcast.R rename to vignettes/plotting_statcast.Rmd index 62c5ab48..fac639cc 100644 --- a/EXAMPLES/scraping_plotting_statcast.R +++ b/vignettes/plotting_statcast.Rmd @@ -1,52 +1,80 @@ -##### Bill Petti -##### 8-27-2016 -##### In this example, the baseballr package is used to acquire Statcast data for Mookie Betts from 2015-2016 -##### The data is then processed and plotted to show how his launch angle and batted ball speed have changed from year to year - -# load required packages - -require(devtools) -install_github("BillPetti/baseballr") -require(baseballr) -require(dplyr) -require(ggplot2) -require(reshape2) -require(zoo) +--- +title: "Plotting Statcast data" +author: "Bill Petti" +date: "`r Sys.Date()`" +output: rmarkdown::html_vignette +vignette: > + %\VignetteIndexEntry{Vignette Title} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +```{r setup, include = FALSE} +knitr::opts_chunk$set( + collapse = TRUE, + comment = "#>", + eval = FALSE +) +``` + +In this example, the `baseballr` package is used to acquire Statcast data for Mookie Betts from 2015-2016. + +The data is then processed and plotted to show how his launch angle and batted ball speed have changed from year to year + +```{r} +library(baseballr) +library(dplyr) +library(ggplot2) +library(reshape2) +library(zoo) +``` # load custom ggplot2 theme +```{r} source("https://raw.githubusercontent.com/BillPetti/R-Plotting-Resources/master/theme_bp_grey") +``` + # find Mookie Betts' MLBAMID +```{r} betts_id <- playerid_lookup("Betts") %>% filter(first_name == "Mookie") %>% select(mlbam_id, first_name, last_name) +``` # scrape Betts' Statcast data, by pitch removing those with a batted ball speed of 0 +```{r} betts <- scrape_statcast_savant_batter("2015-03-31", "2016-08-26", betts_id[1,1]) %>% mutate(Year = as.factor(substr(game_date,1,4))) %>% filter(type == "X") %>% filter(hit_speed != 0) +``` # calculate average launch angles and batted ball speeds by game +```{r} betts_grpd <- betts %>% group_by(game_date) %>% - summarise(`Average Launch Angle` = mean(hit_angle, na.rm = TRUE), `Average Batted Ball Speed` = mean(hit_speed, na.rm = TRUE)) %>% + summarise(`Average Launch Angle` = mean(hit_angle, na.rm = TRUE), + `Average Batted Ball Speed` = mean(hit_speed, na.rm = TRUE)) %>% ungroup() %>% melt(id=c("game_date")) %>% mutate(Year = as.factor(substr(game_date,1,4))) - +``` # calculate Betts' average launch angle and batted ball speed by year +```{r} betts_avg_speed_yr <- betts %>% group_by(Year) %>% summarise(speed = round(mean(hit_speed, na.rm = TRUE),1), angle = round(mean(hit_angle, na.rm = TRUE),1)) +``` # plot the data +```{r} betts_grpd %>% ggplot(aes(game_date, value)) + geom_point() + @@ -60,7 +88,10 @@ betts_grpd %>% theme_bp_grey() + theme(legend.position = "bottom", strip.text.x = element_text(face = "bold", size = 14), plot.subtitle = element_text(hjust=-.12)) + scale_color_manual(values = c("#5F9ED1", "#FF800E")) +``` # export plot to your working directory +```{r, eval=FALSE} ggsave("betts_angle_speed_year.png", scale = 1.2, width = 14, height = 8.5, units = "in") +``` \ No newline at end of file