In [1]:
#loading libraries
library(tidyverse)
library(repr)
library(tidymodels)
options(repr.matrix.max.rows = 6)

── [1mAttaching packages[22m ─────────────────────────────────────── tidyverse 1.3.1 ──

[32m✔[39m [34mggplot2[39m 3.3.6     [32m✔[39m [34mpurrr  [39m 0.3.4
[32m✔[39m [34mtibble [39m 3.1.7     [32m✔[39m [34mdplyr  [39m 1.0.9
[32m✔[39m [34mtidyr  [39m 1.2.0     [32m✔[39m [34mstringr[39m 1.4.0
[32m✔[39m [34mreadr  [39m 2.1.2     [32m✔[39m [34mforcats[39m 0.5.1

── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()

── [1mAttaching packages[22m ────────────────────────────────────── tidymodels 1.0.0 ──

[32m✔[39m [34mbroom       [39m 1.0.0     [32m✔[39m [34mrsample     [39m 1.0.0
[32m✔[39m [34mdials       [39m 1.0.0     [32m✔[39m [34mtune        [39m 1.0.0
[32m✔[39m [34minfer       [39m 1.0.2     [32m✔[39m [34mworkflows   [39m 1.0.0
[32m✔

In [2]:
tennis_data <- read_csv("tennis_data.csv", show_col_types = FALSE)

[1m[22mNew names:
[36m•[39m `` -> `...1`


In [3]:
#prepping data
set.seed(4673)

winner_data <- tennis_data |> select(winner_rank_points, winner_ht, w_df, w_bpSaved, w_bpFaced) |> 
    filter(w_bpSaved != 0) |> filter(w_bpFaced != 0) |> mutate(bpratio = w_bpSaved/w_bpFaced) |> na.omit() |>
    rename(rank_points = winner_rank_points, height = winner_ht, double_faults = w_df) |>
    select(-w_bpSaved, -w_bpFaced)
loser_data <- tennis_data |> select(loser_rank_points, loser_ht, l_df, l_bpSaved, l_bpFaced) |> 
    filter(l_bpSaved != 0) |> filter(l_bpFaced != 0) |> mutate(bpratio = l_bpSaved/l_bpFaced) |> na.omit() |>
    rename(rank_points = loser_rank_points, height = loser_ht, double_faults = l_df) |>
    select(-l_bpSaved, -l_bpFaced)

player_data <- rbind(winner_data, loser_data)
player_data

rank_points,height,double_faults,bpratio
<dbl>,<dbl>,<dbl>,<dbl>
3590,178,3,0.5
3590,178,2,1.0
200,188,2,0.8
⋮,⋮,⋮,⋮
667,178,2,0.4545455
2320,188,3,0.7000000
667,178,0,0.3636364


In [35]:
#adjusting data
mean_height <- summarize(player_data, mean_ht = mean(height))
player_data_a <- player_data |>
    mutate(bp_percent = bpratio*100) |>
    mutate(mean_height, height_difference = height - mean_ht) |>
    select(-mean_ht)
player_data_a

rank_points,height,double_faults,bpratio,bp_percent,height_difference
<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
3590,178,3,0.5,50,-8.156029
3590,178,2,1.0,100,-8.156029
200,188,2,0.8,80,1.843971
⋮,⋮,⋮,⋮,⋮,⋮
667,178,2,0.4545455,45.45455,-8.156029
2320,188,3,0.7000000,70.00000,1.843971
667,178,0,0.3636364,36.36364,-8.156029


In [16]:
#splitting data
set.seed(7893)
player_split <- initial_split(player_data_a, prop = .75, strata = rank_points)  
player_train <- training(player_split)
player_test <- testing(player_split)

In [31]:
#model
lm_spec <- linear_reg() |>
    set_engine("lm") |>
    set_mode("regression")

#recipe
player_recipe <- recipe(rank_points ~ double_faults + height_difference + bp_percent, data = player_train)


workflow <- workflow() |>
    add_recipe(player_recipe) |>
    add_model(lm_spec) |>
    fit(player_train)
workflow

══ Workflow [trained] ══════════════════════════════════════════════════════════
[3mPreprocessor:[23m Recipe
[3mModel:[23m linear_reg()

── Preprocessor ────────────────────────────────────────────────────────────────
0 Recipe Steps

── Model ───────────────────────────────────────────────────────────────────────

Call:
stats::lm(formula = ..y ~ ., data = data)

Coefficients:
      (Intercept)      double_faults  height_difference         bp_percent  
         1333.633            -42.542             25.229              8.158  


In [29]:
#RMSE
#stats: w/abs diff: 1782.172, wo/abs diff: 1779.149, df+bp: 1789.681, ht+bp: 1782.217, ht+df: 1787.156, 

lm_test_results <- workflow |>
    predict(player_train) |>
    bind_cols(player_train) |>
    metrics(truth = rank_points, estimate = .pred) |>
    filter(.metric == "rmse")
lm_test_results

.metric,.estimator,.estimate
<chr>,<chr>,<dbl>
rmse,standard,1787.156
