In [None]:
list.of.packages <- c("tidyverse", "data.table", "dtplyr", "lme4", "lmerTest", "pROC", "matrixStats", "glmnet", "broom")
new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]
if(length(new.packages)) install.packages(new.packages)

library(tidyverse)
library(data.table)
library(dtplyr) # Use dplyr syntax, but with a datatable backend - faster.
library(lme4)
library(lmerTest)
library(pROC)
library(caTools)
library(matrixStats)
library(glmnet)
library(broom)
library(glue)

In [None]:
raw_shots <- read_csv("../Data/NBA_Shots_Raw.csv")
player_info <- read_csv("../Data/Player_Info.csv")
player_salary <- read_csv("../Data/Player_Salary.csv")
player_info <- player_info %>% mutate(PLAYER_NAME = paste(First_Name, Surname))
player_salary <- player_salary %>% rename(PLAYER_NAME = Name)
clean_shots <- read_csv("../Data/NBA_Shots_Clean_Example.csv")
common_player_info <- read_csv("../Data/wyatt_basketball/csv/common_player_info.csv")
common_player_info <- common_player_info %>% mutate(PLAYER_NAME = paste(first_name, last_name))
height_2014 <- read_csv("../Data/NBA-Height-Weight/CSVs/Yearly/2014.csv") # https://github.com/simonwarchol/NBA-Height-Weight
height_2014 <- height_2014 %>% rename(PLAYER_NAME = Name)


typos <- c(
    "Time Hardaway Jr" = "Tim Hardaway Jr",
    "Steve Adams" = "Steven Adams",
    "Jose Juan Barea" = "Jj Barea",
    "Glen Rice Jr" = "Glen Rice",
    "Charles Hayes" = "Chuck Hayes", # technically correct but more sources with chuck
    "Ishmael Smith" = "Ish Smith", # as above
    "Patrick Mills" = "Patty Mills", # etc
    "Na Nene" = "Nene",
    "Jose Barea" = "Jj Barea"
)

replace_strings <- function(df, replacements) {
    replacements <- unlist(replacements, use.names = TRUE)

    df %>% mutate(across(where(is.character), ~ str_replace_all(., replacements)))
}


# Trying to standardise naming, works in almost every case!
clean_name <- function(name) {
  name %>%
    str_replace_all("-", " ") %>%
    str_replace_all("'", "") %>%
    str_remove_all("\\.") %>%
    str_to_title()
}

player_info <- player_info %>%
    mutate(PLAYER_NAME = clean_name(PLAYER_NAME))

           
clean_shots <- clean_shots %>%
    mutate(PLAYER_NAME = clean_name(PLAYER_NAME),
          CLOSEST_DEFENDER = clean_name(CLOSEST_DEFENDER))

common_player_info <- common_player_info %>%
    mutate(PLAYER_NAME = clean_name(PLAYER_NAME)) %>%
    filter(person_id != 779) # filtering out glen rice sr. by hand

height_2014 <- height_2014 %>%
    mutate(PLAYER_NAME = clean_name(PLAYER_NAME))


clean_shots <- replace_strings(clean_shots, typos)
player_info <- replace_strings(player_info, typos)
common_player_info <- replace_strings(common_player_info, typos)
height_2014 <- replace_strings(height_2014, typos)

# Explicit case because of his name and interactions with the regex
player_info <- player_info %>% mutate(across(where(is.character), ~ str_replace_all(., "Luc Mbah", "Luc Mbah A Moute")))

In [None]:
colnames(clean_shots)
colnames(player_info)
colnames(player_salary)
colnames(height_2014)
colnames(common_player_info)

In [None]:
options(repr.plot.width = 10, repr.plot.height = 7)

s <- raw_shots %>% 
    left_join(player_info %>% select(Pos, PLAYER_NAME), by = "PLAYER_NAME") %>%
    filter(!is.na(Pos)) %>%
    select(SHOT_DIST, Pos)

ggplot(s, aes(x = SHOT_DIST)) +
    geom_histogram(bins = 30, fill = "blue", color = "black", alpha = 0.5) +
    facet_wrap(~ Pos) +
    labs(title = "Shot Distance Distribution by Position",
       x = "Shot Distance (feet)",
       y = "Frequency") +
    theme_minimal()

In [None]:
# Listings of top 2pt/3pt scorers. Can be deleted, but ideas important for later.

l1 <- raw_shots %>%
    left_join(player_info %>% select(Pos, PLAYER_NAME), by = "PLAYER_NAME", relationship="many-to-many") %>%
    filter(!is.na(Pos)) %>%
    group_by(PLAYER_NAME, Pos) %>% 
    summarize(tot_FGM = sum(FGM), .groups="drop") %>% 
    arrange(desc(tot_FGM))
l2 <- raw_shots %>% 
    filter(FGM == 1) %>% 
    group_by(PLAYER_NAME) %>%
    summarise(
        total_FGM = n(),
        pct_3pt = sum(PTS_TYPE == 3),
        pct_2pt = sum(PTS_TYPE == 2)
    ) %>%
    left_join(player_info %>% select(Pos, PLAYER_NAME, Team, Age), by = "PLAYER_NAME", relationship="many-to-many")

In [None]:
# How many unique players are there in each dataset?

clean_shots %>% summarise(unique_players = n_distinct(PLAYER_NAME))
player_info %>% summarise(unique_players = n_distinct(PLAYER_NAME))
player_salary %>% summarise(unique_players = n_distinct(PLAYER_NAME))

# Why is the first number significantly different from the others?

In [None]:
# If the points are close at the end of the third quarter, is the home team more likely to take the win in subsequent period(s)?
# Note this approach doesn't include every point as the data we have is incomplete (pointed out during lab session)

home_games <- clean_shots %>%
    filter(LOCATION == "H") %>%
    select(GAME_ID, HOME_TEAM, AWAY_TEAM, WIN_LOSE, FINAL_MARGIN) %>%
    distinct(GAME_ID, .keep_all=TRUE)

# We test the home win percentage directly first

home_games %>% summarise(total = n(), home_wins = sum(WIN_LOSE == "W"))

# So the home team is statistically more likely to win a game

binom.test(x=506, n=904, p=0.5, alternative="greater")

home_advantage_p3 <- clean_shots %>%
    filter(PERIOD <= 3) %>%
    mutate(pts = PTS_TYPE * SUCCESS) %>%
    group_by(GAME_ID, LOCATION) %>%
    summarise(total_pts = sum(pts), .groups = "drop") %>%
    pivot_wider(names_from = LOCATION, values_from = total_pts, names_prefix = "pts_") %>%
    left_join(home_games, by="GAME_ID") %>%
    rename(HOME_RESULT = WIN_LOSE) %>%
    mutate(pts_diff = abs(pts_H - pts_A)) %>%
    filter(pts_diff <= 3) %>%
    summarise(
        home_pct_win = sum(HOME_RESULT == "W")/n(),
        home_wins = sum(HOME_RESULT == "W"),
        total_games = n()
    )

home_advantage_p3

binom.test(x=home_advantage_p3$home_wins, n=home_advantage_p3$total_games, p=0.5, alternative="greater")

# It turns out this isn't a significant effect, maybe we can try using FINAL_MARGIN, which is the point difference at the end.

home_advantage_fm <- home_games %>% 
    filter(abs(FINAL_MARGIN) <= 3) %>%
    summarise(home_pct_win = sum(WIN_LOSE == "W")/n(),
             home_wins = sum(WIN_LOSE == "W"),
             total_games = n())

home_advantage_fm

# Even worse! Maybe it's actually a detriment to be the home team in fact? Let's check.

binom.test(x = home_advantage_fm$home_wins, n = home_advantage_fm$total_games, p=0.5, alternative = "less")

# Unfortunately we don't have significance here either!

In [None]:
# Heights like 6-4 are very annoying, convert them to cm here!

convert_to_cm <- function(feet_inches) {
  split_height <- strsplit(feet_inches, "-")
  
  feet <- sapply(split_height, function(x) as.numeric(x[1]))
  inches <- sapply(split_height, function(x) as.numeric(x[2]))
  
  cm_height <- (feet * 30.48) + (inches * 2.54)
  
  return(cm_height)
}
convert_to_cm <- Vectorize(convert_to_cm)

In [None]:
predictors <- c("GAME_ID", "PLAYER_NAME", "CLOSEST_DEFENDER" ,"SHOT_DIST", "PTS_TYPE",
                "CLOSE_DEF_DIST", "SHOT_CLOCK", "TOUCH_TIME", "PERIOD", "DRIBBLES",
                "SUCCESS")

# Join in height information one dataset at a time. 
# Since our shot data is our "left" dataset for the join, start by joining on unique players (shooters) and defenders.

distinct_players_and_defenders <- union(
    clean_shots %>% distinct(PLAYER_NAME),
    clean_shots %>% distinct(CLOSEST_DEFENDER) %>% rename(PLAYER_NAME = CLOSEST_DEFENDER)
)

player_height_pos <- distinct_players_and_defenders %>%
    left_join(player_info %>% select(PLAYER_NAME, Height, Pos), by="PLAYER_NAME") %>%
    rename(H1 = Height)

player_height_pos <- player_height_pos %>%
    left_join(common_player_info %>% select(PLAYER_NAME, height, position), by="PLAYER_NAME", relationship = "many-to-many") %>%
    rename(H2 = height)
   
player_height_pos <- player_height_pos %>%
    left_join(height_2014 %>% select(PLAYER_NAME, "Height(Feet-Inches)"), by="PLAYER_NAME") %>%
    rename(H3 = "Height(Feet-Inches)") 
# Look at overlaps/ number of NAs in different height data now, then choose a method to combine the heights.
# H3 all NA count is the important metric here, these are the players which aren't present in any of our height datasets.


player_height_pos %>%
  group_by(PLAYER_NAME) %>%
  summarise(
    H1_na = any(is.na(H1)),
    H2_na = any(is.na(H2)),
    H3_na = any(is.na(H3)),
    H_all_na = all(is.na(H1) & is.na(H2) & is.na(H3))
  ) %>%
  summarise(
    H1_na_count = sum(H1_na),
    H2_na_count = sum(H2_na),
    H3_na_count = sum(H3_na),
    H_all_na_count = sum(H_all_na)
  )


# For example we're using the following rule below: Use the height data from the course if it exists, otherwise take the average of the other two.
# Note, in the case one of the other two heights is NA it will use the valid one.

player_height_pos <- player_height_pos %>%
    mutate(
    H2 = ifelse(!is.na(H2), convert_to_cm(H2), NA),
    H3 = ifelse(!is.na(H3), convert_to_cm(H3), NA),
    HEIGHT = case_when(
        !is.na(H1) ~ H1,
        is.na(H1) ~ rowMeans2(cbind(H2, H3), na.rm = TRUE),
        TRUE ~ NA_real_
    ))

# Finally join the "more complete" height data onto the shot data.
# Double join to get the defenders height data too - trick with renaming columns.
# After we have a complete model dataset here, we can split it up and only then we can convert to factors.
# Ignoring overtime for now.

model_data_clean <- clean_shots %>% 
    select(all_of(predictors)) %>%
    left_join(player_height_pos %>% select(PLAYER_NAME, HEIGHT), by="PLAYER_NAME", relationship = "many-to-many") %>%
    rename(SHOOTER_HEIGHT = HEIGHT) %>%
    left_join(player_height_pos %>% select(PLAYER_NAME, HEIGHT) %>% rename(CLOSEST_DEFENDER = PLAYER_NAME), 
        by="CLOSEST_DEFENDER", relationship = "many-to-many") %>%
    rename(DEFENDER_HEIGHT = HEIGHT) %>%
    mutate(SHOOTER_HEIGHT_ADV = SHOOTER_HEIGHT - DEFENDER_HEIGHT) %>%
    filter(PERIOD <= 4)


# Using height data is hard and some players aren't present in any of the three height datasets I found.
# Who are these missing players? Do this in two steps...

shooters_na <- model_data_clean %>%
    filter(is.na(SHOOTER_HEIGHT)) %>%
    distinct(name = PLAYER_NAME)

defenders_na <- model_data_clean %>%
    filter(is.na(DEFENDER_HEIGHT)) %>%
    distinct(name = CLOSEST_DEFENDER)

bind_rows(shooters_na, defenders_na) %>%
    distinct() %>%
    rename(missing_players =  name)

model_data_clean <- na.omit(model_data_clean)

# If we restrict ourselves to looking at the height data from this module we can identify a few players with problems. 
# For example, John Salmons played for NOP and was traded to PHX midseason.
# If he were in the dataset, what would his team be? Brandan Wright is another example BOS->PHX (trade)

In [None]:
# Scale everything so we can compare the weight sizes, also scaling is required for lasso regression.
# Make an all_shots dataset, which is the only one we do the test/train split for because we want to examine the AUC.
model_data <- model_data_clean %>%
    mutate(across(c("GAME_ID", "PLAYER_NAME", "CLOSEST_DEFENDER", "PERIOD", "SUCCESS", "PTS_TYPE"), as.factor))
    
model_data_scaled <- model_data %>% mutate(across(where(is.numeric), scale))

set.seed(0)

# Base r approach for model test/train split. Avoids using caret
train_indices <- sample(1:nrow(model_data), size = 0.75 * nrow(model_data))
train_data <- model_data_scaled[train_indices, ]
test_data <- model_data_scaled[-train_indices, ]

train_data <- na.omit(train_data)
test_data <- na.omit(test_data)

# A note on model specification. We're not putting PTS_TYPE here initially because of how R deals with factors.
# If you do add PTS_TYPE, it will show PTS_TYPE3 as being a significant effect, and say nothing about PTS_TYPE2. 
# This is because 2 is chosen as the baseline for the model. Instead we deal with this afterwards

log_model <- glm(SUCCESS ~ SHOT_DIST + CLOSE_DEF_DIST + TOUCH_TIME + SHOOTER_HEIGHT_ADV + PERIOD + SHOT_CLOCK + DRIBBLES,  
                 data=train_data, family=binomial(link="logit"))

# Mixed model tries to account for variance between games and players. Doesn't come to very much variance it turns out.
# Could be because of under-specification. No need to include mixed model in report.

#log_mixed_model <- glmer(SUCCESS ~ SHOT_DIST + CLOSE_DEF_DIST + TOUCH_TIME + SHOOTER_HEIGHT_ADV + PERIOD + SHOT_CLOCK
#                         + (1 | GAME_ID) + (1 | PLAYER_NAME),
#                    data = train_data,
#                   family=binomial(link = "logit"),
#                        nAGQ = 0)

## the output will explode if you put GAME_ID or PLAYER_NAME in here 
x_train <- model.matrix(SUCCESS ~ SHOT_DIST + CLOSE_DEF_DIST + TOUCH_TIME + 
                        SHOOTER_HEIGHT_ADV + PERIOD + SHOT_CLOCK, data = train_data)[, -1]
y_train <- train_data$SUCCESS
cv.out <- cv.glmnet(x_train, y_train, family = "binomial", alpha = 1, type.measure = "auc", nfolds=10)
plot(cv.out)
lambda_opt <- cv.out$lambda.1se

x_test <- model.matrix(SUCCESS ~ SHOT_DIST + CLOSE_DEF_DIST + TOUCH_TIME + 
                       SHOOTER_HEIGHT_ADV + PERIOD + SHOT_CLOCK, data = test_data)[, -1]
y_test <- test_data$SUCCESS

log_lasso_model <- glmnet(x_train, y_train, family = "binomial", alpha=1, lambda=lambda_opt, standardize=FALSE)

In [None]:
# Investigating the coefficients and if they are significant.
# At this point there is no splitting the model for only 2 pointers/ only superstars etc. This comes after.
summary(log_model)
log_lasso_model$beta

In [None]:
# Dealing with just the regular log model, let's see what happens to the weights if we make a model for 2 and 3 pointers separately.
# Fit with the whole dataset this time, we only need a train/test split for lasso and computing ROC.
# Finally we fit a models for superstar players, and I've excluded them from the normal data but maybe this is pedantic.

superstars = c("Steph Curry", "Anthony Davis", "Lebron James", "James Harden", "Russel Westbrook", "Kyrie Irving",
              "Demarcus Cousins", "Klay Thompson", "Dwayne Wade", "Damian Lillard") # This is arbitrary, so this is just a team of good players.

model_data_2pt <- model_data_scaled %>% filter(PTS_TYPE == 2 & !PLAYER_NAME %in% superstars)
model_data_3pt <- model_data_scaled %>% filter(PTS_TYPE == 3 & !PLAYER_NAME %in% superstars)
model_data_super <- model_data_scaled %>% filter(PLAYER_NAME %in% superstars)
model_data_super_2pt <- model_data_super %>% filter(PTS_TYPE == 2)
model_data_super_3pt <- model_data_super %>% filter(PTS_TYPE == 3)

# Repetitive but only to make sure the models are correct. A more concise syntax can be used if I filtered the data to only have the predictor cols.

log_model_2pt <- glm(SUCCESS ~ SHOT_DIST + CLOSE_DEF_DIST + TOUCH_TIME + SHOOTER_HEIGHT_ADV  + DRIBBLES + SHOT_CLOCK, 
                     data = model_data_2pt, family = binomial(link = "logit"))
log_model_3pt <- glm(SUCCESS ~ SHOT_DIST + CLOSE_DEF_DIST + TOUCH_TIME + SHOOTER_HEIGHT_ADV + DRIBBLES + SHOT_CLOCK, 
                     data = model_data_3pt, family = binomial(link = "logit"))
#log_model_super <- glm(SUCCESS ~ SHOT_DIST + CLOSE_DEF_DIST + TOUCH_TIME + SHOOTER_HEIGHT_ADV + SHOT_CLOCK + DRIBBLES + SHOT_CLOCK, 
#                     data = model_data_super, family = binomial(link = "logit"))

log_model_super_2pt <- glm(SUCCESS ~ SHOT_DIST + CLOSE_DEF_DIST + TOUCH_TIME + SHOOTER_HEIGHT_ADV + DRIBBLES + SHOT_CLOCK, 
                     data = model_data_super_2pt, family = binomial(link = "logit"))
log_model_super_3pt <- glm(SUCCESS ~ SHOT_DIST + CLOSE_DEF_DIST + TOUCH_TIME + SHOOTER_HEIGHT_ADV + DRIBBLES + SHOT_CLOCK, 
                     data = model_data_super_3pt, family = binomial(link = "logit"))


In [None]:
# Let's take a look at the coefficients, use Wald's CI to make it easy
coefs_plot <- bind_rows(
    tidy(log_model_2pt)   %>% mutate(model = glue("2PT, n={nrow(model_data_2pt)}")),
    tidy(log_model_3pt)   %>% mutate(model = glue("3PT, n={nrow(model_data_3pt)}")),
    tidy(log_model_super_2pt) %>% mutate(model = glue("Superstar 2PT, n={nrow(model_data_super_2pt)}")),
    tidy(log_model_super_3pt) %>% mutate(model = glue("Superstar 3PT, n={nrow(model_data_super_3pt)}")),
  ) %>%
  filter(term != "(Intercept)") %>%
  mutate(
    # Wald CI here, 1.96 is a normal dist value.
    lower = estimate - 1.96 * std.error,
    upper = estimate + 1.96 * std.error,
    sig   = p.value <= 0.05
  )
options(repr.plot.width=15, repr.plot.height=8)
ggplot(coefs_plot, aes(x = estimate, y = term, colour = model, shape = sig)) +
  geom_vline(xintercept = 0, linetype = "dashed", colour = "grey50") +
  geom_errorbarh(aes(xmin = lower, xmax = upper),
                 height = 0.2,
                 linewidth = 1.2,
                 position = position_dodge(width = 0.7)) +
  geom_point(position = position_dodge(width = 0.7), size = 4) +
  scale_color_brewer(palette = "Dark2") +
  scale_shape_manual(
    values = c(`FALSE` = 1, `TRUE` = 16),
    labels = c(`FALSE` = "Not significant", `TRUE` = "Significant")
  ) +
  labs(
    title  = "Model Coefficients",
    x      = "Log-Odds Estimate",
    y      = NULL,
    colour = "Model",
    shape  = "Significance"
  ) +
  theme_minimal(base_size = 14) +
  theme(
    panel.grid.major.y = element_blank(),
    axis.text.y = element_text(face = "bold"),
    plot.title = element_text(face = "bold"),
    legend.position    = "bottom"
  )

# The large CI on superstart 3pts is strange. We expect the sample sizes to be different, let's check. (Added to plot now)

In [None]:
# Residual deviance and ROC plots. Probably don't include deviance in report but AUROC could work.
# Phrasing would be: AUROC low so we know these features don't fully explain if a shot will go in or not.
# (Deviance tells the same story)

model_roc <- function(model, test_data) {
    pred_prob <- predict(model, newdata = test_data, type="response")
    roc_obj <- roc(test_data$SUCCESS, as.vector(pred_prob), ci = TRUE, quiet = TRUE)
    return(roc_obj)
}

residual_deviance_plot <- function(model, title_str) {
    plot_data <- data.frame(
        fitted = fitted(model),
        residuals = residuals(model, type = "deviance")
    )
    
    resid_plot <- ggplot(plot_data, aes(x = fitted, y = residuals)) +
    geom_point(alpha = 0.3, size = 0.8) +
    geom_hline(yintercept = 0, color = "red", linetype = "dashed") + 
    geom_smooth(method = "gam", formula = y ~ s(x, bs = "cs")) + # the default warning message was annoying
    labs(
        title = paste0(title_str, ": Residuals vs Fitted"),
        subtitle = "Deviance residuals plotted against predicted probabilities",
        x = "Fitted values (predicted probabilities)",
        y = "Deviance residuals"
    ) +
    theme_bw() +
    theme(
        panel.grid.minor = element_blank(),
        panel.grid.major = element_line(color = "grey90"),
        plot.title = element_text(hjust = 0.5, face = "bold"),
        plot.subtitle = element_text(hjust = 0.5, color = "darkgrey")
    )
    return(resid_plot)
}

model_roc(log_model, test_data)
residual_deviance_plot(log_model, "Logistc model")


pred_prob <- predict(log_model, newdata = test_data, type="response")

roc_obj <- roc(test_data$SUCCESS, as.vector(pred_prob), ci=TRUE)
plot(roc_obj, col = "#2C3E50", lwd = 2, main = "ROC Curve for Logistic Model")
auc_text <- paste0("AUC = ", round(auc(roc_obj), 3))
legend("bottomright", legend = auc_text, col = "#2C3E50", lwd = 2)

In [None]:
# Get the mean touch time in each period per game, then convert to factors for plotting/etc.

period_mtt <- clean_shots %>%
    group_by(GAME_ID, PERIOD) %>%
    summarise(mean_tt = mean(TOUCH_TIME))

period_mtt <- period_mtt %>% filter(PERIOD <= 4) %>% mutate_at(c("PERIOD", "GAME_ID"), as.factor)


ggplot(period_mtt, aes(x = PERIOD, y = mean_tt)) +
    geom_boxplot() +
    labs(
    x = "Period",
    y = "Mean Touch Time",
    title = "Distribution of Mean Touch Time by Period"
    )

# How many overtime games did we exclude from this analysis?
clean_shots %>% 
    filter(PERIOD >= 5) %>%
    summarise(overtime_games = n_distinct(GAME_ID))

# Do a simple anova to see if we have a significant difference
model <- aov(mean_tt ~ PERIOD, data = period_mtt)
summary(model)

# since we have a significant difference, we now ask which period is significantly different from the rest? This is a post-hoc test.
TukeyHSD(model)

In [None]:
# What about touch time when the shot is contested?
contested_shots <- model_data_clean %>%
    mutate(contested = case_when(
        CLOSE_DEF_DIST <= 4 ~ "Contested",
        .default = "Not contested"  
    )) %>%
    group_by(GAME_ID, contested) %>%
    summarise(mean_tt = mean(TOUCH_TIME), .groups = "drop")


ggplot(contested_shots, aes(x = contested, y = mean_tt, fill = contested)) +
  geom_boxplot(alpha = 0.7, outlier.shape = NA) +
  scale_fill_manual(values = c("Contested" = "#E69F00", "Not contested" = "#56B4E9")) +
  labs(
    title = "Distribution of Mean TT by Contested Status",
    x     = "Contested?",
    y     = "Mean TT"
  ) +
  theme_minimal() +
  theme(legend.position = "none")

# There is definitely more to this than meets the eye...
# How about the success probability of contested shots?

contested_success_rate <- model_data_clean %>%
    mutate(contested = case_when(
        CLOSE_DEF_DIST <= 4 ~ "Contested",
        .default = "Not contested"  
    )) %>% 
    summarise(total_shots = n(),
             contested_shots = sum(contested == "Contested"),
             contested_successful = sum(SUCCESS == 1 & contested == "Contested"),
             prop = contested_successful/contested_shots)

contested_success_rate


binom.test(x=contested_success_rate$contested_successful, n=contested_success_rate$contested_shots, p=0.5, alternative="less")
# Finally a significant result, even though the conclusion isn't surprising. 

In [None]:
# Experimenting here, but doesn't look too impressive.
library(scatterplot3d)
scatterplot3d(
  y = model_data_clean$CLOSE_DEF_DIST,
  x = model_data_clean$TOUCH_TIME,
  z = model_data_clean$SHOT_DIST,
  color = "darkred",
  pch = 19)

In [None]:
# Lebron isn't the best player? Can it be?
# Here it says that he only played 51 games, but official stats (ESPN/Basketball reference) say he played 69 games! (Regular season)
# James Harden actually played in 81 games!
model_data_clean %>%
    mutate(pts = PTS_TYPE * SUCCESS) %>%
    group_by(PLAYER_NAME) %>%
    summarise(
        total_pts   = sum(pts),
        games_played = n_distinct(GAME_ID),
        avg_pts_game = total_pts / games_played,
        .groups = "drop"
    ) %>% arrange(desc(avg_pts_game))


model_data_clean %>% summarise(count = n_distinct(GAME_ID))