In [None]:
list.of.packages <- c("tidyverse", "data.table", "dtplyr", "arrow", "dbplyr", "duckdb", "lme4", "dbscan", "pROC", "caTools")
new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]
if(length(new.packages)) install.packages(new.packages)

library(tidyverse)
library(data.table)
#library(dtplyr)
#library(dplyr, warn.conflicts = FALSE)
library(arrow)
library(duckdb)
library(lme4)
library(dbscan)
library(pROC)
library(caTools)

In [None]:
raw_shots <- read_csv("../Data/NBA_Shots_Raw.csv")
player_info <- read_csv("../Data/Player_Info.csv")
player_salary <- read_csv("../Data/Player_Salary.csv")
player_info <- player_info %>% mutate(PLAYER_NAME = paste(First_Name, Surname))
player_salary <- player_salary %>% rename(PLAYER_NAME = Name)
clean_shots <- read_csv("../Data/NBA_Shots_Clean_Example.csv")
common_player_info <- read_csv("../Data/wyatt_basketball/csv/common_player_info.csv")
common_player_info <- common_player_info %>% mutate(PLAYER_NAME = paste(first_name, last_name))


#clean_shots <- clean_shots %>% mutate(across(where(is.character), ~ str_replace_all(., "Kyle Oquinn", "Kyle O'Quinn")))
#clean_shots <- clean_shots %>% mutate(across(where(is.character), ~ str_replace_all(., "Al Farouq Aminu", "Al-Farouq Aminu")))
#clean_shots <- clean_shots %>% mutate(across(where(is.character), ~ str_replace_all(., "Lamarcus Aldridge", "LaMarcus Aldridge")))
#clean_shots <- clean_shots %>% mutate(across(where(is.character), ~ str_replace_all(., "Oj Mayo", "O.J. Mayo")))
#clean_shots <- clean_shots %>% mutate(across(where(is.character), ~ str_replace_all(., "Cj Miles", "C.J. Miles")))
player_info <- player_info %>% mutate(across(where(is.character), ~ str_replace_all(., "Luc Mbah", "Luc Mbah a Moute")))


player_info <- player_info %>% 
    mutate(PLAYER_NAME = str_to_title(PLAYER_NAME)) %>%
    mutate(PLAYER_NAME = str_remove_all(PLAYER_NAME, "\\.")) %>%
    mutate(PLAYER_NAME = str_replace_all(PLAYER_NAME, "-", " "))

# keep for now
player_info <- player_info %>% 
    mutate(Surname = str_to_title(Surname)) %>%
    mutate(Surname = str_remove_all(Surname, "\\.")) %>%
    mutate(Surname = str_replace_all(Surname, "-", " "))

clean_shots <- clean_shots %>%
    mutate(PLAYER_NAME = str_replace_all(PLAYER_NAME, "-", " "))


In [None]:
colnames(raw_shots)
colnames(clean_shots)
colnames(player_info)
colnames(player_salary)
colnames(common_player_info)
#player_statistics %>% count() %>% collect()
#head(raw_shots)
#head(player_info)
#head(player_salary)

In [None]:
options(repr.plot.width = 10, repr.plot.height = 7)

s <- raw_shots %>% 
    left_join(player_info %>% select(Pos, PLAYER_NAME), by = "PLAYER_NAME") %>%
    filter(!is.na(Pos)) %>%
    select(SHOT_DIST, Pos)

ggplot(s, aes(x = SHOT_DIST)) +
    geom_histogram(bins = 30, fill = "blue", color = "black", alpha = 0.5) +
    facet_wrap(~ Pos) +
    labs(title = "Shot Distance Distribution by Position",
       x = "Shot Distance (feet)",
       y = "Frequency") +
    theme_minimal()

In [None]:
l1 <- raw_shots %>%
    left_join(player_info %>% select(Pos, PLAYER_NAME), by = "PLAYER_NAME", relationship="many-to-many") %>%
    filter(!is.na(Pos)) %>%
    group_by(PLAYER_NAME, Pos) %>% 
    summarize(tot_FGM = sum(FGM), .groups="drop") %>% 
    arrange(desc(tot_FGM))
l2 <- raw_shots %>% 
    filter(FGM == 1) %>% 
    group_by(PLAYER_NAME) %>%
    summarise(
        total_FGM = n(),
        pct_3pt = sum(PTS_TYPE == 3),
        pct_2pt = sum(PTS_TYPE == 2)
    ) %>%
    left_join(player_info %>% select(Pos, PLAYER_NAME, Team, Age), by = "PLAYER_NAME", relationship="many-to-many")

In [None]:
raw_shots %>% summarise(unique_players = n_distinct(PLAYER_NAME))
player_info %>% summarise(unique_players = n_distinct(PLAYER_NAME))
player_salary %>% summarise(unique_players = n_distinct(PLAYER_NAME))

In [None]:
#clean_shots %>% 
#    filter(PERIOD <= 3) %>%
#    select(HOME_TEAM, AWAY_TEAM, WIN_LOSE)
#    group_by(

#clean_shots %>% 
#    select(GAME_ID, FINAL_MARGIN, WIN_LOSE, HOME_TEAM, AWAY_TEAM, LOCATION) %>%
#    filter(abs(FINAL_MARGIN) <= 3) %>% #no need to filter for w/l
 #   distinct(GAME_ID, .keep_all=TRUE) %>%
#    summarise(home_win_pct = 100*sum(FINAL_MARGIN > 0 & LOCATION == "H")/sum(LOCATION == "H"))


#clean_shots %>%
#    select(GAME_ID, WIN_LOSE, HOME_TEAM, AWAY_TEAM, LOCATION, PERIOD, PTS_TYPE, SUCCESS) %>%
#    mutate(pts = PTS_TYPE * SUCCESS) %>%
#    filter(PERIOD <= 3) %>%
#    group_by(GAME_ID, LOCATION) %>%
#    summarise(total_pts = sum(pts), .groups = "drop") %>%
#    pivot_wider(names_from = LOCATION, values_from = total_pts, names_prefix = "pts_") %>%
#    summarise(pts_diff_H = pts_H-pts_A)

home_win <- clean_shots %>%
    filter(LOCATION == "H") %>%
    select(GAME_ID, HOME_TEAM, AWAY_TEAM, WIN_LOSE) %>%
    distinct(GAME_ID, .keep_all=TRUE)

clean_shots %>%
    filter(PERIOD <= 3) %>%
    mutate(pts = PTS_TYPE * SUCCESS) %>%
    group_by(GAME_ID, LOCATION) %>%
    summarise(total_pts = sum(pts), .groups = "drop") %>%
    pivot_wider(names_from = LOCATION, values_from = total_pts, names_prefix = "pts_") %>%
    left_join(home_win, by="GAME_ID") %>%
    rename(HOME_RESULT = WIN_LOSE) %>%
    mutate(pts_diff = abs(pts_H - pts_A)) %>%
    filter(pts_diff <= 3) %>%
    summarise(
        home_pct_win = sum(HOME_RESULT == "W")/n(),
        home_wins = sum(HOME_RESULT == "W"),
        total_games = n()
    )

binom.test(x=86, n=161, p=0.5, alternative="greater")

In [None]:
play_by_play <- open_dataset("../Data/wyatt_basketball/csv/play_by_play.csv", format="csv")

In [None]:
con <- DBI::dbConnect(
  duckdb::duckdb(), dbdir = ".tmp.duckdb"
)

In [None]:
duckdb_read_csv(con, name="play_by_play", files="../Data/wyatt_basketball/csv/play_by_play.csv")

In [None]:
colnames(tbl(con, "play_by_play") %>% head(1))

In [None]:
tbl(con, "play_by_play") %>% count(eventmsgtype) %>% collect()
tbl(con, "play_by_play") %>% filter(eventmsgtype == 1) %>% head(10) %>% collect()

In [None]:
dbDisconnect(con, shutdown=TRUE)

In [None]:
predictors <- c("PLAYER_NAME", "CLOSEST_DEFENDER" ,"SHOT_DIST", "PTS_TYPE", "CLOSE_DEF_DIST", "SHOT_CLOCK", "TOUCH_TIME", "PERIOD", "SUCCESS")
colnames(player_info)
colnames(clean_shots)
head(common_player_info %>% filter(first_name == "Tim"))

lookup_height_pos <- player_info %>%
    select(PLAYER_NAME, Height, Pos)

model_data <- clean_shots %>% 
    select(all_of(predictors)) %>%
    left_join(lookup_height_pos, by="PLAYER_NAME", relationship = "many-to-many") %>%
    rename(SHOOTER_POS = Pos, SHOOTER_HEIGHT = Height) %>%
    left_join(lookup_height_pos %>% rename(CLOSEST_DEFENDER = PLAYER_NAME), by="CLOSEST_DEFENDER") %>%
    rename(DEFENDER_POS = Pos, DEFENDER_HEIGHT = Height)



set.seed(0)
train_indices <- sample(1:nrow(model_data), size = 0.70 * nrow(model_data))
train <- model_data[train_indices, ]
test <- model_data[-train_indices, ]

#lookup_height_pos %>% head(10)


#dplyr::filter(clean_shots, grepl("Kyle",PLAYER_NAME)) ## found issues with apostrophe in name of Kyle O'Quinn (was Oquinn)

bad_names <- model_data %>% filter(is.na(SHOOTER_HEIGHT) | is.na(SHOOTER_POS)) %>% distinct(PLAYER_NAME, .keep_all=FALSE)

dplyr::filter(player_info, grepl("Hardaway",PLAYER_NAME))

#player_info %>% filter(Team == "LAL") %>% distinct(PLAYER_NAME)

#Robert Sacre not in player_info. Plays for LAL
#Jason Terry not in either. Plays for HOU
#John Salmons not in player_info. Played for NOP and got traded to PHX midseason
#Brandan Wright not in player_info. Played for BOS and got traded to PHX at some point
#Cody Zeller ... Played for Charlotte Bobcats

model_data %>% filter(is.na(SHOOTER_HEIGHT)) %>% count()
bad_names


In [None]:
bad_names %>% mutate(Surname = str_extract(PLAYER_NAME, "\\S+$")) %>% left_join(player_info %>% select(Surname, Age))
player_info %>% head(10)
dplyr::filter(player_info, grepl("Williams",PLAYER_NAME))

In [None]:
log_model <- glm(SUCCESS ~ SHOT_DIST + CLOSE_DEF_DIST + SHOT_CLOCK + TOUCH_TIME, data=clean_shots, family="binomial")
log_mm_model <- glmer(SUCCESS ~ SHOT_DIST + CLOSE_DEF_DIST + SHOT_CLOCK + TOUCH_TIME + PERIOD +
                      (SHOT_DIST | PLAYER_ID),
                    data = train_data,
                    family=binomial(link = "logit"))

summary(log_mm_model)
test_data$pred_prob <- predict(log_mm_model, type = "response", newdata=test_data)
roc(SUCCESS ~ pred_prob, data = test_data)

In [None]:
ggplot(clean_shots, aes(x=SHOT_DIST, y=TOUCH_TIME)) + 
    geom_point()