In [None]:
list.of.packages <- c("tidyverse", "data.table", "dtplyr", "arrow", "dbplyr", "duckdb", "lme4", "dbscan", "pROC")
new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]
if(length(new.packages)) install.packages(new.packages)

library(tidyverse)
library(data.table)
#library(dtplyr)
#library(dplyr, warn.conflicts = FALSE)
library(arrow)
library(duckdb)
library(lme4)
library(dbscan)
library(pROC)

In [None]:
raw_shots <- read_csv("../Data/NBA_Shots_Raw.csv")
player_info <- read_csv("../Data/Player_Info.csv")
player_salary <- read_csv("../Data/Player_Salary.csv")
player_info <- player_info %>% mutate(PLAYER_NAME = paste(First_Name, Surname))
player_salary <- player_salary %>% rename(PLAYER_NAME = Name)
clean_shots <- read_csv("../Data/NBA_Shots_Clean_Example.csv")

In [None]:
colnames(raw_shots)
colnames(clean_shots)
colnames(player_info)
colnames(player_salary)
#player_statistics %>% count() %>% collect()
#head(raw_shots)
#head(player_info)
#head(player_salary)


In [None]:
options(repr.plot.width = 10, repr.plot.height = 7)

s <- raw_shots %>% 
    left_join(player_info %>% select(Pos, PLAYER_NAME), by = "PLAYER_NAME") %>%
    filter(!is.na(Pos)) %>%
    select(SHOT_DIST, Pos)

ggplot(s, aes(x = SHOT_DIST)) +
    geom_histogram(bins = 30, fill = "blue", color = "black", alpha = 0.5) +
    facet_wrap(~ Pos) +
    labs(title = "Shot Distance Distribution by Position",
       x = "Shot Distance (feet)",
       y = "Frequency") +
    theme_minimal()

In [None]:
l1 <- raw_shots %>%
    left_join(player_info %>% select(Pos, PLAYER_NAME), by = "PLAYER_NAME", relationship="many-to-many") %>%
    filter(!is.na(Pos)) %>%
    group_by(PLAYER_NAME, Pos) %>% 
    summarize(tot_FGM = sum(FGM), .groups="drop") %>% 
    arrange(desc(tot_FGM))
l2 <- raw_shots %>% 
    filter(FGM == 1) %>% 
    group_by(PLAYER_NAME) %>%
    summarise(
        total_FGM = n(),
        pct_3pt = sum(PTS_TYPE == 3),
        pct_2pt = sum(PTS_TYPE == 2)
    ) %>%
    left_join(player_info %>% select(Pos, PLAYER_NAME, Team, Age), by = "PLAYER_NAME", relationship="many-to-many")

In [None]:
raw_shots %>% summarise(unique_players = n_distinct(PLAYER_NAME))
player_info %>% summarise(unique_players = n_distinct(PLAYER_NAME))
player_salary %>% summarise(unique_players = n_distinct(PLAYER_NAME))

In [None]:
#clean_shots %>% 
#    filter(PERIOD <= 3) %>%
#    select(HOME_TEAM, AWAY_TEAM, WIN_LOSE)
#    group_by(

#clean_shots %>% 
#    select(GAME_ID, FINAL_MARGIN, WIN_LOSE, HOME_TEAM, AWAY_TEAM, LOCATION) %>%
#    filter(abs(FINAL_MARGIN) <= 3) %>% #no need to filter for w/l
 #   distinct(GAME_ID, .keep_all=TRUE) %>%
#    summarise(home_win_pct = 100*sum(FINAL_MARGIN > 0 & LOCATION == "H")/sum(LOCATION == "H"))


#clean_shots %>%
#    select(GAME_ID, WIN_LOSE, HOME_TEAM, AWAY_TEAM, LOCATION, PERIOD, PTS_TYPE, SUCCESS) %>%
#    mutate(pts = PTS_TYPE * SUCCESS) %>%
#    filter(PERIOD <= 3) %>%
#    group_by(GAME_ID, LOCATION) %>%
#    summarise(total_pts = sum(pts), .groups = "drop") %>%
#    pivot_wider(names_from = LOCATION, values_from = total_pts, names_prefix = "pts_") %>%
#    summarise(pts_diff_H = pts_H-pts_A)

home_win <- clean_shots %>%
    filter(LOCATION == "H") %>%
    select(GAME_ID, HOME_TEAM, AWAY_TEAM, WIN_LOSE) %>%
    distinct(GAME_ID, .keep_all=TRUE)

clean_shots %>%
    filter(PERIOD <= 3) %>%
    mutate(pts = PTS_TYPE * SUCCESS) %>%
    group_by(GAME_ID, LOCATION) %>%
    summarise(total_pts = sum(pts), .groups = "drop") %>%
    pivot_wider(names_from = LOCATION, values_from = total_pts, names_prefix = "pts_") %>%
    left_join(home_win, by="GAME_ID") %>%
    rename(HOME_RESULT = WIN_LOSE) %>%
    mutate(pts_diff = abs(pts_H - pts_A)) %>%
    filter(pts_diff <= 3) %>%
    summarise(
        home_pct_win = sum(HOME_RESULT == "W")/n(),
        home_wins = sum(HOME_RESULT == "W"),
        total_games = n()
    )

binom.test(x=86, n=161, p=0.5, alternative="greater")

In [None]:
play_by_play <- open_dataset("../Data/wyatt_basketball/csv/play_by_play.csv", format="csv")

In [None]:
colnames(play_by_play %>% head(10))

In [None]:
player_statistics_od <- open_dataset("../Data/historical-nba-data-and-player-box-scores/PlayerStatistics.csv", format="csv")

In [None]:
player_statistics_od %>% head() %>% collect()

In [None]:
con <- DBI::dbConnect(
  duckdb::duckdb(), dbdir = ".tmp.duckdb"
)

In [None]:
duckdb_read_csv(con, name="play_by_play", files="../Data/wyatt_basketball/csv/play_by_play.csv")

In [None]:
colnames(tbl(con, "play_by_play") %>% head(1))

In [None]:
tbl(con, "play_by_play") %>% count(eventmsgtype) %>% collect()
tbl(con, "play_by_play") %>% filter(eventmsgtype == 1) %>% head(10) %>% collect()

In [None]:
dbDisconnect(con, shutdown=TRUE)

In [None]:
colnames(clean_shots)
head(clean_shots)

clean_shots_scaled <- clean_shots %>%
  mutate(across(c(SHOT_DIST, CLOSE_DEF_DIST, SHOT_CLOCK, TOUCH_TIME, PERIOD), scale))

train_idx <- sample(1:nrow(clean_shots_scaled), 0.7 * nrow(clean_shots_scaled))
train_data <- clean_shots_scaled[train_idx, ]
test_data <- clean_shots_scaled[-train_idx, ]


log_model <- glm(SUCCESS ~ SHOT_DIST + CLOSE_DEF_DIST + SHOT_CLOCK + TOUCH_TIME, data=clean_shots, family="binomial")
log_mm_model <- glmer(SUCCESS ~ SHOT_DIST + CLOSE_DEF_DIST + SHOT_CLOCK + TOUCH_TIME + PERIOD +
                      (SHOT_DIST | PLAYER_ID),
                    data = train_data,
                    family=binomial(link = "logit"))


In [None]:
summary(log_mm_model)
test_data$pred_prob <- predict(log_mm_model, type = "response", newdata=test_data)
roc(SUCCESS ~ pred_prob, data = test_data)

In [None]:
ggplot(clean_shots, aes(x=SHOT_DIST, y=TOUCH_TIME)) + 
    geom_point()

In [None]:
library(dbscan)
library(ggplot2)

# First, let's prepare the data
cluster_data <- clean_shots[, c("SHOT_DIST", "TOUCH_TIME")]

# Standardize the data (important for distance-based algorithms like DBSCAN)
scaled_data <- scale(cluster_data)

# Determine optimal eps value using k-NN distance plot
kNNdistplot(scaled_data, k = 5)
abline(h = 0.5, col = "red")  # Example threshold line - adjust based on plot

# Perform DBSCAN clustering (example parameters - adjust based on your data)
set.seed(123)
dbscan_result <- dbscan(scaled_data, eps = 1.5, minPts = 5)

# Add cluster labels to original data
clean_shots$cluster <- as.factor(dbscan_result$cluster)

# Plot results
ggplot(clean_shots, aes(x = SHOT_DIST, y = TOUCH_TIME, color = cluster)) +
  geom_point(alpha = 0.6) +
  scale_color_discrete(name = "Cluster", 
                      labels = c("Noise", "Cluster 1", "Cluster 2", "Cluster 3")) +
  labs(title = "DBSCAN Clustering of Basketball Shots",
       x = "Shot Distance",
       y = "Touch Time") +
  theme_minimal()