In [None]:
import numpy as np
import pandas as pd

In [None]:
## Data Preprocess + First Model ##

In [None]:
import json
with open('../data/preprocessed.json', 'r') as file:
    data = json.load(file)

In [None]:
data = pd.DataFrame(data)
data

In [None]:
data['date'] = pd.to_datetime(data['date'], unit='ms')
data

In [None]:
# Combine first_name and last_name
data['player_name'] = data['first_name'] + ' ' + data['last_name']
# Remove first_name, last_name, turnover, pf, and player_id columns
data = data.drop(columns=['player_id','first_name','last_name','turnover','pf'])
# Move player_name to front
data = data[['player_name'] + data.columns.drop('player_name').tolist()]
data

In [None]:
# Check for missing values
data.isna().sum() # None, Yay!

In [None]:
projections = pd.read_json('../data/prizepicks_cleaned.json')
projections = projections[~projections['name'].str.contains(r'\+')]
projections.head()

In [None]:
point_projection = projections[projections['stat_type'] == 'Points']
point_projection = point_projection[['name', 'line_score']].reset_index(drop=True)
point_projection.head()

In [None]:
data_and_proj = data.merge(point_projection, left_on='player_name', right_on='name', how='inner')
data_and_proj.head()

In [None]:
data_and_proj[data_and_proj['player_name'] == 'Michael Porter Jr.']

In [None]:
data_and_proj = data_and_proj.sort_values(by=['player_name', 'date'])
data_and_proj

In [None]:
data_and_proj['above_threshold'] = (data_and_proj['pts'] > data_and_proj['line_score']).astype(int)
data_and_proj

In [None]:
features = ["fgm", "fga", "reb", "ast", "stl", "blk", "fg3m", "fg3a", "fta", "oreb", "dreb"]
X = data_and_proj[features]
y = data_and_proj["above_threshold"]

In [None]:
def create_sequences_with_names(data, target, sequence_length):
    sequences = []
    labels = []
    player_names = [] 
    
    for player, group in data.groupby("player_name"):
        group_features = group[features].values 
        group_labels = group[target].values
        
        for i in range(len(group) - sequence_length):
            sequences.append(group_features[i:i + sequence_length]) 
            labels.append(group_labels[i + sequence_length])        
            player_names.append(player)                             
    
    return np.array(sequences), np.array(labels), player_names

sequence_length = 5
X_seq, y_seq, player_names = create_sequences_with_names(data_and_proj, "above_threshold", sequence_length)

print(f"Number of sequences: {len(X_seq)}")
print(f"First sequence shape: {X_seq[0].shape}")
print(f"First player name: {player_names[0]}")

In [None]:
X_seq[:10]

In [None]:
# Get unique player names
unique_players = np.unique(player_names)

# Split indices for training and testing
train_indices = []
test_indices = []

for player in unique_players:
    # Get indices for this player
    player_indices = np.where(np.array(player_names) == player)[0]
    split_point = int(len(player_indices) * 0.8)  # 80% for training
    train_indices.extend(player_indices[:split_point])
    test_indices.extend(player_indices[split_point:])

# Subset data for training and testing
X_train, X_test = X_seq[train_indices], X_seq[test_indices]
y_train, y_test = y_seq[train_indices], y_seq[test_indices]
train_names = [player_names[i] for i in train_indices]
test_names = [player_names[i] for i in test_indices]

# Convert to PyTorch tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32)


In [None]:
%cd ../prediction_model
from lstm import LSTM
%cd -

input_size = X_train_tensor.shape[2]
hidden_size = 64
output_size = 1

model = LSTM(input_size=input_size, hidden_size=hidden_size, output_size=output_size)

In [None]:
import torch.optim as optim
import torch.nn as nn

criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [None]:
import torch

num_epochs = 10
batch_size = 32 

for epoch in range(num_epochs):
    model.train()
    epoch_loss = 0.0

    permutation = torch.randperm(X_train_tensor.size(0))
    for i in range(0, X_train_tensor.size(0), batch_size):
        indices = permutation[i:i + batch_size]
        batch_X, batch_y = X_train_tensor[indices], y_train_tensor[indices]

        optimizer.zero_grad()

        predictions = model(batch_X).squeeze()
        loss = criterion(predictions, batch_y)

        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()

    print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {epoch_loss:.4f}")


In [None]:
# Evaluation
model.eval()  # Set the model to evaluation mode
with torch.no_grad():
    predictions = model(X_test_tensor).squeeze()
    predictions = (predictions >= 0.5).float()  # Apply threshold for binary classification

    # Calculate accuracy
    accuracy = (predictions == y_test_tensor).float().mean()
    print(f"Test Accuracy: {accuracy * 100:.2f}%")


In [None]:
# Scale to normalize features
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
numerical_cols = ['fgm', 'fga', 'oreb', 'dreb', 'ast', 'stl', 'blk', 'turnover', 'pf', 'pts']
data[numerical_cols] = scaler.fit_transform(df[numerical_cols])

In [None]:
# Computing the last 5 averages for each player
average_estimates = ['fgm', 'fga', 'fg3m', 'fg3a', 'ftm', 'fta', 'oreb', 'dreb', 'reb', 'ast', 'stl', 'blk', 'turnover', 'pf', 'pts']
for col in average_estimates:
    data[f'{col}_avg_last_5'] = data.groupby('player_name')[col].transform(lambda x: x.rolling(window=5, min_periods=1).mean())

In [None]:
# Creating feature "on_hotstreak" to indicate if player is performing higher when compared to their average i
player_avg_pts = data.groupby('player_name')['pts'].transform('mean')
player_avg_asts = data.groupby('player_name')['ast'].transform('mean')
player_avg_reb = data.groupby('player_name')['reb'].transform('mean')

# Then, compare the last 5-game average to the player's overall average and convert boolean to integers 
# where (1 if on hotstreak, 0 if not)
data['on_hotstreak_pts'] = (data['pts_avg_last_5'] > player_avg_pts).astype(int)
data['on_hotstreak_asts'] = (data['ast_avg_last_5'] > player_avg_asts).astype(int)
data['on_hotstreak_reb'] = (data['reb_avg_last_5'] > player_avg_reb).astype(int)
data

In [None]:
def predictor(player_name, type_of_prediction, baseline):
    player_data = data[data['player_name'] == player_name]
    
    relevant_features = {
        "points": ["fg3_pct", "fg_pct",'ft_pct', "pts_avg_last_5", "on_hotstreak_pts"],
        "rebounds": ["oreb", "dreb", "reb_avg_last_5", "on_hotstreak_reb"],
        "assists": ["ast", "ast_avg_last_5", "on_hotstreak_asts"]
    }
    
    if type_of_prediction == 'points':
        
    elif type_of_prediction == 'rebounds':
        
    elif type_of_prediction == 'assists':
        
    elif type_of_prediction == 'pra':
        
