In [72]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from tqdm.auto import tqdm
import os
import glob
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split, GroupKFold
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error


In [7]:
DATA_DIR = "./data"

input_files = sorted(glob.glob(os.path.join(DATA_DIR, "train/input_2023_w*.csv")))
output_files = sorted(glob.glob(os.path.join(DATA_DIR, "train/output_2023_w*.csv")))

df_in = pd.concat((pd.read_csv(p) for p in tqdm(input_files, desc="loading inputs")), ignore_index=True)
df_out = pd.concat((pd.read_csv(p) for p in tqdm(output_files, desc="loading outputs")), ignore_index=True)

test_in = pd.read_csv(os.path.join(DATA_DIR, "test_input.csv"))
test_template = pd.read_csv(os.path.join(DATA_DIR, "test.csv"))
sample_submission = pd.read_csv(os.path.join(DATA_DIR, "sample_submission.csv"))

print("Inputs:", df_in.shape, "Outputs:", df_out.shape, "Test input:", test_in.shape)

loading inputs:   0%|          | 0/18 [00:00<?, ?it/s]

loading outputs:   0%|          | 0/18 [00:00<?, ?it/s]

Inputs: (4880579, 23) Outputs: (562936, 6) Test input: (49753, 23)


In [9]:
frame_len = 0.1

In [None]:
def feature_engineer(df_prethrow, gcol: list = []):
    '''Function taking in the input features / prethrow frames and computing relevant features up to the final prethrow frame'''
    df = df_prethrow.copy()
    df = df[df['player_to_predict']]

    df['dx_land'] = np.abs(df['ball_land_x'] - df['x'])
    df['dy_land'] = np.abs(df['ball_land_y'] - df['y'])
    df['dist_to_ball'] = np.sqrt((df['ball_land_x'] - df['x'])**2 + (df['ball_land_y'] - df['y'])**2)

    df['velo_x'] = df['s']*np.cos(df['dir'])
    df['velo_y'] = df['s']*np.sin(df['dir'])

    
    df["acc_x"] = df.groupby(gcol)['velo_x'].diff() / frame_len
    df["acc_y"] = df.groupby(gcol)['velo_y'].diff() / frame_len

   # a_T is the derivative of the scalar speed 's' (Tangential Acceleration)
    # This is the rate of change of speed (along the path).
    df["accel_tangential"] = df.groupby(gcol)['s'].diff() / frame_len

    # a_N is calculated using the given total magnitude 'a' and a_T
    # The clip(lower=0) handles floating-point errors.
    df["accel_normal"] = np.sqrt(
        (df["a"]**2 - df["accel_tangential"]**2).clip(lower=0)
    )

    # 2b. Calculate Instantaneous Jerk (1-frame lag)
    df["jerk_x"] = df.groupby(gcol)['acc_x'].diff() / frame_len
    df["jerk_y"] = df.groupby(gcol)['acc_y'].diff() / frame_len
    df["jerk"] = np.sqrt(df["jerk_x"]**2 + df["jerk_y"]**2) # Instantaneous Jerk Magnitude


    # variable_to_flatten = 'jerk'
    # num_lags = 5

    # for i in range(1, num_lags + 1):
    #     # This creates a new column for each time step in the past
    #     # Example: 'jerk_lag_1' holds the value of 'jerk' from the previous frame.
    #     #          'jerk_lag_5' holds the value of 'jerk' from 5 frames ago.
    #     df[f"{variable_to_flatten}_lag_{i}"] = df.groupby(gcol)[variable_to_flatten].shift(i)

    position_encoder = OneHotEncoder(sparse_output=False).set_output(transform="pandas")
    encoded_position = position_encoder.fit_transform(df[['player_position']])
    
    direction_encoder = OneHotEncoder(sparse_output=False).set_output(transform="pandas")
    encoded_direction = direction_encoder.fit_transform(df[['play_direction']])

    side_encoder = OneHotEncoder(sparse_output=False).set_output(transform="pandas")
    encoded_side = side_encoder.fit_transform(df[['player_side']])

    df_tot = pd.concat([df, encoded_position, encoded_direction, encoded_side], axis=1)
    df_snap = df_tot[df_tot['final_frame_prediction'] == 1]
    df_snap.drop(columns='frame_id',inplace=True)

    rename_map = {
        "x": "x_last",
        "y": "y_last",
        "s": "s_last",
        "a": "a_last",
        "dir": "dir_last",
        "o": "o_last",
    }

    df_snap.rename(columns=rename_map,inplace=True)
    print(df_snap.columns)
    return df_snap

def merge_tables(df_prethrow, df_postthrow ,gcol):
    '''merge snapshots with df_out for training'''
    df = pd.merge(df_postthrow, df_prethrow, on = gcol, how='left')
    df['dx'] = df['x'] - df['x_last']
    df['dy'] = df['y'] - df['y_last']
    df["play_key"] = df["game_id"].astype(str) + "_" + df["play_id"].astype(str)
    return df



In [None]:
def feature_process(df_prethrow, df_postthrow ):
    df_prethrow = df_prethrow.copy()  # avoid SettingWithCopyWarning
    df_postthrow = df_postthrow.copy()

    gcol = ['game_id','play_id','nfl_id']

    final_frame = df_prethrow.groupby(gcol).tail(1)
    
    player_predict_final = final_frame[final_frame['player_to_predict']]
    player_predict_final = player_predict_final[gcol + ['frame_id']]

    is_final_frame = pd.MultiIndex.from_frame(df_prethrow[['game_id','play_id','nfl_id','frame_id']]).isin(
        pd.MultiIndex.from_frame(player_predict_final)
    )

    df_prethrow['final_frame_prediction'] = np.where(is_final_frame, 1, 0)

    df_altered = feature_engineer(df_prethrow, gcol=gcol)

    df_model = merge_tables(df_prethrow = df_altered ,df_postthrow = df_postthrow,gcol = gcol)

    return df_model
    


In [None]:
df_model = merge_tables(df_prethrow = df_in, df_postthrow = df_out)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_snap.drop(columns='frame_id',inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_snap.rename(columns=rename_map,inplace=True)


Index(['game_id', 'play_id', 'player_to_predict', 'nfl_id', 'play_direction',
       'absolute_yardline_number', 'player_name', 'player_height',
       'player_weight', 'player_birth_date', 'player_position', 'player_side',
       'player_role', 'x_last', 'y_last', 's_last', 'a_last', 'dir_last',
       'o_last', 'num_frames_output', 'ball_land_x', 'ball_land_y',
       'final_frame_prediction', 'dx_land', 'dy_land', 'dist_to_ball',
       'velo_x', 'velo_y', 'acc_x', 'acc_y', 'accel_tangential',
       'accel_normal', 'jerk_x', 'jerk_y', 'jerk', 'player_position_CB',
       'player_position_DE', 'player_position_DT', 'player_position_FB',
       'player_position_FS', 'player_position_ILB', 'player_position_LB',
       'player_position_MLB', 'player_position_NT', 'player_position_OLB',
       'player_position_QB', 'player_position_RB', 'player_position_S',
       'player_position_SS', 'player_position_T', 'player_position_TE',
       'player_position_WR', 'play_direction_left', 'play_d

In [None]:
numeric_features = ['x_last', 'y_last', 's_last', 'a_last', 'dir_last',
       'o_last', 'num_frames_output', 'ball_land_x', 'ball_land_y',
       'final_frame_prediction', 'dx_land', 'dy_land', 'dist_to_ball',
       'velo_x', 'velo_y', 'acc_x', 'acc_y', 'accel_tangential',
       'accel_normal', 'jerk_x', 'jerk_y', 'jerk']

cat_features = ['player_position_CB',
       'player_position_DE', 'player_position_DT', 'player_position_FB',
       'player_position_FS', 'player_position_ILB', 'player_position_LB',
       'player_position_MLB', 'player_position_NT', 'player_position_OLB',
       'player_position_QB', 'player_position_RB', 'player_position_S',
       'player_position_SS', 'player_position_T', 'player_position_TE',
       'player_position_WR', 'play_direction_left', 'play_direction_right',
       'player_side_Defense', 'player_side_Offense']

In [94]:
X = df_model[numeric_features + cat_features]
y_dx = df_model['dx']
y_dy = df_model['dy']
groups = df_model["play_key"]

In [None]:
def model_train():
    n_splits = 5
    gkf = GroupKFold(n_splits=n_splits)


    model_y = LinearRegression()
    model_x = LinearRegression()
    fold_scores = np.zeros((2,n_splits))

    # Loop through the folds generated by GroupKFold
    # X is features, y is target, and groups is the multi-key column
    for fold_idx, (train_index, val_index) in enumerate(gkf.split(X, groups=groups)):
        print(f"--- Fold {fold_idx + 1} ---")

        # Split the data
        X_train, X_val = X.iloc[train_index], X.iloc[val_index]
        y_dy_train, y_dy_val = y_dy.iloc[train_index], y_dy.iloc[val_index]
        y_dx_train, y_dx_val = y_dx.iloc[train_index], y_dx.iloc[val_index]
        # Optional: Verify group separation (should print unique groups in each set)
        # print(f"Train Groups: {groups.iloc[train_index].unique()}")
        # print(f"Validation Groups: {groups.iloc[val_index].unique()}")

        # Train the model
        model_y.fit(X_train, y_dy_train)

        # Predict and evaluate
        dy_y_pred = model_y.predict(X_val)
        rmse_y = np.sqrt(mean_squared_error(y_dy_val, dy_y_pred))
        fold_scores[0,fold_idx] = rmse_y

        # Train the model
        model_x.fit(X_train, y_dx_train)

        # Predict and evaluate
        dy_x_pred = model_x.predict(X_val)
        rmse_x = np.sqrt(mean_squared_error(y_dx_val, dy_x_pred))
        fold_scores[1,fold_idx] = rmse_x
        
        print(f"Validation RMSEY: {rmse_y:.4f}")
        print(f"Validation RMSEX: {rmse_x:.4f}")

--- Fold 1 ---
Validation RMSEY: 2.8670
Validation RMSEX: 3.1124
--- Fold 2 ---
Validation RMSEY: 2.9073
Validation RMSEX: 3.0708
--- Fold 3 ---
Validation RMSEY: 2.8046
Validation RMSEX: 3.1441
--- Fold 4 ---
Validation RMSEY: 2.7789
Validation RMSEX: 3.0387
--- Fold 5 ---
Validation RMSEY: 2.8640
Validation RMSEX: 3.0946


In [60]:
x = df_in.loc[(df_in['game_id'] ==2023090700) & (df_in['play_id'] == 101) & (df_in['nfl_id'] == 46137), :]
y = df_out.loc[(df_out['game_id'] ==2023090700) & (df_out['play_id'] == 101) & (df_out['nfl_id'] == 46137),:]

In [None]:
#if in a comfortable spot in the project try to seperate model files from data cleaning/ transform to readibility and conciseness of the file 