In [145]:
import pandas as pd
import polars as pl
import matplotlib.pyplot as plt
import numpy as np
from tqdm.auto import tqdm
import os
import glob
import seaborn as sns
from sklearn.model_selection import GroupKFold
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error


In [2]:
DATA_DIR = "./data"

input_files = sorted(glob.glob(os.path.join(DATA_DIR, "train/input_2023_w*.csv")))
output_files = sorted(glob.glob(os.path.join(DATA_DIR, "train/output_2023_w*.csv")))

df_in = pd.concat((pd.read_csv(p) for p in tqdm(input_files, desc="loading inputs")), ignore_index=True)
df_out = pd.concat((pd.read_csv(p) for p in tqdm(output_files, desc="loading outputs")), ignore_index=True)

test_in = pd.read_csv(os.path.join(DATA_DIR, "test_input.csv"))
test_template = pd.read_csv(os.path.join(DATA_DIR, "test.csv"))
sample_submission = pd.read_csv(os.path.join(DATA_DIR, "sample_submission.csv"))

print("Inputs:", df_in.shape, "Outputs:", df_out.shape, "Test input:", test_in.shape)

loading inputs:   0%|          | 0/18 [00:00<?, ?it/s]

loading outputs:   0%|          | 0/18 [00:00<?, ?it/s]

Inputs: (4880579, 23) Outputs: (562936, 6) Test input: (49753, 23)


In [3]:
frame_len = 0.1

In [141]:
def feature_engineer(df_prethrow, gcol: list = []):
    '''Function taking in the input features / prethrow frames and computing relevant features up to the final prethrow frame'''
    df = df_prethrow.copy()
    df = df[df['player_to_predict']]

    df['dx_land'] = np.abs(df['ball_land_x'] - df['x'])
    df['dy_land'] = np.abs(df['ball_land_y'] - df['y'])
    df['dist_to_ball'] = np.sqrt((df['ball_land_x'] - df['x'])**2 + (df['ball_land_y'] - df['y'])**2)

    df['velo_x'] = df['s']*np.cos(df['dir'])
    df['velo_y'] = df['s']*np.sin(df['dir'])

    
    df["acc_x"] = df.groupby(gcol)['velo_x'].diff() / frame_len
    df["acc_y"] = df.groupby(gcol)['velo_y'].diff() / frame_len

   # a_T is the derivative of the scalar speed 's' (Tangential Acceleration)
    # This is the rate of change of speed (along the path).
    df["accel_tangential"] = df.groupby(gcol)['s'].diff() / frame_len

    # a_N is calculated using the given total magnitude 'a' and a_T
    # The clip(lower=0) handles floating-point errors.
    df["accel_normal"] = np.sqrt(
        (df["a"]**2 - df["accel_tangential"]**2).clip(lower=0)
    )

    # 2b. Calculate Instantaneous Jerk (1-frame lag)
    df["jerk_x"] = df.groupby(gcol)['acc_x'].diff() / frame_len
    df["jerk_y"] = df.groupby(gcol)['acc_y'].diff() / frame_len
    df["jerk"] = np.sqrt(df["jerk_x"]**2 + df["jerk_y"]**2) # Instantaneous Jerk Magnitude


    # variable_to_flatten = 'jerk'
    # num_lags = 5

    # for i in range(1, num_lags + 1):
    #     # This creates a new column for each time step in the past
    #     # Example: 'jerk_lag_1' holds the value of 'jerk' from the previous frame.
    #     #          'jerk_lag_5' holds the value of 'jerk' from 5 frames ago.
    #     df[f"{variable_to_flatten}_lag_{i}"] = df.groupby(gcol)[variable_to_flatten].shift(i)

    df['player_position'] = df['player_position'].astype('category')
    df['play_direction'] = df['play_direction'].astype('category')
    df['player_side'] = df['player_side'].astype('category')    

    df_copy = df.copy()
    df_copy = df_copy[df_copy['final_frame_prediction'] == 1]
    df_copy.drop(columns='frame_id',inplace=True)

    rename_map = {
        "x": "x_last",
        "y": "y_last",
        "s": "s_last",
        "a": "a_last",
        "dir": "dir_last",
        "o": "o_last",
    }

    df_copy.rename(columns=rename_map,inplace=True)
    
    return df_copy

def merge_tables(df_prethrow, df_postthrow ,gcol, train = True):
    '''merge snapshots with df_out for training'''
    df = pd.merge(df_postthrow, df_prethrow, on = gcol, how='left')
    df['dx'] = df['x'] - df['x_last']
    df['dy'] = df['y'] - df['y_last']
    if train == False:
        df["id"] = df["game_id"].astype(str) + "_" + df["play_id"].astype(str) + "_" + df["nfl_id"].astype(str) + "_" + df["frame_id"].astype(str)
    else:    
        df["play_key"] = df["game_id"].astype(str) + "_" + df["play_id"].astype(str)

    
    return df






In [142]:
def feature_process(df_prethrow, df_postthrow, train ):
    df_prethrow = df_prethrow.copy()  # avoid SettingWithCopyWarning
    df_postthrow = df_postthrow.copy()

    gcol = ['game_id','play_id','nfl_id']

    final_frame = df_prethrow.groupby(gcol).tail(1)
    
    player_predict_final = final_frame[final_frame['player_to_predict']]
    player_predict_final = player_predict_final[gcol + ['frame_id']]

    is_final_frame = pd.MultiIndex.from_frame(df_prethrow[['game_id','play_id','nfl_id','frame_id']]).isin(
        pd.MultiIndex.from_frame(player_predict_final)
    )

    df_prethrow['final_frame_prediction'] = np.where(is_final_frame, 1, 0)

    df_altered = feature_engineer(df_prethrow, gcol=gcol)

    df_model = merge_tables(df_prethrow = df_altered ,df_postthrow = df_postthrow,gcol = gcol, train=train)

    return df_model
    


In [102]:
train_df = feature_process(df_prethrow = df_in, df_postthrow = df_out)

In [115]:
numeric_features = ['frame_id','x_last', 'y_last', 's_last', 'a_last', 'dir_last',
       'o_last', 'ball_land_x', 'ball_land_y',
       'dx_land', 'dy_land', 'dist_to_ball',
       'velo_x', 'velo_y', 'acc_x', 'acc_y', 'accel_tangential',
       'accel_normal', 'jerk_x', 'jerk_y', 'jerk']

cat_features = ['play_direction', 'player_position', 'player_side']

In [116]:
X = train_df[numeric_features + cat_features]
y_dx = train_df['dx']
y_dy = train_df['dy']
groups = train_df["play_key"]

In [None]:
def model_train(args):
    n_splits = 5
    gkf = GroupKFold(n_splits=n_splits)
    fold_scores = np.zeros((2,n_splits))


    for fold_idx, (train_index, val_index) in enumerate(gkf.split(X, groups=groups)):
        print(f"--- Fold {fold_idx + 1} ---")

        model_y = XGBRegressor(**args)
        model_x = XGBRegressor(**args)  

        # split
        X_train, X_val = X.iloc[train_index], X.iloc[val_index]
        y_dy_train, y_dy_val = y_dy[train_index], y_dy[val_index]
        y_dx_train, y_dx_val = y_dx[train_index], y_dx[val_index]
    
        model_y.fit(
            X_train, y_dy_train,
            eval_set=[(X_val, y_dy_val)],
            verbose=False
        )
        model_x.fit(
            X_train, y_dx_train,
            eval_set=[(X_val, y_dx_val)],
            verbose=False
        )

        # predict & score
        dy_y_pred = model_y.predict(X_val)
        rmse_y = np.sqrt(mean_squared_error(y_dy_val, dy_y_pred))
        fold_scores[0, fold_idx] = rmse_y

        dy_x_pred = model_x.predict(X_val)
        rmse_x = np.sqrt(mean_squared_error(y_dx_val, dy_x_pred))
        fold_scores[1, fold_idx] = rmse_x

        print(f"Validation RMSE_y: {rmse_y:.4f}")
        print(f"Validation RMSE_x: {rmse_x:.4f}")

    pass

In [None]:
def parameter_tune():
    
    xgb_params = {
    "n_estimators": 1000,
    "learning_rate": 0.05,
    "max_depth": 12,
    "subsample": 0.8,
    "colsample_bytree": 0.8,
    "reg_alpha": 0.0,
    "reg_lambda": 1.0,
    "tree_method": "hist",
    "n_jobs": -1,
    "random_state": 42,
    "objective": "reg:squarederror",
    "eval_metric": "rmse",
    'enable_categorical' : True
}

    ### return the best parameter setup
    pass

    

In [None]:
def predict(test: pl.DataFrame, test_input: pl.DataFrame) -> pl.DataFrame | pd.DataFrame:
    predict_df = feature_process(
        test_input.to_pandas(copy=True),
        test.to_pandas(copy=True),
        train=False
    )

    X = predict_df[numeric_features + cat_features]

    dx_pred = model_x.predict(X)
    dy_pred = model_y.predict(X)

    
    x_out = dx_pred + predict_df["x_last"].to_numpy()
    y_out = dy_pred + predict_df["y_last"].to_numpy()

    
    ids = predict_df["id"].to_pandas() 
    predictions = pd.DataFrame({"id": ids, "x": x_out, "y": y_out})

    assert isinstance(predictions, (pd.DataFrame, pl.DataFrame))
    assert len(predictions) == len(test)

    return predictions


# When your notebook is run on the hidden test set, inference_server.serve must be called within 10 minutes of the notebook starting
# or the gateway will throw an error. If you need more than 15 minutes to load your model you can do so during the very
# first `predict` call, which does not have the usual 5 minute response deadline.
import kaggle_evaluation.nfl_inference_server
inference_server = kaggle_evaluation.nfl_inference_server.NFLInferenceServer(predict)

if os.getenv('KAGGLE_IS_COMPETITION_RERUN'):
    inference_server.serve()
else:
    inference_server.run_local_gateway(('/kaggle/input/nfl-big-data-bowl-2026-prediction/',))

NameError: name 'kaggle_evaluation' is not defined

In [None]:
#if in a comfortable spot in the project try to seperate model files from data cleaning/ transform to readibility and conciseness of the file 

# hyper param tune
# more features, spatial and lagged terms, better notion for frame_id?
# figure out submission, ie saving model


