In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from tqdm.auto import tqdm
import os
import glob
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder

In [2]:
DATA_DIR = "./data"

input_files = sorted(glob.glob(os.path.join(DATA_DIR, "train/input_2023_w*.csv")))
output_files = sorted(glob.glob(os.path.join(DATA_DIR, "train/output_2023_w*.csv")))

df_in = pd.concat((pd.read_csv(p) for p in tqdm(input_files, desc="loading inputs")), ignore_index=True)
df_out = pd.concat((pd.read_csv(p) for p in tqdm(output_files, desc="loading outputs")), ignore_index=True)

test_in = pd.read_csv(os.path.join(DATA_DIR, "test_input.csv"))
test_template = pd.read_csv(os.path.join(DATA_DIR, "test.csv"))
sample_submission = pd.read_csv(os.path.join(DATA_DIR, "sample_submission.csv"))

print("Inputs:", df_in.shape, "Outputs:", df_out.shape, "Test input:", test_in.shape)

loading inputs:   0%|          | 0/18 [00:00<?, ?it/s]

loading outputs:   0%|          | 0/18 [00:00<?, ?it/s]

Inputs: (4880579, 23) Outputs: (562936, 6) Test input: (49753, 23)


In [43]:
df_in.columns

Index(['game_id', 'play_id', 'player_to_predict', 'nfl_id', 'frame_id',
       'play_direction', 'absolute_yardline_number', 'player_name',
       'player_height', 'player_weight', 'player_birth_date',
       'player_position', 'player_side', 'player_role', 'x', 'y', 's', 'a',
       'dir', 'o', 'num_frames_output', 'ball_land_x', 'ball_land_y',
       'final_frame_prediction'],
      dtype='object')

In [73]:
def feature_engineer(df_prethrow, alpha = 0.4, gcol: list = []):
    '''Function taking in the input features / prethrow frames and computing relevant features up to the final prethrow frame'''
    df = df_prethrow.copy()
    df = df[df['player_to_predict']]
    df['rolling_s_mean'] = (df.groupby(gcol)['s'].transform(lambda s: s.ewm(alpha=alpha, min_periods = 8).mean()))
    df['rolling_a_mean'] = (df.groupby(gcol)['a'].transform(lambda s: s.ewm(alpha=alpha, min_periods = 8).mean()))
    df['dx'] = np.abs(df['ball_land_x'] - df['x'])
    df['dy'] = np.abs(df['ball_land_y'] - df['y'])
    for i in range(1,9):
        df[f"dx_lag_{i}"] = df.groupby(gcol)['dx'].shift(i)
        df[f"dy_lag_{i}"] = df.groupby(gcol)['dy'].shift(i)

    position_encoder = OneHotEncoder(sparse_output=False).set_output(transform="pandas")
    encoded_data = position_encoder.fit_transform(df[['player_position']])
    
    df_tot = pd.concat([df, encoded_data], axis=1)
    df_snap = df_tot[df_tot['final_frame_prediction'] == 1]

    df_snap.rename(columns={'x': 'prethrow_x', 'y': 'prethrow_y'})

    return df_snap

def merge_features(df_prethrow, df_postthrow ,gcol = []):
    '''merge snapshots with df_out for training'''
    df = pd.merge(df_postthrow, df_prethrow, on = gcol, how='left')
    features = [
        's', 'a',
       'dir', 'o', 'ball_land_x', 'ball_land_y',
       'final_frame_prediction', 'rolling_s_mean', 'rolling_a_mean', 'dx',
       'dy', 'dx_lag_1', 'dy_lag_1', 'dx_lag_2', 'dy_lag_2', 'dx_lag_3',
       'dy_lag_3', 'dx_lag_4', 'dy_lag_4', 'dx_lag_5', 'dy_lag_5', 'dx_lag_6',
       'dy_lag_6', 'dx_lag_7', 'dy_lag_7', 'dx_lag_8', 'dy_lag_8','player_position_SS'
    #    'player_position_CB', 'player_position_DE', 'player_position_DT',
    #    'player_position_FB', 'player_position_FS', 'player_position_ILB',
    #    'player_position_LB', 'player_position_MLB', 'player_position_NT',
    #    'player_position_OLB', 'player_position_QB', 'player_position_RB',
    #    'player_position_S', 'player_position_SS', 'player_position_T',
    #    'player_position_TE', 'player_position_WR'
    ]
    cols_to_keep = list(df_postthrow.columns) + features

    return df.columns


In [74]:
def main(df_prethrow, df_postthrow ):
    df_prethrow = df_prethrow.copy()  # avoid SettingWithCopyWarning
    df_postthrow = df_postthrow.copy()

    gcol = ['game_id','play_id','nfl_id']

    final_frame = df_prethrow.groupby(gcol).tail(1)
    
    player_predict_final = final_frame[final_frame['player_to_predict']]
    player_predict_final = player_predict_final[gcol + ['frame_id']]

    is_final_frame = pd.MultiIndex.from_frame(df_prethrow[['game_id','play_id','nfl_id','frame_id']]).isin(
        pd.MultiIndex.from_frame(player_predict_final)
    )

    df_prethrow['final_frame_prediction'] = np.where(is_final_frame, 1, 0)

    df_altered = feature_engineer(df_prethrow, alpha = 0.4, gcol=gcol)

    df_model = merge_features(df_prethrow = df_altered ,df_postthrow = df_postthrow,gcol = gcol)

    return df_model
    


In [75]:
x = df_in.loc[(df_in['game_id'] ==2023090700) & (df_in['play_id'] == 101) & (df_in['nfl_id'] == 46137), :]
y = df_out.loc[(df_out['game_id'] ==2023090700) & (df_out['play_id'] == 101) & (df_out['nfl_id'] == 46137),:]

In [76]:
z = main(df_prethrow = x, df_postthrow = y)

In [77]:
z

Index(['game_id', 'play_id', 'nfl_id', 'frame_id_x', 'x_x', 'y_x',
       'player_to_predict', 'frame_id_y', 'play_direction',
       'absolute_yardline_number', 'player_name', 'player_height',
       'player_weight', 'player_birth_date', 'player_position', 'player_side',
       'player_role', 'x_y', 'y_y', 's', 'a', 'dir', 'o', 'num_frames_output',
       'ball_land_x', 'ball_land_y', 'final_frame_prediction',
       'rolling_s_mean', 'rolling_a_mean', 'dx', 'dy', 'dx_lag_1', 'dy_lag_1',
       'dx_lag_2', 'dy_lag_2', 'dx_lag_3', 'dy_lag_3', 'dx_lag_4', 'dy_lag_4',
       'dx_lag_5', 'dy_lag_5', 'dx_lag_6', 'dy_lag_6', 'dx_lag_7', 'dy_lag_7',
       'dx_lag_8', 'dy_lag_8', 'player_position_SS'],
      dtype='object')

In [67]:
df_out

Unnamed: 0,game_id,play_id,nfl_id,frame_id,x,y
0,2023090700,101,46137,1,56.22,17.28
1,2023090700,101,46137,2,56.63,16.88
2,2023090700,101,46137,3,57.06,16.46
3,2023090700,101,46137,4,57.48,16.02
4,2023090700,101,46137,5,57.91,15.56
...,...,...,...,...,...,...
562931,2024010713,4018,52457,14,30.99,10.51
562932,2024010713,4018,52457,15,30.78,9.73
562933,2024010713,4018,52457,16,30.63,8.93
562934,2024010713,4018,52457,17,30.52,8.12


In [61]:
z

Index(['game_id', 'play_id', 'nfl_id', 'frame_id_x', 'x_x', 'y_x',
       'player_to_predict', 'frame_id_y', 'play_direction',
       'absolute_yardline_number', 'player_name', 'player_height',
       'player_weight', 'player_birth_date', 'player_position', 'player_side',
       'player_role', 'x_y', 'y_y', 's', 'a', 'dir', 'o', 'num_frames_output',
       'ball_land_x', 'ball_land_y', 'final_frame_prediction',
       'rolling_s_mean', 'rolling_a_mean', 'dx', 'dy', 'dx_lag_1', 'dy_lag_1',
       'dx_lag_2', 'dy_lag_2', 'dx_lag_3', 'dy_lag_3', 'dx_lag_4', 'dy_lag_4',
       'dx_lag_5', 'dy_lag_5', 'dx_lag_6', 'dy_lag_6', 'dx_lag_7', 'dy_lag_7',
       'dx_lag_8', 'dy_lag_8', 'player_position_SS'],
      dtype='object')

In [70]:
data = pd.DataFrame({'color': ['red', 'blue', 'green', 'red']})
encoder = OneHotEncoder(sparse_output=False).set_output(transform="pandas")
encoded_data = encoder.fit_transform(data[['color']])
print(encoded_data)

   color_blue  color_green  color_red
0         0.0          0.0        1.0
1         1.0          0.0        0.0
2         0.0          1.0        0.0
3         0.0          0.0        1.0


In [None]:
#if in a comfortable spot in the project try to seperate model files from data cleaning/ transform to readibility and conciseness of the file 