In [61]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from tqdm.auto import tqdm
import os
import glob
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder

In [2]:
DATA_DIR = "./data"

input_files = sorted(glob.glob(os.path.join(DATA_DIR, "train/input_2023_w*.csv")))
output_files = sorted(glob.glob(os.path.join(DATA_DIR, "train/output_2023_w*.csv")))

df_in = pd.concat((pd.read_csv(p) for p in tqdm(input_files, desc="loading inputs")), ignore_index=True)
df_out = pd.concat((pd.read_csv(p) for p in tqdm(output_files, desc="loading outputs")), ignore_index=True)

test_in = pd.read_csv(os.path.join(DATA_DIR, "test_input.csv"))
test_template = pd.read_csv(os.path.join(DATA_DIR, "test.csv"))
sample_submission = pd.read_csv(os.path.join(DATA_DIR, "sample_submission.csv"))

print("Inputs:", df_in.shape, "Outputs:", df_out.shape, "Test input:", test_in.shape)

loading inputs:   0%|          | 0/18 [00:00<?, ?it/s]

loading outputs:   0%|          | 0/18 [00:00<?, ?it/s]

Inputs: (4880579, 23) Outputs: (562936, 6) Test input: (49753, 23)


In [4]:
gcol = ['game_id','play_id','nfl_id']

In [64]:
df_in.iloc[:,:15].head()

Unnamed: 0,game_id,play_id,player_to_predict,nfl_id,frame_id,play_direction,absolute_yardline_number,player_name,player_height,player_weight,player_birth_date,player_position,player_side,player_role,x
0,2023090700,101,False,54527,1,right,42,Bryan Cook,6-1,210,1999-09-07,FS,Defense,Defensive Coverage,52.33
1,2023090700,101,False,54527,2,right,42,Bryan Cook,6-1,210,1999-09-07,FS,Defense,Defensive Coverage,52.33
2,2023090700,101,False,54527,3,right,42,Bryan Cook,6-1,210,1999-09-07,FS,Defense,Defensive Coverage,52.33
3,2023090700,101,False,54527,4,right,42,Bryan Cook,6-1,210,1999-09-07,FS,Defense,Defensive Coverage,52.35
4,2023090700,101,False,54527,5,right,42,Bryan Cook,6-1,210,1999-09-07,FS,Defense,Defensive Coverage,52.37


In [43]:
df_in.columns

Index(['game_id', 'play_id', 'player_to_predict', 'nfl_id', 'frame_id',
       'play_direction', 'absolute_yardline_number', 'player_name',
       'player_height', 'player_weight', 'player_birth_date',
       'player_position', 'player_side', 'player_role', 'x', 'y', 's', 'a',
       'dir', 'o', 'num_frames_output', 'ball_land_x', 'ball_land_y',
       'final_frame_prediction'],
      dtype='object')

In [145]:
def feature_engineer(df_in, alpha = 0.4, gcol=gcol):
    '''Function taking in the input features / prethrow frames and computing relevant features up to the final prethrow frame'''
    df = df_in.copy()
    df = df[df['player_to_predict']]
    df['rolling_s_mean'] = (df.groupby(gcol)['s'].transform(lambda s: s.ewm(alpha=alpha, min_periods = 8).mean()))
    df['rolling_a_mean'] = (df.groupby(gcol)['a'].transform(lambda s: s.ewm(alpha=alpha, min_periods = 8).mean()))
    df['dx'] = np.abs(df['ball_land_x'] - df['x'])
    df['dy'] = np.abs(df['ball_land_y'] - df['y'])
    for i in range(1,9):
        df[f"dx_lag_{i}"] = df.groupby(gcol)['dx'].shift(i)
        df[f"dy_lag_{i}"] = df.groupby(gcol)['dy'].shift(i)

    position_encoder = OneHotEncoder(sparse_output=False).set_output(transform="pandas")
    encoded_data = position_encoder.fit_transform(df[['player_position']])
    
    df_tot = pd.concat([df, encoded_data], axis=1)
    df_snap = df_tot[df_tot['final_frame_prediction'] == 1]

    df_snap.rename(columns={'x': 'prethrow_x', 'y': 'prethrow_y'})

    return df_tot[df_tot['final_frame_prediction'] == 1]

def merge_features(df_prethrow, df_afterthrow ,gcol = gcol):
    '''merge snapshots with df_out for training'''
    df = pd.merge(df_afterthrow, df_prethrow, on = gcol, how='left')
    features = [
        's', 'a',
       'dir', 'o', 'ball_land_x', 'ball_land_y',
       'final_frame_prediction', 'rolling_s_mean', 'rolling_a_mean', 'dx',
       'dy', 'dx_lag_1', 'dy_lag_1', 'dx_lag_2', 'dy_lag_2', 'dx_lag_3',
       'dy_lag_3', 'dx_lag_4', 'dy_lag_4', 'dx_lag_5', 'dy_lag_5', 'dx_lag_6',
       'dy_lag_6', 'dx_lag_7', 'dy_lag_7', 'dx_lag_8', 'dy_lag_8','player_position_SS'
    #    'player_position_CB', 'player_position_DE', 'player_position_DT',
    #    'player_position_FB', 'player_position_FS', 'player_position_ILB',
    #    'player_position_LB', 'player_position_MLB', 'player_position_NT',
    #    'player_position_OLB', 'player_position_QB', 'player_position_RB',
    #    'player_position_S', 'player_position_SS', 'player_position_T',
    #    'player_position_TE', 'player_position_WR'
    ]
    cols_to_keep = list(df_out.columns) + features

    return df.columns



In [149]:
def main(df_in,df_out):
    df_in = df_in.copy()  # avoid SettingWithCopyWarning
    df_out = df_out.copy()
    gcol = ['game_id','play_id','nfl_id']
    final_frame = df_in.groupby(gcol).tail(1)
    player_predict_final = final_frame[final_frame['player_to_predict']]
    player_predict_final = player_predict_final[gcol + ['frame_id']]

    is_final_frame = pd.MultiIndex.from_frame(df_in[['game_id','play_id','nfl_id','frame_id']]).isin(
        pd.MultiIndex.from_frame(player_predict_final)
    )

    df_in['final_frame_prediction'] = np.where(is_final_frame, 1, 0)


    df_altered = feature_engineer(df_in, alpha = 0.4, gcol=gcol)

    df_model = merge_features(df_prethrow = df_altered ,df_afterthrow = df_out,gcol = gcol)

    print(df_model)
    


In [150]:
x = df_in.loc[(df_in['game_id'] ==2023090700) & (df_in['play_id'] == 101) & (df_in['nfl_id'] == 46137)]
y = df_out.loc[(df_out['game_id'] ==2023090700) & (df_out['play_id'] == 101) & (df_out['nfl_id'] == 46137),:]
y

Unnamed: 0,game_id,play_id,nfl_id,frame_id,x,y
0,2023090700,101,46137,1,56.22,17.28
1,2023090700,101,46137,2,56.63,16.88
2,2023090700,101,46137,3,57.06,16.46
3,2023090700,101,46137,4,57.48,16.02
4,2023090700,101,46137,5,57.91,15.56
5,2023090700,101,46137,6,58.34,15.1
6,2023090700,101,46137,7,58.75,14.57
7,2023090700,101,46137,8,59.14,14.01
8,2023090700,101,46137,9,59.51,13.41
9,2023090700,101,46137,10,59.86,12.8


In [70]:
data = pd.DataFrame({'color': ['red', 'blue', 'green', 'red']})
encoder = OneHotEncoder(sparse_output=False).set_output(transform="pandas")
encoded_data = encoder.fit_transform(data[['color']])
print(encoded_data)

   color_blue  color_green  color_red
0         0.0          0.0        1.0
1         1.0          0.0        0.0
2         0.0          1.0        0.0
3         0.0          0.0        1.0


In [None]:
columns = ['Sex', 'Housing', 'Saving accounts', 'Checking account',
                                'Purpose']
# this is to generate a random dataframe with values
input_df = pd.DataFrame(np.random.randint(0,100,size=(100, len(columns))), columns=columns)

encoder = OneHotEncoder(sparse_output=False).set_output(transform="pandas")

output_df = encoder.fit_transform(data[columns])

KeyError: "None of [Index(['Sex', 'Housing', 'Saving accounts', 'Checking account', 'Purpose'], dtype='object')] are in the [columns]"