In [1]:
from xmlrpc.client import DateTime

import pandas as pd
import numpy as np
import glob

from fontTools.misc.bezierTools import namedtuple

input_files = glob.glob('data/train/input*.csv')
output_files = glob.glob('data/train/output*.csv')

input_df = pd.concat((pd.read_csv(f) for f in input_files), ignore_index=True)
output_df = pd.concat((pd.read_csv(f) for f in output_files), ignore_index=True)

pd.set_option('display.max_columns', None)

In [2]:
## Redefining columns

one_hot_columsns = ["play_direction", "player_position", "player_side", "player_role"]
input_df = pd.get_dummies(input_df, columns=one_hot_columsns)
input_df = input_df.sort_values(["game_id", "play_id", "nfl_id", "frame_id"])
output_df = output_df.sort_values(["game_id", "play_id", "nfl_id", "frame_id"])

In [3]:
## Transforming input data



## Age

year = 2025
input_df["player_birth_date"] = pd.to_datetime(input_df["player_birth_date"])
input_df["age"] = 2025 - input_df["player_birth_date"].dt.year

## Height to meters

def foot_to_meters(x:str):
    x = x.replace("-",".")
    meters = float(x) * 0.3048
    return meters

input_df["player_height"] = input_df["player_height"].apply(foot_to_meters)
input_df["player_height"] = pd.to_numeric(input_df["player_height"])

In [4]:
## Scaling Inputs

from sklearn.preprocessing import StandardScaler

scaled_columns= ["absolute_yardline_number", "player_height", "player_weight", "age", "s", "a", "dir", "o", "ball_land_x", "ball_land_y"]
scaler = StandardScaler()
input_df[scaled_columns] = scaler.fit_transform(input_df[scaled_columns])

In [5]:
input_df.head()

Unnamed: 0,game_id,play_id,player_to_predict,nfl_id,frame_id,absolute_yardline_number,player_name,player_height,player_weight,player_birth_date,x,y,s,a,dir,o,num_frames_output,ball_land_x,ball_land_y,play_direction_left,play_direction_right,player_position_CB,player_position_DE,player_position_DT,player_position_FB,player_position_FS,player_position_ILB,player_position_K,player_position_LB,player_position_MLB,player_position_NT,player_position_OLB,player_position_P,player_position_QB,player_position_RB,player_position_S,player_position_SS,player_position_T,player_position_TE,player_position_WR,player_side_Defense,player_side_Offense,player_role_Defensive Coverage,player_role_Other Route Runner,player_role_Passer,player_role_Targeted Receiver,age
182,2023090700,101,False,43290,1,-0.804465,Jared Goff,0.941695,0.528542,1994-10-14,37.36,30.07,-1.355459,-1.496217,-1.142589,-0.872946,21,0.108481,-1.739696,False,True,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,True,False,False,True,False,0.930666
183,2023090700,101,False,43290,2,-0.804465,Jared Goff,0.941695,0.528542,1994-10-14,37.36,30.07,-1.355459,-1.496217,-1.157582,-0.872946,21,0.108481,-1.739696,False,True,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,True,False,False,True,False,0.930666
184,2023090700,101,False,43290,3,-0.804465,Jared Goff,0.941695,0.528542,1994-10-14,37.35,30.07,-1.355459,-1.496217,-1.257665,-0.872946,21,0.108481,-1.739696,False,True,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,True,False,False,True,False,0.930666
185,2023090700,101,False,43290,4,-0.804465,Jared Goff,0.941695,0.528542,1994-10-14,37.34,30.07,-1.355459,-1.496217,1.293664,-0.872946,21,0.108481,-1.739696,False,True,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,True,False,False,True,False,0.930666
186,2023090700,101,False,43290,5,-0.804465,Jared Goff,0.941695,0.528542,1994-10-14,37.33,30.07,-1.328528,-0.528562,0.906635,-0.864579,21,0.108481,-1.739696,False,True,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,True,False,False,True,False,0.930666


In [6]:
## Defining feature columns

feature_columns = []
for c in input_df.columns:
    for columns in one_hot_columsns:
        if c.startswith(columns) and c not in one_hot_columsns:
            feature_columns.append(c)
feature_columns.append("absolute_yardline_number")
feature_columns.append("player_height")
feature_columns.append("player_weight")
feature_columns.append("age")
feature_columns.append("x")
feature_columns.append("y")
feature_columns.append("s")
feature_columns.append("a")
feature_columns.append("dir")
feature_columns.append("o")
feature_columns.append("ball_land_x")
feature_columns.append("ball_land_y")

label_columns = ["x", "y"]

In [59]:
sequence_groups = ["game_id", "play_id"]
groups_input = input_df.groupby(sequence_groups)
groups_output = output_df.groupby(sequence_groups)
max_input_sequence = groups_input.size().max()
max_output_sequence = groups_output.size().max()

In [107]:
## Extracting sequences in shape of [total_sequences, len_sequences, vector_length]


feature_dim = len(feature_columns)
out_dim = len(label_columns)

input_sequences = []
queries = []
output_sequences = []

i = 0
for (game_id, play_id), dataframe in input_df.groupby(sequence_groups):
    i += 1
    if i > 100:
        break
    frames = []
    player_to_predict = []
    for frame_id, frame_frame in dataframe.groupby(["frame_id"]):
        players = frame_frame[feature_columns].to_numpy(dtype=np.float32)
        frames.append(players)
        player_idx = frame_frame.reset_index(drop=True).index[frame_frame["player_to_predict"]].unique()

        player_to_predict.append(player_idx)

    player_to_predict = np.array(player_to_predict)
    assert np.all(player_to_predict[0]==player_to_predict)
    for idx in player_to_predict[0]:
        input_sequences.append(frames)
        queries.append(idx)
    out_group = groups_output.get_group((game_id, play_id))
    for nfl_id, frame in out_group.groupby("nfl_id"):

        out_seq = frame[label_columns].to_numpy(dtype=np.float32)
        output_sequences.append(out_seq)



In [150]:
from torch.utils.data import DataLoader, Dataset
from collections import namedtuple
from typing import NamedTuple

class DataSetEntry(NamedTuple):
    input_sequence: any
    query_ids: any
    output_sequences: any

class SequenceDataset(Dataset):
    def __init__(self, input_sequences, idxs, output_sequences):
        self.sequences = input_sequences
        self.idxs = idxs
        self.labels = output_sequences

    def __len__(self):
        return len(self.idxs)

    def __getitem__(self, idx):
        return DataSetEntry(self.sequences[idx], self.idxs[idx], self.labels[idx])

In [149]:
def collate_fn(batch):
    max_len_seq = 0
    for entry in batch:
        inputs = entry.input_sequence
        outputs = entry.output_sequences
        idxs = entry.idxs
        max_len_seq = max(len(inputs), max_len_seq)
    for seq in inputs:
        while len(seq) < max_len_seq:
            seq.append(np.zeros_like(seq[0]))
    idxs = np.array(idxs)

    return inputs, idxs, outputs

IndentationError: expected an indented block after 'for' statement on line 5 (499428377.py, line 7)

In [135]:
dataset = SequenceDataset(input_sequences, queries, output_sequences)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)

In [145]:
for batch in dataloader:
    print(len(batch[0][0]), len(batch[0][0][0]), len(batch[0][0][0][0]))
    print(len(batch[0][1]), len(batch[0][1][0]), len(batch[0][1][0][0]))
    break


67 12 39
67 10 39
