In [1]:
from xmlrpc.client import DateTime

import pandas as pd
import numpy as np
import glob


input_files = glob.glob('data/train/input*.csv')
output_files = glob.glob('data/train/output*.csv')

input_df = pd.concat((pd.read_csv(f) for f in input_files), ignore_index=True)
output_df = pd.concat((pd.read_csv(f) for f in output_files), ignore_index=True)

pd.set_option('display.max_columns', None)



In [2]:
## Redefining columns

one_hot_columsns = ["play_direction", "player_position", "player_side", "player_role"]
input_df = pd.get_dummies(input_df, columns=one_hot_columsns)
input_df = input_df.sort_values(["game_id", "play_id", "nfl_id", "frame_id"])
output_df = output_df.sort_values(["game_id", "play_id", "nfl_id", "frame_id"])

In [3]:
## Transforming input data



## Age

year = 2025
input_df["player_birth_date"] = pd.to_datetime(input_df["player_birth_date"])
input_df["age"] = 2025 - input_df["player_birth_date"].dt.year

## Height to meters

def foot_to_meters(x:str):
    x = x.replace("-",".")
    meters = float(x) * 0.3048
    return meters

input_df["player_height"] = input_df["player_height"].apply(foot_to_meters)
input_df["player_height"] = pd.to_numeric(input_df["player_height"])


## Scaling Inputs

from sklearn.preprocessing import StandardScaler

scaled_columns= ["absolute_yardline_number", "player_height", "player_weight", "age", "s", "a", "dir", "o", "ball_land_x", "ball_land_y"]
scaler = StandardScaler()
input_df[scaled_columns] = scaler.fit_transform(input_df[scaled_columns])

## Defining feature columns

feature_columns = []
for c in input_df.columns:
    for columns in one_hot_columsns:
        if c.startswith(columns) and c not in one_hot_columsns:
            feature_columns.append(c)
feature_columns.append("frame_id")
feature_columns.append("absolute_yardline_number")
feature_columns.append("player_height")
feature_columns.append("player_weight")
feature_columns.append("age")
feature_columns.append("x")
feature_columns.append("y")
feature_columns.append("s")
feature_columns.append("a")
feature_columns.append("dir")
feature_columns.append("o")
feature_columns.append("ball_land_x")
feature_columns.append("ball_land_y")

label_columns = ["x", "y"]

In [4]:
input_df.sort_values(["frame_id", "player_to_predict", "nfl_id"], ascending=[True, False, True], inplace=True)
output_df.sort_values(["frame_id", "nfl_id"], ascending=[True, True], inplace=True)

In [5]:
sequence_groups = ["game_id", "play_id"]
groups_input = input_df.groupby(sequence_groups, sort=False)
groups_output = output_df.groupby(sequence_groups, sort=False)

In [6]:
## Extracting sequences in shape of [total_sequences, len_sequences, vector_length]


feature_dim = len(feature_columns)
out_dim = len(label_columns)

input_sequences = []
output_sequences = []
player_to_predict = []

i = 0
for group, frame in groups_input:
    i += 1
    if i % 1000 == 0:
        print(i)
    input_frames = []

    for _, frame_frame in frame.groupby(["frame_id"]):
        framed = frame_frame[feature_columns].to_numpy(dtype=np.float32)
        input_frames.append(framed)
    input_sequences.append(input_frames)
    output = groups_output.get_group(group)
    out_frames = []
    player_to_predict.append(len(output["nfl_id"].unique()))
    for _, frame_frame in output.groupby(["frame_id"]):
        framed = frame_frame[label_columns].to_numpy(dtype=np.float32)
        out_frames.append(framed)
    output_sequences.append(out_frames)

1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000


In [7]:
sample = input_sequences[100][-1]
pd.DataFrame(sample)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39
0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,23.0,1.27712,0.274693,-0.373275,2.272104,94.839996,39.240002,0.269362,-0.846405,-1.536171,0.22532,1.344229,0.459404
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,23.0,1.27712,0.052359,-0.824183,1.601385,92.330002,34.740002,0.260385,2.904141,-0.970621,-0.510632,1.344229,0.459404
2,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,23.0,1.27712,0.274693,0.212906,0.595307,97.279999,30.530001,-0.013411,0.650988,-1.741004,1.507445,1.344229,0.459404
3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,23.0,1.27712,0.274693,0.708905,0.930666,95.269997,18.190001,-1.216317,-0.01295,-0.369625,0.900971,1.344229,0.459404
4,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,23.0,1.27712,-1.926415,-0.959455,1.266026,106.220001,16.120001,0.269362,-0.754584,-0.813049,0.323066,1.344229,0.459404
5,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,23.0,1.27712,0.274693,-0.192911,0.259948,104.370003,30.99,-0.516118,-0.860531,-1.422286,-0.800605,1.344229,0.459404
6,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,23.0,1.27712,0.052359,-1.455454,0.595307,100.080002,41.130001,0.484808,-1.157185,-1.280502,-0.08659,1.344229,0.459404
7,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,23.0,1.27712,-0.614644,-0.373275,-0.075412,96.269997,6.07,0.879792,0.679241,-0.470999,-1.031094,1.344229,0.459404
8,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,23.0,1.27712,0.941695,0.483451,-0.74613,98.489998,33.389999,1.983951,-0.705142,-0.876594,-0.936919,1.344229,0.459404
9,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,23.0,1.27712,0.719361,0.303088,-1.416849,83.32,23.73,-1.059221,-0.32373,1.152475,-0.992526,1.344229,0.459404


In [8]:
output_sequences[100][0]

array([[95.  , 39.56],
       [92.68, 34.74],
       [97.28, 30.82]], dtype=float32)

In [9]:

player_to_predict[0]

3

In [10]:

from torch.utils.data import DataLoader

from torch.utils.data import Dataset

class TrainingDataset(Dataset):
    def __init__(self, input_sequences, output_sequences, num_frames):
        self.input_sequences = input_sequences
        self.output_sequences = output_sequences
        self.num_frames = num_frames

    def __len__(self):
        return len(self.input_sequences)

    def __getitem__(self, idx):
        return self.input_sequences[idx], self.output_sequences[idx], self.num_frames[idx]

In [11]:
output_sequences_max = max(len(sequence) for sequence in output_sequences)
max_len_players = max(max(len(players) for players in sequence) for sequence in output_sequences)

In [31]:
N_DIM = 40
import torch
def collate_fn(batch):
    i,o, n = zip(*batch)
    max_i = max(len(s) for s in i)
    max_len_players = max(max(len(players) for players in sequence) for sequence in i)

    sequences_in = []
    lengths_in = []
    lengths_o = []
    key_masks = []
    sequences_out = []
    for k, sequence in enumerate(i):
        T = len(sequence)
        N = len(sequence[0])
        pad = max_i - T
        pad_players = max_len_players - N
        key_mask = np.ones(shape=(max_i, max_len_players), dtype=np.bool)
        if pad > 0:
            sequence = np.concatenate([sequence, np.zeros((pad, N, N_DIM))], axis=0)
            key_mask[:,N:max_len_players] = True
        if pad_players > 0:
            sequence = np.concatenate([sequence, np.zeros((max_i, pad_players,N_DIM))], axis=1)
        sequences_in.append(sequence)
        lengths_in.append(T)
        key_masks.append(key_mask)

    for k, sequence in enumerate(o):
        T = len(sequence)
        N = len(sequence[0])
        pad = output_sequences_max - T
        pad_players = max_len_players - N
        if pad > 0:
            sequence = np.concatenate([sequence, np.zeros((pad,N,2))], axis=0)
        if pad_players > 0:
            sequence = np.concatenate([sequence, np.zeros((output_sequences_max, pad_players, 2))], axis=1)
        sequences_out.append(sequence)
        lengths_o.append(T)

    sequences_in = torch.tensor(np.array(sequences_in, dtype=np.float32))
    lengths_in = torch.tensor(np.array(lengths_in, dtype=np.float32))
    key_masks = torch.tensor(np.array(key_masks, dtype=np.long))
    lengths_o = torch.tensor(np.array(lengths_o, dtype=np.long))
    player_to_predict = torch.tensor(np.array(n), dtype=torch.long)
    sequences_out = torch.tensor(np.array(sequences_out), dtype=torch.float32)
    return sequences_in, sequences_out, lengths_in, key_masks, lengths_o, player_to_predict

In [32]:
dataset = TrainingDataset(input_sequences, output_sequences, player_to_predict)
dataloader = DataLoader(dataset, batch_size=1, shuffle=False, collate_fn=collate_fn)

In [33]:
batch = next(iter(dataloader))

In [34]:
len(input_sequences[0][0][0])

40

In [35]:
batch[0][0][-1]

tensor([[ 0.0000e+00,  1.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  1.0000e+00,
          0.0000e+00,  0.0000e+00,  1.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e+00,  1.0000e+00,  3.3000e+01,  1.3205e+00,  1.3864e+00,
          2.5125e+00,  4.2843e+00,  9.4180e+01,  1.2500e+01,  1.7461e+00,
         -8.1109e-01, -1.3222e-01,  5.7325e-01,  1.3612e+00, -1.3154e+00],
        [ 0.0000e+00,  1.0000e+00,  1.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e+00,  1.0000e+00,  0.0000e+00,  1.0000e+00,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  3

In [36]:
seqs_in,seqs_out, lens, key_mask, lens_o, no_out_frames = batch

In [37]:
print(seqs_in.shape)

torch.Size([1, 33, 11, 40])


In [38]:
    print(seqs_out.shape)

torch.Size([1, 94, 11, 2])
