In [12]:
from xmlrpc.client import DateTime

import pandas as pd
import numpy as np
import glob

from fontTools.misc.bezierTools import namedtuple
from torch.cuda import device

input_files = glob.glob('data/train/input*.csv')
output_files = glob.glob('data/train/output*.csv')

input_df = pd.concat((pd.read_csv(f) for f in input_files), ignore_index=True)
output_df = pd.concat((pd.read_csv(f) for f in output_files), ignore_index=True)

pd.set_option('display.max_columns', None)



In [13]:
## Redefining columns

one_hot_columsns = ["play_direction", "player_position", "player_side", "player_role"]
input_df = pd.get_dummies(input_df, columns=one_hot_columsns)
input_df = input_df.sort_values(["game_id", "play_id", "nfl_id", "frame_id"])
output_df = output_df.sort_values(["game_id", "play_id", "nfl_id", "frame_id"])

In [14]:
## Transforming input data



## Age

year = 2025
input_df["player_birth_date"] = pd.to_datetime(input_df["player_birth_date"])
input_df["age"] = 2025 - input_df["player_birth_date"].dt.year

## Height to meters

def foot_to_meters(x:str):
    x = x.replace("-",".")
    meters = float(x) * 0.3048
    return meters

input_df["player_height"] = input_df["player_height"].apply(foot_to_meters)
input_df["player_height"] = pd.to_numeric(input_df["player_height"])

## Defining feature columns

feature_columns = []
for c in input_df.columns:
    for columns in one_hot_columsns:
        if c.startswith(columns) and c not in one_hot_columsns:
            feature_columns.append(c)
feature_columns.append("absolute_yardline_number")
feature_columns.append("player_height")
feature_columns.append("player_weight")
feature_columns.append("age")
feature_columns.append("x")
feature_columns.append("y")
feature_columns.append("s")
feature_columns.append("a")
feature_columns.append("dir")
feature_columns.append("o")
feature_columns.append("ball_land_x")
feature_columns.append("ball_land_y")

label_columns = ["x", "y"]

In [95]:
sequence_groups = ["game_id", "play_id"]
groups_input = input_df.groupby(sequence_groups)
groups_output = output_df.groupby(sequence_groups)
max_input_sequence = groups_input.size().max()
max_output_sequence = groups_output.size().max()

In [None]:
## Extracting sequences in shape of [total_sequences, len_sequences, vector_length]


feature_dim = len(feature_columns)
out_dim = len(label_columns)

input_sequences = []
output_sequences = []

i = 0
for group, frame in groups_input:
    frame.sort_values(["frame_id"])
    i += 1
    if i % 1000 == 0:
        print(i)
    input_sequence = frame[feature_columns].to_numpy(dtype=np.float32)
    i
    output_sequence = group[label_columns].to_numpy(dtype=np.float32)
    input_sequences.append(input_sequence)
    output_sequences.append(output_sequence)

In [91]:
example = list(groups_input.groups)[0]

In [92]:
example

(2023090700, 101, 1)

In [93]:
player_traj = groups_input.get_group(example).sort_values(["player_to_predict"], ascending=False).sort_values("frame_id", ascending=True)

In [100]:
groups = input_df.groupby(sequence_groups, sort=False)

In [101]:
player_traj = [group.to_numpy() for _, group in groups_input]

In [104]:
player_traj[0].shape

(234, 47)

In [78]:
groups_output.get_group(example).v

Unnamed: 0,game_id,play_id,nfl_id,frame_id,x,y
42,2023090700,101,44930,1,53.2,13.98
0,2023090700,101,46137,1,56.22,17.28
21,2023090700,101,52546,1,47.94,12.12
