In [1]:
import numpy as np
import pandas as pd 
import os

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/nfl-big-data-bowl-2026-prediction/sample_submission.csv
/kaggle/input/nfl-big-data-bowl-2026-prediction/test_input.csv
/kaggle/input/nfl-big-data-bowl-2026-prediction/test.csv
/kaggle/input/nfl-big-data-bowl-2026-prediction/train/input_2023_w17.csv
/kaggle/input/nfl-big-data-bowl-2026-prediction/train/output_2023_w05.csv
/kaggle/input/nfl-big-data-bowl-2026-prediction/train/output_2023_w10.csv
/kaggle/input/nfl-big-data-bowl-2026-prediction/train/input_2023_w03.csv
/kaggle/input/nfl-big-data-bowl-2026-prediction/train/output_2023_w18.csv
/kaggle/input/nfl-big-data-bowl-2026-prediction/train/input_2023_w05.csv
/kaggle/input/nfl-big-data-bowl-2026-prediction/train/output_2023_w11.csv
/kaggle/input/nfl-big-data-bowl-2026-prediction/train/output_2023_w12.csv
/kaggle/input/nfl-big-data-bowl-2026-prediction/train/output_2023_w16.csv
/kaggle/input/nfl-big-data-bowl-2026-prediction/train/output_2023_w06.csv
/kaggle/input/nfl-big-data-bowl-2026-prediction/train/input_2023_w18.csv


In [2]:
# 1. Load & Concatenate Input and Output CSV's
base_uri = '/kaggle/input/nfl-big-data-bowl-2026-prediction/train'

input_df = pd.DataFrame()
output_df = pd.DataFrame()

for i in range(1, 19):
    week_code = i if i >= 10 else f'0{i}'
    
    input_csv = pd.read_csv(f'{base_uri}/input_2023_w{week_code}.csv')
    output_csv = pd.read_csv(f'{base_uri}/output_2023_w{week_code}.csv')

    input_df = pd.concat([input_df, input_csv])
    output_df = pd.concat([output_df, output_csv])

print('DataFrames loaded successfully')
print(f'\tInput: {len(input_df)} rows')
print(f'\tOutput: {len(output_df)} rows')

DataFrames loaded successfully
	Input: 4880579 rows
	Output: 562936 rows


In [3]:
# 2. Create Gold Table for Training
def row_id(row):
    return f"{row['game_id']}_{row['play_id']}_{row['nfl_id']}_{row['frame_id']}"

input_df['row_id'] = input_df.apply(row_id, axis=1)
output_df['row_id'] = output_df.apply(row_id, axis=1)

predictable_input_df = input_df[input_df['player_to_predict'] == True].copy()

gold_table = pd.merge(predictable_input_df, output_df, on='row_id')
gold_table.head()

Unnamed: 0,game_id_x,play_id_x,player_to_predict,nfl_id_x,frame_id_x,play_direction,absolute_yardline_number,player_name,player_height,player_weight,...,num_frames_output,ball_land_x,ball_land_y,row_id,game_id_y,play_id_y,nfl_id_y,frame_id_y,x_y,y_y


In [4]:
# 3. Data Cleanup

# post-join column name cleanup
duplicate_cols = [
    'game_id',
    'play_id',
    'nfl_id',
    'frame_id',
    'x',
    'y'
]

for col in duplicate_cols:
    gold_table[col] = gold_table[f'{col}_x']
    gold_table = gold_table.drop([f'{col}_x', f'{col}_y'], axis=1)

# cleanup steps from previous winning submission
# source: https://github.com/mpchang/uncovering-missed-tackle-opportunities/blob/main/code/data_cleaning.py
# rotate direction & orientation
gold_table['dir_clean'] = (-(gold_table['dir'] - 90)) % 360
gold_table['o_clean'] = (-(gold_table['o'] - 90)) % 360

# make all plays left-to-right
gold_table['x_clean'] = np.where(
    gold_table['play_direction'] == 'left', 
    120 - gold_table['x'], 
    gold_table['x']
)

gold_table['ball_land_x_clean'] = np.where(
    gold_table['play_direction'] == 'left',
    120 - gold_table['ball_land_x'], 
    gold_table['ball_land_x']
)

gold_table['o_clean'] = np.where(
    gold_table['play_direction'] == 'left',
    (540 - gold_table['o_clean']) % 360,
    gold_table['o_clean']
)

gold_table['dir_clean'] = np.where(
    gold_table['play_direction'] == 'left',
    (540 - gold_table['dir_clean']) % 360,
    gold_table['dir_clean']
)

# drop unused "dirty" columns
dirty_cols = [
    'x',
    'ball_land_x',
    'dir',
    'o'
]

gold_table = gold_table.drop(dirty_cols, axis=1)

In [5]:
gold_table.columns

Index(['player_to_predict', 'play_direction', 'absolute_yardline_number',
       'player_name', 'player_height', 'player_weight', 'player_birth_date',
       'player_position', 'player_side', 'player_role', 's', 'a',
       'num_frames_output', 'ball_land_y', 'row_id', 'game_id', 'play_id',
       'nfl_id', 'frame_id', 'y', 'dir_clean', 'o_clean', 'x_clean',
       'ball_land_x_clean'],
      dtype='object')