# Deep Learning Preprocessing v3 

__Author:__ Jack Friedman <br>
__Date:__ 11/10/2023 <br>
__Purpose:__ Updated and condensed frame representation

In [68]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from Preprocessing_v3 import *
from DataLoader import load_data
from math import cos, sin, radians
import torch

## Step 0: Load and preprocess data

In [69]:
# Load data
[games_df, players_df, plays_df, tracking_df] = load_data()

loaded games df
shape: (136, 9)
-----
loaded players df
shape: (1683, 7)
-----
loaded plays df
shape: (12486, 35)
-----
loading tracking frames...
loaded tracking frames
shape: (12187398, 17)
returning 4 frames


In [70]:
# Preprocess data
tracking_df_clean = preprocess_all_df(plays_df, games_df, players_df, tracking_df)

cleaning plays_df
final plays data shape: (6840, 289)
-----

cleaning games_df
-----

cleaning players_df
-----

cleaning tracking_df
original tracking df shape: (12187398, 17)
unique play and game id combos: (6840, 2)
filtered df shape: (8458178, 17)
number of merge errors: 0
joined plays and tracking dataframes
original tracking shape: (8458178, 17)
merged data shape: (8458178, 304)
-------
joined players and tracking dataframes
original tracking shape: (8458178, 304)
merged data shape: (8458178, 306)
-------
joined games and tracking dataframes
original tracking shape: (8458178, 306)
merged data shape: (8458178, 307)
-------
Old df shape:(8458178, 346)
New df shape:(8458178, 346)
-----



## Step 1: Filter observations by frame cutoff at handoff/run event

### 1A) Get Frame cutoffs

In [71]:
# Filter for pre-handoff events 

# Step 1: Get the frames where handoff or run occurs 
frame_cutoffs = tracking_df_clean[(tracking_df_clean['event'] == 'run') | (tracking_df_clean['event'] == 'handoff')][['gameId', 'playId', 'frameId']].drop_duplicates()

# Step 2: Handle duplicate handoffs 

# # Option A: Keep the later handoff event + drop the first one
# frame_cutoffs = frame_cutoffs.loc[frame_cutoffs.groupby(['gameId', 'playId'])['frameId'].idxmax()]  # keeps the max frame with a duplicate

# Option B: Drop all duplicate handoff plays
frame_cutoffs = frame_cutoffs.drop_duplicates(subset=['gameId', 'playId'], keep=False)

# Step 3: Rename cutoff column
frame_cutoffs = frame_cutoffs.rename(columns = {'frameId':'frame_cutoff'})
frame_cutoffs.head()

Unnamed: 0,gameId,playId,frame_cutoff
5,2022090800,80,6
708,2022090800,101,19
1838,2022090800,146,22
3122,2022090800,191,18
4296,2022090800,299,19


### 1B) Filter tracking_df at cutoffs

In [72]:
# Step 4: Merge cutoffs with the original dataframe 
tracking_df_clean = pd.merge(tracking_df_clean, frame_cutoffs, on=['gameId', 'playId'])

# Step 5: Filter tracking data before cutoff
print("shape before filter: " + str(tracking_df_clean.shape))
tracking_df_clean = tracking_df_clean[tracking_df_clean['frameId'] <= tracking_df_clean['frame_cutoff']]
print("shape after filter: " + str(tracking_df_clean.shape))

shape before filter: (8392214, 337)
shape after filter: (2802745, 337)


## Step 2: Build tensors for single frame

### 2A) Set up df and helper functions

In [73]:
tensor_df = pd.DataFrame(columns = ['gameId', 'playId', 'frameId', 'frame_cutoff', 'field_tensor'])

In [74]:
def is_out_of_bounds(x, y):
    i = int(x)
    j = int(y)
    return i < 0 or i > 119 or j < 0 or j > 53

In [75]:
# Get max s, a, height, weight for normalization
max_a = tracking_df_clean['a'].max()
max_s = tracking_df_clean['s'].max()
max_height = tracking_df_clean['heightInches'].max()   
max_weight = tracking_df_clean['weight'].max()  

In [90]:
# NEW SCHEMA
# Channel 0: Record position of this player
# Channe; 1: Ball present in this cell?
# Channel 2: % offensive players in this cell
# Channel 3: Record defensive player ratio
# Channel 4: Net velocity vector (x component, standardized 0-1)
# Channel 5: Net velocity vector (y component, standardized 0-1)
# Channel 6: Net acceleration vector (x component, standardized 0-1)
# Channel 7: Net acceleration vector (y component, standardized 0-1)

# Other channels to consider removing
# Channel 8: Height (average, standardized 0-1)
# Channel 9: Height (variance, standardized 0-1)
# Channel 10: % home team in this cell
# Channel 11: % away team in this cell

# Helper function taht builds a tensor for a frame of tracking data
def build_tensor(single_frame_data, max_s = max_s, max_a = max_a, max_height = max_height, max_weight = max_weight):
    # STEP 0: Create a blank matrix
    image = np.zeros((120, 54, 12))

    # STEP 1: Record ball location before looping through players in Channel 1
    i = int(single_frame_data[single_frame_data['nflId'].isnull()]['X_std'])
    j = int(single_frame_data[single_frame_data['nflId'].isnull()]['Y_std'])
    # Make sure ball is inbounds 
    if not is_out_of_bounds(i, j):
        image[i, j, 1] = 1

    # STEP 2: Drop football from dataframe
    single_frame_data = single_frame_data.dropna(subset=['nflId'])

    # STEP 3: Populate player channels, person by person
    total_num_players_checked_on_field = 0
    num_players = np.zeros((120, 54, 4))  # Depth 0 is offense channel Depth 1 is defense, depth 2 is home, depth 3 is away
    temp_values = {}   # holds temporary values for calculating variances (0 = s, 1 = a, 2 = h, 3 = w)
    
    for _, row in single_frame_data.iterrows():
        # Keep track of x and y locations of player (matrix indices)
        i = int(row['X_std'])
        j = int(row['Y_std'])
        
        # Make sure player is in the frame 
        if is_out_of_bounds(i, j):
            continue

        # Update total number of players checked
        total_num_players_checked_on_field += 1

        # Channel 0: Record position of this player
        image[i, j, 0] = (image[i, j, 0] * 22 + 1) / 22

        # Keep track of people on offense and defense
        if row['isOnOffense'] == 1:
            num_players[i, j, 0] += 1  # record offensive player
        else: 
            num_players[i, j, 1] += 1  # record defensive player
        total_players_in_cell = num_players[i, j, 0] + num_players[i, j, 1]
        
        # Channel 2: % offensive players in this cell
        # Channel 3: Record defensive player ratio
        image[i, j, 2] = num_players[i, j, 0] / (num_players[i, j, 0] + num_players[i, j, 1])  # calculate % offensive players at this cell
        image[i, j, 3] = num_players[i, j, 1] / (num_players[i, j, 0] + num_players[i, j, 1])  # calculate % offensive players at this cell

        # Channel 4: Net velocity vector (x component, standardized 0-1)
        image[i, j, 4] += (row['s'] / max_s) * cos(radians(row['Dir_std']))

        # Channel 5: Net velocity vector (y component, standardized 0-1)
        image[i, j, 5] += (row['s'] / max_s) * sin(radians(row['Dir_std']))

        # Channel 6: Net force vector (x component, standardized 0-1)
        image[i, j, 6] += ((row['weight'] * row['a']) / (max_weight * max_a)) * cos(radians(row['O_std']))

        # Channel 7: Net force vector (y component, standardized 0-1)
        image[i, j, 7] += ((row['weight'] * row['a']) / (max_weight * max_a)) * sin(radians(row['O_std']))

        # Keep track of height
        if (i, j) not in temp_values.keys():
            temp_values[(i, j)] = {'heightInches':[],
                                   'vx': 0,
                                   'vy': 0,
                                   'fx': 0,
                                   'fy': 0}
        temp_values[(i, j)]['heightInches'] += [row['heightInches'] / max_height]  # Channel 9: Height (variance, standardized 0-1)

        # Keep track of people home and away
        if row['isHomeTeam'] == 1:
            num_players[i, j, 2] += 1  # record home team player
        else: 
            num_players[i, j, 3] += 1  # record away team player
        # Update channels
        # Channel 10: % home team in this cell
        # Channel 11: % away team in this cell
        image[i, j, 10] = num_players[i, j, 2] / total_players_in_cell  # calculate % home team players at this cell
        image[i, j, 11] = num_players[i, j, 3] / total_players_in_cell  # calculate % away team players at this cell

    # Compute variances 
    for (i, j) in temp_values.keys():
        # Get sample size:
        n = len(temp_values[(i,j)]['heightInches'])
    
        # Channel 8: Height (mean, standardized 0-1)
        image[i, j, 8] = np.mean(temp_values[(i,j)]['heightInches']) if n > 0 else 0
        
        # Channel 9: Height (variance, standardized 0-1)
        image[i, j, 9] = np.var(temp_values[(i,j)]['heightInches'], ddof = 1) if n > 1 else 0
        
        # 22. PFF rating?
    
    # Convert matrix to a tensor
    tensor = torch.from_numpy(image)
    return tensor

### Check when 2 players are in the same matrix cell (same yard^2) in a play

In [91]:
# Single
# single_frame_data = tracking_df_clean[(tracking_df_clean['gameId'] == 2022090800) & (tracking_df_clean['playId'] == 101) & (tracking_df_clean['frameId'] == 1)]
# Double
# single_frame_data = tracking_df_clean[(tracking_df_clean['gameId'] == 2022090800) & (tracking_df_clean['playId'] == 2360) & (tracking_df_clean['frameId'] == 17)]
# Double
# single_frame_data = tracking_df_clean[(tracking_df_clean['gameId'] == 2022090800) & (tracking_df_clean['playId'] == 191) & (tracking_df_clean['frameId'] == 18)]
# Triple
single_frame_data = tracking_df_clean[(tracking_df_clean['gameId'] == 2022091100) & (tracking_df_clean['playId'] == 546) & (tracking_df_clean['frameId'] == 16)]
# 5 in one
# single_frame_data = tracking_df_clean[(tracking_df_clean['gameId'] == 2022101611) & (tracking_df_clean['playId'] == 3237) & (tracking_df_clean['frameId'] == 15)]

# Conflicts to test
# (2022090800, 101, 1) at (46, 25) --> single
# (2022090800, 191, 18) at (75, 29) --> double
# (2022090800, 2360, 17) at (68, 27) --> double
# (2022091100, 546, 16) at (41, 23) --> triple
# (2022101611, 3237, 15) at (79, 24) --> 5 in one

tensor = build_tensor(single_frame_data)

In [92]:
# 68, 27
i = 41
j = 23
vars = {'isOnOffense': 1, 
        's': max_s, 
        'a': max_a,
        'heightInches': max_height, 
        'weight': max_weight, 
        'isHomeTeam': 1,
        'O_std': 360,
        'Dir_std': 360
        }

for var in vars.keys():
    print(var)
    vals = single_frame_data[(single_frame_data['X_std'].astype(int) == i) & (single_frame_data['Y_std'].astype(int) == j)][var].values
    for val in vals:
        print(val / vars[var])
    print("Average: " + str(np.mean(np.dot(vals,(1 / vars[var])))))
    n = len(vals)
    print("Variance: " + str(np.var(np.dot(vals,(1 / vars[var])), ddof = 1))  if n > 1 else 0)
    print('-----')


isOnOffense
1.0
0.0
1.0
Average: 0.6666666666666666
Variance: 0.33333333333333337
-----
s
0.01751505155896252
0.07881773201533132
0.08374384026628953
Average: 0.06002554128019446
Variance: 0.0013614229378792646
-----
a
0.03896103865627229
0.013328776382408944
0.049555707062802476
Average: 0.033948507367161235
Variance: 0.0003469417290737476
-----
heightInches
0.9382716049382716
0.9506172839506173
0.9259259259259259
Average: 0.9382716049382717
Variance: 0.00015241579027587394
-----
weight
0.8289473684210527
0.9210526315789473
0.8394736842105263
Average: 0.8631578947368421
Variance: 0.002541551246537395
-----
isHomeTeam
0.0
1.0
0.0
Average: 0.3333333333333333
Variance: 0.33333333333333337
-----
O_std
0.10211111111111108
0.27705555555555555
0.10430555555555558
Average: 0.16115740740740744
Variance: 0.010075489454732516
-----
Dir_std
0.00816666666666666
0.42902777777777773
0.3216388888888888
Average: 0.2529444444444444
Variance: 0.04782021373456789
-----


In [93]:
single_frame_data[(single_frame_data['X_std'].astype(int) == i) & (single_frame_data['Y_std'].astype(int) == j)][['Dir_std','O_std']]

Unnamed: 0,Dir_std,O_std
55411,2.94,36.76
55525,154.45,99.74
55753,115.79,37.55


In [94]:
for c in range(12):
    print("channel: " + str(c))
    print(tensor[i, j, c])

# NEW SCHEMA
# Channel 0: % players at this location
# Channe; 1: Ball present in this cell?
# Channel 2: % offensive players in this cell
# Channel 3: % defensive player ratio
# Channel 4: Net velocity vector (x component, standardized 0-1)
# Channel 5: Net velocity vector (y component, standardized 0-1)
# Channel 6: Net acceleration vector (x component, standardized 0-1)
# Channel 7: Net acceleration vector (y component, standardized 0-1)
# Channel 8: Height (average, standardized 0-1)
# Channel 9: Height (variance, standardized 0-1)
# Channel 10: % home team in this cell
# Channel 11: % away team in this cell


channel: 0
tensor(0.1364, dtype=torch.float64)
channel: 1
tensor(0., dtype=torch.float64)
channel: 2
tensor(0.6667, dtype=torch.float64)
channel: 3
tensor(0.3333, dtype=torch.float64)
channel: 4
tensor(-0.0901, dtype=torch.float64)
channel: 5
tensor(0.1103, dtype=torch.float64)
channel: 6
tensor(0.0568, dtype=torch.float64)
channel: 7
tensor(0.0568, dtype=torch.float64)
channel: 8
tensor(0.9383, dtype=torch.float64)
channel: 9
tensor(0.0002, dtype=torch.float64)
channel: 10
tensor(0.3333, dtype=torch.float64)
channel: 11
tensor(0.6667, dtype=torch.float64)


In [None]:
# Conflicts to test
# (2022090800, 101, 1) at (46, 25) --> single
# (2022090800, 191, 18) at (75, 29) --> double
# (2022090800, 2360, 17) at (68, 27) --> double
# (2022091100, 546, 16) at (41, 23) --> triple
# (2022101611, 3237, 15) at (79, 24) --> 5 in one

## Step 3: Build tensors for every frame

In [None]:
def process_batch(group_of_plays):
    # Hold in an array (faster than concatenating a df every row)
    tensor_rows = []

    # Loop through every play
    for group_df in group_of_plays:
        game_id = group_df['gameId'].iloc[0]
        play_id = group_df['frameId'].iloc[0]
        # Loop through every frame in that play
        frame_groups = group_df.groupby(['frameId'])
        for frame_id, frame_df in frame_groups:
            # Build tensor for that frame
            tensor = build_tensor(frame_df)
            
            # Keep track of row
            new_row = {
                'gameId': [game_id], 
                'playId': [play_id], 
                'frameId': [frame_id], 
                'frame_cutoff': [frame_df['frame_cutoff'].iloc[0]], 
                'field_tensor': [tensor]
            }
            tensor_rows += [new_row]

    # Build dataframe
    tensor_df = pd.DataFrame(tensor_rows)
    return tensor_df

In [None]:
def get_batches(groupby_object, num_batches):
    # Get the keys of the groups
    group_keys = list(groupby_object.groups.keys())

    # Calculate the number of keys in each batch
    keys_per_batch = len(group_keys) // num_batches

    # Initialize an empty list to store the batches
    batches = []

    # Split the keys into batches
    for i in range(0, len(group_keys), keys_per_batch):
        batch_keys = group_keys[i:i + keys_per_batch]
        batch = [groupby_object.get_group(key) for key in batch_keys]
        batches.append(batch)

    return batches


In [None]:
tensor_df = pd.DataFrame(columns = ['gameId', 'playId', 'frameId', 
                                    'frame-cutoff', 'field_tensor'])

play_groups = tracking_df_clean.groupby(['gameId', 'playId'])
batches = get_batches(play_groups, 6)

for i in range(len(batches)):
    print("processing batch " + str(i))
    batch = batches[i]
    new_batch_df = process_batch(batch)
    tensor_df = pd.concat([tensor_df, new_batch_df])
tensor_df.head()

processing batch 0


  for frame_id, frame_df in frame_groups:
  for frame_id, frame_df in frame_groups:
  for frame_id, frame_df in frame_groups:
  for frame_id, frame_df in frame_groups:
  for frame_id, frame_df in frame_groups:
  for frame_id, frame_df in frame_groups:
  for frame_id, frame_df in frame_groups:
  for frame_id, frame_df in frame_groups:
  for frame_id, frame_df in frame_groups:
  for frame_id, frame_df in frame_groups:
  for frame_id, frame_df in frame_groups:
  for frame_id, frame_df in frame_groups:
  for frame_id, frame_df in frame_groups:
  for frame_id, frame_df in frame_groups:
  for frame_id, frame_df in frame_groups:
  for frame_id, frame_df in frame_groups:
  for frame_id, frame_df in frame_groups:
  for frame_id, frame_df in frame_groups:
  for frame_id, frame_df in frame_groups:
  for frame_id, frame_df in frame_groups:
  for frame_id, frame_df in frame_groups:
  for frame_id, frame_df in frame_groups:
  for frame_id, frame_df in frame_groups:
  for frame_id, frame_df in frame_

processing batch 1


  for frame_id, frame_df in frame_groups:
  for frame_id, frame_df in frame_groups:
  for frame_id, frame_df in frame_groups:
  for frame_id, frame_df in frame_groups:
  for frame_id, frame_df in frame_groups:
  for frame_id, frame_df in frame_groups:
  for frame_id, frame_df in frame_groups:
  for frame_id, frame_df in frame_groups:
  for frame_id, frame_df in frame_groups:
  for frame_id, frame_df in frame_groups:
  for frame_id, frame_df in frame_groups:
  for frame_id, frame_df in frame_groups:
  for frame_id, frame_df in frame_groups:
  for frame_id, frame_df in frame_groups:
  for frame_id, frame_df in frame_groups:
  for frame_id, frame_df in frame_groups:
  for frame_id, frame_df in frame_groups:
  for frame_id, frame_df in frame_groups:
  for frame_id, frame_df in frame_groups:
  for frame_id, frame_df in frame_groups:
  for frame_id, frame_df in frame_groups:
  for frame_id, frame_df in frame_groups:
  for frame_id, frame_df in frame_groups:
  for frame_id, frame_df in frame_

processing batch 2


  for frame_id, frame_df in frame_groups:
  for frame_id, frame_df in frame_groups:
  for frame_id, frame_df in frame_groups:
  for frame_id, frame_df in frame_groups:
  for frame_id, frame_df in frame_groups:
  for frame_id, frame_df in frame_groups:
  for frame_id, frame_df in frame_groups:
  for frame_id, frame_df in frame_groups:
  for frame_id, frame_df in frame_groups:
  for frame_id, frame_df in frame_groups:
  for frame_id, frame_df in frame_groups:
  for frame_id, frame_df in frame_groups:
  for frame_id, frame_df in frame_groups:
  for frame_id, frame_df in frame_groups:
  for frame_id, frame_df in frame_groups:
  for frame_id, frame_df in frame_groups:
  for frame_id, frame_df in frame_groups:
  for frame_id, frame_df in frame_groups:
  for frame_id, frame_df in frame_groups:
  for frame_id, frame_df in frame_groups:
  for frame_id, frame_df in frame_groups:
  for frame_id, frame_df in frame_groups:
  for frame_id, frame_df in frame_groups:
  for frame_id, frame_df in frame_

processing batch 3


  for frame_id, frame_df in frame_groups:
  for frame_id, frame_df in frame_groups:
  for frame_id, frame_df in frame_groups:
  for frame_id, frame_df in frame_groups:
  for frame_id, frame_df in frame_groups:
  for frame_id, frame_df in frame_groups:
  for frame_id, frame_df in frame_groups:
  for frame_id, frame_df in frame_groups:
  for frame_id, frame_df in frame_groups:
  for frame_id, frame_df in frame_groups:
  for frame_id, frame_df in frame_groups:
  for frame_id, frame_df in frame_groups:
  for frame_id, frame_df in frame_groups:
  for frame_id, frame_df in frame_groups:
  for frame_id, frame_df in frame_groups:
  for frame_id, frame_df in frame_groups:
  for frame_id, frame_df in frame_groups:
  for frame_id, frame_df in frame_groups:
  for frame_id, frame_df in frame_groups:
  for frame_id, frame_df in frame_groups:
  for frame_id, frame_df in frame_groups:
  for frame_id, frame_df in frame_groups:
  for frame_id, frame_df in frame_groups:
  for frame_id, frame_df in frame_

processing batch 4


  for frame_id, frame_df in frame_groups:
  for frame_id, frame_df in frame_groups:
  for frame_id, frame_df in frame_groups:
  for frame_id, frame_df in frame_groups:
  for frame_id, frame_df in frame_groups:
  for frame_id, frame_df in frame_groups:
  for frame_id, frame_df in frame_groups:
  for frame_id, frame_df in frame_groups:
  for frame_id, frame_df in frame_groups:
  for frame_id, frame_df in frame_groups:
  for frame_id, frame_df in frame_groups:
  for frame_id, frame_df in frame_groups:
  for frame_id, frame_df in frame_groups:
  for frame_id, frame_df in frame_groups:
  for frame_id, frame_df in frame_groups:
  for frame_id, frame_df in frame_groups:
  for frame_id, frame_df in frame_groups:
  for frame_id, frame_df in frame_groups:
  for frame_id, frame_df in frame_groups:
  for frame_id, frame_df in frame_groups:
  for frame_id, frame_df in frame_groups:
  for frame_id, frame_df in frame_groups:
  for frame_id, frame_df in frame_groups:
  for frame_id, frame_df in frame_

processing batch 5


  for frame_id, frame_df in frame_groups:
  for frame_id, frame_df in frame_groups:
  for frame_id, frame_df in frame_groups:
  for frame_id, frame_df in frame_groups:
  for frame_id, frame_df in frame_groups:
  for frame_id, frame_df in frame_groups:
  for frame_id, frame_df in frame_groups:
  for frame_id, frame_df in frame_groups:
  for frame_id, frame_df in frame_groups:
  for frame_id, frame_df in frame_groups:
  for frame_id, frame_df in frame_groups:
  for frame_id, frame_df in frame_groups:
  for frame_id, frame_df in frame_groups:
  for frame_id, frame_df in frame_groups:
  for frame_id, frame_df in frame_groups:
  for frame_id, frame_df in frame_groups:
  for frame_id, frame_df in frame_groups:
  for frame_id, frame_df in frame_groups:
  for frame_id, frame_df in frame_groups:
  for frame_id, frame_df in frame_groups:
  for frame_id, frame_df in frame_groups:
  for frame_id, frame_df in frame_groups:
  for frame_id, frame_df in frame_groups:
  for frame_id, frame_df in frame_

processing batch 6


  for frame_id, frame_df in frame_groups:
  for frame_id, frame_df in frame_groups:


Unnamed: 0,gameId,playId,frameId,frame-cutoff,field_tensor,frame_cutoff
0,[2022090800],[1],[1],,"[[[tensor([0., 0., 0., 0., 0., 0., 0., 0., 0.,...",[6]
1,[2022090800],[1],[2],,"[[[tensor([0., 0., 0., 0., 0., 0., 0., 0., 0.,...",[6]
2,[2022090800],[1],[3],,"[[[tensor([0., 0., 0., 0., 0., 0., 0., 0., 0.,...",[6]
3,[2022090800],[1],[4],,"[[[tensor([0., 0., 0., 0., 0., 0., 0., 0., 0.,...",[6]
4,[2022090800],[1],[5],,"[[[tensor([0., 0., 0., 0., 0., 0., 0., 0., 0.,...",[6]
