# Preprocessing to get tensors for all observations

## Import libraries

In [58]:
import numpy as np
import pandas as pd
import torch
from Preprocessing_v2 import *
from DataLoader import load_data

## Step 0: Load and Preprocess data

In [59]:
# Load data
[games_df, players_df, plays_df, tracking_df] = load_data()

loaded games df
shape: (136, 9)
-----
loaded players df
shape: (1683, 7)
-----
loaded plays df
shape: (12486, 35)
-----
loading tracking frames...


In [None]:
# Preprocess data
tracking_df_clean = preprocess_all_df(plays_df, games_df, players_df, tracking_df)

cleaning plays_df
final plays data shape: (6840, 289)
-----

cleaning games_df
-----

cleaning players_df
-----

cleaning tracking_df
original tracking df shape: (12187398, 17)
unique play and game id combos: (6840, 2)
filtered df shape: (8458178, 17)
number of merge errors: 0
joined plays and tracking dataframes
original tracking shape: (8458178, 17)
merged data shape: (8458178, 304)
-------
joined players and tracking dataframes
original tracking shape: (8458178, 304)
merged data shape: (8458178, 306)
-------
joined games and tracking dataframes
original tracking shape: (8458178, 306)
merged data shape: (8458178, 307)
-------
Old df shape:(8458178, 346)
New df shape:(8458178, 346)
-----



## Step 1: Filter observations by frame cutoff at handoff/run event

### 1A) Get Frame cutoffs

In [None]:
# Filter for pre-handoff events 

# Step 1: Get the frames where handoff or run occurs 
frame_cutoffs = tracking_df_clean[(tracking_df_clean['event'] == 'run') | (tracking_df_clean['event'] == 'handoff')][['gameId', 'playId', 'frameId']].drop_duplicates()

# Step 2: Handle duplicate handoffs 

# # Option A: Keep the final handoff event
# frame_cutoffs = frame_cutoffs.loc[frame_cutoffs.groupby(['gameId', 'playId'])['frameId'].idxmax()]  # keeps the max frame with a duplicate

# Option B: Drop duplicate handoff plays
frame_cutoffs = frame_cutoffs.drop_duplicates(subset=['gameId', 'playId'], keep=False)

# Step 3: Rename cutoff column
frame_cutoffs = frame_cutoffs.rename(columns = {'frameId':'frame_cutoff'})
frame_cutoffs

Unnamed: 0,gameId,playId,frame_cutoff
5,2022090800,80,6
708,2022090800,101,19
1838,2022090800,146,22
3122,2022090800,191,18
4296,2022090800,299,19
...,...,...,...
8451732,2022110700,3630,18
8453136,2022110700,3686,19
8454884,2022110700,3707,19
8456220,2022110700,3740,21


In [None]:
frame_cutoffs['frame_cutoff'].describe()

count    6794.000000
mean       17.944215
std         4.214998
min         6.000000
25%        17.000000
50%        19.000000
75%        20.000000
max        49.000000
Name: frame_cutoff, dtype: float64

### 1B) Filter tracking_df at cutoffs

In [None]:
# Step 4: Merge cutoffs with the original dataframe 
tracking_df_clean = pd.merge(tracking_df_clean, frame_cutoffs, on=['gameId', 'playId'])

# Step 5: Filter tracking data before cutoff
print("shape before filter: " + str(tracking_df_clean.shape))
tracking_df_clean = tracking_df_clean[tracking_df_clean['frameId'] <= tracking_df_clean['frame_cutoff']]
print("shape after filter: " + str(tracking_df_clean.shape))

shape before filter: (8392214, 337)
shape after filter: (2802745, 337)


## Step 2: Build tensors for all frames

### 2A) Set up df and helper functions

In [None]:
tensor_df = pd.DataFrame(columns = ['gameId', 'playId', 'frameId', 'frame_cutoff', 'field_tensor'])

In [None]:
def is_out_of_bounds(x, y):
    i = int(x)
    j = int(y)
    return i < 0 or i > 119 or j < 0 or j > 53

In [None]:
# Get max s, a, height, weight for normalization
max_a = tracking_df_clean['a'].max()
max_s = tracking_df_clean['s'].max()
max_height = tracking_df_clean['heightInches'].max()   
max_weight = tracking_df_clean['weight'].max()  

In [None]:
# Helper function taht builds a tensor for a frame of tracking data
def build_tensor(single_frame_data, max_s = max_s, max_a = max_a, max_height = max_height, max_weight = max_weight):
    # STEP 0: Create a blank matrix
    image = np.zeros((120, 54, 22))

    # STEP 1: Record ball location before looping through players in Channel 1
    i = int(single_frame_data[single_frame_data['nflId'].isnull()]['X_std'])
    j = int(single_frame_data[single_frame_data['nflId'].isnull()]['Y_std'])
    # Make sure ball is inbounds 
    if not is_out_of_bounds(i, j):
        image[i, j, 1] = 1

    # STEP 2: Drop football from dataframe
    single_frame_data = single_frame_data.dropna(subset=['nflId'])

    # STEP 3: Populate player channels, person by person
    total_num_players_checked_on_field = 0
    num_players = np.zeros((120, 54, 4))  # Depth 0 is offense channel Depth 1 is defense, depth 2 is home, depth 3 is away
    temp_values = {}   # holds temporary values for calculating variances (0 = s, 1 = a, 2 = h, 3 = w)

    for _, row in single_frame_data.iterrows():
        # Keep track of x and y locations of player (matrix indices)
        i = int(row['X_std'])
        j = int(row['Y_std'])
        
        # Make sure player is in the frame 
        if is_out_of_bounds(i, j):
            continue

        # Update total number of players checked
        total_num_players_checked_on_field += 1

        # Channel 0: Record position of this player
        image[i, j, 0] = (image[i, j, 0] * 22 + 1) / 22

        # Keep track of people on offense and defense
        if row['isOnOffense'] == 1:
            num_players[i, j, 0] += 1  # record offensive player
        else: 
            num_players[i, j, 1] += 1  # record defensive player
        total_players_in_cell = num_players[i, j, 0] + num_players[i, j, 1]
        
        # Channel 2: % offensive players in this cell
        # Channel 3: Record defensive player ratio
        image[i, j, 2] = num_players[i, j, 0] / (num_players[i, j, 0] + num_players[i, j, 1])  # calculate % offensive players at this cell
        image[i, j, 3] = num_players[i, j, 1] / (num_players[i, j, 0] + num_players[i, j, 1])  # calculate % offensive players at this cell

        # Channel 4: Velocity (average, standardized 0-1)
        image[i, j, 4] = (image[i, j, 4] * (total_players_in_cell - 1) + row['s'] / max_s) / total_players_in_cell

        # Store continuous variables so we can calculate variances at the end
        if (i, j) not in temp_values.keys():
            temp_values[(i, j)] = {'s': [],
                                'a': [],
                                'heightInches':[],
                                'weight': [],
                                'offense_dir': [],
                                'defense_dir': [],
                                'offense_o': [],
                                'defense_o': []}
        temp_values[(i, j)]['s'] += [row['s'] / max_s]  # Channel 5: Velocity (variance, standardized 0-1)
        temp_values[(i, j)]['a'] += [row['a'] / max_a]  # Channel 7: Acceleration (variance, standardized 0-1)
        temp_values[(i, j)]['heightInches'] += [row['heightInches'] / max_height]  # Channel 9: Height (variance, standardized 0-1)
        temp_values[(i, j)]['weight'] += [row['weight'] / max_weight]  # Channel 11: Weight (variance, standardized 0-1)
        
        # Channel 6: Acceleration (average, standardized 0-1)
        image[i, j, 6] = (image[i, j, 6] * (total_players_in_cell - 1) + row['a'] / max_a) / total_players_in_cell
        

        # Store offense/defense o and dir variables for later
        if row['Dir_std'] == 0:
            dir = 360
        else:
            dir = row['Dir_std']
        if row['O_std'] == 0:
            o = 360
        else:
            o = row['O_std']

        if row['isOnOffense']:
            temp_values[(i,j)]['offense_dir'] += [dir / 360]
            temp_values[(i,j)]['offense_o'] += [o / 360]
        else:
            temp_values[(i,j)]['defense_dir'] += [dir / 360]
            temp_values[(i,j)]['defense_o'] += [o / 360]

        # Channel 16: Height (average, standardized 0-1)
        image[i, j, 16] = (image[i, j, 16] * (total_players_in_cell - 1) + row['heightInches'] / max_height) / total_players_in_cell
        
        # Channel 18: Weight (average, standardized 0-1)
        image[i, j, 18] = (image[i, j, 18] * (total_players_in_cell - 1) + row['weight'] / max_weight) / total_players_in_cell

        # Keep track of people home and away
        if row['isHomeTeam'] == 1:
            num_players[i, j, 2] += 1  # record home team player
        else: 
            num_players[i, j, 3] += 1  # record away team player
        # Update channels

        # Channel 20: % home team in this cell
        # Channel 21: % away team in this cell
        image[i, j, 20] = num_players[i, j, 2] / total_players_in_cell  # calculate % home team players at this cell
        image[i, j, 21] = num_players[i, j, 3] / total_players_in_cell  # calculate % away team players at this cell

    # Compute variances 
    for (i, j) in temp_values.keys():
        # Get sample size:
        n = len(temp_values[(i,j)]['s'])

        # Channel 5: Speed (variance, standardized 0-1)
        image[i, j, 5] = np.var(temp_values[(i,j)]['s'], ddof = 1) if n > 1 else 0

        # Channel 7: Acceleration (variance, standardized 0-1)
        image[i, j, 7] = np.var(temp_values[(i,j)]['a'], ddof = 1) if n > 1 else 0

        # Channel 8: Offense dir angle (average, standardized 0-1)
        image[i, j, 8] = np.mean(temp_values[(i,j)]['offense_dir'])

        # Channel 9: Offense dir angle (variance, standardized 0-1)
        n_o = len(temp_values[(i,j)]['offense_dir'])  # keep track of # people on offense
        image[i, j, 9] = np.var(temp_values[(i,j)]['offense_dir'], ddof = 1) if n_o > 1 else 0

        # Channel 10: Defense dir angle (average, standardized 0-1)
        image[i, j, 10] = np.mean(temp_values[(i,j)]['defense_dir'])

        # Channel 11: Defense dir angle (variance, standardized 0-1)
        n_d = len(temp_values[(i,j)]['defense_dir'])  # keep track of # people on defense
        image[i, j, 11] = np.var(temp_values[(i,j)]['defense_dir'], ddof = 1) if n_d > 1 else 0

        # Channel 12: Offense o angle (average, standardized 0-1)
        image[i, j, 12] = np.mean(temp_values[(i,j)]['offense_o'])

        # Channel 13: Offense o angle (variance, standardized 0-1)
        image[i, j, 13] = np.var(temp_values[(i,j)]['offense_o'], ddof = 1) if n_o > 1 else 0

        # Channel 14: Defense 0 angle (average, standardized 0-1)
        image[i, j, 14] = np.mean(temp_values[(i,j)]['defense_o'])

        # Channel 15: Defense o angle (variance, standardized 0-1)
        image[i, j, 15] = np.var(temp_values[(i,j)]['defense_o'], ddof = 1) if n_d > 1 else 0

        # Channel 17: Height (variance, standardized 0-1)
        image[i, j, 17] = np.var(temp_values[(i,j)]['heightInches'], ddof = 1) if n > 1 else 0
        
        # Channel 19: Weight (variance, standardized 0-1)
        image[i, j, 19] = np.var(temp_values[(i,j)]['weight'], ddof = 1) if n > 1 else 0
        
        # 22. PFF rating?
    
    # Convert matrix to a tensor
    tensor = torch.from_numpy(image)
    return tensor

### 2B) Create tensors

In [None]:
# Hold in an array (faster than concatenating a df every row)
tensor_rows = []

# Loop through every play
play_groups = tracking_df_clean.groupby(['gameId', 'playId'])
for (game_id, play_id), group_df in play_groups:

    # Loop through every frame in that play
    frame_groups = group_df.groupby(['frameId'])
    for frame_id, frame_df in frame_groups:
        # Build tensor for that frame
        tensor = build_tensor(frame_df)
        
        # Keep track of row
        new_row = {
            'gameId': [game_id], 
            'playId': [play_id], 
            'frameId': [frame_id], 
            'frame_cutoff': [frame_df['frame_cutoff'].iloc[0]], 
            'field_tensor': [tensor]
        }
        tensor_rows += [new_row]

# Build dataframe
tensor_df = pd.DataFrame(tensor_rows)
tensor_df.head()

KeyboardInterrupt: 

In [55]:
tensor_df.shape

(121913, 5)