In [19]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt

os.chdir('C:/Users/dalto/OneDrive/Pictures/Documents/Emory/NFL Lab/data')
df = pd.read_csv('2023_tracking.csv')

In [20]:
play_ids = df['play_id_n'].unique()[:1500]
df = df[df['play_id_n'].isin(play_ids)]

In [21]:
max_targets = df.groupby(['play_id_n', 'frame_id'])['player_to_predict'].sum().max()
print(max_targets)

9


#### 2d Grid of Cords

In [22]:
def pixel_map(player_data, target_player_ids, max_targets, grid_width=121, grid_height=55, sigma=.8):
    
    num_channels = 2 + max_targets + 1
    # three channels, one for offense, one for defense, one for ball location, one for player to predict
    pixel_map = np.zeros((num_channels, grid_height, grid_width), dtype=np.float32)
    
    x_vals = player_data['x'].values
    y_vals = player_data['y'].values
    
    for i, (x, y) in enumerate(zip(x_vals, y_vals)):
        x_min = max(0, int(np.floor(x - 2*sigma)))
        x_max = min(grid_width, int(np.ceil(x + 2*sigma)) + 1)
        y_min = max(0, int(np.floor(y - 2*sigma)))
        y_max = min(grid_height, int(np.ceil(y + 2*sigma)) + 1)
        
        for xi in range(x_min, x_max):
            for yi in range(y_min, y_max):
                dist_sq = (xi - x)**2 + (yi - y)**2
                weight = np.exp(-dist_sq / (2 * sigma**2))

                player_id = player_data.iloc[i].get('nfl_id', None)
                
                if player_id in target_player_ids:
                    target_idx = target_player_ids.index(player_id)
                    if target_idx < max_targets: # to prevent error
                        pixel_map[2 + target_idx, yi, xi] += weight
                elif player_data.iloc[i]['player_side'] == 'Offense':
                    pixel_map[0, yi, xi] += weight
                elif player_data.iloc[i]['player_side'] == 'Defense':
                    pixel_map[1, yi, xi] += weight
                
    
    ball_x = player_data['ball_land_x'].iloc[0]
    ball_y = player_data['ball_land_y'].iloc[0]
    
    ball_x_min = max(0, int(np.floor(ball_x - 2*sigma)))
    ball_x_max = min(grid_width, int(np.ceil(ball_x + 2*sigma)) + 1)
    ball_y_min = max(0, int(np.floor(ball_y - 2*sigma)))
    ball_y_max = min(grid_height, int(np.ceil(ball_y + 2*sigma)) + 1)
    
    for xi in range(ball_x_min, ball_x_max):
        for yi in range(ball_y_min, ball_y_max):
            dist_sq = (xi - ball_x)**2 + (yi - ball_y)**2
            weight = np.exp(-dist_sq / (2 * sigma**2))
            pixel_map[3, yi, xi] += weight
    
    return pixel_map

testing one play

In [7]:
df_play_id = df[df['play_id_n'] == 456]
target_player_ids = df_play_id[df_play_id['player_to_predict'] == True]['nfl_id'].unique().tolist()
df_grids_t = df_play_id.groupby(['play_id_n','frame_id']).apply(
    lambda x: pd.Series({'grid': pixel_map(x, target_player_ids, max_targets)})
)

  df_grids_t = df_play_id.groupby(['play_id_n','frame_id']).apply(


grid for all plays

In [None]:
df_grids = df.groupby(['play_id_n','frame_id']).apply(
    lambda x: pd.Series({'grid': pixel_map(x,  x[x['player_to_predict'] == True]['nfl_id'].unique().tolist(), max_targets)})
)

In [17]:
df_grids = df_grids.reset_index()
df_grids = df_grids.sort_values(['play_id_n', 'frame_id'])

visual test

In [None]:
sample_grid = df_grids_t['grid'].iloc[30]

# Count how many target player channels have data
num_targets = 0
for i in range(max_targets):
    if sample_grid[2 + i].sum() > 0:
        num_targets += 1

# Create subplots: 2 base channels + ball + target players
total_plots = 3 + num_targets
cols = 4
rows = (total_plots + cols - 1) // cols

fig, axes = plt.subplots(rows, cols, figsize=(14, 5*rows))
axes = axes.flatten() if total_plots > 1 else [axes]

# Plot offense
axes[0].imshow(sample_grid[0], origin='lower', cmap='Reds')
axes[0].set_title('Offense Players')
axes[0].set_xlabel('X (yards)')
axes[0].set_ylabel('Y (yards)')

# Plot defense
axes[1].imshow(sample_grid[1], origin='lower', cmap='Blues')
axes[1].set_title('Defense Players')
axes[1].set_xlabel('X (yards)')
axes[1].set_ylabel('Y (yards)')

# Plot ball location (at index 2 + max_targets)
axes[2].imshow(sample_grid[2 + max_targets], origin='lower', cmap='Purples')
axes[2].set_title('Ball Landing Location')
axes[2].set_xlabel('X (yards)')
axes[2].set_ylabel('Y (yards)')

# Plot each target player (channels 2 through 2+max_targets-1)
plot_idx = 3
for i in range(max_targets):
    if sample_grid[2 + i].sum() > 0:
        axes[plot_idx].imshow(sample_grid[2 + i], origin='lower', cmap='Greens')
        axes[plot_idx].set_title(f'Target Player {i+1}')
        axes[plot_idx].set_xlabel('X (yards)')
        axes[plot_idx].set_ylabel('Y (yards)')
        plot_idx += 1

for idx in range(total_plots, len(axes)):
    axes[idx].axis('off')

plt.tight_layout()
plt.show()


In [None]:
df_grids.to_pickle("full_grids_1500.pkl")

### Transformer

In [4]:
import torch
import torch.nn as nn 
from torch.utils.data import DataLoader, TensorDataset
import torch.nn.functional as F
from sklearn.model_selection import train_test_split
import gc
from transformers import get_cosine_schedule_with_warmup
df_grids = pd.read_pickle("full_grids_1500.pkl")
torch.manual_seed(26)

  from .autonotebook import tqdm as notebook_tqdm


<torch._C.Generator at 0x2172013ac90>

#### Global Input Seq Context

In [None]:
class CNN_DownSample(nn.Module):
    def __init__(self):
        super().__init__()

        input_chan = 2 + max_targets + 1 
        
        # using stride rather than max pooling preforms better as max pooling tends to compress feat. too much.
        self.heatmap_encoder = nn.Sequential(
            nn.Conv2d(in_channels=input_chan, out_channels=16, kernel_size=3, stride = 2, padding=1),
            nn.BatchNorm2d(16), # normalize var and mean
            nn.GELU(), # preforms better on average idk if itll make a difference in this application

            nn.Conv2d(in_channels=16, out_channels=32, kernel_size=3, stride = 2, padding=1),
            nn.BatchNorm2d(32),
            nn.GELU(),

            nn.Conv2d(in_channels=32, out_channels=64, kernel_size=3, stride = 2, padding=1),
            nn.BatchNorm2d(64),
            nn.GELU()
        )
        
    def forward(self, x):
        x = self.heatmap_encoder(x)
        return x
    

attention layer

In [6]:
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, nhead, mask, dropout=0.15):
        super().__init__() # inhert from parent class
        
        if d_model % nhead != 0:
            raise ValueError(f"d_model ({d_model}) must be divisible by nhead ({nhead})")
            
        self.d_model = d_model # dimension of model
        self.nhead = nhead # number of attention heads, multi headed 
        self.head_dim = d_model // nhead 

        # create key query and values
        self.qkv_proj = nn.Linear(d_model, 3 * d_model)
        # learn context as a product of the attention heads
        self.out_proj = nn.Linear(d_model, d_model)
        # dropout as a form of regularzation
        self.dropout = nn.Dropout(dropout)
        # scaling function
        self.scale = self.head_dim ** -0.5

    def forward(self, x, mask=None):
        B, L, _ = x.shape # batch and length
        
        # create q, k, v values | init just random matrix mults, learned parameter
        qkv = self.qkv_proj(x)
        
        # split key, query, and value vectors into diff pares
        q, k, v = qkv.chunk(3, dim=-1)
        
        # transpose the matrix so that batch and nhead are treated as batches and self attention is calculated from there
        q = q.view(B, L, self.nhead, self.head_dim).transpose(1, 2)
        k = k.view(B, L, self.nhead, self.head_dim).transpose(1, 2)
        v = v.view(B, L, self.nhead, self.head_dim).transpose(1, 2)
    
        # scaled dot product, scale so values arent 0 or 1 
        scores = torch.matmul(q, k.transpose(-2, -1)) * self.scale # matrix mult
        
        # set masked values to -inf so softmax does not "give" attention to them
        if mask is not None:
            mask = mask.unsqueeze(1).unsqueeze(1) 
            mask = mask.expand(B, self.nhead, L, L) 
            scores = scores.masked_fill(mask == 0, -1e9)
            
            
        # softmax to give attention weights to each token
        attn_weights = torch.softmax(scores, dim=-1)
        
        # drop some weights 
        attn_weights = self.dropout(attn_weights)
        
        # context vector for a given input sequence
        context = torch.matmul(attn_weights, v) 
        
        # transpose so the matrix is in the correct size to be concatinated
        context = context.transpose(1, 2).contiguous().view(B, L, self.d_model)
        
        # "combine" the outputs from the head to one general vector
        output = self.out_proj(context)
        
        return output

transformer block

In [None]:
class TransformerBlock(nn.Module):
    def __init__(self, d_model, nhead=4, mask=None, dropout=0.15):
        super(TransformerBlock, self).__init__()
        # self attention class definied above
        self.self_attn = MultiHeadAttention(d_model=d_model, nhead=nhead, dropout=dropout, mask=mask)
        
        # feed forward network for each token
        self.feed_forward = nn.Sequential(
            nn.Linear(64, 128),
            nn.GELU(),
            nn.Dropout(dropout), # to combat overfitting
            nn.Linear(128, 64)
        )
        
        # normilzations so values are between 0-1, learned gamma and beta parameters
        # to shift center and var for values.
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        
        # standard dropout
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, mask):
        residual = x
        # normalized pre attention layer, gradients flow black directly without the normalizing effecting x values
        norm_x = self.norm1(x)
        # self attention
        attn_output = self.self_attn(norm_x, mask)
        # adding residual back to self attention
        x = residual + self.dropout(attn_output)
        
        residual = x
        # normalize values
        # we do so because over the amount of layers scale can get distorted, lead to super big or small values
        norm_x = self.norm2(x) 
        # basic fcn
        ff_output = self.feed_forward(norm_x)
        # adding residual back so that the gradient can flow directly back.
        # adds a 1 + terms to gradients, helps solve the vanishing gradients problem
        x = residual + self.dropout(ff_output)
        
        return x

In [None]:
# use sinusoidal functions, simpler than learned values and generalizes better to unseen parameters
class PositionalEncoding(nn.Module):
    def __init__(self, embed_size, dropout, max_length=5000):
        super(PositionalEncoding, self).__init__()
        # droput
        self.dropout = nn.Dropout(p=dropout)

        # create matrix
        pe = torch.zeros(max_length, embed_size)
        
        # position tensor shape
        position = torch.arange(0, max_length, dtype=torch.float).unsqueeze(1)
        
        # div_term tensor shape
        div_term = torch.exp(torch.arange(0, embed_size, 2).float() * (-torch.log(torch.tensor(10000.0)) / embed_size))
        
        # apply sin to even indices
        pe[:, 0::2] = torch.sin(position * div_term)
        # apply cos to odd indices
        if embed_size % 2 == 1:
            pe[:, 1::2] = torch.cos(position * div_term[:-1])
        else:
            pe[:, 1::2] = torch.cos(position * div_term)
        
        # register as buffer so it moves with model to device
        self.register_buffer('pe', pe)

    def forward(self, x):
        pe_slice = self.pe[:x.size(1), :].to(x.device)
        x = x + pe_slice
        return self.dropout(x)

class TransEncoder(nn.Module):
    def __init__(self, input_dim, embed_size, num_layers, device, dropout, mask, max_length):
        super(TransEncoder, self).__init__()
        self.embed_size = embed_size 
        self.device = device
        # learned matrix projection
        self.input_projection = nn.Linear(input_dim, embed_size)
        # postional encoding 
        self.position_encoding = PositionalEncoding(embed_size, dropout, max_length)
        # layers of model, just stacked encoding layer
        self.layers = nn.ModuleList(
            [
                TransformerBlock(
                    embed_size,
                    mask=mask,
                    nhead=4, # number of attention heads
                    dropout=dropout
                )
                for _ in range(num_layers)
            ]
        )
        # normalize after attention
        self.norm = nn.LayerNorm(embed_size)

    def forward(self, x, mask):
        # input layer matrix mult
        projected_input = self.input_projection(x)
        # position encodings
        out = self.position_encoding(projected_input)
        # pass through transformer block
        for layer in self.layers:
            out = layer(out, mask)
        # normalize gradients
        out = self.norm(out)
        
        return out

##### Sequence

In [None]:
class DecoderLayer(nn.Module):
    def __init__(self, dropout, embedding, nhead):
        super().__init__()
        # attention layers
        self.attention_self = MultiHeadAttention(d_model=embedding, nhead=4, mask=None, dropout=dropout)
        # cross atten
        self.cross_attention = MultiHeadAttention(d_model=embedding, nhead=4, mask=None, dropout=dropout)
        # layer normal
        self.norm1 = nn.LayerNorm(embedding)
        self.norm2 = nn.LayerNorm(embedding)
        self.norm3 = nn.LayerNorm(embedding)
        # droput
        self.dropout = nn.Dropout(dropout)
        # fcnn
        self.fcnn = nn.Sequential(
            nn.Linear(embedding, embedding*2),
            nn.GELU(),
            nn.Dropout(0.15),
            nn.Linear(embedding*2, embedding)
        )
    # decoder forward pass
    def forward(self, decoder_input, encoded_context, target_mask):
        # self attention amoung decoder 
        residual = decoder_input
        norm_x = self.norm1(decoder_input)
        self_attn = self.attention_self(norm_x, mask = target_mask)
        decoder_input = residual + self.dropout(self_attn)

        # cross attention to encoder
        norm_x = self.norm2(decoder_input)
        cross_atn = self.encoder_cross_attention(norm_x, encoded_context)
        decoder_input += self.dropout(cross_atn)

        # fcnn predictions
        norm_x = self.norm3(decoder_input)
        ffcn = self.fcnn(norm_x)
        out = decoder_input + self.dropout(ffcn)

        return out
    
    def encoder_cross_attention(self, query, key_value):
        B, L_q, _ = query.shape
        B, L_kv, _ = key_value.shape
        
        # Q from decoder, kv form encoder, using key value from encoder
        q = self.cross_attention.qkv_proj(query)[:, :, :self.cross_attention.d_model]
        kv = self.cross_attention.qkv_proj(key_value)[:, :, self.cross_attention.d_model:]
        k, v = kv.chunk(2, dim=-1)
        
        # attention calc
        q = q.view(B, L_q, self.cross_attention.nhead, self.cross_attention.head_dim).transpose(1, 2)
        k = k.view(B, L_kv, self.cross_attention.nhead, self.cross_attention.head_dim).transpose(1, 2)
        v = v.view(B, L_kv, self.cross_attention.nhead, self.cross_attention.head_dim).transpose(1, 2)
        
        scores = torch.matmul(q, k.transpose(-2, -1)) * self.cross_attention.scale
        attn_weights = torch.softmax(scores, dim=-1)
        attn_weights = self.cross_attention.dropout(attn_weights)
        
        context = torch.matmul(attn_weights, v)
        context = context.transpose(1, 2).contiguous().view(B, L_q, self.cross_attention.d_model)
        
        return self.cross_attention.out_proj(context)

In [None]:
class PositionalEncoding(nn.Module):
    def __init__(self, embed_size, dropout, max_length=5000):
        super(PositionalEncoding, self).__init__()
        # droput
        self.dropout = nn.Dropout(p=dropout)

        # create matrix
        pe = torch.zeros(max_length, embed_size)
        
        # position tensor shape
        position = torch.arange(0, max_length, dtype=torch.float).unsqueeze(1)
        
        # div_term tensor shape
        div_term = torch.exp(torch.arange(0, embed_size, 2).float() * (-torch.log(torch.tensor(10000.0)) / embed_size))
        
        # apply sin to even indices
        pe[:, 0::2] = torch.sin(position * div_term)
        # apply cos to odd indices
        if embed_size % 2 == 1:
            pe[:, 1::2] = torch.cos(position * div_term[:-1])
        else:
            pe[:, 1::2] = torch.cos(position * div_term)
        
        # register as buffer so it moves with model to device
        self.register_buffer('pe', pe)

    def forward(self, x):
        pe_slice = self.pe[:x.size(1), :].to(x.device)
        x = x + pe_slice
        return self.dropout(x)

In [None]:
class TransDecoder(nn.Module):
    def __init__(self, target_mask, embedding, dropout, nhead, layers, max_targets, max_seq_len):
        super(TransDecoder, self).__init__()
        self.max_targets = max_targets
        self.max_seq_len = max_seq_len
        self.embedding = embedding

        # postional embeddings
        self.pos_embed = PositionalEncoding(embed_size=embedding, dropout=0.15, max_length=150)

        # decode 
        self.layers = nn.ModuleList([
            DecoderLayer(embedding=embedding, dropout=dropout, nhead=nhead)
            for _ in range(layers)
        ])
        # predictions
        self.multi_decoder = nn.ModuleList([
            nn.Linear(embedding, 2)
            for _ in range(max_targets)
        ])
        # target queries
        self.target_queries = nn.Parameter(torch.randn(max_targets, embedding))

        # normalization
        self.norm = nn.LayerNorm(embedding)
        
    def forward(self, encoded_context, target_mask, future_steps):
        # batch size and device
        batch_size = encoded_context.shape[0]
        device = encoded_context.device

        # learnable targets 
        decoder_input = self.target_queries.unsqueeze(0).expand(batch_size, -1, -1)
        
        # outputs
        outputs = []
        current_input = decoder_input

        # output sequence
        for step in range(future_steps):
            # add positional context
            pos_input = current_input + self.pos_embed.pe[step, :self.embedding].unsqueeze(0).unsqueeze(0)

            # init 
            decoded = pos_input

            # through transformer layers
            for layer in self.layers:
                decoded = layer(decoded, encoded_context, target_mask)
            
            # norm to prevent exploding gradients
            decoded = self.norm(decoded)
            
            # positons for timestep
            step_predictions = []
            for i, decoder in enumerate(self.multi_decoder):
                pred = decoder(decoded[:, i, :])
                step_predictions.append(pred)
            # stack all predictions
            step_output = torch.stack(step_predictions, dim=1)
            outputs.append(step_output)

            current_input = decoded
        
        predictions = torch.stack(outputs, dim=2)

        # mask
        target_mask_expanded = target_mask.unsqueeze(-1).unsqueeze(-1)
        predictions = predictions * target_mask_expanded

        return predictions


In [None]:
def train_seq_2_seq(df_grids, max_targets, input_seq_len, output_seq_len):
    sequences = []
    targets = []
    masks = []
    # get all offensive players
    player_to_predict = df[df['player_to_predict'] == True].groupby('play_id_n')['nfl_id'].unique()
    
    # get players to predict postions
    for play_id in df_grids['play_id_n'].unique():
        play_data = df_grids[df_grids['play_id_n'] == play_id].sort_values('frame_id')

    # loop through every play
    for play_id in df_grids['play_id_n'].unique():

        play_data = df_grids[df_grids['play_id_n'] == play_id].sort_values('frame_id')
        # padding with sequence length less than max
        if len(play_data) >= input_seq_len + output_seq_len:
            # get players to predict ids
            players = player_to_predict[play_id]

            # only take max targets
            players = players[:max_targets]
            num_receivers = len(players)
            
            # mask of players
            mask = torch.zeros(max_targets)
            mask[:num_receivers] = 1
            
            # input sequence
            input_grids = [torch.from_numpy(grid).float() for grid in play_data['grid'].iloc[:input_seq_len]]
            sequence = torch.stack(input_grids, dim=0)
            
            # add noise for regularzatiom
            noise = torch.randn_like(sequence) * 0.0001
            input_sequence = sequence + noise
            
            # target sequence
            target_positions = torch.zeros(max_targets, output_seq_len, 2)

            # append postions for every targeted reciver
            for step in range(output_seq_len):
                frame_idx = input_seq_len + step
                if frame_idx < len(play_data):
                    target_frame = play_data.iloc[frame_idx]['frame_id']
                    frame_data = df[(df['play_id_n'] == play_id) & (df['frame_id'] == target_frame)]
                    
                    for i, receiver_id in enumerate(players):
                        receiver_data = frame_data[frame_data['nfl_id'] == receiver_id]
                        if not receiver_data.empty:
                            x = float(receiver_data['x'].iloc[0]) / 120
                            y = float(receiver_data['y'].iloc[0]) / 53.3
                            target_positions[i, step] = torch.tensor([x, y])

            # if there is targets
            if num_receivers > 0:
                sequences.append(input_sequence)
                targets.append(target_positions)
                masks.append(mask)


    if len(sequences) == 0:
        return None, None, None
    
    return (torch.stack(sequences, dim=0), 
            torch.stack(targets, dim=0), 
            torch.stack(masks, dim=0))

In [None]:
def train_mask(seq_len, device):
    # create a mask where feature postions cannot be seen during training
    mask = torch.triu(torch.ones(seq_len, seq_len, device=device), diagonal=1)

def padding_mask(lengths, max_len, device):
    batch_size = len(lengths)
    mask = torch.zeros(batch_size, max_len, device=device, dtype=torch.bool)
    for i, length in enumerate(lengths):
        mask[i, :length] = True
    return mask

class MaskedSequenceLoss(nn.Module):
    def __init__(self):
        super().__init__()
    
    def forward(self, predictions, targets, target_mask):
        mask = target_mask.unsqueeze(-1).unsqueeze(-1)
        mask = mask.expand_as(predictions)

        mse = (predictions - targets) ** 2
        # dont consider error on padding, masked tokens etc
        masked_mse = mse * mask

        # normalize for masked elements 
        valid = mask.sum()
        if valid > 0:
            return masked_mse.sum() / valid
        else:
            return torch.tensor(0.0, device=predictions.device)


In [None]:
class DJMooreSeq(nn.Module):
    def __init__(self, embed_size, encoder_layers, decoder_layers, 
                 max_targets, dropout, nheads, dev='cuda') -> None:
        super().__init__()
        
        # general vars
        self.embedding_size = embed_size
        self.max_targets = max_targets
        self.device = dev

        # context cnn
        self.context_cnn = CNN_DownSample()
        context_cnn_output = 64* 14 * 31

        # transformer encoder
        self.encoder = TransEncoder(input_dim=context_cnn_output,
                                    embed_size=embed_size,
                                    num_layers=encoder_layers,
                                    device=dev,
                                    mask=None,
                                    dropout=dropout,
                                    max_length=100)
        
        # decoder
        self.decoder = TransDecoder(target_mask=None,
                                    embedding=embed_size,
                                    dropout=dropout,
                                    nhead=nheads,
                                    layers=decoder_layers,
                                    max_targets=max_targets,
                                    max_seq_len=100)
    def forward():
        