In [None]:
# install required libraries here

!pip install albumentations av cv2 kaggle fastai fastkaggle pandas numpy

In [16]:
from fastkaggle import *
from google.colab import files
from fastai.data.all import *
from fastai.vision.all import *
from fastai.vision.widgets import *
import albumentations as A
import cv2
import pandas as pd

In [None]:
files.upload()

In [None]:
! mkdir ~/.kaggle
! cp kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json

In [20]:
comp = 'nfl-player-contact-detection'

path = setup_comp(comp, install='fastai "timm>=0.6.2.dev0"')

path.ls()

Downloading nfl-player-contact-detection.zip to /content


100%|██████████| 3.84G/3.84G [00:34<00:00, 119MB/s]





## Data Exploration

The most important thing for us to do now that we have the data is to make sure we understand what it is we are given!

We do this by looking at the data description on the Kaggle site: https://www.kaggle.com/competitions/nfl-player-contact-detection

and printing out some of the data below.

In [None]:
from IPython.display import display, Video

# define the path to the test video files
test_data = os.path.join(path, 'train')

# get_files is a fast.ai function
video_paths = get_files(test_data, extensions='.mp4')

Video(str(video_paths[0]),embed=True,width=320,height=320)

There are three video angles of each play, Sideline, Endzone and All 29

There are also csv files:

* `train_labels.csv` - These are the labelled contacts for the videos in the /train folder for every player combination. What this means is that for every single frame of video we have an indicator saying whether player 1 has contacted EVERY other player, player 2 has contacted EVERY other player and so on. This is why this file is over 400MB,it is basically a registry of (player_x * num_players) * num_frames + contacted_occured 

* `train_baseline_helmets.csv` - These are baseline helmet detection and assignment boxes for the training and test set. These are useful when predicting contacts. It provides the bounding boxes for all detected helmets. Not all helmets are detected in every frame.

* `train_player_tracking.csv` -  This is 10 Hz tracking data for each player on the field during the provided plays. What this means is that for every 1/10th of a second, we have the location, acceleration and direction of each player. This is useful for numerous reasons, including figuring out exactly how close players are to each other.

* `train_video_metadata.csv` - contains timestamps associated with each Sideline and Endzone view for syncing with the player tracking data.

## Feature Engineering

Reading the description of the columns in the train_labels.csv, the contact_id column is an amalgamation of several potentially useful data points. 

Lets do some feature engineering to parse them out into their own columns.

In [None]:
def expand_contact_id(df):
    """
    Splits out contact_id into seperate columns.
    """
    df["game_play"] = df["contact_id"].str[:12]
    df["step"] = df["contact_id"].str.split("_").str[-3].astype("int")
    df["nfl_player_id_1"] = df["contact_id"].str.split("_").str[-2]
    df["nfl_player_id_2"] = df["contact_id"].str.split("_").str[-1]
    return df

In [None]:
labels = expand_contact_id(pd.read_csv(os.path.join(path, "train_labels.csv")))
print("Number of labelled contacts : ",len(labels))
train_tracking = pd.read_csv(os.path.join(path, "train_player_tracking.csv"))
print("Number of tracking records : ",len(train_tracking))
train_helmets = pd.read_csv(os.path.join(path, "train_baseline_helmets.csv"))
print("Number of helmet detections : ",len(train_helmets))
train_video_metadata = pd.read_csv(os.path.join(path, "train_video_metadata.csv"))

### Data Filtering

360 plays is too many to train on. Choose a subset.

In [None]:
plays = ['58168_003392']
#plays = ['58168_003392', '58172_003247', '58173_003606', '58174_001792', '58176_002844']
train_video_metadata = train_video_metadata[train_video_metadata['game_play'].isin(plays)] 
train_helmets = train_helmets[train_helmets['game_play'].isin(plays)] 
train_tracking = train_tracking[train_tracking['game_play'].isin(plays)] 
labels = labels[labels['game_play'].isin(plays)]

### Create Features

Create a feature set by joining data from several CSV files into one dataframe.

Join labeled data of whether players are contacting each other or the ground (train_labels.csv) with distance and acceleration tracking data (train_player_tracking.csv).


In [None]:
def create_features(df, tr_tracking, merge_col="step", use_cols=["x_position", "y_position"]):
    output_cols = []
    df_combo = (
      # cast the nfl_player_id_1 column to a string
      df.astype({"nfl_player_id_1": "str"})
      # database-style join
      .merge(
          # define the columns to be merged on the dataframe being merged
          tr_tracking.astype({"nfl_player_id": "str"})[
              ["game_play", merge_col, "nfl_player_id",] + use_cols
          ],
          # match values in these columns when merging
          left_on=["game_play", merge_col, "nfl_player_id_1"],
          right_on=["game_play", merge_col, "nfl_player_id"],
          # in the resulting frame, keep only columns from df
          how="left"
      )
      .rename(columns={c: c+"_1" for c in use_cols})
       # drop the nfl_player_id column
      .drop("nfl_player_id", axis=1)
      .merge(
          # define the columns to be merged on the dataframe being merged
          tr_tracking.astype({"nfl_player_id": "str"})[
              ["game_play", merge_col, "nfl_player_id",] + use_cols
          ],
          # match values in these columns when merging
          left_on=["game_play", merge_col, "nfl_player_id_2"],
          right_on=["game_play", merge_col, "nfl_player_id"],
          # in the resulting frame, keep only columns from df
          how="left"
      )
      .rename(columns={c: c+"_2" for c in use_cols})
      # drop the nfl_player_id column
      .drop("nfl_player_id", axis=1)
      .sort_values(["game_play", merge_col, "nfl_player_id_1", "nfl_player_id_2"])
      .reset_index(drop=True)
    )

    output_cols += [c+"_1" for c in use_cols]
    output_cols += [c+"_2" for c in use_cols]

    # find the euclidian distance (p-2) between two players
    # create a new column called 'distance' with this data
    if ("x_position" in use_cols) & ("y_position" in use_cols):
        index = df_combo['x_position_2'].notnull()

        distance_arr = np.full(len(index), np.nan)
        tmp_distance_arr = np.sqrt(
            np.square(df_combo.loc[index, "x_position_1"] - df_combo.loc[index, "x_position_2"])
            + np.square(df_combo.loc[index, "y_position_1"]- df_combo.loc[index, "y_position_2"])
        )

        distance_arr[index] = tmp_distance_arr
        df_combo['distance'] = distance_arr
        output_cols += ["distance"]

    df_combo['G_flug'] = (df_combo['nfl_player_id_2']=="G")
    output_cols += ["G_flug"]
    return df_combo, output_cols

In [None]:
use_cols = [
    'x_position', 'y_position', 'speed', 'distance',
    'direction', 'orientation', 'acceleration', 'sa'
]

train, feature_cols = create_features(labels, train_tracking, use_cols=use_cols)

train.head(100)

There is more we can do to clean up this data. We are trying to predict player contact. The distance column describes how far a player has traveled since the last time stamp. If that value is large, the player is running freely and not in contact with another player.

In [None]:
# drop rows where distance is greater than two yards
train_filtered = train.query('not distance>2').reset_index(drop=True)

From the Kaggle competition page:
"These videos all contain a frame rate of 59.94 HZ. The moment of snap occurs 5 seconds into the video."

Create a new 'frame' column using this information and the step column.

In [None]:
# 59.94 is average frame rate, 5*59.94 is because step=0 starts
# from 5s not 0s (The data is labelled starting AFTER the snap)
# a step is 1/10th of a second

train_filtered['frame'] = (train_filtered['step']/10*59.94+5*59.94).astype('int')+1

Next we use ffmpeg to convert each video in jpeg images representing each frame

In [None]:
for video in tqdm(train_helmets.video.unique()):
    if 'Endzone2' not in video:
        !ffmpeg -i nfl-player-contact-detection/train/{video} -q:v 2 -f image2 working/{video}_%04d.jpg -hide_banner -loglevel error

## Lookup Tables

Create two look up tables:
1. find all players in a video
2. find all jpg frames belonging to a video

In [None]:
video2helmets = {}

train_helmets_new = train_helmets.set_index('video')
for video in tqdm(train_helmets.video.unique()):
    video2helmets[video] = train_helmets_new.loc[video].reset_index(drop=True)

In [None]:
video2frames = {}

for game_play in tqdm(train_video_metadata.game_play.unique()):
    for view in ['Endzone', 'Sideline']:
        video = game_play + f'_{view}.mp4'
        video2frames[video] = max(list(map(lambda x:int(x.split('_')[-1].split('.')[0]), \
                                           glob.glob(f'working/{video}*'))))

## Data Augmentation

with help from Albumentations library

In [None]:
# This is an implementation of TTA
# https://www.kaggle.com/code/andrewkh/test-time-augmentation-tta-worth-it

# The image is flipped, transposed and contrast adjusted
train_aug = A.Compose([
    A.HorizontalFlip(p=0.75),
    A.ShiftScaleRotate(p=0.5),
    A.RandomBrightnessContrast(brightness_limit=(-0.1, 0.1), contrast_limit=(-0.1, 0.1), p=0.25),
    A.Normalize(mean=[0.], std=[1.])
])

valid_aug = A.Compose([
    A.Normalize(mean=[0.], std=[1.])
])

## Data Pipeline

### 8.1 - 2.5D Training

#### We use twist on traditional 2.5D training. 2.5D training involves stacking multiple similar images on top of each other and feeding it to a model. In this way, the model gets a better idea of what it is we are trying to optimize.

#### What we are dealing with here are hundreds of videos of American Football plays, where we already have all of the contacts labelled, so we know to a fairly high level of accuracy when contacts occur in the video. Our job is to feed 'what contact between players looks like' into the model.

#### What we do is to feed frames into the model which occur 'around the same time' as the point of contact. This means that for each contact that we are training on, we are using quite a few images.

#### For every record in train_labels.csv, create a Tensor.

#### The below function defines the transformations we must perform to ready all the data into Tensors

In [None]:
class TFDataset():
    def __init__(self, df, aug=train_aug, mode='train'):
        self.df = df
        self.frame = df.frame.values
        self.feature = df[feature_cols].fillna(-1).values
        self.players = df[['nfl_player_id_1','nfl_player_id_2']].values
        self.game_play = df.game_play.values
        self.aug = aug
        self.mode = mode
    
    def __len__(self):
        return len(self.df)
    
    def getitem(self):
        # Create a list of sequential numbers from 0 to the number of occurrences in the dataframe
        # This represents the frame indexes 
        allIndexes = np.arange(len(self.df))
        
        for idx in allIndexes:
            
            # the window is how far you will look behind and ahead, from the current frame
            # to create a frame sequence for one tensor
            window = 24
            
            # Get the first frame of interest
            # The snap occurs five seconds into every video.
            # Given our frame rate of ___, this means we always start from 300
            
            # Also to note, we do not have tracking and labeling data for every frame.
            # only every 6th frame: Video framerate is 60Hz and tracking/labels at 10Hz
            frame = self.frame[idx]
            
            # When training, randomize the first pulled frame by +/- 6 frames 
            if self.mode == 'train':
                
                frame = frame + random.randint(-6, 6)
            
            # Consider the two players in the contact
            players = []
            for p in self.players[idx]:
                if p == 'G':
                    players.append(p)
                else:
                    players.append(int(p))
            
            # Process the frame matching the current frame number for both the EndZone and Sideline videos
            for view in ['Endzone', 'Sideline']:
                video = self.game_play[idx] + f'_{view}.mp4'
                
                # Get a mapping of all the players in the video
                tmp = video2helmets[video]
                
                # Narrow the mapping to only those frames in the window size of interest
                tmp[tmp['frame'].between(frame-window, frame+window)]
                
                # Further narrow to frames containing players of interest 
                tmp = tmp[tmp.nfl_player_id.isin(players)]
                
                tmp_frames = tmp.frame.values
                
                # Group the remaining frames and get the mean value of the bounding box dimensions
                tmp = tmp.groupby('frame')[['left','width','top','height']].mean()
                
                # Iterate thru every frame in our window
                bboxes = []
                for f in range(frame-window, frame+window+1, 1):
                    if f in tmp_frames:
                        x, w, y, h = tmp.loc[f][['left','width','top','height']]
                        bboxes.append([x, w, y, h])
                    else:
                        bboxes.append([np.nan, np.nan, np.nan, np.nan])
                        
                # Interpolate bounding boxes for frames where we did not find bboxes
                bboxes = pd.DataFrame(bboxes).interpolate(limit_direction='both').values
                
                # Sample every fourth bounding box
                bboxes = bboxes[::4]
                
                if bboxes.sum() > 0:
                    flag = 1
                else:
                    flag = 0
                    
                frame_sampled = False
                
                for i, f in enumerate(range(frame-window, frame+window+1, 4)):
                    
                    img_new = np.zeros((256, 256), dtype=np.float32)
                    
                    # Read the frame if it had a valid player 1 helmet bounding box
                    if flag == 1 and f <= video2frames[video]:
                        
                        img = cv2.imread(f'{CFG.ffmpeg_output}/{video}_{f:04d}.jpg', 0)
                        
                        # Get the bbox
                        # Use it to create a new image
                        # by zooming in on the two players in the video
                        x, w, y, h = bboxes[i]
                        
                        img = img[int(y+h/2)-128:int(y+h/2)+128,int(x+w/2)-128:int(x+w/2)+128].copy()
                        
                        img_new[:img.shape[0], :img.shape[1]] = img
                        
                    imgs.append(img_new)
                    
            # Cast all features for this record to float
            feature = np.float32(self.feature[idx])
            
            # Transpose the image list and store it
            img = np.array(imgs).transpose(1, 2, 0)
            
            img = self.aug(image=img)["image"]
            record_label = np.float32(self.df.contact.values[idx])
            
            yield (img, feature), record_label