## Kaggle competition: NFL Big Data Bowl
- Refer to [How many yards will an NFL player gain after receiving a handoff?](https://www.kaggle.com/c/nfl-big-data-bowl-2020)
- BERT: Bidirectional Encoder Representations from Transformers
- Pytorch Transformer model
- The notebook was originally obtained from [the kaggle soltuion notebook](https://www.kaggle.com/nyanpn/pytorch-transformer-public-14th-private-22nd)
- Refer to https://www.kaggle.com/c/nfl-big-data-bowl-2020/discussion/119314

## Contents
- Load data and prepare features
- Prepare a customized data loader for pytorch models
- Define functions and debug them one by one
- Split data sets into train and validation
- Define Transformer model
- Parse options and instantiate a model with parsed options
- Train models on GPU/Cuda
- Ensemble through Snapshot

In [1]:
import copy
import csv
import time
from typing import Optional, Tuple, List, Dict, Type

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn.model_selection import GroupKFold
from sklearn.preprocessing import StandardScaler

from torch.nn.modules.module import Module
from torch.nn.modules.dropout import Dropout
from torch.nn.modules.linear import Linear
from torch.nn.modules.normalization import LayerNorm
from torch.nn.parameter import Parameter
from torch.nn.init import xavier_uniform_
from torch.nn.init import constant_
from torch.nn.init import xavier_normal_
from torch.nn.modules.container import ModuleList
#
from transformers.modeling_bert import BertConfig, BertEncoder, BertModel
torch.manual_seed(0)
np.random.seed(0)

## Load  and prepare features

In [2]:
#######################################################################################################################
# Prep and Feature Engineering
#######################################################################################################################

# Thanks to: https://www.kaggle.com/cpmpml/initial-wrangling-voronoi-areas-in-python
# new: adjusted features
    #df.loc[:, 'S'] = 10 * df['Dis']    
    #df.loc[df.Season == 2017, 'A'] = df[df.Season == 2018]['A'].mean()
    
def prep(df: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]:
    print(f"prepare features from source data with function: prep")       
    df['ToLeft'] = df.PlayDirection == "left"
    df['IsBallCarrier'] = df.NflId == df.NflIdRusher

    df.loc[df.VisitorTeamAbbr == "ARI", 'VisitorTeamAbbr'] = "ARZ"
    df.loc[df.HomeTeamAbbr == "ARI", 'HomeTeamAbbr'] = "ARZ"

    df.loc[df.VisitorTeamAbbr == "BAL", 'VisitorTeamAbbr'] = "BLT"
    df.loc[df.HomeTeamAbbr == "BAL", 'HomeTeamAbbr'] = "BLT"

    df.loc[df.VisitorTeamAbbr == "CLE", 'VisitorTeamAbbr'] = "CLV"
    df.loc[df.HomeTeamAbbr == "CLE", 'HomeTeamAbbr'] = "CLV"

    df.loc[df.VisitorTeamAbbr == "HOU", 'VisitorTeamAbbr'] = "HST"
    df.loc[df.HomeTeamAbbr == "HOU", 'HomeTeamAbbr'] = "HST"
    
    ## adjusted features from kaggle discussions
    ##06/06/2020
    df.loc[:, 'S'] = 10 * df['Dis']    
    df.loc[df.Season == 2017, 'A'] = df[df.Season == 2018]['A'].mean()
    
    # standardization
    df['TeamOnOffense'] = "home"
    df.loc[df.PossessionTeam != df.HomeTeamAbbr, 'TeamOnOffense'] = "away"
    df['IsOnOffense'] = df.Team == df.TeamOnOffense  # Is player on offense?
    df['YardLine_std'] = 100 - df.YardLine
    df.loc[df.FieldPosition.fillna('') == df.PossessionTeam, 'YardLine_std'] = df.loc[
        df.FieldPosition.fillna('') == df.PossessionTeam, 'YardLine']
    df['X_std'] = df.X
    df.loc[df.ToLeft, 'X_std'] = 120 - df.loc[df.ToLeft, 'X']
    df['Y_std'] = df.Y
    df.loc[df.ToLeft, 'Y_std'] = 160 / 3 - df.loc[df.ToLeft, 'Y']
    df['Orientation_std'] = df.Orientation
    df.loc[df.ToLeft, 'Orientation_std'] = np.mod(180 + df.loc[df.ToLeft, 'Orientation_std'], 360)
    df['Dir_std'] = df.Dir
    df.loc[df.ToLeft, 'Dir_std'] = np.mod(180 + df.loc[df.ToLeft, 'Dir_std'], 360)
    df['IsOffence'] = df['Team'] == df['TeamOnOffense']

    # translate Home/Visitor to Offence/Defense
    df['OffenceScoreBeforePlay'] = df['HomeScoreBeforePlay']
    df.loc[df.TeamOnOffense == "away", 'OffenceScoreBeforePlay'] = df.loc[
        df.TeamOnOffense == "away", 'VisitorScoreBeforePlay']
    df['DefenseScoreBeforePlay'] = df['VisitorScoreBeforePlay']
    df.loc[df.TeamOnOffense == "away", 'DefenseScoreBeforePlay'] = df.loc[
        df.TeamOnOffense == "away", 'HomeScoreBeforePlay']

    df['OffenceTeamAbbr'] = df['HomeTeamAbbr']
    df.loc[df.TeamOnOffense == "away", 'OffenceTeamAbbr'] = df.loc[df.TeamOnOffense == "away", 'VisitorTeamAbbr']
    df['DefenseTeamAbbr'] = df['VisitorTeamAbbr']
    df.loc[df.TeamOnOffense == "away", 'DefenseTeamAbbr'] = df.loc[df.TeamOnOffense == "away", 'HomeTeamAbbr']

    df['Year'] = pd.to_datetime(df.TimeSnap).dt.year
    df.loc[df['Year'] == 2017, 'Orientation_std'] = np.mod(90 + df.loc[df['Year'] == 2017, 'Orientation_std'], 360)

    player_features = ['X_std', 'Y_std', 'S', 'A', 'Dis', 'Orientation_std', 'Dir_std', 'NflId', 'JerseyNumber',
                       'PlayerHeight',
                       'PlayerWeight', 'PlayerBirthDate', 'PlayerCollegeName', 'Position', 'IsBallCarrier', 'IsOffence']

    play_features = ['YardLine_std', 'Quarter', 'GameClock', 'PossessionTeam', 'Down', 'Distance', 'FieldPosition',
                     'OffenceScoreBeforePlay', 'DefenseScoreBeforePlay', 'OffenseFormation', 'OffensePersonnel',
                     'DefendersInTheBox', 'DefensePersonnel', 'TimeHandoff', 'TimeSnap',
                     'OffenceTeamAbbr', 'DefenseTeamAbbr', 'Week', 'Stadium', 'Location',
                     'Turf', 'GameWeather', 'Temperature', 'Humidity', 'WindSpeed', 'WindDirection',
                     'TeamOnOffense', 'HomeTeamAbbr', 'Year']

    if 'Yards' in df:
        play_features.append('Yards')

    players = df[['PlayId', 'GameId'] + player_features].copy()

    play = df[['GameId', 'PlayId'] + play_features].copy().drop_duplicates(subset=['PlayId'])

    return play, players


def prep_players_nn(play, players, scaler=None, scaler_meta=None):
    print(f"prepare player data with function: prep_players_nn")    
    p = players.drop(['NflId', 'JerseyNumber', 'PlayerHeight', 'PlayerBirthDate', 'PlayerCollegeName', 'GameId'],
                     axis=1).copy()

    if scaler is None:
        is_training = True
    else:
        assert scaler_meta is not None
        is_training = False

    if 'YardLine_std' not in p:
        p = pd.merge(p, play[['YardLine_std', 'PlayId']], on='PlayId', how='left')

    p['IsBallCarrier'] = p['IsBallCarrier'].astype(int)
    p['IsOffence'] = p['IsOffence'].astype(int)
    p['X_std'] -= p['YardLine_std']

    p['Dir_cos'] = np.cos(np.deg2rad(90 - p['Dir_std']))
    p['Dir_sin'] = np.sin(np.deg2rad(90 - p['Dir_std']))
    
    ## fill in missing with 0
    p = p.fillna(0)

    rb = p[p['IsBallCarrier'] == 1][['X_std', 'Y_std', 'PlayId', 'S', 'Dir_sin', 'Dir_cos', 'Dir_std']]
    rb.columns = ['X_', 'Y_', 'PlayId', 'S_', 'Dir_sin_', 'Dir_cos_', 'Dir_std_']

    p = pd.merge(p, rb, how='left')
    p['DX'] = p['X_std'] - p['X_']
    p['DY'] = p['Y_std'] - p['Y_']

    # Relative angle from on Rusher's vector
    angles = 90.0 - np.rad2deg(np.arctan2(p['DY'], p['DX']))
    p['AngleFromRB'] = angles - p['Dir_std_']
    p['AngleFromRB'] = np.mod(p['AngleFromRB'] + 360, 360)
    p.loc[p['AngleFromRB'] > 180, 'AngleFromRB'] -= 360

    # dTheta of AngleFromRB
    dt = 0.001
    p['DX2'] = (p['X_std'] + dt * p['Dir_cos'] * p['S']) - (p['X_'] + dt * p['Dir_cos_'] * p['S_'])
    p['DY2'] = (p['Y_std'] + dt * p['Dir_sin'] * p['S']) - (p['Y_'] + dt * p['Dir_sin_'] * p['S_'])

    angles = 90.0 - np.rad2deg(np.arctan2(p['DY2'], p['DX2']))
    p['AngleFromRB2'] = angles - p['Dir_std_']
    p['AngleFromRB2'] = np.mod(p['AngleFromRB2'] + 360, 360)
    p.loc[p['AngleFromRB2'] > 180, 'AngleFromRB2'] -= 360
    p['AngleFromRB2'] = p['AngleFromRB2'] - p['AngleFromRB']
    p.loc[p['AngleFromRB2'] > 180, 'AngleFromRB2'] -= 360
    p.loc[p['AngleFromRB2'] < -180, 'AngleFromRB2'] += 360

    p.loc[p['IsBallCarrier'] == 1, 'AngleFromRB'] = 0
    p.loc[p['IsBallCarrier'] == 1, 'AngleFromRB2'] = 0

    p.drop(['DX2', 'DY2'], axis=1, inplace=True)

    p['AngleTan'] = np.arctan2(p['DY'], p['DX'])

    p = p.replace([np.inf, -np.inf], np.nan)
    p = p.fillna(0)
    p.drop(['X_', 'Y_', 'Dir_sin_', 'Dir_cos_', 'S_', 'Dir_std_', 'Orientation_std', 'Dir_std'], axis=1, inplace=True)

    concat = p.drop(['Position', 'PlayId', 'YardLine_std'], axis=1)

    if is_training:
        scaler = StandardScaler()
        scaled = scaler.fit_transform(concat.values)
    else:
        scaled = scaler.transform(concat.values)
    df = pd.DataFrame(scaled, columns=concat.columns)

    df_rusher = df[p['IsBallCarrier'] == 1].reset_index(drop=True)
    drop_cols = ['IsBallCarrier', 'IsOffence', 'OffenseDist0', 'PlayerWeight', 'DX', 'DY', 'AngleFromRB', 'AngleToRB',
                 'DistDelta', 'AngleTan', 'AngleFromRB2', 'delaunay_adj']
    drop_cols = [c for c in drop_cols if c in df_rusher.columns]
    df_rusher.drop(drop_cols, axis=1, inplace=True)
    df_rusher = df_rusher.set_index(players['PlayId'].iloc[np.arange(0, len(players), 22)])

    # meta features
    meta = play[['YardLine_std', 'Distance']].copy()
    meta = meta.replace([np.inf, -np.inf], np.nan)
    meta = meta.fillna(0) ## fill in missing with 0

    if scaler_meta is not None:
        scaled_meta = scaler_meta.transform(meta.values)
    else:
        scaler_meta = StandardScaler()
        scaled_meta = scaler_meta.fit_transform(meta.values)
    scaled_meta = pd.DataFrame(scaled_meta, columns=meta.columns)

    return df.set_index(players['PlayId']), df_rusher, scaled_meta, scaler, scaler_meta


### A user-defined data class for training model

In [3]:
class Data(object):
    def __init__(self, players: torch.Tensor, rusher: torch.Tensor, meta: torch.Tensor, y: Optional[torch.Tensor],
                 yardLine: np.ndarray, year: np.ndarray, player_cols: List[str], rusher_cols: List[str],
                 meta_cols: List[str]):
        ## inputs with tensor and ndarray
        self.players = players
        self.rusher = rusher
        self.meta = meta
        self.y = y
        self.yardLine = yardLine
        self.year = year
        self.player_cols = player_cols
        self.rusher_cols = rusher_cols
        self.meta_cols = meta_cols

        assert self.players.size(0) == self.rusher.size(0)
        if yardLine is not None:
            assert len(yardLine) == self.players.size(0)

    def len(self):
        return self.players.size(0)

    def y_soft(self, sigma: float = 1.0):
        from scipy.ndimage.filters import gaussian_filter1d
        return torch.from_numpy(gaussian_filter1d(self.y.numpy(), sigma=sigma))

    def slice(self, begin: int, end: int) -> 'Data':
        ## inputs:(p, r, m, yd, yr) and target: y
        p = self.players[begin:end] if self.players is not None else None
        r = self.rusher[begin:end] if self.rusher is not None else None
        m = self.meta[begin:end] if self.meta is not None else None
        y = self.y[begin:end] if self.y is not None else None
        yd = self.yardLine[begin:end].copy() if self.yardLine is not None else None
        yr = self.year[begin:end].copy() if self.year is not None else None

        return Data(p, r, m, y, yd, yr, self.player_cols, self.rusher_cols, self.meta_cols)

    def _sample_by_mask(self, mask):
        ## sub-sample observations, which will be used for data from year 2017 
        mask_tensor = torch.from_numpy(mask)
        p = self.players[mask_tensor] if self.players is not None else None
        r = self.rusher[mask_tensor] if self.rusher is not None else None
        m = self.meta[mask_tensor] if self.meta is not None else None
        y = self.y[mask_tensor] if self.y is not None else None
        yd = self.yardLine[mask].copy() if self.yardLine is not None else None
        yr = self.year[mask].copy() if self.year is not None else None

        return Data(p, r, m, y, yd, yr, self.player_cols, self.rusher_cols, self.meta_cols)

    def downsample_2017(self, dropout_rate: float):
        assert 0 <= dropout_rate <= 1.0
        print(f"down sample for games in 2017 with rate: {dropout_rate}")
        dropout = np.random.choice([True, False], size=len(self.year), p=[1 - dropout_rate, dropout_rate])
        mask = (self.year != 2017) | dropout
        return self._sample_by_mask(mask)

    def shuffled(self):
        print(f"start shuffled")
        indices = np.random.permutation(self.players.shape[0])
        p = np.take(self.players, indices, axis=0)
        r = np.take(self.rusher, indices, axis=0) if self.rusher is not None else None
        m = np.take(self.meta, indices, axis=0) if self.meta is not None else None
        y = np.take(self.y, indices, axis=0) if self.y is not None else None
        yd = np.take(self.yardLine, indices, axis=0).copy() if self.yardLine is not None else None
        yr = np.take(self.year, indices, axis=0).copy() if self.year is not None else None
        return Data(p, r, m, y, yd, yr, self.player_cols, self.rusher_cols, self.meta_cols)

    def random_split(self, p: float):
        mask = np.random.choice([True, False], p=[p, 1 - p], size=self.meta.size(0))
        d1 = self._sample_by_mask(mask)
        d2 = self._sample_by_mask(~mask)
        return d1, d2

## Load  data from source files and split data into training and validation

In [4]:
#######################################################################################################################
# Load & Validation Split
#######################################################################################################################

def load_data_nn_test(test_df: pd.DataFrame, scaler: StandardScaler, scaler_meta: StandardScaler) -> Data:
    play, players = prep(test_df)
    players_nn, rusher_nn, meta_nn, _, _ = prep_players_nn(play, players, scaler, scaler_meta)
    X = players_nn.values.astype(np.float32).reshape((-1, 22, len(players_nn.columns)))
    Xr = rusher_nn.values.astype(np.float32)
    Xm = meta_nn.values.astype(np.float32)
    year = play['Year'].values.copy()

    return Data(torch.from_numpy(X), torch.from_numpy(Xr), torch.from_numpy(Xm),
                None, play['YardLine_std'].values.copy(), year,
                list(players_nn.columns), list(rusher_nn.columns), list(meta_nn.columns))


def load_data_nn(n_plays_sample=None,
                 nfolds=None, nidx=None,
                 skiprows=None,
                 game_set=None) -> Tuple[Data, Data, StandardScaler, StandardScaler]:
    print(f"load and prepare training and validation data")
    print(f"parameters to read_csv: nobs to read: {n_plays_sample}, nfolds:{nfolds}, skiprows{skiprows}")
    print(f"game_set for splitting data into train and validation:{game_set}")
    
    
    print(f"---start loading data from source file (train.csv) in load_data_nn()--")
    train = pd.read_csv('./input/train.csv', nrows=n_plays_sample, skiprows=skiprows)
    
    
    play, players = prep(train)
    
    players_nn, rusher_nn, meta_nn, scaler, scaler_meta = prep_players_nn(play, players, None, None)

    assert len(players_nn) == len(rusher_nn) * 22
    assert len(rusher_nn) == len(meta_nn)
    assert len(rusher_nn) == len(play)
    
    play_ids = np.array(players_nn.index[np.arange(0, len(players_nn), 22)])
    game_ids = np.array(players['GameId'].values[np.arange(0, len(players_nn), 22)])

    assert len(play_ids) == len(game_ids)

    if nfolds is not None:
        print(f"prepare data into {nfolds} folds")
        assert nidx is not None
        game_ids_valid = NFL_group_split(train, nfolds, nidx)
    else:
        if game_set is None:
            print(f"game set is None, and will use last 5 games for each team as validation")
        else:
            print(f"use game set to split data data:{game_set}")      
        game_ids_valid = NFL_validation_split(train, game_set)

    X = players_nn.values.astype(np.float32).reshape((-1, 22, len(players_nn.columns)))
    Xr = rusher_nn.values.astype(np.float32)
    Xm = meta_nn.values.astype(np.float32)

    y = np.vstack(play['Yards'].apply(return_delta).values).astype(np.float32)

    yardLine = play['YardLine_std'].values
    year = play['Year'].values

    valid_mask = np.isin(game_ids, np.array(list(game_ids_valid)))

    X_valid = torch.from_numpy(X[valid_mask])
    Xr_valid = torch.from_numpy(Xr[valid_mask])
    Xm_valid = torch.from_numpy(Xm[valid_mask])
    y_valid = torch.from_numpy(y[valid_mask])
    yd_valid = yardLine[valid_mask].copy()
    yr_valid = year[valid_mask].copy()

    X_train = torch.from_numpy(X[~valid_mask])
    Xr_train = torch.from_numpy(Xr[~valid_mask])
    Xm_train = torch.from_numpy(Xm[~valid_mask])
    y_train = torch.from_numpy(y[~valid_mask])
    yd_train = yardLine[~valid_mask].copy()
    yr_train = year[~valid_mask].copy()

    print(f'X_train: {X_train.shape}, {Xr_train.shape}, {Xm_train.shape}')
    print(f'X_valid: {X_valid.shape}, {Xr_valid.shape}, {Xm_valid.shape}')
    print(f'y_train: {y_train.shape}, y_valid: {y_valid.shape}')
    print(f'players: {list(players_nn.columns)}')
    print(f'rusher: {list(rusher_nn.columns)}')
    print(f'meta: {list(meta_nn.columns)}')

    dtrain = Data(X_train, Xr_train, Xm_train, y_train, yd_train, yr_train,
                  list(players_nn.columns), list(rusher_nn.columns), list(meta_nn.columns))
    dvalid = Data(X_valid, Xr_valid, Xm_valid, y_valid, yd_valid, yr_valid,
                  list(players_nn.columns), list(rusher_nn.columns), list(meta_nn.columns))

    assert len(X) == len(y)
    return dtrain, dvalid, scaler, scaler_meta


def return_delta(x):
    temp = np.zeros(199)
    temp[x + 99:] = 1
    return temp


def NFL_validation_split(df: pd.DataFrame, game_set=None):
    games = df[['GameId', 'PossessionTeam']].drop_duplicates()

    # Sort so the latest games are first and label the games with cumulative counter
    games = games.sort_values(['PossessionTeam', 'GameId'], ascending=[True, False])
    games['row_number'] = games.groupby(['PossessionTeam']).cumcount() + 1

    # Use last 5 games for each team as validation. There will be overlap since two teams will have the same
    # GameId
    game_set = game_set or {1, 2, 3, 4, 5}

    # Set of unique game ids
    game_ids = set(games[games['row_number'].isin(game_set)]['GameId'].unique().tolist())

    return game_ids


def NFL_group_split(df: pd.DataFrame, nfolds: int, nidx: int):
    kf = GroupKFold(nfolds)

    train_idx, valid_idx = list(kf.split(df, groups=df['GameId']))[nidx]

    return set(df['GameId'].iloc[valid_idx].unique())




def crps(y_pred, y_true):
    loss = torch.mean((torch.cumsum(y_pred, dim=1) - y_true) ** 2)
    return loss


## Ready to understand the data (shape, features,etc)

In [5]:
train_data, valid_data, scaler, scaler_meta = load_data_nn(None)
print(f"---Members of train_data:---\n {train_data.__dict__.keys()}")
## dir(train_data) ## list available functions and members

load and prepare training and validation data
parameters to read_csv: nobs to read: None, nfolds:None, skiprowsNone
game_set for splitting data into train and validation:None
---start loading data from source file (train.csv) in load_data_nn()--


  if (await self.run_code(code, result,  async_=asy)):


prepare features from source data with function: prep
prepare player data with function: prep_players_nn
game set is None, and will use last 5 games for each team as validation
X_train: torch.Size([27274, 22, 15]), torch.Size([27274, 7]), torch.Size([27274, 2])
X_valid: torch.Size([3733, 22, 15]), torch.Size([3733, 7]), torch.Size([3733, 2])
y_train: torch.Size([27274, 199]), y_valid: torch.Size([3733, 199])
players: ['X_std', 'Y_std', 'S', 'A', 'Dis', 'PlayerWeight', 'IsBallCarrier', 'IsOffence', 'Dir_cos', 'Dir_sin', 'DX', 'DY', 'AngleFromRB', 'AngleFromRB2', 'AngleTan']
rusher: ['X_std', 'Y_std', 'S', 'A', 'Dis', 'Dir_cos', 'Dir_sin']
meta: ['YardLine_std', 'Distance']
---Members of train_data:---
 dict_keys(['players', 'rusher', 'meta', 'y', 'yardLine', 'year', 'player_cols', 'rusher_cols', 'meta_cols'])


In [6]:
print(f"shuffle and downsample data for training")
data = train_data.shuffled()
downsample_2017 = 0.4
if downsample_2017 > 0.0:
    print(downsample_2017)
    data1 = data.downsample_2017(downsample_2017)

shuffle and downsample data for training
start shuffled
0.4
down sample for games in 2017 with rate: 0.4


## Define a BERT model:  Encoder from transformer

In [7]:
#######################################################################################################################
# The Model
#######################################################################################################################

class TransformerModel(nn.Module):
    def __init__(self, ninp: int, nemb: int = 1, nhead: int = 1, nhid: int = 32, nlayers: int = 4, nfinal: int = 1024,
                 dropout_encoder: float = 0.1, dropout_embed: float = 0.0, dropout_classifier: float = 0.0,
                 n_class: int = 199, ninp_rusher: int = 16, pre_LN: bool = False, rusher_emb: int = 32,
                 n_emb_layers: int = 2,
                 ninp_meta: int = 8, meta_emb: int = 32, gauss_noise: float = 0.0,
                 gauss_xy_noise: float = 0.0,
                 n_fin_layers: int = 3, dropout_attn: float = 0):
        """        
         :param ninp: Number of input dimensions(feature dimension per player): 15
         :param nemb: Embedding layer dimension
         :param nhead: number of multi-head attention heads
         :param nhid: hidden dimension of FeedForward(FFN) in transformer
         :param nlayers: number of transformer-encoder layers
         :param nfinal: Dimension of Linear layer after readout
         :param dropout_encoder: Self-Attention, dropout in Encoder
         :param dropout_embed: Embedding layer dropout
         :param dropout_classifier: dropout after readout
         :param n_class:
         :param ninp_rusher: Rusher feature dimension
         :param pre_LN: Whether to set Layer-Normalization placement method to pre-LN
         :param rusher_emb: Rusher's embedding dimension        
        """
        super(TransformerModel, self).__init__()
        self.model_type = 'Transformer'
        
        #encoder_layers = TransformerEncoderLayer(nemb, nhead, nhid, dropout_encoder, pre_LN=pre_LN, dropout_attn=dropout_attn)
        #self.transformer_encoder = TransformerEncoder(encoder_layers, nlayers)
        self.config = BertConfig( 
            3, # no. vocaburary, no use
            hidden_size=nemb,
            num_hidden_layers=nlayers,
            num_attention_heads=nhead,
            intermediate_size=nhid,
            hidden_dropout_prob=dropout_encoder,
            attention_probs_dropout_prob=dropout_attn,
            max_position_embeddings=22,
            type_vocab_size=1 # 
        )

        print(f"bert encoder config:{self.config}")
        self.transformer_encoder  = BertEncoder(self.config)         
        
        #self.n_emb_layers = n_emb_layers
        ## input shape for Conv1d : batch x input channels x seq len
        ## output from Cov1d: batch x input channels x seq len, where the output seq len is decided by kernel size and stride
        self.conv1 = nn.Conv1d(in_channels=ninp, out_channels=nemb, kernel_size=1)
        self.conv2 = nn.Conv1d(in_channels=nemb, out_channels=nemb, kernel_size=1)
        self.conv3 = nn.Conv1d(in_channels=nemb, out_channels=nemb, kernel_size=1)

        self.relu1 = nn.PReLU()
        self.relu2 = nn.PReLU()
        self.relu3 = nn.PReLU()
        self.relu4 = nn.PReLU()
        self.relu5 = nn.ReLU()
        if dropout_embed > 0:
            self.dropout1 = nn.Dropout(dropout_embed)
        else:
            self.dropout1 = None

        self.avgpool = nn.AvgPool1d(kernel_size=22)  # nn.MaxPool1d(kernel_size=22)

        assert n_fin_layers == 3

        self.linear = nn.Sequential(
            nn.Dropout(dropout_classifier),
            nn.Linear(nemb + rusher_emb + meta_emb, nfinal),
            nn.ReLU(),
            nn.Dropout(dropout_classifier),
            nn.Linear(nfinal, nfinal),
            nn.ReLU(),
            nn.Dropout(dropout_classifier),
            nn.Linear(nfinal, n_class)
        )

        self.activation = nn.Softmax(dim=1)
        self.linear2 = nn.Linear(ninp_rusher, rusher_emb)
        self.gauss_noise = gauss_noise
        self.gauss_xy_noise = gauss_xy_noise
        if meta_emb > 0:
            self.linear3 = nn.Linear(ninp_meta, meta_emb)
        else:
            self.linear3 = None

    def forward(self, x: Data, with_clip: bool = True):
        # src: [Batch x Players(22) x Player Vector]
        src = x.players
        src_rusher = x.rusher
        
        
        # src: [Batch x Player Vector x Players(22)] after permuation
        src = src.permute([0, 2, 1])

        if self.training:
            # gaussian augmentation on training data
            if self.gauss_noise > 0.0:
                noise = torch.randn_like(src) * self.gauss_noise
                src = src + noise
                src = src.to(device) ## to device

            if self.gauss_xy_noise > 0.0:
                # dx = torch.randn(src.size(0)) * self.gauss_xy_noise
                dy = torch.randn(src.size(0)) * self.gauss_xy_noise
                dy = dy.to(device)
                # Batch x 1 x 22
                # src[:, 0, :] += dx.reshape(src.size(0), 1).expand(src.size(0), src.size(2))
                src[:, 1, :] += dy.reshape(src.size(0), 1).expand(src.size(0), src.size(2))

                # src_rusher[:, 0] += dx
                src_rusher[:, 1] += dy

        # src: [Batch x Player Vector(embedded, 4*inp) x Players(22)]
        # conv1d expect: [batch_size, in_channels, len]
        # With kernel size 1, no interaction between players
        src = self.relu1(self.conv1(src))
        src = self.dropout1(src)
        src = self.relu2(self.conv2(src))
        src = self.dropout1(src)
        src = self.relu3(self.conv3(src))

        src = src.permute([2, 0, 1])

        # output: [Players(22) x Batch x Transformed Player Vector]
        head_mask = [None] * self.config.num_hidden_layers
        #extended_attention_mask, head_mask=head_mask
        # input shape for encoder: seq len x batch x embedding len
        output = self.transformer_encoder(src, attention_mask=None, head_mask=head_mask)[0]

        # output: [Batch x Transformed Player Vector x Players(22)]
        output = output.permute([1, 2, 0])

        # output: [Batch x Transformed Player Vector]
        output = torch.squeeze(self.avgpool(output), dim=2)

        if self.linear3 is not None:
            output = torch.cat((output, self.relu4(self.linear2(src_rusher)), self.relu5(self.linear3(x.meta))), dim=1)
        else:
            output = torch.cat((output, self.relu4(self.linear2(src_rusher))), dim=1)

        # output: [Batch x n_class]
        output = self.linear(output)
        output = self.activation(output)

        if not self.training and x.yardLine is not None:
            
            output = torch.cumsum(output, dim=1).cpu().numpy()

            output = np.clip(output, 0.0, 1.0)

            # mask
            if with_clip:
                left = 99 - x.yardLine
                right = 199 - x.yardLine
                for k in range(len(output)):
                    output[k, :left[k] + 1] = 0.0
                    output[k, right[k]:] = 1.0

        return output


In [8]:
import argparse
parser = argparse.ArgumentParser()
parser.add_argument('--ninp', default=15)
parser.add_argument('--nemb', default=128)
parser.add_argument('--nhead', default=1)
parser.add_argument('--nhid', default=96)  # number of hidden units in attention
parser.add_argument('--nlayers', default = 4)  # number of transformers stacked
parser.add_argument('--nfinal', default = 512)  # number of hidden units in final layers

parser.add_argument('--gamma', default=0.976)
parser.add_argument('--dropout_encoder', default=0.0)
parser.add_argument('--n_emb_layers', default=3)
parser.add_argument('--dropout_classifier', default=0.3)
parser.add_argument('--dropout_embed', default=0.15)
parser.add_argument('--dropout_attn', default=0.35)


parser.add_argument('--meta_emb', default=8)
parser.add_argument('--downsample_2017', default=0.4)
parser.add_argument('--gauss_noise', default=0.15)
parser.add_argument('--gauss_xy_noise', default=0.1)
parser.add_argument('--n_fin_layers', default=3)
parser.add_argument('--model_name', default='transformer_dsbowl')
parser.add_argument('--epochs', type=int, default=50)
parser.add_argument('--lr', type=float, default=0.0001)
parser.add_argument('--batch_size', type=int, default=16)
parser.add_argument('--weight_decay', type=float, default=1e-6)
parser.add_argument('--device', default='cuda:0')
parser.add_argument('--save_dir', default='.') ## model saved at the current directory
parser.add_argument('--log_filename', default='encoder_model_log.txt')
args = parser.parse_args("")  
args

Namespace(batch_size=16, device='cuda:0', downsample_2017=0.4, dropout_attn=0.35, dropout_classifier=0.3, dropout_embed=0.15, dropout_encoder=0.0, epochs=50, gamma=0.976, gauss_noise=0.15, gauss_xy_noise=0.1, log_filename='encoder_model_log.txt', lr=0.0001, meta_emb=8, model_name='transformer_dsbowl', n_emb_layers=3, n_fin_layers=3, nemb=128, nfinal=512, nhead=1, nhid=96, ninp=15, nlayers=4, save_dir='.', weight_decay=1e-06)

In [9]:
### prepare model specification from parsed options
params = {
    'ninp': args.ninp, ## from train_data.players.shape[2],
    'nemb': args.nemb,
    'nhead': args.nhead,
    'nhid': args.nhid,
    'nlayers': args.nlayers,
    'nfinal': args.nfinal,
    'ninp_rusher': train_data.rusher.shape[1],
    'pre_LN': True,
    'dropout_encoder': args.dropout_encoder,
    'dropout_embed': args.dropout_embed,
    'dropout_classifier': args.dropout_classifier,
    'n_emb_layers': args.n_emb_layers,
    'ninp_meta': train_data.meta.shape[1],
    'meta_emb': args.meta_emb,
    'gauss_noise': args.gauss_noise,
    'gauss_xy_noise': args.gauss_xy_noise,
    'n_fin_layers': args.n_fin_layers,
    'dropout_attn': args.dropout_attn
    
}

print(f"model specifications from params: {params}")
##NOTE: pass parameters with key value pairs: **params
model = TransformerModel(**params)

model specifications from params: {'ninp': 15, 'nemb': 128, 'nhead': 1, 'nhid': 96, 'nlayers': 4, 'nfinal': 512, 'ninp_rusher': 7, 'pre_LN': True, 'dropout_encoder': 0.0, 'dropout_embed': 0.15, 'dropout_classifier': 0.3, 'n_emb_layers': 3, 'ninp_meta': 2, 'meta_emb': 8, 'gauss_noise': 0.15, 'gauss_xy_noise': 0.1, 'n_fin_layers': 3, 'dropout_attn': 0.35}
bert encoder config:BertConfig {
  "attention_probs_dropout_prob": 0.35,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 128,
  "initializer_range": 0.02,
  "intermediate_size": 96,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 22,
  "model_type": "bert",
  "num_attention_heads": 1,
  "num_hidden_layers": 4,
  "pad_token_id": 0,
  "type_vocab_size": 1,
  "vocab_size": 3
}



In [10]:
model

TransformerModel(
  (transformer_encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=128, out_features=128, bias=True)
            (key): Linear(in_features=128, out_features=128, bias=True)
            (value): Linear(in_features=128, out_features=128, bias=True)
            (dropout): Dropout(p=0.35, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=128, out_features=128, bias=True)
            (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.0, inplace=False)
          )
        )
        (intermediate): BertIntermediate(
          (dense): Linear(in_features=128, out_features=96, bias=True)
        )
        (output): BertOutput(
          (dense): Linear(in_features=96, out_features=128, bias=True)
          (LayerNorm): LayerNorm((128,), eps=1e-12

### Calculate the number of model's parameters

In [11]:
### model parameters
model_parameters = filter(lambda p: p.requires_grad,model.parameters())
print(f"number of model parameters:{sum([np.prod(p.size()) for p in model_parameters])}")

number of model parameters:852067


## Display first k layer's names and their shapes

In [12]:
## model structure and its weights
nLayers = 20
print(f"Shows the first {nLayers} layers from model")
i=0
for name, param in model.named_parameters():
    if i<nLayers:
        if param.requires_grad:
            print (name, param.shape)
            i += 1
        

Shows the first 20 layers from model
transformer_encoder.layer.0.attention.self.query.weight torch.Size([128, 128])
transformer_encoder.layer.0.attention.self.query.bias torch.Size([128])
transformer_encoder.layer.0.attention.self.key.weight torch.Size([128, 128])
transformer_encoder.layer.0.attention.self.key.bias torch.Size([128])
transformer_encoder.layer.0.attention.self.value.weight torch.Size([128, 128])
transformer_encoder.layer.0.attention.self.value.bias torch.Size([128])
transformer_encoder.layer.0.attention.output.dense.weight torch.Size([128, 128])
transformer_encoder.layer.0.attention.output.dense.bias torch.Size([128])
transformer_encoder.layer.0.attention.output.LayerNorm.weight torch.Size([128])
transformer_encoder.layer.0.attention.output.LayerNorm.bias torch.Size([128])
transformer_encoder.layer.0.intermediate.dense.weight torch.Size([96, 128])
transformer_encoder.layer.0.intermediate.dense.bias torch.Size([96])
transformer_encoder.layer.0.output.dense.weight torch.Si

### Prepare for training
- evaluation metric
- repeat/loop train and evaluation

In [13]:
def evaluate(eval_model: nn.Module, data: Data, with_clip: bool = True, device: torch.device =None):
    #print(f"in evalueate, device:{device}")
    eval_model.eval()  # Turn on the evaluation mode
    assert data.y is not None

    with torch.no_grad():
        y_actual = data.y.numpy()
        '''
        data.players = data.players.to(device)
        data.rusher = data.rusher.to(device)
        data.meta = data.meta.to(device)
        '''
        data = data_to_device(data, device)
        y_predicted = eval_model(data, with_clip)

        loss = np.sum((y_predicted - y_actual) ** 2) / (199 * len(y_predicted)) ## 199 predictions for one sample

    return loss
## move data to device for model evaluation
def data_to_device(data, device):
    ## data on device for model evaluation
    data.players = data.players.to(device)
    data.rusher = data.rusher.to(device)
    data.meta = data.meta.to(device)
    return data

In [14]:
#######################################################################################################################
# Training & Pseudo-Labeling
#######################################################################################################################

def train_model(model: nn.Module, scheduler, batch_size: int,
                train_data: Data, valid_data: Data, writer,
                epochs:int,
                downsample_2017: float,
                calc_train_loss: bool = True,
                params: Dict = None,
                device: torch.device =None):
    print(f"in train_model, device:{device}")
    np.random.seed(0)

    # Keep number of data in 1 epoch same between 1st/2nd stage
    std_batch_length = 12000 // batch_size + 1
    n_total_batch = 0
    epoch = 1
    epoch_start_time = time.time()
    ensemble = None
    print(f"epochs in train_model:{epochs}")
    model.eval()

    snapshots = SnapShots(torelance=1.007, interval=3)

    snapshots.add(model, 0, evaluate(model, valid_data,device=device))

    model.train()  # Turn on the train mode

    while epoch < epochs:
        print(f"start training at epoch: {epoch} and shuffled")
        data = train_data.shuffled()
        if downsample_2017 > 0.0:
            data = data.downsample_2017(downsample_2017)
        for i in range(0, data.len() - 1, batch_size):
            model.train()  # Turn on the train mode
            iend = min(i + batch_size, data.len())
            batch_data = data.slice(i, iend)
            optimizer.zero_grad()
            batch_data = data_to_device(batch_data, device)
            output = model(batch_data)

            loss = crps(output, batch_data.y.to(device))
            #print(f"loss type: {loss.is_cuda}")            
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
            optimizer.step()

            n_total_batch += 1

            if n_total_batch > std_batch_length:
                model.eval()
                
                if calc_train_loss and epoch % 1 == 0:
                    #train_loss = evaluate(model, data, device=device)
                    train_loss = loss.item() ## from one batch only
                else:
                    train_loss = -1

                if epoch % 1 == 0 or epoch > 30:
                    ## data to device will be done in evaluate
                    ## IMPORTANT: due to limit memory, evaluate in batch mode on GPU
                    val_loss = 0
                    for i in range(0, valid_data.len() - 1, 100):
                        iend = min(i + 100, valid_data.len())
                        batch_valid_data = valid_data.slice(i, iend)                  
                        val_loss_batch = evaluate(model, batch_valid_data, device=device)
                        val_loss += val_loss_batch*(iend-i+1)
                        
                    val_loss = val_loss/valid_data.len()    
                    snapshots.add(model, epoch, val_loss/valid_data.len())
                else:
                    val_loss = -1

                print('| end of epoch {:3d} | lr: {:2.5f} | time: {:5.2f}s | train loss {:5.6f} | '
                      'valid loss {:5.9f}'.format(epoch, scheduler.get_lr()[0], (time.time() - epoch_start_time),
                                                  train_loss, val_loss))
                #scheduler.step()
                if writer is not None:
                    writer.writerow([epoch, scheduler.get_lr()[0], train_loss, val_loss])
                    f.flush()

                n_total_batch = 0
                epoch += 1
                epoch_start_time = time.time()
                model.train()  # Turn on the train mode
                scheduler.step()
                #print(f"after step, lr:{scheduler.get_lr()[0]}")
                if epoch in [40]:
                    batch_size *= 2
                    std_batch_length = 12000 // batch_size + 1

    # reload best model
    model = snapshots.load_best_single_model(model)
    eval_single = evaluate(model, valid_data)

    try:
        ensemble = snapshots.load_ensemble_model(TransformerModel, params)
        eval_ensemble = evaluate(ensemble, valid_data)
        print(f'final loss: {eval_single:.8f} (single) / {eval_ensemble:.8f} ({len(ensemble.models)} models)')
    except:
        pass

    return ensemble if ensemble is not None else model


### Ensemble through Snapshot

In [15]:
#######################################################################################################################
## SnapShot & Ensemble
#######################################################################################################################

class EnsembleModel(object):
    def __init__(self):
        self.models = []

    def add_model(self, model):
        self.models.append(model)

    def train(self):
        for m in self.models:
            m.train()

    def eval(self):
        for m in self.models:
            m.eval()

    def __call__(self, *input, **kwargs):
        assert len(self.models) >= 1

        base = self.models[0](*input, **kwargs)

        for m in self.models[1:]:
            base += m(*input, **kwargs)

        return base / len(self.models)


class SnapShot(object):
    def __init__(self, model: nn.Module, epoch: int, loss: float):
        self.state = copy.deepcopy(model.state_dict())
        self.epoch = epoch
        self.loss = loss
        torch.save(self.state, f'snapshot_{epoch}_{loss}')


class SnapShots(object):
    def __init__(self, interval: int = 3, torelance: float = 1.01, verbose: bool = True):
        self.best_model = None  # type: Optional[SnapShot]
        self.snap_shots = []  # type: List[SnapShot]
        self.best_val_loss = 1.0
        self.interval = interval
        self.torelance = torelance
        self.verbose = verbose

    def add(self, model: nn.Module, epoch: int, loss: float):
        if loss < self.best_val_loss:
            if self.verbose:
                print(f'best model is updated. epoch{epoch}, loss={loss:.7f}')
            self.best_model = SnapShot(model, epoch, loss)
            self.best_val_loss = loss
        if epoch % self.interval == 0:
            if self.verbose:
                print(f'Add snapshot. epoch{epoch}, loss={loss:.7f}')
            self.snap_shots.append(SnapShot(model, epoch, loss))

    def load_best_single_model(self, model: nn.Module):
        if self.best_model is not None:
            model.load_state_dict(self.best_model.state)
        return model

    def load_ensemble_model(self, cls: Type, params: Dict, max_models: int = 5):
        model = EnsembleModel()
        best = cls(**params)
        self.load_best_single_model(best)
        model.add_model(best)

        candidates = []  # Tuple[int, float]

        for i, s in enumerate(self.snap_shots):
            if s.loss > self.torelance * self.best_model.loss:
                continue
            if s.epoch == self.best_model.epoch:
                continue

            candidates.append((i, s.loss))

        # Collect top-n models from snapshot
        candidates = sorted(candidates, key = lambda x: x[1])
        if len(candidates) > max_models - 1:
            candidates = candidates[:max_models - 1]
        for i, _ in candidates:
            s = self.snap_shots[i]
            sub = cls(**params)
            sub.load_state_dict(s.state)
            model.add_model(sub)
            if self.verbose:
                print(f'add {s.epoch}-th epoch to ensemble (loss:{s.loss:.7f})')

        return model

In [16]:
args

Namespace(batch_size=16, device='cuda:0', downsample_2017=0.4, dropout_attn=0.35, dropout_classifier=0.3, dropout_embed=0.15, dropout_encoder=0.0, epochs=50, gamma=0.976, gauss_noise=0.15, gauss_xy_noise=0.1, log_filename='encoder_model_log.txt', lr=0.0001, meta_emb=8, model_name='transformer_dsbowl', n_emb_layers=3, n_fin_layers=3, nemb=128, nfinal=512, nhead=1, nhid=96, ninp=15, nlayers=4, save_dir='.', weight_decay=1e-06)

## Ready to train transformer model

In [17]:
%time
### debug purpose
print(f"single mode training")
 
mode = 'single'
n_holdout = 0
n_train = None
print(f"args:{args}")
#device='cuda:0'
#device ='cpu'
device = torch.device(args.device)
## For debug purpose, use a small number of epochs
args.epochs = 3
print(f"---train on device:{device}---")
train_data, valid_data, scaler, scaler_meta = load_data_nn(None) 
model = TransformerModel(**params)
#model = TransformerModel(**params)
model.to(device)
f = open(args.log_filename + '.csv', 'w+', newline='')
writer = csv.writer(f)
writer.writerow(train_data.player_cols)
writer.writerow(train_data.rusher_cols)
writer.writerow(['epoch', 'lr', 'train_loss', 'valid_loss'])

optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)  
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=args.gamma)

model = train_model(model, scheduler, args.batch_size, train_data, valid_data, writer, 
                    args.epochs, downsample_2017, params=params, device=device)


CPU times: user 3 µs, sys: 1 µs, total: 4 µs
Wall time: 8.82 µs
single mode training
args:Namespace(batch_size=16, device='cuda:0', downsample_2017=0.4, dropout_attn=0.35, dropout_classifier=0.3, dropout_embed=0.15, dropout_encoder=0.0, epochs=50, gamma=0.976, gauss_noise=0.15, gauss_xy_noise=0.1, log_filename='encoder_model_log.txt', lr=0.0001, meta_emb=8, model_name='transformer_dsbowl', n_emb_layers=3, n_fin_layers=3, nemb=128, nfinal=512, nhead=1, nhid=96, ninp=15, nlayers=4, save_dir='.', weight_decay=1e-06)
---train on device:cuda:0---
load and prepare training and validation data
parameters to read_csv: nobs to read: None, nfolds:None, skiprowsNone
game_set for splitting data into train and validation:None
---start loading data from source file (train.csv) in load_data_nn()--


  if (await self.run_code(code, result,  async_=asy)):


prepare features from source data with function: prep
prepare player data with function: prep_players_nn
game set is None, and will use last 5 games for each team as validation
X_train: torch.Size([27274, 22, 15]), torch.Size([27274, 7]), torch.Size([27274, 2])
X_valid: torch.Size([3733, 22, 15]), torch.Size([3733, 7]), torch.Size([3733, 2])
y_train: torch.Size([27274, 199]), y_valid: torch.Size([3733, 199])
players: ['X_std', 'Y_std', 'S', 'A', 'Dis', 'PlayerWeight', 'IsBallCarrier', 'IsOffence', 'Dir_cos', 'Dir_sin', 'DX', 'DY', 'AngleFromRB', 'AngleFromRB2', 'AngleTan']
rusher: ['X_std', 'Y_std', 'S', 'A', 'Dis', 'Dir_cos', 'Dir_sin']
meta: ['YardLine_std', 'Distance']
bert encoder config:BertConfig {
  "attention_probs_dropout_prob": 0.35,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 128,
  "initializer_range": 0.02,
  "intermediate_size": 96,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 22,
  "model_type": "bert",
  "num_attention_heads": 1,
  



start training at epoch: 2 and shuffled
start shuffled
down sample for games in 2017 with rate: 0.4
best model is updated. epoch2, loss=0.0000036
| end of epoch   2 | lr: 0.00010 | time: 18.83s | train loss 0.008589 | valid loss 0.013412119
best model is updated. epoch3, loss=0.0000036
Add snapshot. epoch3, loss=0.0000036
| end of epoch   3 | lr: 0.00009 | time: 17.77s | train loss 0.007942 | valid loss 0.013256483
bert encoder config:BertConfig {
  "attention_probs_dropout_prob": 0.35,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 128,
  "initializer_range": 0.02,
  "intermediate_size": 96,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 22,
  "model_type": "bert",
  "num_attention_heads": 1,
  "num_hidden_layers": 4,
  "pad_token_id": 0,
  "type_vocab_size": 1,
  "vocab_size": 3
}



 ### Train models in different modes:
 - Single mode
 - Ensemble mode
 - Grid mode

In [18]:
def train_in_mode(params, mode):
    if mode == 'grid':
        print(f"grid mode training")
        train_data, valid_data, scaler, scaler_meta = load_data_nn(None)
        for dropout_embed in [0.05, 0.1, 0.15]:
            for dropout_encoder in [0.2, 0.3, 0.4]:
                log_f = f'{log_filename}_embed{dropout_embed}_encoder{dropout_encoder}.csv'
                params['dropout_encoder'] = dropout_encoder
                params['dropout_embed'] = dropout_embed
                model = TransformerModel(**params)
                model.to(args.device)
                f = open(log_f, 'w+', newline='')
                writer = csv.writer(f)
                writer.writerow([dropout_embed, dropout_classifier])
                writer.writerow(['epoch', 'lr', 'train_loss', 'valid_loss'])

                optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)
                scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=args.gamma)

                model = train_model(model, scheduler, args.batch_size, train_data, valid_data, 
                                    writer,  args.epochs, downsample_2017, params=params,device=args.device)

    elif mode == 'ensemble':
        print(f"ensemble training")
        models = EnsembleModel()
        game_sets = [
            {1, 3, 5, 7, 9},
            {2, 4, 6, 8, 10}
        ]

        for i in range(len(game_sets)):
            model = TransformerModel(**params)
            model.to(args.device)
            train_data, valid_data, scaler, scaler_meta = load_data_nn(None, nfolds=8, nidx=i) #load_data_nn(None, game_set=game_sets[i])
            optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)    
            scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=args.gamma)

            model = train_model(model, scheduler, args.batch_size, train_data, valid_data, None, args.epochs, downsample_2017,
                                calc_train_loss=True, params=param,device=args.device)
            models.add_model(model)
        model = models
    else:
        assert mode == 'single'
        print(f"single mode training")
        train_data, valid_data, scaler, scaler_meta = load_data_nn(None)        
        model = TransformerModel(**params)
        model.to(args.device)
        f = open(log_filename + '.csv', 'w+', newline='')
        writer = csv.writer(f)
        writer.writerow(train_data.player_cols)
        writer.writerow(train_data.rusher_cols)
        writer.writerow(['epoch', 'lr', 'train_loss', 'valid_loss'])

        optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)  
        scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=args.gamma) ## gamma decay factor of learning
        
        model = train_model(model, scheduler, args.batch_size, train_data, valid_data, writer, args.epochs, downsample_2017,
                            params=params, device=args.device)



In [None]:
## choose different mode to train model
mode = 'ensemble'
args.epochs = 10
log_filename = 'ensemble'
no_decay = ['bias', '.norm']
train_in_mode(params, mode)

ensemble training
bert encoder config:BertConfig {
  "attention_probs_dropout_prob": 0.35,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 128,
  "initializer_range": 0.02,
  "intermediate_size": 96,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 22,
  "model_type": "bert",
  "num_attention_heads": 1,
  "num_hidden_layers": 4,
  "pad_token_id": 0,
  "type_vocab_size": 1,
  "vocab_size": 3
}

load and prepare training and validation data
parameters to read_csv: nobs to read: None, nfolds:8, skiprowsNone
game_set for splitting data into train and validation:None
---start loading data from source file (train.csv) in load_data_nn()--


  exec(code_obj, self.user_global_ns, self.user_ns)


prepare features from source data with function: prep
prepare player data with function: prep_players_nn
prepare data into 8 folds
X_train: torch.Size([27129, 22, 15]), torch.Size([27129, 7]), torch.Size([27129, 2])
X_valid: torch.Size([3878, 22, 15]), torch.Size([3878, 7]), torch.Size([3878, 2])
y_train: torch.Size([27129, 199]), y_valid: torch.Size([3878, 199])
players: ['X_std', 'Y_std', 'S', 'A', 'Dis', 'PlayerWeight', 'IsBallCarrier', 'IsOffence', 'Dir_cos', 'Dir_sin', 'DX', 'DY', 'AngleFromRB', 'AngleFromRB2', 'AngleTan']
rusher: ['X_std', 'Y_std', 'S', 'A', 'Dis', 'Dir_cos', 'Dir_sin']
meta: ['YardLine_std', 'Distance']
in train_model, device:cuda:0
epochs in train_model:10
best model is updated. epoch0, loss=0.0659919
Add snapshot. epoch0, loss=0.0659919
start training at epoch: 1 and shuffled
start shuffled
down sample for games in 2017 with rate: 0.4
best model is updated. epoch1, loss=0.0000172
| end of epoch   1 | lr: 0.00010 | time: 19.30s | train loss 0.085103 | valid los



start training at epoch: 2 and shuffled
start shuffled
down sample for games in 2017 with rate: 0.4
| end of epoch   2 | lr: 0.00010 | time: 19.26s | train loss 0.083864 | valid loss 0.066655046
Add snapshot. epoch3, loss=0.0000172
| end of epoch   3 | lr: 0.00009 | time: 19.28s | train loss 0.085332 | valid loss 0.066655046
start training at epoch: 4 and shuffled
start shuffled
down sample for games in 2017 with rate: 0.4
| end of epoch   4 | lr: 0.00009 | time: 18.90s | train loss 0.092580 | valid loss 0.066655046
| end of epoch   5 | lr: 0.00009 | time: 18.93s | train loss 0.085941 | valid loss 0.066655046
start training at epoch: 6 and shuffled
start shuffled
down sample for games in 2017 with rate: 0.4
Add snapshot. epoch6, loss=0.0000172
| end of epoch   6 | lr: 0.00009 | time: 17.78s | train loss 0.084727 | valid loss 0.066655046


In [None]:
def concat_dataset(l: Data, r: Data):
    p = torch.cat((l.players, r.players))
    rs = torch.cat((l.rusher, r.rusher))
    m = torch.cat((l.meta, r.meta))
    y = torch.cat((l.y, r.y)) if l.y is not None else None
    yd = np.concatenate([l.yardLine, r.yardLine]) if l.yardLine is not None else None
    yr = np.concatenate([l.year, r.year]) if l.year is not None else None
    print(p.size())
    print(yd.shape)
    return Data(p, rs, m, y, yd, yr, l.player_cols, l.rusher_cols, l.meta_cols)

In [None]:
def tta(test_df, sigma_dir=1.0, sigma_y=1.0):
    n_aug = 10
    test_df_aug = pd.concat([test_df]*n_aug)
    
    test_df_aug['Dir'] += np.random.normal(0, sigma_dir, size=len(test_df_aug))

    # yは共通で上げ下げ
    test_df_aug['Y'] += np.repeat(np.random.normal(0, sigma_y, size=n_aug), 22)
    test_df_aug['PlayId'] += np.repeat(np.arange(0, n_aug), 22)
    
    return test_df_aug

In [None]:
model.eval()  # Turn on the evaluation mode
n_prev = 0

original_data = concat_dataset(train_data, valid_data)

for (test_df, sample_prediction_df) in env.iter_test():
    try:
        test_df = tta(test_df)
        X_test = load_data_nn_test(test_df, scaler, scaler_meta)

    except Exception as e:
        print(f'### ERROR ### {e} / {test_df}')
        # submit as-is if something happened
        env.predict(sample_prediction_df)
        continue

    with torch.no_grad():
        predicted = model(X_test).mean(axis=0)
        sample_prediction_df.iloc[0,:] = np.squeeze(predicted)
        env.predict(sample_prediction_df)


In [None]:
env.write_submission_file()   