## NFL Big Data Bowl

**Content**
1. [Importing Libraries](#1)
1. [Loading Dataset](#2)
1. [Feature Preprocess](#3)
1. [Feature Selection](#4)
1. [NN Model and CV](#5)
1. [Submission](#6)

<a id="1"></a> <br>
# Importing Libraries

In [1]:

# PACKAGES
import os
import time
import warnings
from kaggle.competitions import nflrush

import re
import codecs
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import datetime

import plotly as py
import plotly.express as px
import plotly.graph_objs as go
from plotly.subplots import make_subplots
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot

from sklearn.metrics.pairwise import euclidean_distances
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import f1_score
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

#import keras.backend as K
import tensorflow as tf
from tensorflow.keras.layers import Dense, Input, Flatten, concatenate, Dropout, Lambda, BatchNormalization,GaussianNoise,LSTM
from tensorflow.keras.models import Model
from tensorflow.keras.losses import binary_crossentropy
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, Callback
# !pip install -U keras-tuner
# # from tensorflow import keras
# # from tensorflow.keras import layers
# #import tensorflow.python as tensorflow
# from kerastuner.tuners import RandomSearch
# #from kerastuner.engine.hypermodel import HyperModel
# #from kerastuner.engine.hyperparameters import HyperParameters
# import kerastuner as kt

# SETTINGS
os.environ['CUDA_VISIBLE_DEVICES'] = "1"
init_notebook_mode(connected=True)
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', 50)
pd.set_option('display.max_rows', 150)



<a id="2"></a> <br>
# Loading Dataset

In [2]:
# DATA
origin_df = pd.read_csv('../input/nfl-big-data-bowl-2020/train.csv', dtype={'WindSpeed': 'object'})
train_df = origin_df.copy()
outcomes = train_df[['GameId','PlayId','Yards']].drop_duplicates()
train_df.head()

Unnamed: 0,GameId,PlayId,Team,X,Y,S,A,Dis,Orientation,Dir,NflId,DisplayName,JerseyNumber,Season,YardLine,Quarter,GameClock,PossessionTeam,Down,Distance,FieldPosition,HomeScoreBeforePlay,VisitorScoreBeforePlay,NflIdRusher,OffenseFormation,OffensePersonnel,DefendersInTheBox,DefensePersonnel,PlayDirection,TimeHandoff,TimeSnap,Yards,PlayerHeight,PlayerWeight,PlayerBirthDate,PlayerCollegeName,Position,HomeTeamAbbr,VisitorTeamAbbr,Week,Stadium,Location,StadiumType,Turf,GameWeather,Temperature,Humidity,WindSpeed,WindDirection
0,2017090700,20170907000118,away,73.91,34.84,1.69,1.13,0.4,81.99,177.18,496723,Eric Berry,29,2017,35,1,14:14:00,NE,3,2,NE,0,0,2543773,SHOTGUN,"1 RB, 1 TE, 3 WR",6.0,"2 DL, 3 LB, 6 DB",left,2017-09-08T00:44:06.000Z,2017-09-08T00:44:05.000Z,8,6-0,212,12/29/1988,Tennessee,SS,NE,KC,1,Gillette Stadium,"Foxborough, MA",Outdoor,Field Turf,Clear and warm,63.0,77.0,8,SW
1,2017090700,20170907000118,away,74.67,32.64,0.42,1.35,0.01,27.61,198.7,2495116,Allen Bailey,97,2017,35,1,14:14:00,NE,3,2,NE,0,0,2543773,SHOTGUN,"1 RB, 1 TE, 3 WR",6.0,"2 DL, 3 LB, 6 DB",left,2017-09-08T00:44:06.000Z,2017-09-08T00:44:05.000Z,8,6-3,288,03/25/1989,Miami,DE,NE,KC,1,Gillette Stadium,"Foxborough, MA",Outdoor,Field Turf,Clear and warm,63.0,77.0,8,SW
2,2017090700,20170907000118,away,74.0,33.2,1.22,0.59,0.31,3.01,202.73,2495493,Justin Houston,50,2017,35,1,14:14:00,NE,3,2,NE,0,0,2543773,SHOTGUN,"1 RB, 1 TE, 3 WR",6.0,"2 DL, 3 LB, 6 DB",left,2017-09-08T00:44:06.000Z,2017-09-08T00:44:05.000Z,8,6-3,270,01/21/1989,Georgia,DE,NE,KC,1,Gillette Stadium,"Foxborough, MA",Outdoor,Field Turf,Clear and warm,63.0,77.0,8,SW
3,2017090700,20170907000118,away,71.46,27.7,0.42,0.54,0.02,359.77,105.64,2506353,Derrick Johnson,56,2017,35,1,14:14:00,NE,3,2,NE,0,0,2543773,SHOTGUN,"1 RB, 1 TE, 3 WR",6.0,"2 DL, 3 LB, 6 DB",left,2017-09-08T00:44:06.000Z,2017-09-08T00:44:05.000Z,8,6-3,245,11/22/1982,Texas,ILB,NE,KC,1,Gillette Stadium,"Foxborough, MA",Outdoor,Field Turf,Clear and warm,63.0,77.0,8,SW
4,2017090700,20170907000118,away,69.32,35.42,1.82,2.43,0.16,12.63,164.31,2530794,Ron Parker,38,2017,35,1,14:14:00,NE,3,2,NE,0,0,2543773,SHOTGUN,"1 RB, 1 TE, 3 WR",6.0,"2 DL, 3 LB, 6 DB",left,2017-09-08T00:44:06.000Z,2017-09-08T00:44:05.000Z,8,6-0,206,08/17/1987,Newberry,FS,NE,KC,1,Gillette Stadium,"Foxborough, MA",Outdoor,Field Turf,Clear and warm,63.0,77.0,8,SW


<a id="3"></a> <br>
# Feature Preprocess

In [3]:
#
# FEATURE UTILITY FUNCTION
#

# WindSpeed
def strToFloat(x):
    try:
        return float(x)
    except:
        # 0 or -1
        return -1

# Weather
def mapWeather(txt):
    ans = 1
    if pd.isna(txt):
        return 0
    if 'partly' in txt:
        ans *= 0.5
    if 'climate controlled' in txt or 'indoor' in txt or 'controlled climate' in txt:
        return ans*3
    if 'sunny' in txt or 'sun' in txt:
        return ans*2
    if 'clear' in txt:
        return ans
    if 'cloudy' in txt:
        return -ans
    if 'rain' in txt or 'rainy' in txt:
        return -2*ans
    if 'snow' in txt: # cold
        return -3*ans
    return 0

# Personnel Offense/Defense
def offensePersonnelSplit(x):
    dic = {'DB' : 0, 'DL' : 0, 'LB' : 0, 'OL' : 0, 'QB' : 0, 'RB' : 0, 'TE' : 0, 'WR' : 0}
    for xx in x.split(","):
        xxs = xx.split(" ")
        dic[xxs[-1]] = int(xxs[-2])
    return dic

def DefensePersonnelSplit(x):
    dic = {'DB' : 0, 'DL' : 0, 'LB' : 0, 'OL' : 0}
    for xx in x.split(","):
        xxs = xx.split(" ")
        dic[xxs[-1]] = int(xxs[-2])
    return dic

# Orientation (why to category / 15?10?)
def orientationToCat(x):
    try:
        return str(int(np.clip(x, 0, 360 - 1)/15))
    except:
        return "nan"

# Dir-relevant (new Y?)
def new_X(x_coordinate, play_direction):
    return 120.0 - x_coordinate if play_direction == 'left' else x_coordinate

def new_line(rush_team, field_position, yardline):
    return 10.0 + yardline if rush_team == field_position else 60.0 + (50 - yardline)

def new_orientation(angle, play_direction):
    return angle if angle == 0.0 or play_direction == 'right' else 360.0 - angle

def euclidean_distance(x1,y1,x2,y2):
    return np.sqrt((x1-x2)**2 + (y1-y2)**2)

def back_direction(orientation):
    return 1 if orientation > 180.0 else 0

In [4]:
# deal with features
def preprocess(train):
    # GameClock
    # train['GameClock_sec'] = train['GameClock'].apply(lambda x : x.split(":")[0] * 60 + x.split(":")[1])
    # train["GameClock_sec"] = train["GameClock"].apply(lambda x : x.split(":")[0]).astype("object")

    # Height
    train['PlayerHeight'] = train['PlayerHeight'].apply(lambda x: 12*int(x.split('-')[0])+int(x.split('-')[1]))

    # Time
    train['TimeHandoff'] = train['TimeHandoff'].apply(lambda x: datetime.datetime.strptime(x, "%Y-%m-%dT%H:%M:%S.%fZ"))
    train['TimeSnap'] = train['TimeSnap'].apply(lambda x: datetime.datetime.strptime(x, "%Y-%m-%dT%H:%M:%S.%fZ"))
    train['TimeDelta'] = train.apply(lambda row: (row['TimeHandoff'] - row['TimeSnap']).total_seconds(), axis=1)

    # Age
    seconds_in_year = 60*60*24*365.25
    train['PlayerBirthDate'] = train['PlayerBirthDate'].apply(lambda x: datetime.datetime.strptime(x, "%m/%d/%Y"))
    train['PlayerAge'] = train.apply(lambda row: (row['TimeHandoff']-row['PlayerBirthDate']).total_seconds()/seconds_in_year, axis=1)
    # train["PlayerAge_ob"] = train['PlayerAge'].astype(np.int).astype("object")

    # WindSpeed
    train['WindSpeed'] = train['WindSpeed'].apply(lambda x: x.lower().replace('mph', '').strip() if not pd.isna(x) else x)
    train['WindSpeed'] = train['WindSpeed'].apply(lambda x: (int(x.split('-')[0])+int(x.split('-')[1]))/2 if not pd.isna(x) and '-' in x else x)
    train['WindSpeed'] = train['WindSpeed'].apply(lambda x: (int(x.split()[0])+int(x.split()[-1]))/2 if not pd.isna(x) and type(x)!=float and 'gusts up to' in x else x)
    train['WindSpeed'] = train['WindSpeed'].apply(lambda x: 0.0 if x == 'calm' else x)
    #condition = train['WindSpeed'].apply(lambda x: True if x in ['ssw','se','e'] else False)
    #a,b = train['WindSpeed'][condition], train['WindDirection'][condition]
    #train['WindSpeed'][condition], train['WindDirection'][condition] = b, a
    train['WindSpeed'] = train['WindSpeed'].apply(strToFloat)

    # Weather
    # deal (T: 51; H: 55; W: NW 10 mph)
    train['GameWeather'] = train['GameWeather'].str.lower()
    train['GameWeather'] = train['GameWeather'].apply(lambda x: "indoor" if not pd.isna(x) and "indoor" in x else x)
    train['GameWeather'] = train['GameWeather'].apply(lambda x: x.replace('coudy', 'cloudy').replace('clouidy', 'cloudy').replace('party', 'partly') if not pd.isna(x) else x)
    train['GameWeather'] = train['GameWeather'].apply(lambda x: x.replace('clear and sunny', 'sunny and clear') if not pd.isna(x) else x)
    train['GameWeather'] = train['GameWeather'].apply(lambda x: x.replace('skies', '').replace("mostly", "").strip() if not pd.isna(x) else x)
    train['GameWeather_dense'] = train['GameWeather'].apply(mapWeather)

    # Rusher
    train['IsRusher'] = (train['NflId'] == train['NflIdRusher'])
    temp = train[train["IsRusher"]][["Team", "PlayId"]].rename(columns={"Team":"RusherTeam"})
    train = train.merge(temp, on = "PlayId")
    train["IsRusherTeam"] = (train["Team"] == train["RusherTeam"])
    
    # Orientation and Dir
    train["Orientation_ob"] = train["Orientation"].apply(lambda x : orientationToCat(x)).astype("object")
    train["Dir_ob"] = train["Dir"].apply(lambda x : orientationToCat(x)).astype("object")

    train["Orientation_sin"] = train["Orientation"].apply(lambda x : np.sin(x/360 * 2 * np.pi))
    train["Orientation_cos"] = train["Orientation"].apply(lambda x : np.cos(x/360 * 2 * np.pi))
    train["Dir_sin"] = train["Dir"].apply(lambda x : np.sin(x/360 * 2 * np.pi))
    train["Dir_cos"] = train["Dir"].apply(lambda x : np.cos(x/360 * 2 * np.pi))

    # diff Score 
    train["diffScoreBeforePlay"] = train["HomeScoreBeforePlay"] - train["VisitorScoreBeforePlay"]
    # train["diffScoreBeforePlay"] = abs(train["HomeScoreBeforePlay"] - train["VisitorScoreBeforePlay"])
    # train["diffScoreBeforePlay"] = (train["HomeScoreBeforePlay"] - train["VisitorScoreBeforePlay"]) * (-1 or 1)
    train["diffScoreBeforePlay_binary_ob"] = (train["HomeScoreBeforePlay"] > train["VisitorScoreBeforePlay"]).astype("object")

    # Turf
    Turf = {'Field Turf':'Artificial', 'A-Turf Titan':'Artificial', 'Grass':'Natural', 'UBU Sports Speed S5-M':'Artificial',
            'Artificial':'Artificial', 'DD GrassMaster':'Artificial', 'Natural Grass':'Natural', 'UBU Speed Series-S5-M':'Artificial', 
            'FieldTurf':'Artificial', 'FieldTurf 360':'Artificial', 'Natural grass':'Natural', 'grass':'Natural', 
            'Natural':'Natural', 'Artifical':'Artificial', 'FieldTurf360':'Artificial', 'Naturall Grass':'Natural', 
            'Field turf':'Artificial', 'SISGrass':'Artificial', 'Twenty-Four/Seven Turf':'Artificial', 'natural grass':'Natural'} 
    train['Turf'] = train['Turf'].map(Turf)

    # OffensePersonnel
    temp = train['OffensePersonnel'][np.arange(0, len(train), 22)].apply(lambda x : pd.Series(offensePersonnelSplit(x)))
    temp.columns = ["Offense" + c for c in temp.columns]
    temp["PlayId"] = train['PlayId'][np.arange(0, len(train), 22)]
    train = train.merge(temp, on = "PlayId")

    # DefensePersonnel
    temp = train['DefensePersonnel'][np.arange(0, len(train), 22)].apply(lambda x : pd.Series(DefensePersonnelSplit(x)))
    temp.columns = ["Defense" + c for c in temp.columns]
    temp["PlayId"] = train['PlayId'][np.arange(0, len(train), 22)]
    train = train.merge(temp, on = "PlayId")

    # numerical -> categorical
    train["Quarter"] = train["Quarter"].astype("object")
    train["Down"] = train["Down"].astype("object")
    train["JerseyNumber"] = train["JerseyNumber"].astype("object")
    # train["YardLine"] = train["YardLine"].astype("object")
    # train["DefendersInTheBox"] = train["DefendersInTheBox"].astype("object")
    # train["Week"] = train["Week"].astype("object")

    # Dir-relevant
    train['YardLine'] = train[['PossessionTeam','FieldPosition','YardLine']].apply(lambda x: new_line(x[0],x[1],x[2]), axis=1)
    train['X'] = train[['X','PlayDirection']].apply(lambda x: new_X(x[0],x[1]), axis=1)
    train['Orientation'] = train[['Orientation','PlayDirection']].apply(lambda x: new_orientation(x[0],x[1]), axis=1)
    train['Dir'] = train[['Dir','PlayDirection']].apply(lambda x: new_orientation(x[0],x[1]), axis=1)
    
    # OffenseFormation
    off_list = ['I_FORM','SHOTGUN','SINGLEBACK']
    for offf in off_list:
        train['OffenseFormation_' + offf] = 0
        train.loc[train['OffenseFormation'] == offf,'OffenseFormation_'+offf] = 1

    
    # Encoder
#     le_dict = {}
#     categoricals = ["Team_le", "NflIdRusher_le", "DisplayName_le", "PlayerCollegeName_le", "Position_le",
#                     'OffenseFormation_le', 'OffensePersonnel_le', 'DefensePersonnel_le', 'PlayDirection_le',
#                     'HomeTeamAbbr_le', 'VisitorTeamAbbr_le', 'Stadium_le', 'Location_le', 'StadiumType_le', 'Turf_le', 'GameWeather_le', "WindDirection_le"]

#     for cat in categoricals:
#         le_dict[cat] = LabelEncoder()
#         train[cat] = le_dict[cat].fit_transform(train[cat[:-3]].apply(str))
    
    # sort
#     train = train.sort_values(by = ['X']).sort_values(by = ['Dis']).sort_values(by=['PlayId', 'Team', 'IsRusher']).reset_index(drop = True)
#     train = train.sort_values(by = ['X']).sort_values(by = ['Dis']).sort_values(by=['PlayId', 'IsRusherTeam', 'IsRusher']).reset_index(drop = True)
    return train

In [5]:
train_df = preprocess(train_df)
# train_df = train_df.drop(columns=[])

<a id="4"></a> <br>
# Feature Selection

In [6]:
# create new features
def createFeatures(train, outcomes=None, deploy=False):
#     def defense_possession_features(df):
#         rusher = df[df['NflId'] == df['NflIdRusher']][['GameId','PlayId','X','Y']]
#         rusher.columns = ['GameId','PlayId','RusherX','RusherY']
#         defense = pd.merge(df,rusher,on=['GameId','PlayId'],how='inner')
#         defense = defense[defense['Team'] != defense['RusherTeam']][['GameId','PlayId','NflId','X','Y','RusherX','RusherY']]
#         defense['def_dist_to_back'] = defense[['X','Y','RusherX','RusherY']].apply(lambda x: euclidean_distance(x[0],x[1],x[2],x[3]), axis=1)
#         defense_p = defense.groupby(['NflId']).agg({'NflId':['count']}).reset_index()
#         defense_p.columns = ['NflId','Possessions']
#         defense_p = pd.merge(defense[['GameId','PlayId','NflId','def_dist_to_back']],defense_p,on=['NflId'],how='inner')
#         #Might test 2/4/6 or even larger
#         defense_Ip = defense_p[defense_p['def_dist_to_back']<=6]
#         defense_Ip = defense_Ip.groupby(['NflId']).agg({'NflId':['count']}).reset_index()
#         defense_Ip.columns = ['NflId','In_place']
#         defense_Ip = pd.merge(defense_p,defense_Ip,on=['NflId'],how='inner')
#         defense_Ip['In_place_times_100'] = defense_Ip['In_place'] / defense_Ip['Possessions'] * 100
#         defense_Ip = defense_Ip.drop(columns = ['Possessions','In_place'])
#         def add_sf(group):
#             temp_dist = group['def_dist_to_back'][group['def_dist_to_back']<=30]
#             Stemp = np.exp(temp_dist)/sum(np.exp(temp_dist))
#             group['Sum_in_place_of_100'] = sum(Stemp * group['In_place_times_100'])
#             group.loc[group['Sum_in_place_of_100'].isna(),'Sum_in_place_of_100'] = 0
#             return group
#         defense_Ip = defense_Ip.groupby(['GameId','PlayId']).apply(add_sf)
#         defense_Ip = defense_Ip.drop(columns = ['NflId','def_dist_to_back','In_place_times_100'])
#         defense_Ip = defense_Ip.groupby(['GameId','PlayId']).mean().reset_index()
#         defense_Ip.columns = ['GameId','PlayId','Sum_in_place_of_100']
#         return defense_Ip     
    
    def defensor_speed_features(df):
        rusher = df[df['NflId'] == df['NflIdRusher']][['GameId','PlayId','X','Y']]
        rusher.columns = ['GameId','PlayId','RusherX','RusherY']

        defense = pd.merge(df,rusher,on=['GameId','PlayId'],how='inner')
        defense = defense[defense['Team'] != defense['RusherTeam']][['GameId','PlayId','X','Y','RusherX','RusherY','S']]
        defense['def_dist_to_back'] = defense[['X','Y','RusherX','RusherY']].apply(lambda x: euclidean_distance(x[0],x[1],x[2],x[3]), axis=1)

        defense1 = defense.copy()
        def_S_index = defense.groupby(['GameId','PlayId'])['def_dist_to_back'].idxmin()
        defense = defense.groupby(['GameId','PlayId'])\
                             .agg({'def_dist_to_back':['min','max','mean','std']})\
                             .reset_index()
        defense.columns = ['GameId','PlayId','def_min_dist','def_max_dist','def_mean_dist','def_std_dist']
        defense['def_S'] = defense1.loc[def_S_index.values].reset_index().S
        defense['def_S'] = defense['def_S']+0.05
        defense['min_time_to_tackle_S'] = defense['def_min_dist']/(defense['def_S'])
        defense = defense.drop(columns = ['def_min_dist', 'def_max_dist', 'def_mean_dist', 'def_std_dist'])
        
        return defense
    
    # back features
    def back_features(df):
        # carriers
        carriers = df[df['NflId'] == df['NflIdRusher']][['GameId','PlayId','NflIdRusher','X','Y','Orientation','Dir','YardLine']]
        carriers['back_from_scrimmage'] = carriers['YardLine'] - carriers['X']
        carriers['back_oriented_down_field'] = carriers['Orientation'].apply(lambda x: back_direction(x))
        carriers['back_moving_down_field'] = carriers['Dir'].apply(lambda x: back_direction(x))
        carriers = carriers.rename(columns={'X':'back_X',
                                            'Y':'back_Y'})
        carriers = carriers[['GameId','PlayId','NflIdRusher','back_X','back_Y','back_from_scrimmage','back_oriented_down_field','back_moving_down_field']]
        return carriers

    def features_relative_to_back(df, carriers):
        # both offense and defense
        player_distance = df[['GameId','PlayId','NflId','X','Y','Team','RusherTeam']]
        player_distance = pd.merge(player_distance, carriers, on=['GameId','PlayId'], how='inner')
        player_distance = player_distance[player_distance['NflId'] != player_distance['NflIdRusher']]
        player_distance['dist_to_back'] = player_distance[['X','Y','back_X','back_Y']].apply(lambda x: euclidean_distance(x[0],x[1],x[2],x[3]), axis=1)
        player_distance['def_in_box_5'] = player_distance[['dist_to_back','Team','RusherTeam']].apply(lambda x: 1 if x[0]<=5.0 and (x[1]!=x[2]) else 0,axis=1)
        player_distance['off_in_box_5'] = player_distance[['dist_to_back','Team','RusherTeam']].apply(lambda x: 1 if x[0]<=5.0 and (x[1]==x[2]) else 0,axis=1)
        player_distance['def_in_box_10'] = player_distance[['dist_to_back','Team','RusherTeam']].apply(lambda x: 1 if x[0]<=10.0 and (x[1]!=x[2]) else 0,axis=1)
        player_distance['off_in_box_10'] = player_distance[['dist_to_back','Team','RusherTeam']].apply(lambda x: 1 if x[0]<=10.0 and (x[1]==x[2]) else 0,axis=1)
        player_distance = player_distance.groupby(['GameId','PlayId','back_from_scrimmage','back_oriented_down_field','back_moving_down_field'])\
                                         .agg({'dist_to_back':['min','max','mean','std'], 'def_in_box_5':['sum'], 'off_in_box_5':['sum'],
                                              'def_in_box_10':['sum'], 'off_in_box_10':['sum']})\
                                         .reset_index()
        player_distance.columns = ['GameId','PlayId','back_from_scrimmage','back_oriented_down_field','back_moving_down_field',
                                   'min_dist','max_dist','mean_dist','std_dist','def_in_box_5','off_in_box_5','def_in_box_10','off_in_box_10']
        player_distance['def_in_box_10'] = player_distance['def_in_box_10'] - player_distance['def_in_box_5']
        player_distance['off_in_box_10'] = player_distance['off_in_box_10'] - player_distance['off_in_box_5']

        return player_distance

    # defense
    def defense_features(df):
        rusher = df[df['NflId'] == df['NflIdRusher']][['GameId','PlayId','X','Y']]
        rusher.columns = ['GameId','PlayId','RusherX','RusherY']

        defense = pd.merge(df,rusher,on=['GameId','PlayId'],how='inner')
        defense = defense[defense['Team'] != defense['RusherTeam']][['GameId','PlayId','X','Y','RusherX','RusherY','S']]
        defense['def_dist_to_back'] = defense[['X','Y','RusherX','RusherY']].apply(lambda x: euclidean_distance(x[0],x[1],x[2],x[3]), axis=1)
        defense['tackle_time'] = defense[['def_dist_to_back','S']].apply(lambda x: x[0]/(x[1]+0.05),axis=1)
        defense['new_X'] = defense[['X','RusherX']].apply(lambda x: x[0]-x[1],axis=1)
        
        defense = defense.groupby(['GameId','PlayId'])\
                         .agg({'def_dist_to_back':['min','max','mean','std'],'Y':['min','max','std'],'X':['min','max','std'],
                               'tackle_time':['min','max','mean','std'],'new_X':['std']})\
                         .reset_index()
        defense.columns = ['GameId','PlayId','def_min_dist','def_max_dist','def_mean_dist','def_std_dist',
                           'Y_min','Y_max','def_Y_std','X_min','X_max','def_X_std','def_tackle_time_min','def_tackle_time_max','def_tackle_time_mean',
                          'def_tackle_time_std','def_new_X_std']
#         defense = defense.groupby(['GameId','PlayId'])\
#                          .agg({'def_dist_to_back':['min','max','mean','std'],'Y':['min','max','std'],'X':['min','max','std'],
#                                'tackle_time':['min','max','mean','std'],'new_X':['std'], 'DefenseDL':['sum']})\
#                          .reset_index()
#         defense.columns = ['GameId','PlayId','def_min_dist','def_max_dist','def_mean_dist','def_std_dist',
#                            'Y_min','Y_max','def_Y_std','X_min','X_max','def_X_std','def_tackle_time_min','def_tackle_time_max','def_tackle_time_mean',
#                           'def_tackle_time_std','def_new_X_std','DefenseDL']
        defense['def_Y_spread'] = defense[['Y_min','Y_max']].apply(lambda x: x[1]-x[0], axis=1)
        defense['def_X_spread'] = defense[['X_min','X_max']].apply(lambda x: x[1]-x[0], axis=1)
        defense = defense.drop(columns=['Y_min','Y_max','X_min','X_max'])
        

        return defense
    
    # rusher 
    def rusher_features(df):
        rusher = df[df['NflId'] == df['NflIdRusher']][['GameId','PlayId','Dir', 'S', 'A', 'X', 'Y']]
        rusher.columns = ['GameId','PlayId', 'RusherDir', 'RusherS', 'RusherA', 'RusherX', 'RusherY']
       
        radian_angle = (90 - rusher['RusherDir']) * np.pi / 180.0
        v_horizontal = np.abs(rusher['RusherS'] * np.cos(radian_angle))
        v_vertical = np.abs(rusher['RusherS'] * np.sin(radian_angle)) 
       
        rusher['v_horizontal'] = v_horizontal
        rusher['v_vertical'] = v_vertical
        
        rusher = rusher.drop(columns=['RusherDir','RusherS', 'RusherA','RusherX','RusherY'])
        return rusher
    
    # offense     
    def offense_features(df):
        rusher = df[df['NflId'] == df['NflIdRusher']][['GameId','PlayId','X','Y']]
        rusher.columns = ['GameId','PlayId','RusherX','RusherY']

        offense = pd.merge(df,rusher,on=['GameId','PlayId'],how='inner')
        offense = offense[offense['Team'] == offense['RusherTeam']][offense['NflId'] != offense['NflIdRusher']][['GameId','PlayId','X','Y','RusherX','RusherY','S']]
        offense['off_dist_to_back'] = offense[['X','Y','RusherX','RusherY']].apply(lambda x: euclidean_distance(x[0],x[1],x[2],x[3]), axis=1)
        offense['tackle_time'] = offense[['off_dist_to_back','S']].apply(lambda x: x[0]/(x[1]+0.05),axis=1)
        offense['new_X'] = offense[['X','RusherX']].apply(lambda x: x[0]-x[1],axis=1)
        
        offense = offense.groupby(['GameId','PlayId'])\
                         .agg({'off_dist_to_back':['min','max','mean','std'],'Y':['min','max','std'],'X':['min','max','std'],
                               'tackle_time':['min','max','mean','std'],'new_X':['std']})\
                         .reset_index()
        offense.columns = ['GameId','PlayId','off_min_dist','off_max_dist','off_mean_dist','off_std_dist',
                           'Y_min','Y_max','off_Y_std', 'X_min','X_max','off_X_std','off_tackle_time_min','off_tackle_time_max','off_tackle_time_mean',
                          'off_tackle_time_std','off_new_X_std']
#         offense = offense.groupby(['GameId','PlayId'])\
#                          .agg({'off_dist_to_back':['min','max','mean','std'],'Y':['min','max','std'],'X':['min','max','std'],
#                                'tackle_time':['min','max','mean','std'],'new_X':['std'], 'OffenseOL':['sum'],'OffenseTE':['sum'],
#                                'OffenseWR':['sum']})\
#                          .reset_index()
#         offense.columns = ['GameId','PlayId','off_min_dist','off_max_dist','off_mean_dist','off_std_dist',
#                            'Y_min','Y_max','off_Y_std', 'X_min','X_max','off_X_std','off_tackle_time_min','off_tackle_time_max','off_tackle_time_mean',
#                           'off_tackle_time_std','off_new_X_std','OffenseOL','OffenseTE','OffenseWR']
        offense['off_Y_spread'] = offense[['Y_min','Y_max']].apply(lambda x: x[1]-x[0], axis=1)
        offense['off_X_spread'] = offense[['X_min','X_max']].apply(lambda x: x[1]-x[0], axis=1)
        
        offense = offense.drop(columns=['Y_min','Y_max','X_min','X_max'])

        return offense
    
    # other features
    def static_features(df):
        basic_feas = ['GameId','PlayId','X','Y','S','A','Dis','Orientation','Dir',
                      'YardLine','Quarter','Distance','DefendersInTheBox', 'OffenseFormation_I_FORM', 
                      'OffenseFormation_SHOTGUN','OffenseFormation_SINGLEBACK', 'Down']


        add_new_feas = ['PlayerHeight', 'PlayerAge', 'WindSpeed', 'GameWeather_dense','Dir_sin',
                        'Dir_cos', 'diffScoreBeforePlay'] # PlayerWeight
    
        static_features = df[df['NflId'] == df['NflIdRusher']][add_new_feas+basic_feas].drop_duplicates()
        static_features.fillna(-999,inplace=True) # fill NA value
        # static_features[i] = static_features[i].fillna(np.mean(static_features[i]))
        return static_features


    def combine_features(relative_to_back, defense, defensor, offense, rush, static, deploy=deploy):
        df = pd.merge(relative_to_back,defense,on=['GameId','PlayId'],how='inner')
        df = pd.merge(df, defensor, on=['GameId','PlayId'],how='inner')
        #df = pd.merge(df,possession,on=['GameId','PlayId'],how='inner') 
        df = pd.merge(df,offense,on=['GameId','PlayId'],how='inner')
        df = pd.merge(df,rush,on=['GameId','PlayId'],how='inner')
        df = pd.merge(df,static,on=['GameId','PlayId'],how='inner')
        #df['off_on_def'] = df[['def_in_box','off_in_box']].apply(lambda x: float(x[1])/float(x[0]),axis=1)
        df['runner_vs_1stdefensor_speed'] = df['S']/df['def_S']
        return df
    
    back_feats = back_features(train)
    rel_back = features_relative_to_back(train, back_feats)
    def_feats = defense_features(train)
    off_feats = offense_features(train)
    # defense_possession_feats = defense_possession_features(train)
    defensor_speed_feats = defensor_speed_features(train)
    rush_feats = rusher_features(train)
    static_feats = static_features(train)
    train = combine_features(rel_back, def_feats, defensor_speed_feats, off_feats,rush_feats, static_feats, deploy=deploy)
    
    # train or test data
    if not deploy:
        train = pd.merge(train, outcomes, on=['GameId','PlayId'], how='inner')

    return train

In [7]:
train_df = createFeatures(train_df, outcomes)

<a id="5"></a> <br>
# NN Model and CV

In [8]:
# construct X and y
X = train_df.copy()
yards = X.Yards
pca = PCA(n_components=50)
y = np.zeros((yards.shape[0], 199))
for idx, target in enumerate(list(yards)):
    y[idx][99 + target] = 1

X.drop(['GameId','PlayId','Yards'], axis=1, inplace=True)
scaler = StandardScaler()
X = scaler.fit_transform(X)
# pca.fit(X)
# X = pca.transform(X)
# print(pca.explained_variance_ratio_[:50].sum())
# train and test data
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.15, random_state=12345)

In [9]:
# callback
class CRPSCallback(Callback):
    def __init__(self, validation, predict_batch_size=20, include_on_batch=False):
        super(CRPSCallback, self).__init__()
        self.validation = validation
        self.predict_batch_size = predict_batch_size
        self.include_on_batch = include_on_batch
        print('validation shape',len(self.validation))

    def on_batch_begin(self, batch, logs={}):
        pass

    def on_train_begin(self, logs={}):
        if not ('CRPS_score_val' in self.params['metrics']):
            self.params['metrics'].append('CRPS_score_val')

    def on_batch_end(self, batch, logs={}):
        if (self.include_on_batch):
            logs['CRPS_score_val'] = float('-inf')

    def on_epoch_end(self, epoch, logs={}):
        logs['CRPS_score_val'] = float('-inf')
            
        if (self.validation):
            X_valid, y_valid = self.validation[0], self.validation[1]
            y_pred = self.model.predict(X_valid)
            y_true = np.clip(np.cumsum(y_valid, axis=1), 0, 1)
            y_pred = np.clip(np.cumsum(y_pred, axis=1), 0, 1)
            val_s = ((y_true - y_pred) ** 2).sum(axis=1).sum(axis=0) / (199 * X_valid.shape[0])
            val_s = np.round(val_s, 6)
            logs['CRPS_score_val'] = val_s

# NN Model
def get_model(x_tr,y_tr,x_val,y_val):
    # model
    inp = Input(shape = (x_tr.shape[1],))
    x = Dense(512, input_dim=X.shape[1], activation='relu')(inp)
    x = Dropout(0.5)(x)
    x = BatchNormalization()(x)
    x = Dense(256, activation='relu')(x)
    x = Dropout(0.5)(x)
    x = BatchNormalization()(x)
    x = Dense(256, activation='relu')(x)
    x = Dropout(0.5)(x)
    x = BatchNormalization()(x)
    out = Dense(199, activation='softmax')(x)
    model = Model(inp,out)
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=[])
    # add lookahead ?PyTorch
    # lookahead = Lookahead(k=5, alpha=0.5) # Initialize Lookahead
    # lookahead.inject(model) # add into model
    
    es = EarlyStopping(monitor='CRPS_score_val', 
                       mode='min',
                       restore_best_weights=True, 
                       verbose=1, 
                       patience=10)

    mc = ModelCheckpoint('best_model.h5',monitor='CRPS_score_val',mode='min',
                         save_best_only=True, verbose=1, save_weights_only=True)
    
    bsz = 1024
    steps = x_tr.shape[0]/bsz
    
    model.fit(x_tr, y_tr,callbacks=[CRPSCallback(validation = (x_val,y_val)),es,mc], epochs=100, batch_size=bsz,verbose=1)
    model.load_weights("best_model.h5")
    y_pred = model.predict(x_val)
    y_valid = y_val
    y_true = np.clip(np.cumsum(y_valid, axis=1), 0, 1)
    y_pred = np.clip(np.cumsum(y_pred, axis=1), 0, 1)
    val_s = ((y_true - y_pred) ** 2).sum(axis=1).sum(axis=0) / (199 * x_val.shape[0])
    crps = np.round(val_s, 6)

    return model,crps

In [10]:
# # NN Model
# def CRPS(y_true,y_pred):
# #     val_s = ((y_true - y_pred) ** 2).sum(axis=1).sum(axis=0) / (199 * x_val.shape[0])
# #     crps = np.round(val_s, 6)
#     sum1 = tf.keras.backend.sum(
#         ((y_true - y_pred) ** 2),
#         axis=1, keepdims=True)
#     return tf.keras.backend.sum(sum1,axis=0, keepdims=True)/ tf.dtypes.cast((199 * tf.shape(y_true)[0]), tf.float32)
   
# def build_model(hp):
# #     # model
#     # dense1 = hp.Int('units',min_value=256,max_value=1024,step=32,default=1024)
#     # dropout1 = hp.Choice('dropout', [0.5, 0.2, 0.8])
#     dense2 = hp.Int('units',min_value=256,max_value=1024,step=32,default=512)
#     dropout2 = hp.Choice('dropout', [0.5, 0.2, 0.8])
#     dense3 = 256 # hp.Int('units',min_value=128,max_value=512,step=32,default=256)
#     dropout3 = 0.5 # hp.Choice('dropout', [0.5, 0.2, 0.8])
    
#     inp = Input(shape = (X_train.shape[1],))
#     #x = GaussianNoise(stddev=0.25)(inp)
#     x = Dense(units=512, activation='relu')(inp)
#     x = Dropout(0.5)(x)
#     x = BatchNormalization()(x)
#     #x = GaussianNoise(stddev=0.1)(x)
    
#     x = Dense(units=dense2, activation='relu')(x)
#     x = Dropout(dropout2)(x)
#     x = BatchNormalization()(x)
#     #x = GaussianNoise(stddev=0.1)(x)
        
#     x = Dense(units=dense3, activation='relu')(x)
#     x = Dropout(dropout3)(x)
#     x = BatchNormalization()(x)
    
#     out = Dense(199, activation='softmax')(x)
#     model = Model(inp,out)
#     model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=[CRPS])
#     return model
# tuner = RandomSearch(
#     build_model,
#     objective=kt.Objective('val_CRPS', 'min'),
#     max_trials=5,
#     executions_per_trial=2,
#     directory='.')

# #tuner.search_space_summary()

# tuner.search(x=X,
#              y=y,
#              epochs=100, validation_split=0.2)



In [11]:
#tuner.results_summary()

In [12]:
# from tensorflow.keras.utils import plot_model
# model = tuner.get_best_models(num_models=1)
# plot_model(model[0], to_file='model_plot.png', show_shapes=True, show_layer_names=True)

In [13]:
# Cross Validation
losses, models, crps_csv = [], [], []
s_time = time.time()

for k in range(2):
    kfold = KFold(5, random_state = 42 + k, shuffle = True) # different random state
    for k_fold, (tr_inds, val_inds) in enumerate(kfold.split(yards)):
        print("-----------")
        print("-----------")
        tr_x,tr_y = X[tr_inds],y[tr_inds]
        val_x,val_y = X[val_inds],y[val_inds]
        model,crps = get_model(tr_x,tr_y,val_x,val_y)
        models.append(model)
        print("the %d fold crps is %f"%((k_fold+1),crps))
        crps_csv.append(crps)

-----------
-----------
validation shape 2
Epoch 1/100
Epoch 00001: CRPS_score_val improved from inf to 0.08334, saving model to best_model.h5
Epoch 2/100
Epoch 00002: CRPS_score_val improved from 0.08334 to 0.07816, saving model to best_model.h5
Epoch 3/100
Epoch 00003: CRPS_score_val improved from 0.07816 to 0.06920, saving model to best_model.h5
Epoch 4/100
Epoch 00004: CRPS_score_val improved from 0.06920 to 0.05082, saving model to best_model.h5
Epoch 5/100
Epoch 00005: CRPS_score_val improved from 0.05082 to 0.03209, saving model to best_model.h5
Epoch 6/100
Epoch 00006: CRPS_score_val improved from 0.03209 to 0.02073, saving model to best_model.h5
Epoch 7/100
Epoch 00007: CRPS_score_val improved from 0.02073 to 0.01600, saving model to best_model.h5
Epoch 8/100
Epoch 00008: CRPS_score_val improved from 0.01600 to 0.01422, saving model to best_model.h5
Epoch 9/100
Epoch 00009: CRPS_score_val improved from 0.01422 to 0.01363, saving model to best_model.h5
Epoch 10/100
Epoch 00010:

In [14]:
print("mean crps is %f" % np.mean(crps_csv))

mean crps is 0.012682


In [15]:
print(crps_csv)

[0.01276, 0.012664, 0.012623, 0.012491, 0.012913, 0.012374, 0.013009, 0.012251, 0.013213, 0.012524]


<a id="6"></a> <br>
# Submission

In [16]:
%%time
env = nflrush.make_env()
iter_test = env.iter_test()

def predict(x_te):
    model_num = len(models)
    for k,m in enumerate(models):
        if k==0:
            y_pred = m.predict(x_te,batch_size=1024)
        else:
            y_pred+=m.predict(x_te,batch_size=1024)
    y_pred = y_pred / model_num
    return y_pred

for (test_df, sample_prediction_df) in iter_test:
    basetable = createFeatures(preprocess(test_df), deploy=True)
    basetable.drop(['GameId','PlayId'], axis=1, inplace=True)
    scaled_basetable = scaler.transform(basetable)
    # scaled_basetable = pca.transform(basetable)
    y_pred = predict(scaled_basetable)
    y_pred = np.clip(np.cumsum(y_pred, axis=1), 0, 1).tolist()[0]
    # print(y_pred)

    preds_df = pd.DataFrame(data=[y_pred], columns=sample_prediction_df.columns)
    env.predict(preds_df)

env.write_submission_file()

Your submission file has been saved!  Once you `Commit` your Notebook and it finishes running, you can submit the file to the competition from the Notebook Viewer `Output` tab.
CPU times: user 31min 40s, sys: 9.38 s, total: 31min 50s
Wall time: 31min 15s
