In [1]:
import pandas as pd
import numpy as np
import re
from string import punctuation
import math
from sklearn.model_selection import train_test_split

import tensorflow.keras 
from tensorflow.keras.models import load_model
from tensorflow.keras import backend as K
import time
data=pd.read_csv('train.csv',low_memory=False)
data['PlayId']=data['PlayId'].astype(str)

## Function 1

Takes a play from from the dataset in shape (22,52) and returns the top 5 predictions with the accompanying Probabilities

In [2]:
%%time
def function_1(point):
    np.random.seed(43)
    start=time.time()
    data=pd.read_csv('train.csv',low_memory=False)
    end=time.time()
    print('Time taken to read data:', round(end-start),'s')
    
    start=time.time()

    orient=data.groupby('Position',as_index=False)['Orientation'].mean()
    Dir=data.groupby('Position',as_index=False)['Dir'].mean()
    OrientDict={}
    DirDict={}
    for i,j in orient.iterrows():
        OrientDict[j['Position']]=j['Orientation']
        DirDict[j['Position']]=j['Orientation']

    data.Dir=data.Dir.fillna(data.Position.map(DirDict))
    data.Orientation=data.Orientation.fillna(data.Position.map(OrientDict))

    tempsperweek=data.groupby(['Season','Week'],as_index=False)['Temperature'].mean()
    temp2017={}
    temp2018={}
    temp2019={}
    for i,j in tempsperweek.iterrows():
        if j['Season']==2017:
            temp2017[j['Week']]=j['Temperature']
        if j['Season']==2018:
            temp2018[j['Week']]=j['Temperature']
        if j['Season']==2019:
            temp2019[j['Week']]=j['Temperature']

    data.loc[(data.Season==2017) & (data.Temperature.isna()),'Temperature']=data.Temperature.fillna(data.Week.map(temp2017))
    data.loc[(data.Season==2018) & (data.Temperature.isna()),'Temperature']=data.Temperature.fillna(data.Week.map(temp2018))
    data.loc[(data.Season==2019) & (data.Temperature.isna()),'Temperature']=data.Temperature.fillna(data.Week.map(temp2019))

    data.loc[data.FieldPosition.isna(),'FieldPosition']='UNKNOWN'
    data.loc[data.OffenseFormation.isna(),'OffenseFormation']=data.OffenseFormation.value_counts().idxmax()

    data.loc[data.DefendersInTheBox.isna(),'DefendersInTheBox']=data.DefendersInTheBox.median()
    data.loc[data.Humidity.isna(),'Humidity']=data.Humidity.median()


    #Code to clean StadiumType 
    #code inspired from https://www.kaggle.com/sanshengshi/lightgbm-clean-stadiumtype
    def StadiumType(txt):
        txt=str(txt)
        txt=txt.lower()
        txt=txt.strip()
        if 'indoor' in txt or 'closed' in txt:
            return 0
        else:
            return 1   #outdoor or open or unspecified is being treated an as open field 
    data["StadiumType"]=data["StadiumType"].apply(StadiumType)

    def Gameweather(txt):
        txt=str(txt)
        txt=txt.lower()
        txt=txt.strip()
        if 'clear' in txt or 'sun' in txt or 'controlled' in txt or 'indoor' in txt:
            return 0
        if 'rain' in txt:
            return 1
        if 'cloud' in txt or 'overcast' in txt:
            return 0.5 
        if 'snow' in txt or 'overcast' in txt:
            return -0.5
        return 0                                   # Values given to differentiate between clear and rainy 
    data["GameWeather"]=data["GameWeather"].apply(Gameweather)

    def Windspeed(txt):
        if pd.isna(txt):
            return 7.0                   # Median Value   
        if '-' in txt:
            a,b=txt.split('-')
            return (float(a)+float(b))/2
        elif txt.isalnum():
            if re.match('(\d+)',str(txt)):
                return float(re.match('(\d+)',str(txt))[0])
            else:
                return 7.0
        else:
            return 0
    data["WindSpeed"]=data["WindSpeed"].astype(str)
    data["WindSpeed"]=data["WindSpeed"].apply(Windspeed)


    # code based from https://www.kaggle.com/bgmello/neural-networks-feature-engineering-for-the-win
    def WindDirection(txt):

        #Cleaning the values
        if pd.isna(txt):
            return -1
        txt = txt.lower()
        txt = ''.join([c for c in txt if c not in punctuation])
        txt = txt.replace('from', '')
        txt = txt.replace(' ', '')
        txt = txt.replace('north', 'n')
        txt = txt.replace('south', 's')
        txt = txt.replace('west', 'w')
        txt = txt.replace('east', 'e')

        #assigning the values

        deg=360
        if txt=='n':
            return 0
        if txt=='nne' or txt=='nen':
            return 1/16*deg
        if txt=='ne':
            return 2/16*deg
        if txt=='ene' or txt=='nee':
            return 3/16*deg
        if txt=='e':
            return 4/16*deg
        if txt=='ese' or txt=='see':
            return 5/16*deg
        if txt=='se':
            return 6/16*deg
        if txt=='ses' or txt=='sse':
            return 7/16*deg
        if txt=='s':
            return 8/16*deg
        if txt=='ssw' or txt=='sws':
            return 9/16*deg
        if txt=='sw':
            return 10/16*deg
        if txt=='sww' or txt=='wsw':
            return 11/16*deg
        if txt=='w':
            return 12/16*deg
        if txt=='wnw' or txt=='nww':
            return 13/16*deg
        if txt=='nw':
            return 14/16*deg
        if txt=='nwn' or txt=='nnw':
            return 15/16*deg
        return -1
                                          # Values given to differentiate between clear and rainy 
    data["WindDirection"]=data["WindDirection"].apply(WindDirection)
    
    TeamMap={'ARI':'ARZ','BAL':'BLT','CLE':'CLV','HOU':'HST'}

    for k,v in TeamMap.items():
        data.loc[data['VisitorTeamAbbr']==k,'VisitorTeamAbbr']=v
        data.loc[data['HomeTeamAbbr']==k,'HomeTeamAbbr']=v
    data['PlayId']=data['PlayId'].astype(str)
    data['PlayerHeight']=data['PlayerHeight'].astype(str)
    data['PlayerHeight']=data['PlayerHeight'].str.split('-').apply(lambda x: int(x[0])*0.3048+ int(x[1])*0.0254)
    data['PlayerWeight']=data['PlayerWeight']*0.453592
    
    
    
    #first 3 Downs have similar distribution
    data['Down']=data.Down.apply(lambda x: 1 if x<3 else 0)
    #Standardise Play Direction and dependent features
    data['Left']=data.PlayDirection=='left'
    
    data['X_std']=data.X
    data.loc[data.Left,'X_std']=120-data.loc[data.Left,'X']
    data['Y_std']=data.Y
    data.loc[data.Left,'Y_std']=160/3-data.loc[data.Left,'Y']
    
    #Rusher varibale to indicate the rushing player
    data['Rusher']=data.NflId==data.NflIdRusher
    
    data['Offense'] = "home"
    # If attacking team is not home team, it is treated as away team
    data.loc[data.PossessionTeam != data.HomeTeamAbbr, 'Offense'] = "away"
    # If Field position and possession team are same then the Yardline values stay same. Otherwise it is in the opposite direction
    data['YardLine_std'] = 100 - data.YardLine
    data.loc[data.FieldPosition == data.PossessionTeam,'YardLine_std'] = data.loc[data.FieldPosition == data.PossessionTeam,'YardLine']
    
    # Degrees to Radians subtracting by 90 since 0 degrees is upwards in raw data and it doesn't make intuitve sense, 
    # towards right is gven as 0 and towards left as 180 degrees or pi radians
    data['Dir_std']=np.mod(90-data.Dir,360)*np.pi/180
    #Flipping direction of all Left direction by pi radians (180 degrees)
    data.loc[data['Left'],'Dir_std']=np.mod(np.pi-data.loc[data['Left'],'Dir_std'],2*np.pi)
    
    data.drop(['X','Y','Dir','YardLine'],axis=1,inplace=True)
    
    #Data Engineering
    
    #Values taken just from looking at the distrubition of data after comparing 2017 with other Seasons(2018/19)
    #Number of samples also taken after looking at the corresponding values in 2018/2019 data respectively 
    index=data.loc[(data.Season==2017) & (data.Dis==0.0)].sample(2200).index                                          
    data.loc[index.values,'Dis']=np.round(np.random.uniform(0.1,0.4,2200),decimals=2)      
    index=data.loc[(data.Season==2017) & (data.Dis==0.01)].sample(4000).index
    data.loc[index.values,'Dis']=np.round(np.random.uniform(0.1,0.4,4000),decimals=2)
    
    #201 orientation off byy a phase of 90 degrees
    data.loc[data.Season==2017,'Orientation']=np.mod(90+data.loc[data.Season==2017,'Orientation'],360)
    
    
    end=time.time()
    print('Time to preprocess full data',round(end-start),'s')
    
    
    start=time.time()
    
    data=data.loc[data.PlayId==point.PlayId.unique()[0]]
    
    
    #creating a Play only based Dataframe
    df_rush=data.loc[data.Rusher]
    
    
    #Feature engineering for features relative to the Rusher
    
    #https://www.kaggle.com/jccampos/nfl-2020-winner-solution-the-zoo
    # Getting Speed component along vertical and Horizontal Axis
    data['S_x'] = data['S']*data['Dir_std'].apply(math.cos)
    data['S_y'] = data['S']*data['Dir_std'].apply(math.sin)

    #Momentum=mass*velocity along vertical and Horizontal Axis
    data['M_x']=data.PlayerWeight*data['S_x']
    data['M_y']=data.PlayerWeight*data['S_y']


    rush=data[data.Rusher]
    rush.set_index('PlayId',inplace=True,drop=True)
    #Converting to Dictionary the values of Rusher
    mapp=rush[['X_std','Y_std','S_x','S_y','M_x','M_y']].to_dict(orient='index')

    #Creating columns which contain the rusher values to all PlayIds
    rush_x=data['PlayId'].apply(lambda x:mapp[x]['X_std'])
    rush_y=data['PlayId'].apply(lambda y:mapp[y]['Y_std'])
    rush_Sx=data['PlayId'].apply(lambda x:mapp[x]['S_x'])
    rush_Sy=data['PlayId'].apply(lambda y:mapp[y]['S_y'])
    rush_Mx=data['PlayId'].apply(lambda x:mapp[x]['M_x'])
    rush_My=data['PlayId'].apply(lambda y:mapp[y]['M_y'])
    
    #Euclidean Distance between Rusher and other players
    data['gap']=((rush_x-data['X_std'])**2+(rush_y-data['Y_std'])**2)**0.5
    data['inverse_gap']=data['gap'].apply(lambda x: 1/x if x!=0 else -1)
    #Relative Speeds between Rusher and other players
    data['RelS_x']=rush_Sx-data['S_x']
    data['RelS_y']=rush_Sy-data['S_y']
    # Relative Momentum between Rusher and other players
    data['RelM_x']=rush_Mx-data['M_x']
    data['RelM_y']=rush_My-data['M_y']  


    #Code to collapse all 22 players involved per PlayId into a single row arranged from highest to lowest.
    playids=data.PlayId.unique()
    A=data['gap'].values
    B=data['RelS_x'].values
    C=data['RelS_y'].values
    D=data['inverse_gap'].values
    E=data['RelM_x'].values
    F=data['RelM_y'].values
    n=len(playids)
    val_dict={}
    for i in range(0,n):
        val_dict[playids[i]]=sorted(A[i*22:i*22+22],reverse=True)+sorted(D[i*22:i*22+22])+\
                            sorted(B[i*22:i*22+22],reverse=True)+sorted(C[i*22:i*22+22],reverse=True)+\
                            sorted(E[i*22:i*22+22],reverse=True)+sorted(F[i*22:i*22+22],reverse=True)
    val_cols=['P_{0}'.format(i+1) for i in range(0,22)]+['P_Invd_{0}'.format(i+1) for i in range(0,22)]+\
                ['Sx_{0}'.format(i+1) for i in range(0,22)]+['Sy_{0}'.format(i+1) for i in range(0,22)]+\
            ['Mx_{0}'.format(i+1) for i in range(0,22)]+['My_{0}'.format(i+1) for i in range(0,22)]

    rel_data=pd.DataFrame(data=val_dict.values(),columns=val_cols,index=list(val_dict.keys()))
    rel_data.index.rename('PlayId',inplace=True)
    
    #Combining Game state and Relative player dataframes    
    final_df=pd.merge(df_rush,rel_data,on='PlayId')
    Y=final_df.Yards.values
    
    #Dropping redundant or features which don't seem useful 
    final_df.drop([ 'GameId','S','NflId','Team',
           'DisplayName', 'JerseyNumber', 'Season', 'GameClock',
           'PossessionTeam', 'FieldPosition',
           'HomeScoreBeforePlay', 'VisitorScoreBeforePlay', 'NflIdRusher',
           'OffenseFormation', 'OffensePersonnel', 'DefendersInTheBox',
           'DefensePersonnel', 'PlayDirection', 'TimeHandoff', 'TimeSnap',
           'PlayerHeight', 'PlayerWeight', 'PlayerBirthDate', 'PlayerCollegeName',
           'Position', 'HomeTeamAbbr', 'VisitorTeamAbbr', 'Stadium',
           'Location', 'StadiumType', 'Turf','Left', 'X_std', 'Y_std',
           'Rusher', 'Offense', 'Dir_std','Yards'],inplace=True,axis=1)
    
    final_df.set_index('PlayId',inplace=True)
    Y=np.array(Y).reshape(-1,1)
    targets = Y
    test=np.zeros((Y.shape[0],199))
    
    #converting into cdf format
    for idx, t in enumerate(list(Y)):
        test[idx][99 + t[0]] = 1
    
    #Loading Model and CRPS function
    def crps_nn(y_true,y_pred):
        loss = K.mean(K.sum((K.cumsum(y_pred, axis = 1) - K.cumsum(y_true, axis=1))**2, axis=1))/199
        return loss

    model=load_model('nfl_pred.hdf5',custom_objects={'crps_nn':crps_nn})   
    

    preds=model.predict(final_df)
    

    top5=np.argsort(preds,axis=1)[0][::-1]
    top5_prob=np.sort(preds,axis=1)[0][::-1]
    print("="*25)
    print('Top 5 Predictions')
    for i,j in zip(top5[:5],top5_prob[:5]):
        print('{0} Yards, Probability {1:.2f}%'.format(i-99,j*100))
    print("-"*25)
    print('Ground Truth',np.argmax(test)-99,'Yards, Predicted with',round(preds[0][np.argmax(test)]*100,2),'% confidence')
    
    
    end=time.time()
    print("="*25)

    print('Time taken to predict a single datapoint',round(end-start),'s')
    return 

Wall time: 0 ns


In [3]:
%%time
point_idx=data.sample(1)['PlayId'].values[0]
point=data.loc[data.PlayId==point_idx]
function_1(point)

Time taken to read data: 4 s
Time to preprocess full data 7 s
Top 5 Predictions
3 Yards, Probability 15.25%
2 Yards, Probability 15.03%
4 Yards, Probability 13.59%
1 Yards, Probability 10.66%
5 Yards, Probability 8.88%
-------------------------
Ground Truth 5 Yards, Predicted with 8.88 % confidence
Time taken to predict a single datapoint 3 s
Wall time: 15 s


***

## Function 2 

This function contains the whole dataset and prints the CRPS value of the model

In [2]:
%%time

def function_2(data):
    
    # preprocess data and prepare data
    orient=data.groupby('Position',as_index=False)['Orientation'].mean()
    Dir=data.groupby('Position',as_index=False)['Dir'].mean()
    OrientDict={}
    DirDict={}
    for i,j in orient.iterrows():
        OrientDict[j['Position']]=j['Orientation']
        DirDict[j['Position']]=j['Orientation']

    data.Dir=data.Dir.fillna(data.Position.map(DirDict))
    data.Orientation=data.Orientation.fillna(data.Position.map(OrientDict))

    tempsperweek=data.groupby(['Season','Week'],as_index=False)['Temperature'].mean()
    temp2017={}
    temp2018={}
    temp2019={}
    for i,j in tempsperweek.iterrows():
        if j['Season']==2017:
            temp2017[j['Week']]=j['Temperature']
        if j['Season']==2018:
            temp2018[j['Week']]=j['Temperature']
        if j['Season']==2019:
            temp2019[j['Week']]=j['Temperature']

    data.loc[(data.Season==2017) & (data.Temperature.isna()),'Temperature']=data.Temperature.fillna(data.Week.map(temp2017))
    data.loc[(data.Season==2018) & (data.Temperature.isna()),'Temperature']=data.Temperature.fillna(data.Week.map(temp2018))
    data.loc[(data.Season==2019) & (data.Temperature.isna()),'Temperature']=data.Temperature.fillna(data.Week.map(temp2019))

    data.loc[data.FieldPosition.isna(),'FieldPosition']='UNKNOWN'
    data.loc[data.OffenseFormation.isna(),'OffenseFormation']=data.OffenseFormation.value_counts().idxmax()

    data.loc[data.DefendersInTheBox.isna(),'DefendersInTheBox']=data.DefendersInTheBox.median()
    data.loc[data.Humidity.isna(),'Humidity']=data.Humidity.median()


    #Code to clean StadiumType 
    #code inspired from https://www.kaggle.com/sanshengshi/lightgbm-clean-stadiumtype
    def StadiumType(txt):
        txt=str(txt)
        txt=txt.lower()
        txt=txt.strip()
        if 'indoor' in txt or 'closed' in txt:
            return 0
        else:
            return 1   #outdoor or open or unspecified is being treated an as open field 
    data["StadiumType"]=data["StadiumType"].apply(StadiumType)

    def Gameweather(txt):
        txt=str(txt)
        txt=txt.lower()
        txt=txt.strip()
        if 'clear' in txt or 'sun' in txt or 'controlled' in txt or 'indoor' in txt:
            return 0
        if 'rain' in txt:
            return 1
        if 'cloud' in txt or 'overcast' in txt:
            return 0.5 
        if 'snow' in txt or 'overcast' in txt:
            return -0.5
        return 0                                   # Values given to differentiate between clear and rainy 
    data["GameWeather"]=data["GameWeather"].apply(Gameweather)

    def Windspeed(txt):
        if pd.isna(txt):
            return 7.0                   # Median Value   
        if '-' in txt:
            a,b=txt.split('-')
            return (float(a)+float(b))/2
        elif txt.isalnum():
            if re.match('(\d+)',str(txt)):
                return float(re.match('(\d+)',str(txt))[0])
            else:
                return 7.0
        else:
            return 0
    data["WindSpeed"]=data["WindSpeed"].astype(str)
    data["WindSpeed"]=data["WindSpeed"].apply(Windspeed)


    # code based from https://www.kaggle.com/bgmello/neural-networks-feature-engineering-for-the-win
    def WindDirection(txt):

        #Cleaning the values
        if pd.isna(txt):
            return -1
        txt = txt.lower()
        txt = ''.join([c for c in txt if c not in punctuation])
        txt = txt.replace('from', '')
        txt = txt.replace(' ', '')
        txt = txt.replace('north', 'n')
        txt = txt.replace('south', 's')
        txt = txt.replace('west', 'w')
        txt = txt.replace('east', 'e')

        #assigning the values

        deg=360
        if txt=='n':
            return 0
        if txt=='nne' or txt=='nen':
            return 1/16*deg
        if txt=='ne':
            return 2/16*deg
        if txt=='ene' or txt=='nee':
            return 3/16*deg
        if txt=='e':
            return 4/16*deg
        if txt=='ese' or txt=='see':
            return 5/16*deg
        if txt=='se':
            return 6/16*deg
        if txt=='ses' or txt=='sse':
            return 7/16*deg
        if txt=='s':
            return 8/16*deg
        if txt=='ssw' or txt=='sws':
            return 9/16*deg
        if txt=='sw':
            return 10/16*deg
        if txt=='sww' or txt=='wsw':
            return 11/16*deg
        if txt=='w':
            return 12/16*deg
        if txt=='wnw' or txt=='nww':
            return 13/16*deg
        if txt=='nw':
            return 14/16*deg
        if txt=='nwn' or txt=='nnw':
            return 15/16*deg
        return -1
                                          # Values given to differentiate between clear and rainy 
    data["WindDirection"]=data["WindDirection"].apply(WindDirection)
    
    TeamMap={'ARI':'ARZ','BAL':'BLT','CLE':'CLV','HOU':'HST'}

    for k,v in TeamMap.items():
        data.loc[data['VisitorTeamAbbr']==k,'VisitorTeamAbbr']=v
        data.loc[data['HomeTeamAbbr']==k,'HomeTeamAbbr']=v
    data['PlayId']=data['PlayId'].astype(str)
    data['PlayerHeight']=data['PlayerHeight'].astype(str)
    data['PlayerHeight']=data['PlayerHeight'].str.split('-').apply(lambda x: int(x[0])*0.3048+ int(x[1])*0.0254)
    data['PlayerWeight']=data['PlayerWeight']*0.453592
    
    
    
    
    
    #first 3 Downs have similar distribution
    data['Down']=data.Down.apply(lambda x: 1 if x<3 else 0)
    #Standardise Play Direction and dependent features
    data['Left']=data.PlayDirection=='left'
    
    data['X_std']=data.X
    data.loc[data.Left,'X_std']=120-data.loc[data.Left,'X']
    data['Y_std']=data.Y
    data.loc[data.Left,'Y_std']=160/3-data.loc[data.Left,'Y']
    
    #Rusher varibale to indicate the rushing player
    data['Rusher']=data.NflId==data.NflIdRusher
    
    data['Offense'] = "home"
    # If attacking team is not home team, it is treated as away team
    data.loc[data.PossessionTeam != data.HomeTeamAbbr, 'Offense'] = "away"
    # If Field position and possession team are same then the Yardline values stay same. Otherwise it is in the opposite direction
    data['YardLine_std'] = 100 - data.YardLine
    data.loc[data.FieldPosition == data.PossessionTeam,'YardLine_std'] = data.loc[data.FieldPosition == data.PossessionTeam,'YardLine']
    
    # Degrees to Radians subtracting by 90 since 0 degrees is upwards in raw data and it doesn't make intuitve sense, 
    # towards right is gven as 0 and towards left as 180 degrees or pi radians
    data['Dir_std']=np.mod(90-data.Dir,360)*np.pi/180
    #Flipping direction of all Left direction by pi radians (180 degrees)
    data.loc[data['Left'],'Dir_std']=np.mod(np.pi-data.loc[data['Left'],'Dir_std'],2*np.pi)
    
    data.drop(['X','Y','Dir','YardLine'],axis=1,inplace=True)
    
    #Data Engineering
    
    #Values taken just from looking at the distrubition of data after comparing 2017 with other Seasons(2018/19)
    #Number of samples also taken after looking at the corresponding values in 2018/2019 data respectively 
    index=data.loc[(data.Season==2017) & (data.Dis==0.0)].sample(2200).index                                          
    data.loc[index.values,'Dis']=np.round(np.random.uniform(0.1,0.4,2200),decimals=2)      
    index=data.loc[(data.Season==2017) & (data.Dis==0.01)].sample(4000).index
    data.loc[index.values,'Dis']=np.round(np.random.uniform(0.1,0.4,4000),decimals=2)
    
    #201 orientation off byy a phase of 90 degrees
    data.loc[data.Season==2017,'Orientation']=np.mod(90+data.loc[data.Season==2017,'Orientation'],360)
    
    #creating a Play only based Dataframe
    df_rush=data.loc[data.Rusher]
    
    
    #Feature engineering for features relative to the Rusher
    
    #https://www.kaggle.com/jccampos/nfl-2020-winner-solution-the-zoo
    # Getting Speed component along vertical and Horizontal Axis
    data['S_x'] = data['S']*data['Dir_std'].apply(math.cos)
    data['S_y'] = data['S']*data['Dir_std'].apply(math.sin)

    #Momentum=mass*velocity along vertical and Horizontal Axis
    data['M_x']=data.PlayerWeight*data['S_x']
    data['M_y']=data.PlayerWeight*data['S_y']


    rush=data[data.Rusher]
    rush.set_index('PlayId',inplace=True,drop=True)
    #Converting to Dictionary the values of Rusher
    mapp=rush[['X_std','Y_std','S_x','S_y','M_x','M_y']].to_dict(orient='index')

    #Creating columns which contain the rusher values to all PlayIds
    rush_x=data['PlayId'].apply(lambda x:mapp[x]['X_std'])
    rush_y=data['PlayId'].apply(lambda y:mapp[y]['Y_std'])
    rush_Sx=data['PlayId'].apply(lambda x:mapp[x]['S_x'])
    rush_Sy=data['PlayId'].apply(lambda y:mapp[y]['S_y'])
    rush_Mx=data['PlayId'].apply(lambda x:mapp[x]['M_x'])
    rush_My=data['PlayId'].apply(lambda y:mapp[y]['M_y'])
    
    #Euclidean Distance between Rusher and other players
    data['gap']=((rush_x-data['X_std'])**2+(rush_y-data['Y_std'])**2)**0.5
    data['inverse_gap']=data['gap'].apply(lambda x: 1/x if x!=0 else -1)
    #Relative Speeds between Rusher and other players
    data['RelS_x']=rush_Sx-data['S_x']
    data['RelS_y']=rush_Sy-data['S_y']
    # Relative Momentum between Rusher and other players
    data['RelM_x']=rush_Mx-data['M_x']
    data['RelM_y']=rush_My-data['M_y']  


    #Code to collapse all 22 players involved per PlayId into a single row arranged from highest to lowest.
    playids=data.PlayId.unique()
    A=data['gap'].values
    B=data['RelS_x'].values
    C=data['RelS_y'].values
    D=data['inverse_gap'].values
    E=data['RelM_x'].values
    F=data['RelM_y'].values
    n=len(playids)
    val_dict={}
    for i in range(0,n):
        val_dict[playids[i]]=sorted(A[i*22:i*22+22],reverse=True)+sorted(D[i*22:i*22+22])+\
                            sorted(B[i*22:i*22+22],reverse=True)+sorted(C[i*22:i*22+22],reverse=True)+\
                            sorted(E[i*22:i*22+22],reverse=True)+sorted(F[i*22:i*22+22],reverse=True)
    val_cols=['P_{0}'.format(i+1) for i in range(0,22)]+['P_Invd_{0}'.format(i+1) for i in range(0,22)]+\
                ['Sx_{0}'.format(i+1) for i in range(0,22)]+['Sy_{0}'.format(i+1) for i in range(0,22)]+\
            ['Mx_{0}'.format(i+1) for i in range(0,22)]+['My_{0}'.format(i+1) for i in range(0,22)]

    rel_data=pd.DataFrame(data=val_dict.values(),columns=val_cols,index=list(val_dict.keys()))
    rel_data.index.rename('PlayId',inplace=True)
    
    #Combining Game state and Relative player dataframes
    final_df=pd.merge(df_rush,rel_data,on='PlayId')
    Y=final_df.Yards.values
    
    #Dropping redundant or features which don't seem useful 
    final_df.drop([ 'GameId','S','NflId','Team',
           'DisplayName', 'JerseyNumber', 'Season', 'GameClock',
           'PossessionTeam', 'FieldPosition',
           'HomeScoreBeforePlay', 'VisitorScoreBeforePlay', 'NflIdRusher',
           'OffenseFormation', 'OffensePersonnel', 'DefendersInTheBox',
           'DefensePersonnel', 'PlayDirection', 'TimeHandoff', 'TimeSnap',
           'PlayerHeight', 'PlayerWeight', 'PlayerBirthDate', 'PlayerCollegeName',
           'Position', 'HomeTeamAbbr', 'VisitorTeamAbbr', 'Stadium',
           'Location', 'StadiumType', 'Turf','Left', 'X_std', 'Y_std',
           'Rusher', 'Offense', 'Dir_std','Yards'],inplace=True,axis=1)
    
    final_df.set_index('PlayId',inplace=True)
    Y=np.array(Y).reshape(-1,1)
    targets = Y
    test=np.zeros((Y.shape[0],199))
    
    #converting into cdf format
    for idx, t in enumerate(list(Y)):
        test[idx][99 + t[0]] = 1
    
    #Loading Model and CRPS function
    def crps_nn(y_true,y_pred):
        loss = K.mean(K.sum((K.cumsum(y_pred, axis = 1) - K.cumsum(y_true, axis=1))**2, axis=1))/199
        return loss
    model=load_model('nfl_pred.hdf5',custom_objects={'crps_nn':crps_nn})   
    
    y_true=np.clip(np.cumsum(test, axis=1), 0, 1)
    y_pred=np.clip(np.cumsum(model.predict(final_df), axis=1), 0, 1)
    cr=((y_true - y_pred) ** 2).sum(axis=1).sum(axis=0) / (199 * final_df.shape[0])
    
    print('Continuous Ranked Probability Score:',round(cr,8))
    
    preds=model.predict(final_df)
    return final_df.index,preds,Y

Wall time: 0 ns


In [3]:
%%time
data=pd.read_csv('train.csv',low_memory=False)
p,n,r=function_2(data)

Continuous Ranked Probability Score: 0.0134318
Wall time: 18.6 s


In [4]:
r[1,0]

3

In [5]:
x=np.zeros((31007,3))
x[:,0]=p.values
for idx,i in enumerate(n):
    prob=np.sort(i)[::-1]
    a=r[idx,0]-(np.argmax(i)-99)
    if a==0:
        x[idx,2]=1
    else:
        x[idx,2]=0
    x[idx,1]=prob[0]-prob[1]


In [6]:
mask=x[:,2]==1
x=x[mask,:]

In [7]:
indexes=np.argwhere(x[:,1]>0.05)[:,0]
final_index=[int(x[i,0]) for i in indexes]

In [8]:
len(final_index)

556

In [9]:
data=pd.read_csv('train.csv',low_memory=False)

In [10]:
data=data[data.PlayId.isin(final_index)]

In [11]:
data

Unnamed: 0,GameId,PlayId,Team,X,Y,S,A,Dis,Orientation,Dir,...,Week,Stadium,Location,StadiumType,Turf,GameWeather,Temperature,Humidity,WindSpeed,WindDirection
308,2017090700,20170907001177,away,16.63,18.83,1.94,2.53,0.21,339.25,33.96,...,1,Gillette Stadium,"Foxborough, MA",Outdoor,Field Turf,Clear and warm,63.0,77.0,8,SW
309,2017090700,20170907001177,away,17.12,22.33,1.06,1.11,0.17,305.74,350.30,...,1,Gillette Stadium,"Foxborough, MA",Outdoor,Field Turf,Clear and warm,63.0,77.0,8,SW
310,2017090700,20170907001177,away,17.55,28.83,0.61,0.08,0.07,35.66,107.79,...,1,Gillette Stadium,"Foxborough, MA",Outdoor,Field Turf,Clear and warm,63.0,77.0,8,SW
311,2017090700,20170907001177,away,16.48,23.78,2.20,1.24,0.31,346.32,67.88,...,1,Gillette Stadium,"Foxborough, MA",Outdoor,Field Turf,Clear and warm,63.0,77.0,8,SW
312,2017090700,20170907001177,away,14.01,25.76,3.57,1.13,0.45,2.95,95.38,...,1,Gillette Stadium,"Foxborough, MA",Outdoor,Field Turf,Clear and warm,63.0,77.0,8,SW
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
682105,2019112500,20191125003496,home,23.34,20.92,1.29,2.31,0.14,12.17,28.13,...,12,Los Angeles Memorial Coliseum,"Los Angeles, CA",Outdoor,Grass,Clear,62.0,64.0,5,WSW
682106,2019112500,20191125003496,home,20.63,25.29,0.90,0.82,0.09,119.81,39.01,...,12,Los Angeles Memorial Coliseum,"Los Angeles, CA",Outdoor,Grass,Clear,62.0,64.0,5,WSW
682107,2019112500,20191125003496,home,21.73,21.18,3.80,2.32,0.37,37.32,354.26,...,12,Los Angeles Memorial Coliseum,"Los Angeles, CA",Outdoor,Grass,Clear,62.0,64.0,5,WSW
682108,2019112500,20191125003496,home,22.32,23.44,1.01,0.89,0.12,72.71,347.88,...,12,Los Angeles Memorial Coliseum,"Los Angeles, CA",Outdoor,Grass,Clear,62.0,64.0,5,WSW


In [12]:
data.to_csv('Bestresulting.csv')

In [13]:
final_df=pd.read_csv('Final_data.csv')
final_df=final_df[final_df.PlayId.isin(final_index)]
final_df.to_csv('Final_Best_data.csv',index=True)