In [1]:
import pandas as pd 
import numpy as np
import math
import seaborn as sns
from sklearn.utils import shuffle
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

In [2]:
#READ IN FILES
games = pd.read_csv('games.csv')
players = pd.read_csv('players.csv')
plays = pd.read_csv('plays.csv')
tackles = pd.read_csv('tackles.csv')
week1 = pd.read_csv('tracking_week_1.csv')
week2 = pd.read_csv('tracking_week_2.csv')
week3 = pd.read_csv('tracking_week_3.csv')
week4 = pd.read_csv('tracking_week_4.csv')
week5 = pd.read_csv('tracking_week_5.csv')
week6 = pd.read_csv('tracking_week_6.csv')
week7 = pd.read_csv('tracking_week_7.csv')
week8 = pd.read_csv('tracking_week_8.csv')
week9 = pd.read_csv('tracking_week_9.csv')

data_frames = [week1, week2, week3, week4, week5, week6, week7, week8, week9]
weeks = pd.concat(data_frames, ignore_index=True)

In [3]:
#CLEAN OUT SOME VARIOUS TACKLES
tackleMade = tackles[tackles['tackle'] == 1][['gameId','nflId', 'playId', 'tackle']]
###Are there plays with multiple tackles? --> remove those
tackleStuff = tackleMade.groupby(['gameId', 'playId'])['tackle'].count().reset_index(name = 'counts')
tackleStuff2 = tackleStuff[tackleStuff['counts'] !=1 ]
tackleMade = tackleMade[(tackleMade['gameId'] != '2022091107') & (tackleMade['playId'] != 1879)]

In [11]:
#All of the Plays where the pass was caught
weeksUpdates = weeks[(weeks['event'] == 'pass_outcome_caught') ]

#in the Plays data, we want to get the ball carrier & Tackler
plays1 = plays[['gameId', 'playId','ballCarrierId', 'defensiveTeam']]
weeksUpdates0 = weeksUpdates.merge(plays1, how = 'left', on = ['gameId', 'playId'])

#Defensive players are where club = defensive team
weeksUpdates1 =weeksUpdates0[weeksUpdates0['club'] == weeksUpdates0['defensiveTeam']]

#get the offenive players
offensivePlayers = weeksUpdates0[(weeksUpdates0['club'] != weeksUpdates0['defensiveTeam']) &
                                (weeksUpdates0['club'] != 'football')]

#who made the tackle
weeksUpdates2 = weeksUpdates1.merge(tackleMade, how = 'inner', on= ['gameId', 'playId'])

#now we want the ballcarriers location
ballCarrier = weeksUpdates0[weeksUpdates0['nflId'] == weeksUpdates0['ballCarrierId']]
ballCarrier = ballCarrier[['gameId', 'playId', 'frameId', 'x', 'y', 's', 'a', 'dis', 'o', 'dir', 'nflId']]

ballCarrierLocation = weeksUpdates2.merge(ballCarrier, how = 'left', on = ['gameId', 'playId', 'frameId'])


ballCarrierLocation = ballCarrierLocation.drop_duplicates()
#ballCarrierLocation['tackler'].fillna(value = 0, inplace = True)
ballCarrierLocation['madeTackle'] = (ballCarrierLocation['nflId_x'] == ballCarrierLocation['nflId_y']).astype(int)
ballCarrierLocation.drop(['ballCarrierId', 'defensiveTeam'], axis = 1 , inplace = True)

column_mapping = {
    'x_x': 'x',
    'y_x': 'y',
    's_x': 's',
    'a_x': 'a',
    'dis_x': 'dis',
    'o_x': 'o',
    'dir_x': 'dir',
    'x_y': 'x_ball',
    'y_y': 'y_ball',
    's_y': 's_ball',
    'a_y': 'a_ball',
    'dis_y': 'dis_ball',
    'o_y': 'o1_ball',
    'dir_y': 'dir1_ball',
    'tackler': 'madeTackle'
    #'nflId': 'nflId_x'
}
ballCarrierLocation1 = ballCarrierLocation.rename(columns = column_mapping)

In [12]:
#This will be used later for finding number of blockers in betweer
offensivePlayers = offensivePlayers[offensivePlayers['nflId'] != offensivePlayers['ballCarrierId'] ]
offensivePlayers = offensivePlayers.merge(ballCarrier, how = 'left', on = ['gameId', 'playId', 'frameId'])
offensivePlayers['blockerDistanceToBallCarrier'] = np.sqrt((offensivePlayers['x_x'] - offensivePlayers['x_y'])**2 + 
                                         (offensivePlayers['y_x'] - offensivePlayers['y_y'])**2)
offensivePlayers = offensivePlayers.drop(['x_y', 'y_y','s_y', 'a_y','dis_y','o_y', 'dir_y' ,'time', 'club', 'time', 
                                          'jerseyNumber', 'defensiveTeam', 'event', 'ballCarrierId', 'displayName',
                                         's_x', 'a_x', 'dis_x', 'o_x', 'dir_x', 'nflId_x'], axis = 1)
columnsRename = {'x_x':'x_block', 'y_x': 'y_block'}

offensivePlayers = offensivePlayers.rename(columns = columnsRename)

In [13]:
#CLEAN UP SOME PLAYS THAT ARE INCORRECT
test = ballCarrierLocation1.groupby(['gameId','playId'])['club'].count().reset_index(name = 'counts')
test = test[test['counts']== 11]
test = test[['gameId', 'playId']]
ballCarrierLocation1 = ballCarrierLocation1.merge(test, how = 'inner', on = ['gameId', 'playId'])

In [14]:
#Distance to ball carrier

ballCarrierLocation1['distanceToBallCarrier'] = np.sqrt((ballCarrierLocation1['x'] - ballCarrierLocation1['x_ball'])**2 + 
                                          (ballCarrierLocation1['y'] - ballCarrierLocation1['y_ball'])**2)

#CLOSEST DEFENDER --> is the given defender the one closest to the ball carrier
ballCarrierLocation1['IsClosestDefender'] = ballCarrierLocation1.groupby(['gameId', 'playId'])['distanceToBallCarrier'].transform(lambda x: x == x.min()).astype(int)

#IS THE TACKLER INFRONT
ballCarrierLocation1['inFront'] = ballCarrierLocation1.apply(
    lambda row: 1 if ((row['playDirection'] == 'left' and row['x_ball'] > row['x']) or 
                      (row['playDirection'] == 'right' and row['x_ball'] < row['x'])) else 0, axis=1)

#Distance Rank
ballCarrierLocation1['distanceRank'] = ballCarrierLocation1.groupby(['gameId', 'playId', 'frameId'])['distanceToBallCarrier'].rank(method='min')

#Dist to ClosestSideLine
ballCarrierLocation1['closestSideLine'] = ballCarrierLocation1.apply(
    lambda row: (53.3 - row['y_ball']) if row['y_ball'] >= 26.65 else row['y_ball'], axis=1)

#Relative X Position:
conditions = [
    (ballCarrierLocation1['playDirection'] == 'right'),(ballCarrierLocation1['playDirection'] == 'left') 
]

choices = [ (ballCarrierLocation1['x'] - ballCarrierLocation1['x_ball']), (ballCarrierLocation1['x_ball'] - ballCarrierLocation1['x'])  ]          
ballCarrierLocation1['relativeX'] = np.select(conditions, choices)

#What is the closest defender on the plays distance to the ball carrier
ballCarrierLocation1['closestDistance'] = ballCarrierLocation1.groupby(['gameId', 'playId', 'frameId'])['distanceToBallCarrier'].transform('min')

#Find the distance between the given defender and the closest defender. If the current defender is the closest
#Then this will be negative because it will do 2nd closest - closest (Current)
ballCarrierLocation1['distBetween'] = ballCarrierLocation1['distanceToBallCarrier'] - ballCarrierLocation1['closestDistance']

#If the current defender is the closestDefender, then we want to get the distance to the second closest defender
x = ballCarrierLocation1.groupby(['gameId', 'playId', 'frameId'])['distanceToBallCarrier'].apply(lambda x: x.nsmallest(2).values[-1] if len(x.nsmallest(2).values) > 1 else None)
x = x.reset_index()
x = x.rename(columns = {'distanceToBallCarrier' : 'secondDist'})
ballCarrierLocation1 = ballCarrierLocation1.merge(x , how = 'left', on = ['gameId', 'playId', 'frameId'])

ballCarrierLocation1.loc[ballCarrierLocation1['distBetween'] == 0, 'distBetween'] = ballCarrierLocation1['closestDistance'] - ballCarrierLocation1['secondDist']
#ballCarrierLocation1['2ndClosest'] = x['distanceToBallCarrier']

ballCarrierLocation2 = ballCarrierLocation1.merge(offensivePlayers, how = 'left', on = ['gameId', 'playId', 'frameId'])
playersJoined = ballCarrierLocation1.copy()

In [15]:
offensivePlayers.head()

Unnamed: 0,gameId,playId,frameId,playDirection,x_block,y_block,nflId_y,blockerDistanceToBallCarrier
0,2022090800,56,6,left,88.8,26.7,42489.0,12.614856
1,2022090800,56,6,left,88.21,29.31,42489.0,10.456003
2,2022090800,56,6,left,89.87,26.18,42489.0,13.745854
3,2022090800,56,6,left,75.58,16.36,42489.0,19.69837
4,2022090800,56,6,left,90.75,29.99,42489.0,12.254387


In [16]:
#GET THE COUNT OF BLOCKERS BETWEEN DEFENDER AND BALL CARRIER
x = (ballCarrierLocation2['blockerDistanceToBallCarrier'] <= ballCarrierLocation2['distanceToBallCarrier']) & \
                     (((ballCarrierLocation2['x_block'] >= ballCarrierLocation2['x']) & (ballCarrierLocation2['x_block'] <= ballCarrierLocation2['x_ball'])) |
                     ((ballCarrierLocation2['x_block'] >= ballCarrierLocation2['x_ball']) & (ballCarrierLocation2['x_block'] <= ballCarrierLocation2['x'])))


ballCarrierLocation2['inBetween'] = x
ballCarrierLocation2['inBetween'] = ballCarrierLocation2['inBetween'].astype(int)
blockerCount = ballCarrierLocation2.groupby(['gameId', 'playId','nflId_x', 'frameId'])['inBetween'].sum().reset_index(name='countOfBlockersBetween')
playersJoined = playersJoined.merge(blockerCount, how = 'left', on = ['gameId', 'playId', 'nflId_x', 'frameId'])


In [17]:
###############MAKE THE MODEL##################

split_idx = int(len(playersJoined) * .8)

# Split the DataFrame into two
playersJoined1 = playersJoined.iloc[:split_idx, :] #Train
playersJoined1 = playersJoined1.sample(frac = 1, random_state = 42)

playersJoined2 = playersJoined.iloc[split_idx:, :] #Test
playersJoined2 = playersJoined2.sample(frac = 1, random_state = 42)

#########################################################################################
features =[ 'countOfBlockersBetween', 'distanceRank' ,  'distBetween' ,'closestSideLine', 
            'distanceToBallCarrier', 'relativeX' ]
#########################################################################################

X_train = playersJoined1[features]
y_train = playersJoined1['madeTackle']

X_test= playersJoined2[features]
y_test =playersJoined2['madeTackle']
                            
model1 = LogisticRegression(max_iter=2000)
scaler = StandardScaler()

pipe = make_pipeline(scaler, model1)

pipe.fit(X_train, y_train)                              
##########################33

predictions = pipe.predict_proba(X_test)
predictions1 = pipe.predict(X_test) 
output = pd.DataFrame(predictions , columns = ['probNoTackle', 'probTackle'])

#########
#NEED TO FIX THE COpying over?
result_df = playersJoined2.copy()
result_df = result_df.reset_index()
result_df = result_df.join(output)

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
print("Accuracy:", accuracy_score(y_test, predictions1))
print("Precision:", precision_score(y_test, predictions1))
print("Recall:", recall_score(y_test, predictions1))
print("F1 Score:", f1_score(y_test, predictions1))

Accuracy: 0.9397179899687707
Precision: 0.7417910447761195
Recall: 0.5171696149843913
F1 Score: 0.609442060085837


In [22]:
Final_Result.columns


Index(['index', 'gameId', 'playId', 'nflId_x', 'displayName', 'frameId',
       'time', 'jerseyNumber', 'club', 'playDirection', 'x', 'y', 's', 'a',
       'dis', 'o', 'dir', 'event', 'nflId_y', 'tackle', 'x_ball', 'y_ball',
       's_ball', 'a_ball', 'dis_ball', 'o1_ball', 'dir1_ball', 'nflId_x',
       'madeTackle', 'distanceToBallCarrier', 'IsClosestDefender', 'inFront',
       'distanceRank', 'closestSideLine', 'relativeX', 'closestDistance',
       'distBetween', 'secondDist', 'countOfBlockersBetween', 'probNoTackle',
       'probTackle', 'expectedTackler', 'nflId_y', 'position'],
      dtype='object')

In [23]:
#REFIT THE MODEL TO THE ENTIRE SET
X_Final = playersJoined[features]
Final_Probabilities = model1.predict_proba(X_Final)
Final_Output = pd.DataFrame(Final_Probabilities, columns = ['probNoTackle', 'probTackle'])

Final_Results = playersJoined.copy()
Final_Results = Final_Results.reset_index()
Final_Results = Final_Results.join(Final_Output)

#NORMALIZE SO NO PLAY HAS OVER 1 EXPECTATION
Final_Results['probTackle'] = Final_Results.groupby(['gameId', 'playId'])['probTackle'].transform(lambda x: x / x.sum())

#Lets add a column that assigns 1 for the play grouping to whoever has the highest prob of making a tackle
Final_Results['expectedTackler'] = Final_Results.groupby(['gameId', 'playId'])['probTackle'].transform(lambda x: x == x.max()).astype(int)


Final_Result = Final_Results.merge(players[['nflId', 'position']], how = 'left' , left_on = ['nflId_x'], right_on = ['nflId'])
Final_Result.drop(['gameId', 'playId', 'nflId_x', 'displayName', 
                    'jerseyNumber', 'club', 'x', 'y', 's', 'a',
                   'dis', 'o', 'dir', 'event', 'nflId_y', 'tackle', 'x_ball', 'y_ball',
                   's_ball', 'a_ball', 'dis_ball', 'o1_ball', 'dir1_ball', 'nflId_x',
                     'IsClosestDefender', 
                      'closestDistance', 'secondDist',  'probNoTackle',
                    'nflId_y', 'position'], axis = 1, inplace = True)

  Final_Result = Final_Results.merge(players[['nflId', 'position']], how = 'left' , left_on = ['nflId_x'], right_on = ['nflId'])


Unnamed: 0,index,frameId,time,playDirection,madeTackle,distanceToBallCarrier,inFront,distanceRank,closestSideLine,relativeX,distBetween,countOfBlockersBetween,probTackle,expectedTackler
0,0,6,2022-09-08 20:24:05.700000,left,0,7.067538,1,3.0,17.71,1.74,4.239536,0,9.233263e-09,0
1,1,6,2022-09-08 20:24:05.700000,left,0,13.527265,0,9.0,17.71,-12.30,10.699262,4,8.573717e-34,0
2,2,6,2022-09-08 20:24:05.700000,left,0,29.415605,1,11.0,17.71,9.23,26.587602,2,4.091757e-38,0
3,3,6,2022-09-08 20:24:05.700000,left,1,2.828003,1,1.0,17.71,1.70,-1.023296,0,9.458327e-01,1
4,4,6,2022-09-08 20:24:05.700000,left,0,12.502404,0,6.0,17.71,-12.26,9.674401,3,3.464040e-32,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52828,52828,6,2022-11-07 22:50:42.400000,right,0,11.485473,0,5.0,20.74,-8.75,3.378526,1,2.337972e-23,0
52829,52829,6,2022-11-07 22:50:42.400000,right,0,10.680262,0,4.0,20.74,-10.58,2.573315,2,5.428489e-24,0
52830,52830,6,2022-11-07 22:50:42.400000,right,0,27.426866,1,10.0,20.74,21.81,19.319919,4,1.695220e-18,0
52831,52831,6,2022-11-07 22:50:42.400000,right,0,9.185173,0,3.0,20.74,-8.93,1.078225,0,8.977590e-20,0


In [24]:
import statsmodels.api as sm

X = playersJoined1[features]
y = playersJoined1['madeTackle']
X = sm.add_constant(X)

model = sm.Logit(y,X)
result = model.fit()

print(result.summary())

Optimization terminated successfully.
         Current function value: 0.155228
         Iterations 10
                           Logit Regression Results                           
Dep. Variable:             madeTackle   No. Observations:                42266
Model:                          Logit   Df Residuals:                    42259
Method:                           MLE   Df Model:                            6
Date:                Wed, 03 Jan 2024   Pseudo R-squ.:                  0.4904
Time:                        13:15:49   Log-Likelihood:                -6560.9
converged:                       True   LL-Null:                       -12875.
Covariance Type:            nonrobust   LLR p-value:                     0.000
                             coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------------------
const                     -0.1038      0.085     -1.216      0.224      -0.271     