In [None]:
import pandas as pd 
import numpy as np
import math
import seaborn as sns
from sklearn.utils import shuffle
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

In [None]:
#READ IN FILES
games = pd.read_csv('games.csv')
players = pd.read_csv('players.csv')
plays = pd.read_csv('plays.csv')
tackles = pd.read_csv('tackles.csv')
week1 = pd.read_csv('tracking_week_1.csv')
week2 = pd.read_csv('tracking_week_2.csv')
week3 = pd.read_csv('tracking_week_3.csv')
week4 = pd.read_csv('tracking_week_4.csv')
week5 = pd.read_csv('tracking_week_5.csv')
week6 = pd.read_csv('tracking_week_6.csv')
week7 = pd.read_csv('tracking_week_7.csv')
week8 = pd.read_csv('tracking_week_8.csv')
week9 = pd.read_csv('tracking_week_9.csv')

data_frames = [week1, week2, week3, week4, week5, week6, week7, week8, week9]
weeks = pd.concat(data_frames, ignore_index=True)

In [None]:
#CLEAN OUT SOME VARIOUS TACKLES
tackleMade = tackles[tackles['tackle'] == 1][['gameId','nflId', 'playId', 'tackle']]
###Are there plays with multiple tackles? --> remove those
tackleStuff = tackleMade.groupby(['gameId', 'playId'])['tackle'].count().reset_index(name = 'counts')
tackleStuff2 = tackleStuff[tackleStuff['counts'] !=1 ]
tackleMade = tackleMade[(tackleMade['gameId'] != '2022091107') & (tackleMade['playId'] != 1879)]

In [None]:
#All of the Plays where the pass was caught
weeksUpdates = weeks[(weeks['event'] == 'pass_outcome_caught') ]

#in the Plays data, we want to get the ball carrier & Tackler
plays1 = plays[['gameId', 'playId','ballCarrierId', 'defensiveTeam']]
weeksUpdates0 = weeksUpdates.merge(plays1, how = 'left', on = ['gameId', 'playId'])

#Defensive players are where club = defensive team
weeksUpdates1 =weeksUpdates0[weeksUpdates0['club'] == weeksUpdates0['defensiveTeam']]

#get the offenive players
offensivePlayers = weeksUpdates0[(weeksUpdates0['club'] != weeksUpdates0['defensiveTeam']) &
                                (weeksUpdates0['club'] != 'football')]

#who made the tackle
weeksUpdates2 = weeksUpdates1.merge(tackleMade, how = 'inner', on= ['gameId', 'playId'])

#now we want the ballcarriers location
ballCarrier = weeksUpdates0[weeksUpdates0['nflId'] == weeksUpdates0['ballCarrierId']]
ballCarrier = ballCarrier[['gameId', 'playId', 'frameId', 'x', 'y', 's', 'a', 'dis', 'o', 'dir', 'nflId']]

ballCarrierLocation = weeksUpdates2.merge(ballCarrier, how = 'left', on = ['gameId', 'playId', 'frameId'])


ballCarrierLocation = ballCarrierLocation.drop_duplicates()
#ballCarrierLocation['tackler'].fillna(value = 0, inplace = True)
ballCarrierLocation['madeTackle'] = (ballCarrierLocation['nflId_x'] == ballCarrierLocation['nflId_y']).astype(int)
ballCarrierLocation.drop(['ballCarrierId', 'defensiveTeam'], axis = 1 , inplace = True)

column_mapping = {
    'x_x': 'x',
    'y_x': 'y',
    's_x': 's',
    'a_x': 'a',
    'dis_x': 'dis',
    'o_x': 'o',
    'dir_x': 'dir',
    'x_y': 'x_ball',
    'y_y': 'y_ball',
    's_y': 's_ball',
    'a_y': 'a_ball',
    'dis_y': 'dis_ball',
    'o_y': 'o1_ball',
    'dir_y': 'dir1_ball',
    'tackler': 'madeTackle'
    #'nflId': 'nflId_x'
}
ballCarrierLocation1 = ballCarrierLocation.rename(columns = column_mapping)

In [None]:
#This will be used later for finding number of blockers in betweer
offensivePlayers = offensivePlayers[offensivePlayers['nflId'] != offensivePlayers['ballCarrierId'] ]
offensivePlayers = offensivePlayers.merge(ballCarrier, how = 'left', on = ['gameId', 'playId', 'frameId'])
offensivePlayers['blockerDistanceToBallCarrier'] = np.sqrt((offensivePlayers['x_x'] - offensivePlayers['x_y'])**2 + 
                                         (offensivePlayers['y_x'] - offensivePlayers['y_y'])**2)
offensivePlayers = offensivePlayers.drop(['x_y', 'y_y','s_y', 'a_y','dis_y','o_y', 'dir_y' ,'time', 'club', 'time', 
                                          'jerseyNumber', 'defensiveTeam', 'event', 'ballCarrierId', 'displayName',
                                         's_x', 'a_x', 'dis_x', 'o_x', 'dir_x'], axis = 1)
columnsRename = {'x_x':'x_block', 'y_x': 'y_block'}

offensivePlayers = offensivePlayers.rename(columns = columnsRename)

In [None]:
#CLEAN UP SOME PLAYS THAT ARE INCORRECT
test = ballCarrierLocation1.groupby(['gameId','playId'])['club'].count().reset_index(name = 'counts')
test = test[test['counts']== 11]
test = test[['gameId', 'playId']]
ballCarrierLocation1 = ballCarrierLocation1.merge(test, how = 'inner', on = ['gameId', 'playId'])

In [None]:
#Distance to ball carrier

ballCarrierLocation1['distanceToBallCarrier'] = np.sqrt((ballCarrierLocation1['x'] - ballCarrierLocation1['x_ball'])**2 + 
                                          (ballCarrierLocation1['y'] - ballCarrierLocation1['y_ball'])**2)

#CLOSEST DEFENDER --> is the given defender the one closest to the ball carrier
ballCarrierLocation1['IsClosestDefender'] = ballCarrierLocation1.groupby(['gameId', 'playId'])['distanceToBallCarrier'].transform(lambda x: x == x.min()).astype(int)

#IS THE TACKLER INFRONT
ballCarrierLocation1['inFront'] = ballCarrierLocation1.apply(
    lambda row: 1 if ((row['playDirection'] == 'left' and row['x_ball'] > row['x']) or 
                      (row['playDirection'] == 'right' and row['x_ball'] < row['x'])) else 0, axis=1)

#Distance Rank
ballCarrierLocation1['distanceRank'] = ballCarrierLocation1.groupby(['gameId', 'playId', 'frameId'])['distanceToBallCarrier'].rank(method='min')

#Dist to ClosestSideLine
ballCarrierLocation1['closestSideLine'] = ballCarrierLocation1.apply(
    lambda row: (53.3 - row['y_ball']) if row['y_ball'] >= 26.65 else row['y_ball'], axis=1)

#Relative X Position:
conditions = [
    (ballCarrierLocation1['playDirection'] == 'right'),(ballCarrierLocation1['playDirection'] == 'left') 
]

choices = [ (ballCarrierLocation1['x'] - ballCarrierLocation1['x_ball']), (ballCarrierLocation1['x_ball'] - ballCarrierLocation1['x'])  ]          
ballCarrierLocation1['relativeX'] = np.select(conditions, choices)

#What is the closest defender on the plays distance to the ball carrier
ballCarrierLocation1['closestDistance'] = ballCarrierLocation1.groupby(['gameId', 'playId', 'frameId'])['distanceToBallCarrier'].transform('min')

#Find the distance between the given defender and the closest defender. If the current defender is the closest
#Then this will be negative because it will do 2nd closest - closest (Current)
ballCarrierLocation1['distBetween'] = ballCarrierLocation1['distanceToBallCarrier'] - ballCarrierLocation1['closestDistance']

#If the current defender is the closestDefender, then we want to get the distance to the second closest defender
x = ballCarrierLocation1.groupby(['gameId', 'playId', 'frameId'])['distanceToBallCarrier'].apply(lambda x: x.nsmallest(2).values[-1] if len(x.nsmallest(2).values) > 1 else None)
x = x.reset_index()
x = x.rename(columns = {'distanceToBallCarrier' : 'secondDist'})
ballCarrierLocation1 = ballCarrierLocation1.merge(x , how = 'left', on = ['gameId', 'playId', 'frameId'])

ballCarrierLocation1.loc[ballCarrierLocation1['distBetween'] == 0, 'distBetween'] = ballCarrierLocation1['closestDistance'] - ballCarrierLocation1['secondDist']
#ballCarrierLocation1['2ndClosest'] = x['distanceToBallCarrier']

ballCarrierLocation2 = ballCarrierLocation1.merge(offensivePlayers, how = 'left', on = ['gameId', 'playId', 'frameId'])
playersJoined = ballCarrierLocation1.copy()

In [None]:
#GET THE COUNT OF BLOCKERS BETWEEN DEFENDER AND BALL CARRIER
x = (ballCarrierLocation2['blockerDistanceToBallCarrier'] <= ballCarrierLocation2['distanceToBallCarrier']) & \
                     (((ballCarrierLocation2['x_block'] >= ballCarrierLocation2['x']) & (ballCarrierLocation2['x_block'] <= ballCarrierLocation2['x_ball'])) |
                     ((ballCarrierLocation2['x_block'] >= ballCarrierLocation2['x_ball']) & (ballCarrierLocation2['x_block'] <= ballCarrierLocation2['x'])))


ballCarrierLocation2['inBetween'] = x
ballCarrierLocation2['inBetween'] = ballCarrierLocation2['inBetween'].astype(int)
blockerCount = ballCarrierLocation2.groupby(['gameId', 'playId','nflId_x', 'frameId'])['inBetween'].sum().reset_index(name='countOfBlockersBetween')
playersJoined = playersJoined.merge(blockerCount, how = 'left', on = ['gameId', 'playId', 'nflId_x', 'frameId'])


In [None]:
###############MAKE THE MODEL##################

split_idx = int(len(playersJoined) * .8)

# Split the DataFrame into two
playersJoined1 = playersJoined.iloc[:split_idx, :] #Train
playersJoined1 = playersJoined1.sample(frac = 1, random_state = 42)

playersJoined2 = playersJoined.iloc[split_idx:, :] #Test
playersJoined2 = playersJoined2.sample(frac = 1, random_state = 42)

#########################################################################################
features =[ 'countOfBlockersBetween', 'distanceRank' ,  'distBetween' ,'closestSideLine', 
            'distanceToBallCarrier', 'relativeX' ]
#########################################################################################

X_train = playersJoined1[features]
y_train = playersJoined1['madeTackle']

X_test= playersJoined2[features]
y_test =playersJoined2['madeTackle']
                            
model1 = LogisticRegression(max_iter=2000)
scaler = StandardScaler()

pipe = make_pipeline(scaler, model1)

pipe.fit(X_train, y_train)                              
##########################33

predictions = pipe.predict_proba(X_test)
predictions1 = pipe.predict(X_test) 
output = pd.DataFrame(predictions , columns = ['probNoTackle', 'probTackle'])

#########
#NEED TO FIX THE COpying over?
result_df = playersJoined2.copy()
result_df = result_df.reset_index()
result_df = result_df.join(output)

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
print("Accuracy:", accuracy_score(y_test, predictions1))
print("Precision:", precision_score(y_test, predictions1))
print("Recall:", recall_score(y_test, predictions1))
print("F1 Score:", f1_score(y_test, predictions1))

In [None]:
#REFIT THE MODEL TO THE ENTIRE SET
X_Final = playersJoined[features]
Final_Probabilities = model1.predict_proba(X_Final)
Final_Output = pd.DataFrame(Final_Probabilities, columns = ['probNoTackle', 'probTackle'])

Final_Results = playersJoined.copy()
Final_Results = Final_Results.reset_index()
Final_Results = Final_Results.join(Final_Output)

#NORMALIZE SO NO PLAY HAS OVER 1 EXPECTATION
Final_Results['probTackle'] = Final_Results.groupby(['gameId', 'playId'])['probTackle'].transform(lambda x: x / x.sum())

#Lets add a column that assigns 1 for the play grouping to whoever has the highest prob of making a tackle
Final_Results['expectedTackler'] = Final_Results.groupby(['gameId', 'playId'])['probTackle'].transform(lambda x: x == x.max()).astype(int)


Final_Result = Final_Results.merge(players[['nflId', 'position']], how = 'left' , left_on = ['nflId_x'], right_on = ['nflId'])
Final_Result.drop(['nflId', 'club', 'playDirection','tackle','x', 'y','o', 'dir', 'event', 'IsClosestDefender', 'frameId', 'time', 'inFront', 'nflId_y','a', 'secondDist', 'inFrontRank', 'closestDistance' ,'probNoTackle','s', 's_ball', 'a_ball','dir1_ball','o1_ball','x_ball', 'y_ball',  'dis', 'dis_ball'], axis = 1, inplace = True)
Final_Result = Final_Result.merge(plays[['gameId', 'playId','quarter', 'gameClock']], how = 'left', on = ['gameId', 'playId'])

In [None]:
import statsmodels.api as sm

X = playersJoined1[features]
y = playersJoined1['madeTackle']
X = sm.add_constant(X)

model = sm.Logit(y,X)
result = model.fit()

print(result.summary())