In [1]:
from zipfile import ZipFile

folder = 'nfl-big-data-bowl-2024'
endpoint = '.zip'
output = folder + endpoint

with ZipFile(output, 'r') as zObject:
	zObject.extractall(folder)

In [9]:
import os

folder = 'nfl-big-data-bowl-2024'
csv_files = [file for file in os.listdir(folder) if file.endswith('.csv')]
csv_files.sort()

In [10]:
import pandas as pd
dfs = {os.path.splitext(file)[0]: pd.read_csv(os.path.join(folder, file)) for file in csv_files}
# Unificar
df_joined = pd.concat(dfs[f'tracking_week_{i}'] for i in range(1, 10))

In [11]:
df_joined[['gameId', 'playId']].drop_duplicates()

Unnamed: 0,gameId,playId
0,2022090800,56
506,2022090800,80
1196,2022090800,101
2323,2022090800,122
3082,2022090800,146
...,...,...
1143606,2022110700,3658
1144963,2022110700,3686
1146711,2022110700,3707
1148045,2022110700,3740


In [172]:
df = df_joined.drop(columns=['displayName', 'jerseyNumber', 'time'])
df.head()

Unnamed: 0,gameId,playId,nflId,frameId,club,playDirection,x,y,s,a,dis,o,dir,event
0,2022090800,56,35472.0,1,BUF,left,88.37,27.27,1.62,1.15,0.16,231.74,147.9,
1,2022090800,56,35472.0,2,BUF,left,88.47,27.13,1.67,0.61,0.17,230.98,148.53,pass_arrived
2,2022090800,56,35472.0,3,BUF,left,88.56,27.01,1.57,0.49,0.15,230.98,147.05,
3,2022090800,56,35472.0,4,BUF,left,88.64,26.9,1.44,0.89,0.14,232.38,145.42,
4,2022090800,56,35472.0,5,BUF,left,88.72,26.8,1.29,1.24,0.13,233.36,141.95,


A continuación se eliminan frames no útiles para nuestra red

In [173]:
# Variables de filtrado
play_condition = (df['event'] == 'pass_forward') | (df['event'] == 'pass_shovel') | (df['event'] == 'run')
play_id_fields = ('gameId', 'playId')
event_id_fields = (*play_id_fields, 'frameId')

# Obtener Ids de la jugada
event_ids = df.loc[play_condition, event_id_fields]
event_ids.drop_duplicates(inplace=True)

# Eliminar jugadas que no contienen alguno de los 3 eventos
df = pd.merge(df, event_ids.loc[:, play_id_fields], on=play_id_fields, how='inner')

# Columna auxiliar que indica en que momento ocurre el evento
df = pd.merge(df, event_ids, on=play_id_fields, suffixes=('', 'Event'))

# Eliminar frames a partir del evento
df = df.loc[df.loc[:, 'frameId'] <= df.loc[:, 'frameIdEvent'], :]
df.head()

Unnamed: 0,gameId,playId,nflId,frameId,club,playDirection,x,y,s,a,dis,o,dir,event,frameIdEvent
0,2022090800,80,35472.0,1,BUF,left,82.42,26.13,2.43,0.37,0.24,196.42,140.67,,6
1,2022090800,80,35472.0,2,BUF,left,82.58,25.96,2.34,0.93,0.24,203.85,138.66,,6
2,2022090800,80,35472.0,3,BUF,left,82.73,25.79,2.2,1.39,0.22,209.5,136.4,,6
3,2022090800,80,35472.0,4,BUF,left,82.87,25.66,1.96,1.87,0.2,214.57,133.71,,6
4,2022090800,80,35472.0,5,BUF,left,82.99,25.55,1.62,2.42,0.16,222.35,130.47,,6


Cambiamos los valores de la pelota NA por 0

In [174]:
# Orientación de la pelota utilizamos 0
df['dir'] = df['dir'].fillna(0)
df['o'] = df['o'].fillna(0)
df.loc[df.loc[:, 'club'] == 'football', :].head()

Unnamed: 0,gameId,playId,nflId,frameId,club,playDirection,x,y,s,a,dis,o,dir,event,frameIdEvent
660,2022090800,80,,1,football,left,83.470001,30.459999,0.85,3.46,0.08,0.0,0.0,,6
661,2022090800,80,,2,football,left,83.330002,30.559999,1.93,8.59,0.18,0.0,0.0,,6
662,2022090800,80,,3,football,left,83.139999,30.74,3.03,9.67,0.26,0.0,0.0,,6
663,2022090800,80,,4,football,left,82.900002,30.99,4.01,8.54,0.35,0.0,0.0,,6
664,2022090800,80,,5,football,left,82.599998,31.32,5.02,8.34,0.45,0.0,0.0,,6


Cambiamos el playDirection para que tome valor 0 ó 1 en vez de cadena

In [175]:
df['playDirection'] = df['playDirection'].replace({'left': 0, 'right': 1})
df.head()

Unnamed: 0,gameId,playId,nflId,frameId,club,playDirection,x,y,s,a,dis,o,dir,event,frameIdEvent
0,2022090800,80,35472.0,1,BUF,0,82.42,26.13,2.43,0.37,0.24,196.42,140.67,,6
1,2022090800,80,35472.0,2,BUF,0,82.58,25.96,2.34,0.93,0.24,203.85,138.66,,6
2,2022090800,80,35472.0,3,BUF,0,82.73,25.79,2.2,1.39,0.22,209.5,136.4,,6
3,2022090800,80,35472.0,4,BUF,0,82.87,25.66,1.96,1.87,0.2,214.57,133.71,,6
4,2022090800,80,35472.0,5,BUF,0,82.99,25.55,1.62,2.42,0.16,222.35,130.47,,6


Añadimos columnas relevantes

In [176]:
# Variables relevantes en la jugada
# intento, yardas restantes, linea de yardas, cuarto
play_relevant_vars = ('down', 'yardsToGo', 'yardlineNumber', 'quarter')

for var in play_relevant_vars:
    df = pd.merge(df, dfs['plays'].loc[:, (var, *play_id_fields)], on=play_id_fields, how='left')

# Posicion del jugador
df = pd.merge(df, dfs['players'].loc[:, ('position', 'nflId')], on='nflId', how='left')

# Jugadores designados como receptores
df['allowedReceiver'] = 0
df.loc[(df['position'] == 'QB')|(df['position'] == 'TE')|(df['position'] == 'WR')|(df['position'] == 'RB'), 'allowedReceiver'] = 1

# Local o visitante
df = pd.merge(df, dfs['games'].loc[:, ('homeTeamAbbr', 'gameId')], on='gameId', how='left')
df.loc[df.loc[:, 'club'] == df.loc[:, 'homeTeamAbbr'], 'local'] = 1
df.loc[df.loc[:, 'club'] == 'football', 'local'] = 0
df.loc[df.loc[:, 'club'] != df.loc[:, 'homeTeamAbbr'], 'local'] = -1

df.head()

Unnamed: 0,gameId,playId,nflId,frameId,club,playDirection,x,y,s,a,...,event,frameIdEvent,down,yardsToGo,yardlineNumber,quarter,position,allowedReceiver,homeTeamAbbr,local
0,2022090800,80,35472.0,1,BUF,0,82.42,26.13,2.43,0.37,...,,6,2,4,31,1,G,0,LA,-1.0
1,2022090800,80,35472.0,2,BUF,0,82.58,25.96,2.34,0.93,...,,6,2,4,31,1,G,0,LA,-1.0
2,2022090800,80,35472.0,3,BUF,0,82.73,25.79,2.2,1.39,...,,6,2,4,31,1,G,0,LA,-1.0
3,2022090800,80,35472.0,4,BUF,0,82.87,25.66,1.96,1.87,...,,6,2,4,31,1,G,0,LA,-1.0
4,2022090800,80,35472.0,5,BUF,0,82.99,25.55,1.62,2.42,...,,6,2,4,31,1,G,0,LA,-1.0


Etiqueta de quién es el receptor en la jugada

In [177]:
# Etiqueta
df = pd.merge(df, dfs['plays'].loc[:, ('ballCarrierId', 'playId', 'gameId')], on=('playId', 'gameId'), how='left')
df['ballCarrier'] = df['nflId'] == df['ballCarrierId']
df['ballCarrier'] = df['ballCarrier'].astype(int)
df.head()

Unnamed: 0,gameId,playId,nflId,frameId,club,playDirection,x,y,s,a,...,down,yardsToGo,yardlineNumber,quarter,position,allowedReceiver,homeTeamAbbr,local,ballCarrierId,ballCarrier
0,2022090800,80,35472.0,1,BUF,0,82.42,26.13,2.43,0.37,...,2,4,31,1,G,0,LA,-1.0,46076,0
1,2022090800,80,35472.0,2,BUF,0,82.58,25.96,2.34,0.93,...,2,4,31,1,G,0,LA,-1.0,46076,0
2,2022090800,80,35472.0,3,BUF,0,82.73,25.79,2.2,1.39,...,2,4,31,1,G,0,LA,-1.0,46076,0
3,2022090800,80,35472.0,4,BUF,0,82.87,25.66,1.96,1.87,...,2,4,31,1,G,0,LA,-1.0,46076,0
4,2022090800,80,35472.0,5,BUF,0,82.99,25.55,1.62,2.42,...,2,4,31,1,G,0,LA,-1.0,46076,0


In [178]:
# Ordenamos, lo que lleva a que la columna local sea irrelevante
df.sort_values(by=['gameId', 'playId', 'frameId', 'local'], ascending=True, inplace=True)
df.reset_index(drop=True, inplace=True)
df

Unnamed: 0,gameId,playId,nflId,frameId,club,playDirection,x,y,s,a,...,down,yardsToGo,yardlineNumber,quarter,position,allowedReceiver,homeTeamAbbr,local,ballCarrierId,ballCarrier
0,2022090800,80,35472.0,1,BUF,0,82.42,26.13,2.43,0.37,...,2,4,31,1,G,0,LA,-1.0,46076,0
1,2022090800,80,42392.0,1,BUF,0,81.75,28.39,1.18,2.36,...,2,4,31,1,C,0,LA,-1.0,46076,0
2,2022090800,80,42489.0,1,BUF,0,69.42,22.59,5.78,5.05,...,2,4,31,1,WR,1,LA,-1.0,46076,0
3,2022090800,80,44875.0,1,BUF,0,82.93,24.59,1.04,3.38,...,2,4,31,1,T,0,LA,-1.0,46076,0
4,2022090800,80,44985.0,1,BUF,0,72.64,8.50,5.55,4.09,...,2,4,31,1,WR,1,LA,-1.0,46076,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
287449,2022110700,3581,48537.0,20,NO,0,81.44,27.81,0.44,2.11,...,2,4,28,4,DE,0,NO,1.0,46101,0
287450,2022110700,3581,48544.0,20,NO,0,81.52,29.75,0.52,1.07,...,2,4,28,4,DT,0,NO,1.0,46101,0
287451,2022110700,3581,52482.0,20,NO,0,83.55,20.72,5.13,0.99,...,2,4,28,4,OLB,0,NO,1.0,46101,0
287452,2022110700,3581,53505.0,20,NO,0,78.52,44.18,3.30,2.74,...,2,4,28,4,CB,0,NO,1.0,46101,0


In [179]:
columnas = ['x','y','s','a','dis','o','dir','down','yardsToGo','yardlineNumber','quarter']
ds = df.loc[:, :]
for columna in columnas:
  # Normalización min-max
  min_val = ds[columna].min()
  max_val = ds[columna].max()
  ds[columna] = (ds[columna] - min_val) / (max_val - min_val)

In [192]:
import tensorflow as tf

numeric_feature_names = ['playDirection', 'x','y','s','a','dis','o','dir','down','yardsToGo','yardlineNumber','allowedReceiver','quarter']
numeric_features = ds[numeric_feature_names]

snapshots = tf.convert_to_tensor(numeric_features)
snapshots = tf.reshape(snapshots, (-1, 23, 13))

target = 'ballCarrier'
label_features = ds[target]
labels = tf.convert_to_tensor(label_features)
labels = tf.reshape(labels, (-1, 23))
snapshots.shape, labels.shape

(TensorShape([12498, 23, 13]), TensorShape([12498, 23]))

In [193]:
ds_nfl =  tf.data.Dataset.from_tensor_slices((snapshots, labels))
ds_nfl

<_TensorSliceDataset element_spec=(TensorSpec(shape=(23, 13), dtype=tf.float64, name=None), TensorSpec(shape=(23,), dtype=tf.int32, name=None))>

In [194]:
for elem in ds_nfl.take(1):
  print(elem)

(<tf.Tensor: shape=(23, 13), dtype=float64, numpy=
array([[0.        , 0.67738194, 0.46672598, 0.11516587, 0.01435221,
        0.0609137 , 0.54561111, 0.39075   , 0.33333333, 0.125     ,
        0.6122449 , 0.        , 0.        ],
       [0.        , 0.67183099, 0.5069395 , 0.05592417, 0.09154383,
        0.03299492, 0.75333333, 0.95644444, 0.33333333, 0.125     ,
        0.6122449 , 0.        , 0.        ],
       [0.        , 0.56967688, 0.40373665, 0.27393364, 0.19588828,
        0.14720812, 0.82991667, 0.79963889, 0.33333333, 0.125     ,
        0.6122449 , 1.        , 0.        ],
       [0.        , 0.68160729, 0.43932384, 0.0492891 , 0.13110938,
        0.03045685, 0.68516667, 0.365     , 0.33333333, 0.125     ,
        0.6122449 , 0.        , 0.        ],
       [0.        , 0.5963546 , 0.15302491, 0.26303317, 0.15865011,
        0.14467005, 0.80427778, 0.64616667, 0.33333333, 0.125     ,
        0.6122449 , 1.        , 0.        ],
       [0.        , 0.68881524, 0.53807829, 

In [195]:
num_train = len(ds_nfl)

RATIO = 0.2
test_size = int(RATIO * num_train)

ds_train = ds_nfl.skip(test_size)
ds_test = ds_nfl.take(test_size)

In [196]:
num_train = len(ds_train)

RATIO = 0.2
validation_size = int(RATIO * num_train)

ds_fit = ds_nfl.skip(validation_size)
ds_val = ds_nfl.take(validation_size)

In [197]:
def normalize(snapshot, label):
    # Calcular la media y la desviación estándar por columnas
    mean = tf.reduce_mean(snapshot, axis=0)
    std_dev = tf.math.reduce_std(snapshot, axis=0)

    # Normalizar por columnas
    normalized_tensor = (snapshot - mean) / std_dev

    return normalized_tensor, label

In [198]:
AUTOTUNE = tf.data.AUTOTUNE
BATCH_SIZE = 1024
ds_fit = (
    ds_fit
    .cache()
    .shuffle(len(ds_fit))
    #.map(normalize, num_parallel_calls=AUTOTUNE)
    .batch(BATCH_SIZE)
    .prefetch(AUTOTUNE)
)

ds_val = (
    ds_val
    .cache()
    #.map(normalize, num_parallel_calls=AUTOTUNE)
    .batch(BATCH_SIZE)
    .prefetch(tf.data.AUTOTUNE)
)

In [199]:
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau

# Reduce learning rate
reduce_lr = ReduceLROnPlateau(monitor='val_accuracy', factor=0.2, patience=4, min_lr=0.0001)

epoch_wait = 5
# Crear el EarlyStopping callback con la función on_train_end
early_stopping = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=epoch_wait)

# Definimos los callbacks
callbacks = (early_stopping, )

In [200]:
import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras.regularizers import l2

# Establecer la semilla global
seed_value = 42
tf.random.set_seed(seed_value)

# Arquitectura de la red convolucional
model = models.Sequential()
model.add(layers.Input(shape=(23, 13)))

# Capa de convolución 1D con padding 'same' para mantener la longitud original de la secuencia
model.add(layers.Conv1D(64, kernel_size=3, activation='relu', padding='same', kernel_regularizer=l2(0.01)))
model.add(layers.Dropout(0.5))

# Capa de aplanado para conectar con una capa densa
model.add(layers.Flatten())

# Capa densa con Batch Normalization
model.add(layers.Dense(128, activation='relu', kernel_regularizer=l2(0.01)))
model.add(layers.BatchNormalization())
model.add(layers.Dropout(0.5))

# Capa de salida
model.add(layers.Dense(23, activation='softmax', kernel_regularizer=l2(0.01)))

# Compilar el modelo
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Resumen de la arquitectura del modelo
model.summary()


Model: "sequential_10"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv1d_10 (Conv1D)          (None, 23, 64)            2560      
                                                                 
 dropout_20 (Dropout)        (None, 23, 64)            0         
                                                                 
 flatten_10 (Flatten)        (None, 1472)              0         
                                                                 
 dense_20 (Dense)            (None, 128)               188544    
                                                                 
 batch_normalization_10 (Ba  (None, 128)               512       
 tchNormalization)                                               
                                                                 
 dropout_21 (Dropout)        (None, 128)               0         
                                                     

In [201]:
history = model.fit(ds_fit, epochs=100, validation_data=ds_val, callbacks=callbacks)

Epoch 1/100


Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 58: early stopping


In [202]:
BATCH_SIZE = 128
ds_eval = (
    ds_test.cache()
    #.map(normalize, num_parallel_calls=AUTOTUNE)
    .batch(BATCH_SIZE)
    .prefetch(AUTOTUNE)
)
test_loss, test_acc = model.evaluate(ds_eval)

print('Test accuracy:', test_acc)

 1/20 [>.............................] - ETA: 0s - loss: 2.5230 - accuracy: 0.6016

Test accuracy: 0.47258904576301575


In [203]:
jugadas_seleccionadas = df.drop_duplicates(subset=('gameId', 'playId'))
jugadas_seleccionadas

Unnamed: 0,gameId,playId,nflId,frameId,club,playDirection,x,y,s,a,...,down,yardsToGo,yardlineNumber,quarter,position,allowedReceiver,homeTeamAbbr,local,ballCarrierId,ballCarrier
0,2022090800,80,35472.0,1,BUF,0,82.42,26.13,2.43,0.37,...,2,4,31,1,G,0,LA,-1.0,46076,0
138,2022090800,1757,40107.0,1,BUF,0,23.65,32.02,6.10,3.29,...,3,1,31,2,FS,0,LA,-1.0,34452,0
276,2022090800,2093,35472.0,1,BUF,0,55.90,21.71,0.00,0.00,...,2,9,45,3,G,0,LA,-1.0,46076,0
736,2022090800,2137,35472.0,1,BUF,0,57.26,25.31,2.52,1.26,...,3,7,43,3,G,0,LA,-1.0,46076,0
874,2022090800,2464,35472.0,1,BUF,0,108.57,29.18,0.76,4.16,...,1,15,6,3,G,0,LA,-1.0,46076,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
285315,2022110700,2440,42361.0,1,BAL,0,46.22,36.31,0.92,0.53,...,1,10,45,3,CB,0,NO,-1.0,45244,0
285844,2022110700,2582,33131.0,1,BAL,0,30.50,27.71,1.08,1.45,...,1,10,16,3,DE,0,NO,-1.0,37110,0
285982,2022110700,2965,38557.0,1,BAL,0,46.92,24.96,0.00,0.00,...,3,1,36,4,G,0,NO,-1.0,46101,0
286442,2022110700,3560,38557.0,1,BAL,0,88.54,25.00,0.00,0.00,...,1,10,22,4,G,0,NO,-1.0,46101,0
