In [1]:
import os
import gc
import numpy as np
import pandas as pd
from scipy.spatial.distance import cdist

In [2]:
def pass_or_rush(play):
    if not pd.isna(play['qbSpike']) and play['qbSpike']:
        return 'none'
    elif not pd.isna(play['qbKneel']) and play['qbKneel']:
        return 'none'
    elif not pd.isna(play['qbSneak']) and play['qbSneak']:
        return 'none'
    elif play['passResult'] == 'R':
        return 'none'
    elif not pd.isna(play['rushLocationType']):
        return 'rush'
    elif not pd.isna(play['passLocationType']):
        return 'pass'
    elif not pd.isna(play['passResult']):
        print("PASS WITHOUT INFO")
        return 'pass'
    else:
        print("can't determine play type")
        return 'none'

In [3]:
cur_path = os.path.os.getcwd()
print(cur_path)
data_path = os.path.abspath(os.path.join(cur_path, '../nfl_data/2025/'))
print(data_path)
distances_path = os.path.abspath(os.path.join(cur_path, '../distances/'))
print(distances_path)

d:\masters-degree\masters-project\notebooks
d:\masters-degree\masters-project\nfl_data\2025
d:\masters-degree\masters-project\distances


In [4]:
games = pd.read_csv(os.path.join(data_path, 'games.csv'))
plays = pd.read_csv(os.path.join(data_path, 'plays.csv'))

games.sort_values(['week'], ascending=[True], inplace=True)

games_info = games[['gameId', 'week']]
plays = pd.merge(plays, games_info, on='gameId')
plays.sort_values(['week'], ascending=[True], inplace=True)

files = ['tracking_week_1.csv', 'tracking_week_2.csv', 'tracking_week_3.csv', 'tracking_week_4.csv', 'tracking_week_5.csv', 'tracking_week_6.csv', 'tracking_week_7.csv']

In [5]:
def keep_n_smallest(row):
    row_copy = row.copy()
    row_copy[row_copy == 0] = np.inf
    smallest_indices = row_copy.nsmallest(2).index
    row[~row.index.isin(smallest_indices)] = 0
    return row

In [6]:
def keep_values_below_x_or_n_smallest(row, X, N):
    row_copy = row.copy()
    # Manter valores menores que X
    row_copy[row_copy >= X] = np.inf
    # Encontrar os Ã­ndices dos N menores valores
    smallest_indices = row_copy.nsmallest(N).index
    # Definir todos os outros valores como zero
    row[~row.index.isin(smallest_indices) & (row >= X)] = 0
    return row

In [7]:
dir_name = '5y_3_closest/'

In [8]:
## DONT RUN THIS ANYMORE, 94 MINUTES TO RUN
## THIS IS THE CODE TO GENERATE THE DISTANCES BETWEEN PLAYERS, ALL THE DISTANCES ARE STORED IN THE FOLDER 'distances/all_edges'
## FOR WEEK 4 THERE IS A PASS WITHOUT INFO

# dir_name = '5_closest/'


X = 5
N = 3

for i, file in enumerate(files):
    data = pd.read_csv(data_path + '/' + file)
    print('Reading file {file}...'.format(file=file))
    for index, row in plays[plays['week'] == i+1].iterrows():
        play_data = data[(data['playId'] == row['playId']) & (data['gameId'] == row['gameId']) & (data['frameType'] == "SNAP") & (data['displayName'] != "football")]
        
        play_type = pass_or_rush(row)
        if play_type == 'none':
            continue
        
        file_name = dir_name + str(row['gameId']) + '_' + str(row['playId']) + '.csv'
        
        coords = play_data[['x', 'y']].values
        dist_matrix = cdist(coords, coords, metric='euclidean')
        
        dist_df = pd.DataFrame(dist_matrix, index=play_data['nflId'], columns=play_data['nflId'])
        dist_df = dist_df.apply(lambda row: keep_values_below_x_or_n_smallest(row, X, N), axis=1)
        # dist_df = dist_df.apply(lambda row: keep_n_smallest(row), axis=1)
        dist_df.to_csv(os.path.join(distances_path, file_name))

Reading file tracking_week_1.csv...
Reading file tracking_week_2.csv...
Reading file tracking_week_3.csv...
Reading file tracking_week_4.csv...
PASS WITHOUT INFO
Reading file tracking_week_5.csv...
Reading file tracking_week_6.csv...
Reading file tracking_week_7.csv...


In [8]:
df = pd.read_csv(os.path.join(distances_path, f'{dir_name}/2022090800_56.csv'), index_col=0)

# Converter o DataFrame em uma matriz NumPy
adjacency_matrix = df.values

print(adjacency_matrix)

[[ 0.          0.          4.69502929  1.61595173  0.          0.
   0.          0.          2.80807763  1.80005555  0.          4.27200187
   0.          4.02880876  0.          2.61007663  0.          3.08079535
   0.          0.          4.73016913  0.        ]
 [ 0.          0.          0.          0.          0.          0.
   0.          0.          4.39219763  0.          0.          0.
   0.          0.          0.          4.08129881  0.          0.
   0.          0.          0.          3.7319298 ]
 [ 4.69502929  0.          0.          3.0904045   0.          0.
   0.          3.6806657   0.          0.          0.          0.
   0.          0.          2.82207371  2.73528792  0.          2.33238076
   0.          0.          2.02074244  3.96364731]
 [ 1.61595173  0.          3.0904045   0.          0.          0.
   0.          0.          3.69399513  3.33636029  0.          4.52009956
   0.          0.          4.74203543  1.58012658  0.          1.65849329
   0.          

In [9]:
dis_files = os.listdir(os.path.join(distances_path, dir_name))
print(len(dis_files))

pass_files = []
rush_files = []

for file in dis_files:
    file_name = file.split('.')[0]
    game_id = int(file_name.split('_')[0])
    play_id = int(file_name.split('_')[1])
    
    play = plays[(plays['gameId'] == game_id) & (plays['playId'] == play_id)].iloc[0]
    play_type = pass_or_rush(play)
    
    if play_type == 'pass':
        pass_files.append(file)
    elif play_type == 'rush':
        rush_files.append(file)
    else:
        raise Exception('Invalid play type')
    
print(len(pass_files))
print(len(rush_files))

12271
PASS WITHOUT INFO
7453
4818


In [10]:
sum_distances_pass = 0
count_pass = 0

for pass_file in pass_files:
    # if count_pass % 1000 == 0:
    #     print(count_pass)
        
    pass_df = pd.read_csv(os.path.join(distances_path, dir_name, pass_file), index_col=0)
    sum_non_zero = pass_df[pass_df > 0].sum().sum()
    sum_distances_pass += sum_non_zero
    count_pass += 1
    
    # break

print('---- PASS ----')
print(f'Sum of distances: {sum_distances_pass}')
print(f'Number of plays: {count_pass}')
print(f'Mean distance: {sum_distances_pass / count_pass}')

sum_distances_rush = 0
count_rush = 0

for rush_file in rush_files:
    # if count_rush % 1000 == 0:
    #     print(count_rush)
        
    pass_df = pd.read_csv(os.path.join(distances_path, dir_name, rush_file), index_col=0)
    sum_non_zero = pass_df[pass_df > 0].sum().sum()
    sum_distances_rush += sum_non_zero
    count_rush += 1
    
    # break

print()
print('---- RUSH ----')
print(f'Sum of distances: {sum_distances_rush}')
print(f'Number of plays: {count_rush}')
print(f'Mean distance: {sum_distances_rush / count_rush}')

print()
print('---- TOTAL ----')
print(f'Sum of distances: {sum_distances_pass + sum_distances_rush}')
print(f'Number of plays: {count_pass + count_rush}')
print(f'Mean distance: {(sum_distances_pass + sum_distances_rush) / (count_pass + count_rush)}')

---- PASS ----
Sum of distances: 3695600.410739805
Number of plays: 7453
Mean distance: 495.85407362670134

---- RUSH ----
Sum of distances: 2459175.2131774407
Number of plays: 4818
Mean distance: 510.41411647518487

---- TOTAL ----
Sum of distances: 6154775.623917246
Number of plays: 12271
Mean distance: 501.5708274726792


In [11]:
## DONT RUN THIS ANYMORE, unless it's necessary, 536 MINUTES TO RUN
## THIS IS THE CODE TO CALCULATE THE NORMS BETWEEN THE DIFFERENCES OF THE DISTANCES BETWEEN PLAYERS

norms = []
i = 0
for pass_file in pass_files:
    if i % 100 == 0:
        print(i)
    pf = pd.read_csv(os.path.join(distances_path, dir_name, pass_file), index_col=0).values
    
    for rush_file in rush_files:
        rf = pd.read_csv(os.path.join(distances_path, dir_name, rush_file), index_col=0).values
        
        dif_matrix = pf - rf
        norm = np.linalg.norm(dif_matrix)
        norms.append(norm)
    
    i += 1

0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100
3200
3300
3400
3500
3600
3700
3800
3900
4000
4100
4200
4300
4400
4500
4600
4700
4800
4900
5000
5100
5200
5300
5400
5500
5600
5700
5800
5900
6000
6100
6200
6300
6400
6500
6600
6700
6800
6900
7000
7100
7200
7300
7400


In [12]:
mean_norm = np.mean(norms)
std_norm = np.std(norms)

print('Mean norm:', mean_norm) # 190.60586513023134
print('Std norm:', std_norm) # 32.49119488356777

Mean norm: 73.56670296801938
Std norm: 10.799991107191932


In [14]:
data = [
    {'edge_type': 'all_edges', 'mean_norm': 190.60586513023134, 'std_norm': 32.49119488356777, 'mean_pass_distance': 4969.517567081221, 'mean_rush_distance': 4354.41503844643, 'mean_total_distance': 4728.007993047937},
    {'edge_type': '2_closest', 'mean_norm': 40.340777577782376, 'std_norm': 4.85426954399721, 'mean_pass_distance': 167.22764933510695, 'mean_rush_distance': 150.3409284739069, 'mean_total_distance': 160.59736483431143},
    {'edge_type': '3_closest', 'mean_norm': 53.65738308891193, 'std_norm': 6.49565441843677, 'mean_pass_distance': 278.8973435100625, 'mean_rush_distance': 249.91796946563363, 'mean_total_distance': 267.5190838616183},
    {'edge_type': '5_closest', 'mean_norm': 77.88521520323044, 'std_norm': 9.612970829043928, 'mean_pass_distance': 548.4222362709286, 'mean_rush_distance': 488.8334711608759, 'mean_total_distance': 525.0257184402518},
    {'edge_type': 'mean_3_closest', 'mean_norm': 105.40489397180272, 'std_norm': 5.726103869716602, 'mean_pass_distance': 1479.6876410052944, 'mean_rush_distance': 1646.76600909736, 'mean_total_distance': 1545.2881281267653},
    {'edge_type': 'mean_5_closest', 'mean_norm': 114.06714587401328, 'std_norm': 9.662376595769185, 'mean_pass_distance': 1585.8577395976033, 'mean_rush_distance': 1719.8054573954084, 'mean_total_distance': 1638.4500388682272},
    {'edge_type': '5y_3_closest', 'mean_norm': 73.56670296801938, 'std_norm': 10.799991107191932, 'mean_pass_distance': 495.85407362670134, 'mean_rush_distance': 510.41411647518487, 'mean_total_distance': 501.5708274726792}
]

df_distances = pd.DataFrame(data)
# df_distances.set_index("edge_type", inplace=True)
df_distances['percentage'] = (df_distances['mean_norm'] / df_distances['mean_total_distance']) * 100

df_distances.to_csv(os.path.join(distances_path, 'distances.csv'))

df_distances

Unnamed: 0,edge_type,mean_norm,std_norm,mean_pass_distance,mean_rush_distance,mean_total_distance,percentage
0,all_edges,190.605865,32.491195,4969.517567,4354.415038,4728.007993,4.03142
1,2_closest,40.340778,4.85427,167.227649,150.340928,160.597365,25.119203
2,3_closest,53.657383,6.495654,278.897344,249.917969,267.519084,20.057404
3,5_closest,77.885215,9.612971,548.422236,488.833471,525.025718,14.834552
4,mean_3_closest,105.404894,5.726104,1479.687641,1646.766009,1545.288128,6.821051
5,mean_5_closest,114.067146,9.662377,1585.85774,1719.805457,1638.450039,6.961893
6,5y_3_closest,73.566703,10.799991,495.854074,510.414116,501.570827,14.667261


In [None]:
# edge type      |        Mean         |      Std norm      
# ---------------|---------------------|--------------------
# all_edges      | 190.60586513023134  | 32.49119488356777
# 3_closest      | 53.65738308891193   | 6.49565441843677
# 5_closest      | 77.88521520323044   | 9.612970829043928
# mean_3_closest | 105.40489397180272  | 5.726103869716602
# mean_5_closest | 114.06714587401328  | 9.662376595769185

In [None]:
GCN - graph convolutional network
SOFT MAX 
2 niveis
DGL / Pytorch Geometric