In [1]:
import pandas as pd
from scipy.sparse import lil_matrix
import scipy.spatial.distance

In [2]:
stats = pd.read_csv('Full_NFL_Stats_2022.csv')

In [3]:
stats.head()

Unnamed: 0.1,Unnamed: 0,teamName,displayName,description,abbreviation,value,perGameValue,rank
0,0,Falcons,Fumbles,The number of times a player/team has fumbled ...,FUM,21.0,1.0,13.0
1,1,Falcons,Fumbles Lost,The number of times a fumble is recovered by t...,LST,12.0,,4.0
2,2,Falcons,Forced Fumbles,The total number of forced fumbles.,FF,8.0,,62.0
3,3,Falcons,Fumbles Recovered,The number of fumbles recovered.,FR,7.0,,23.0
4,4,Falcons,Fumbles Touchdowns,The number of times a fumbles is recovered and...,FTD,1.0,,8.0


In [4]:
stats.drop(columns=['perGameValue', 'Unnamed: 0'],axis=1, inplace=True)

In [5]:
ypg = stats.loc[stats['displayName'] == 'Rushing Yards Per Game']
ypg = ypg.sort_values(by='rank')

In [6]:
ypg.tail()

Unnamed: 0,teamName,displayName,description,abbreviation,value,rank
3627,Rams,Rushing Yards Per Game,The average number of rushing yards per game.,YDS/G,97.706,27.0
887,Bengals,Rushing Yards Per Game,The average number of rushing yards per game.,YDS/G,95.5,29.0
6367,Chargers,Rushing Yards Per Game,The average number of rushing yards per game.,YDS/G,89.647,30.0
8559,Texans,Rushing Yards Per Game,The average number of rushing yards per game.,YDS/G,86.824,31.0
7189,Buccaneers,Rushing Yards Per Game,The average number of rushing yards per game.,YDS/G,76.941,32.0


In [7]:
fumbles = stats.loc[stats['displayName'] == 'Rushing Fumbles Lost']
fumbles = fumbles.sort_values(by='rank',ascending=False)

In [8]:
fumbles.tail()

Unnamed: 0,teamName,displayName,description,abbreviation,value,rank
4720,Saints,Rushing Fumbles Lost,The number of times there is a run and then th...,LST,4.0,3.0
2802,Colts,Rushing Fumbles Lost,The number of times there is a run and then th...,LST,4.0,3.0
8008,Jaguars,Rushing Fumbles Lost,The number of times there is a run and then th...,LST,4.0,3.0
336,Bills,Rushing Fumbles Lost,The number of times there is a run and then th...,LST,5.0,1.0
8556,Texans,Rushing Fumbles Lost,The number of times there is a run and then th...,LST,5.0,1.0


In [9]:
downs = stats.loc[stats['displayName'] == 'Rushing 1st downs']
downs = downs.sort_values(by='rank')
downs = downs.drop_duplicates(subset=['teamName'])

In [10]:
downs.tail()

Unnamed: 0,teamName,displayName,description,abbreviation,value,rank
2995,Colts,Rushing 1st downs,The number of times a rush results in a first ...,FDR,91.0,28.0
8554,Texans,Rushing 1st downs,The number of times a first down is picked up ...,FD,90.0,29.0
3817,Rams,Rushing 1st downs,The number of times a rush results in a first ...,FDR,89.0,30.0
5461,Jets,Rushing 1st downs,The number of times a rush results in a first ...,FDR,79.0,31.0
7184,Buccaneers,Rushing 1st downs,The number of times a first down is picked up ...,FD,79.0,31.0


In [11]:
td = stats.loc[stats['abbreviation'] == 'RUSH']
td = td.sort_values(by='rank')
td = td.drop_duplicates(subset=['teamName'])

In [12]:
espnrbrate = stats.loc[stats['displayName'] == 'ESPN RB Rating']
espnrbrate = espnrbrate.sort_values(by='rank',ascending=True)

In [13]:
espnrbrate

Unnamed: 0,teamName,displayName,description,abbreviation,value,rank
601,Bears,ESPN RB Rating,The ESPN Widereceiver Rating.,ESPNRB,3494.0,1.0
5533,Eagles,ESPN RB Rating,The ESPN Widereceiver Rating.,ESPNRB,3479.0,2.0
53,Falcons,ESPN RB Rating,The ESPN Widereceiver Rating.,ESPNRB,3133.0,3.0
4985,Giants,ESPN RB Rating,The ESPN Widereceiver Rating.,ESPNRB,3104.0,4.0
8273,Ravens,ESPN RB Rating,The ESPN Widereceiver Rating.,ESPNRB,3090.0,5.0
1423,Cowboys,ESPN RB Rating,The ESPN Widereceiver Rating.,ESPNRB,3063.0,6.0
1149,Browns,ESPN RB Rating,The ESPN Widereceiver Rating.,ESPNRB,3020.0,7.0
6629,49ers,ESPN RB Rating,The ESPN Widereceiver Rating.,ESPNRB,2940.0,8.0
1971,Lions,ESPN RB Rating,The ESPN Widereceiver Rating.,ESPNRB,2864.0,9.0
7725,Panthers,ESPN RB Rating,The ESPN Widereceiver Rating.,ESPNRB,2650.0,10.0


In [14]:
bottom_rb = []

for df in [ypg, fumbles, downs, td, espnrbrate]:

    bottom_names = list(df['teamName'].tail(16))

    bottom_rb += bottom_names

name_counts = {name: bottom_rb.count(name) for name in set(bottom_rb)}

final_names = [name for name in name_counts.keys() if name_counts[name] == 5]

print(final_names)

['Colts', 'Broncos', 'Saints', 'Texans', 'Chargers']


In [15]:
rbsim = pd.read_csv('RBData.csv')

In [16]:
rbsimdex_df = rbsim.set_index('PlayerName')

In [17]:
t_player = 'Josh Jacobs'
t_player_stat = rbsimdex_df.loc[t_player]
dist = scipy.spatial.distance.cdist(rbsimdex_df, [t_player_stat], metric = "cosine")[:,0]
query_distances = list(zip(rbsim.index, dist))

In [18]:
for sim_play, sim_stats in sorted(query_distances, key = lambda x: x[1], reverse = False)[:11]:
    print(rbsim['PlayerName'].iloc[sim_play], sim_stats)

Josh Jacobs 0.0
Saquon Barkley 0.00012242843860421626
Derrick Henry 0.00012653435125820067
Jeff Wilson Jr. 0.00034540304662544763
Dalvin Cook 0.00036857321035899115
AJ Dillon 0.0005109675936229019
Isiah Pacheco 0.0006438239747932339
Najee Harris 0.000682681382390582
Nick Chubb 0.0008231992419467371
Latavius Murray 0.0009717954650405591
Tony Pollard 0.0010029583156941912
