In [1]:
import pandas as pd
import numpy as np
from sklearn.decomposition import NMF
import nfl_data_py as nfl

In [2]:
def player_id_to_int(player_id):
#     print(player_id)
    return int(player_id[3:])

def save_player_id_map(df, path='csv/player_id_map.csv'):
    df.to_csv(path)
    
def prep_players(df):
    
    #  Extract lists of all unique passers and receivers (as well as their IDs)
    receiver_dict = dict(list(df.groupby(['receiver_player_id','receiver_player_name']).groups.keys()))
    receiver_ids, receiver_names = list(receiver_dict.keys()), list(receiver_dict.values())

    passer_dict = dict(list(df.groupby(['passer_player_id','passer_player_name']).groups.keys()))
    passer_ids, passer_names = list(passer_dict.keys()), list(passer_dict.values())

    #  Lists of receiver and passers can be combined into a list of all players with their IDs
    receivers = {
        'player_id': receiver_ids,
        'player_name': receiver_names
    }

    passers = {
        'player_id': passer_ids,
        'player_name': passer_names
    }

    passer_df = pd.DataFrame.from_dict(passers)
    receiver_df = pd.DataFrame.from_dict(receivers)

    #  This dataframe allows you to get a players name, give their ID, we can save this separately
    player_id_index = pd.concat([passer_df,receiver_df])  # save this to its own CSV file
    save_player_id_map(player_id_index)
    
    return passer_df, receiver_df, player_id_index

def prep_X(df, receiver_df, passer_df):
    
    #  The number of receivers and passers make up the dimensions of X
    rec_ct = receiver_df.shape[0]
    pass_ct = passer_df.shape[0]

    X = np.zeros((rec_ct,pass_ct))

    #  Iterate over the rows of our main dataframe to tally the passing yards for complete pass plays
    #  "For every completed play, get receiver index and passer index and -> X[receiver_idx][passer_idx] += gain"
    for row in df.itertuples(index=False):
        if row.play_type_nfl=='PASS' and not row.interception and not row.qb_spike and not row.incomplete_pass:
            gain = row.receiving_yards

            rec_id = row.receiver_player_id
            receiver_idx = receiver_df.index[receiver_df['player_id'] == rec_id].tolist()[0]

            pass_id = row.passer_player_id
            passer_idx = passer_df.index[passer_df['player_id'] == pass_id].tolist()[0]

    #         print(f'{passer_df.iloc[passer_idx].player_name} to  {receiver_df.iloc[receiver_idx].player_name} for {gain} yards')

            X[receiver_idx][passer_idx] += gain
    return X

def normalize_rows(W_T): # accepts W.T as a parameter
    W_T_norm = np.zeros_like(W_T)
    
    for row in range(W_T.shape[0]):
        row_sum = np.sum(W_T[row])
        W_T_norm[row] = W_T[row] / row_sum
        
    return W_T_norm

def normalize_columns(H_T): # accepts H.T as a parameter
    H_T_norm = np.zeros_like(H_T)
    
    for col in range(H_T.shape[1]):
        col_sum = np.sum(H_T[:,col])
        H_T_norm[:,col] = H_T[:,col] / col_sum
        
    return H_T_norm

def fetch_receiver(index, player_id_index):
    return player_id_index.iloc[index]['player_name']

In [3]:
relevant_columns = [
    'game_id',
    'pass', # 1
    'play_type_nfl', # PASS
    'incomplete_pass', # 0
    'success', # 1
    'receiving_yards',
    'passer_player_name',
    'receiver_player_name',
    'passer_player_id',
    'receiver_player_id',
    'interception',
    'qb_spike',
    'passer',
    'rusher',
    'receiver',
    'air_yards',
    'desc'
]

In [4]:
time_period = list(range(2016,2021+1))

nfl_season_pbp = nfl.import_pbp_data(time_period)[relevant_columns]
nfl_season_pbp = nfl_season_pbp.loc[nfl_season_pbp['play_type_nfl'] == 'PASS']
nfl_season_pbp = nfl_season_pbp.loc[pd.notna(nfl_season_pbp['receiver_player_id'])]

2016 done.
2017 done.
2018 done.
2019 done.
2020 done.
2021 done.
Downcasting floats.


In [5]:
passer_df, receiver_df, player_id_index = prep_players(nfl_season_pbp)
X = prep_X(nfl_season_pbp, receiver_df, passer_df)

# NMF must be non-negative! We will treat net-negative yardage as zero-gain
X[X<0] = 0

X.shape

(1137, 248)

In [6]:
groupings = 32

model = NMF(n_components=groupings, init='random', random_state=0)
W = model.fit_transform(X)
H = model.components_

In [7]:
W_norm = normalize_columns(W)


most_frequent_word_weights = []
most_frequent_words = []
topics_summary = []

for col in range(W.shape[1]):
    top_10 = {}
    
    top_10_weights = []
    top_10_words = []
    
    sorted_W_column_weights = np.flip(np.sort(W_norm[:,col]))
    sorted_W_column_indices = np.flip(np.argsort(W_norm[:,col]))
    
    
    for i in range(10):
        word = {}
        
        index = sorted_W_column_indices[i]
        weight = sorted_W_column_weights[i]
        
        word['index'] = index
        word['receiver'] = fetch_receiver(index, player_id_index)
        word['weight'] = weight
        
        top_10[i] = word

    topics_summary.append(top_10)

In [8]:
topics_summary[0]

{0: {'index': 381, 'receiver': 'B.Powell', 'weight': 0.13805656567599225},
 1: {'index': 866, 'receiver': 'C.Grayson', 'weight': 0.07967529729483014},
 2: {'index': 24, 'receiver': 'M.Moore', 'weight': 0.04784851873725433},
 3: {'index': 313, 'receiver': 'M.Wallace', 'weight': 0.046915738715455686},
 4: {'index': 47, 'receiver': 'C.Kaepernick', 'weight': 0.04465554727924409},
 5: {'index': 937, 'receiver': 'S.Culkin', 'weight': 0.0396007458190678},
 6: {'index': 302, 'receiver': 'T.Hightower', 'weight': 0.03661900415907419},
 7: {'index': 659, 'receiver': 'M.LaCosse', 'weight': 0.03273296973779006},
 8: {'index': 288, 'receiver': 'M.Ryan', 'weight': 0.030318040846211213},
 9: {'index': 559, 'receiver': 'J.Janis', 'weight': 0.02691532274702591}}

In [9]:
# def format_vertical_headers(df):
#     """Display a dataframe with vertical column headers"""
#     styles = [dict(selector="th", props=[('width', '4px')]),
#               dict(selector="th.col_heading",
#                    props=[("writing-mode", "vertical-rl"),
#                           ('transform', 'rotateZ(180deg)'), 
#                           ('height', '300px'),
#                           ('vertical-align', 'top')])]
#     return (df.fillna('').style.set_table_styles(styles))

# format_vertical_headers(pd.DataFrame(data))

In [10]:
# creating the dataframe
df_X = pd.DataFrame(data = X, 
                  index = list(receiver_df.player_name), 
                  columns = list(passer_df.player_name))
  
# displaying the dataframe
df_X

Unnamed: 0,T.Brady,D.Brees,S.Hill,J.McCown,C.Palmer,T.Romo,M.Schaub,E.Manning,A.Lee,L.Fitzgerald,...,B.DiNucci,J.Hurts,J.Burrow,D.Mills,K.Toney,J.Fields,T.Lawrence,M.Jones,T.Lance,Z.Wilson
T.Brady,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
S.Smith Sr.,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
D.Brees,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A.Gates,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A.Johnson,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Am.Rodgers,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
J.Williams,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A.Schwartz,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
H.Long,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
sum_of_rows = H.sum(axis=0)
H_norm = H / sum_of_rows[np.newaxis]

df_H = pd.DataFrame(data = H_norm, 
                  index = [f'rec_group_{n}' for n in range(H_norm.shape[0])], 
                  columns = list(passer_df.player_name))
  
# displaying the dataframe
# df_H

  H_norm = H / sum_of_rows[np.newaxis]


In [12]:
sum_of_rows = W.sum(axis=1)
W_norm = W / sum_of_rows[:, np.newaxis]

df_W = pd.DataFrame(data = W_norm, 
                  index = list(receiver_df.player_name), 
                  columns = [f'rec_group_{n}' for n in range(W_norm.shape[1])])
  
# displaying the dataframe
# df_W

  W_norm = W / sum_of_rows[:, np.newaxis]


In [13]:
for group_id in range(groupings):
    print(f'GROUP {group_id} RECEIVERS')
    # display(df_W.nlargest(25,f'rec_group_{group_id}'))
    print(df_W.nlargest(10,f'rec_group_{group_id}').iloc[:10].T.columns)

    print('\n\n')

    print(f'QUARTERBACKS THAT PERFORM BEST WITH GROUP {group_id}')
    # display(df_H.T.nlargest(25,f'rec_group_{group_id}')
    print(df_H.T.nlargest(5,f'rec_group_{group_id}').iloc[:,:10].T.columns)

    print('\n')

GROUP 0 RECEIVERS
Index(['M.Perry', 'L.Bowden', 'P.Laird', 'B.Wilds', 'J.Marshall', 'S.Wilson',
       'C.Wilkins', 'C.Cox', 'P.Williams', 'I.Ford'],
      dtype='object')



QUARTERBACKS THAT PERFORM BEST WITH GROUP 0
Index(['A.Wilson', 'T.Tagovailoa', 'L.Bowden', 'R.Fitzpatrick', 'J.Cutler'], dtype='object')


GROUP 1 RECEIVERS
Index(['D.Robinson', 'G.Dieter', 'M.Kemp', 'D.Thompson', 'D.Gore', 'N.Gray',
       'B.Pringle', 'D.Yelder', 'Darr.Williams', 'E.Fisher'],
      dtype='object')



QUARTERBACKS THAT PERFORM BEST WITH GROUP 1
Index(['T.Kelce', 'T.Townsend', 'C.Henne', 'P.Mahomes', 'T.Way'], dtype='object')


GROUP 2 RECEIVERS
Index(['J.Thomas', 'K.Warring', 'B.Howell', 'C.Gillaspia', 'A.Blue',
       'S.Mitchell', 'K.Coutee', 'J.Akins', 'C.Thompson', 'J.Prosch'],
      dtype='object')



QUARTERBACKS THAT PERFORM BEST WITH GROUP 2
Index(['D.Watson', 'T.Savage', 'T.Yates', 'B.Osweiler', 'C.Streveler'], dtype='object')


GROUP 3 RECEIVERS
Index(['K.Johnson', 'A.Isabella', 'R.Moor