In [1]:
import numpy as np
import pandas as pd

In [2]:
# raw data from https://www.pro-football-reference.com/years/2018/receiving.htm
df = pd.read_csv('data/nfl_receivers_2018.txt', index_col=0)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 494 entries, 1 to 494
Data columns (total 17 columns):
Player    494 non-null object
Tm        494 non-null object
Age       494 non-null int64
Pos       373 non-null object
G         494 non-null int64
GS        494 non-null int64
Tgt       494 non-null int64
Rec       494 non-null int64
Ctch%     494 non-null object
Yds       494 non-null int64
Y/R       494 non-null float64
TD        494 non-null int64
Lng       494 non-null int64
Y/Tgt     494 non-null float64
R/G       494 non-null float64
Y/G       494 non-null float64
Fmb       494 non-null int64
dtypes: float64(4), int64(9), object(4)
memory usage: 69.5+ KB


In [3]:
df.head()

Unnamed: 0_level_0,Player,Tm,Age,Pos,G,GS,Tgt,Rec,Ctch%,Yds,Y/R,TD,Lng,Y/Tgt,R/G,Y/G,Fmb
Rk,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
1,Michael Thomas*+\ThomMi05,NOR,25,WR,16,16,147,125,85.0%,1405,11.2,9,72,9.6,7.8,87.8,2
2,Zach Ertz*\ErtzZa00,PHI,28,TE,16,16,156,116,74.4%,1163,10.0,8,34,7.5,7.3,72.7,1
3,DeAndre Hopkins*+\HopkDe00,HOU,26,WR,16,16,163,115,70.6%,1572,13.7,11,49,9.6,7.2,98.3,2
4,Julio Jones*\JoneJu02,ATL,29,WR,16,16,170,113,66.5%,1677,14.8,8,58,9.9,7.1,104.8,2
5,Adam Thielen*\ThieAd00,MIN,28,WR,16,16,153,113,73.9%,1373,12.2,9,68,9.0,7.1,85.8,1


In [4]:
# new features from player names
df['ProBowl'] = df['Player'].apply(lambda x: '*' in x) * 1
df['FirstTeamAllPro'] = df['Player'].apply(lambda x: '+' in x) * 1

In [5]:
# clean up player names
name_id_split = (df['Player'].str.replace('*','')
                             .str.replace('+','')
                             .str.split('\\', expand=True))

df['Player'] = name_id_split[0]
df.insert(1, 'Id', name_id_split[1])

In [6]:
# change catch rate from string to decimal
df['Ctch%'] = df['Ctch%'].str.replace('%','').astype(float) / 100
df.rename(columns={'Ctch%': 'Ctch_Rate'}, inplace=True)


In [7]:
# one-hot encode positions

positions_df = (df['Pos'].str.lower()
                         .str.replace('rcb', 'cb')
                         .str.replace('lcb', 'cb')
                         .str.replace('rdt','dt')
                         .str.replace('ldt','dt')
                         .str.replace('nan','unknown')
                         .str.upper()
                         .str.get_dummies(sep='/')
                         .add_suffix('_pos'))

df = df.drop('Pos', axis=1).join(positions_df)

In [8]:
df.head().T

Rk,1,2,3,4,5
Player,Michael Thomas,Zach Ertz,DeAndre Hopkins,Julio Jones,Adam Thielen
Id,ThomMi05,ErtzZa00,HopkDe00,JoneJu02,ThieAd00
Tm,NOR,PHI,HOU,ATL,MIN
Age,25,28,26,29,28
G,16,16,16,16,16
GS,16,16,16,16,16
Tgt,147,156,163,170,153
Rec,125,116,115,113,113
Ctch_Rate,0.85,0.744,0.706,0.665,0.739
Yds,1405,1163,1572,1677,1373


In [9]:
df.to_csv('data/nfl.csv.gz', compression='gzip')