In [1]:
import pandas as pd
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re

In [2]:
all_active_players = []
for year in range(2000,2016):
    # get the html and extract the data based off the proper CSS selector
    url = "http://www.pro-football-reference.com/years/{}/draft.htm".format(year)
    html = urlopen(url)
    soup = BeautifulSoup(html, "lxml")
    # active drafted players are bolded in the draft table on pfr
    player_html = soup.select("#drafts strong a")
    # get the link for each
    player_links = [player["href"] for player in player_html]
    # add these links to the large list active player lists
    all_active_players.extend(player_links)

In [3]:
active_player_ids = [re.search(r"/.*/.*/(.*)\.", player).group(1) for player
                     in all_active_players]

In [4]:
active_player_ids[:5] # just check things out, need to drop charles woodson as he's retired


['janikseb01', 'LechSh20', 'BradTo00', 'BreeDr00', 'PeppJu99']

In [6]:
draft_df = pd.read_csv("data/clean_data/pfr_nfl_draft_data_CLEAN.csv")

In [7]:
draft_df.head()

Unnamed: 0,Draft_Yr,Rnd,Pick,Tm,Player,Pos,Age,To,AP1,PB,...,Rush_Yds,Rush_TD,Rec,Rec_Yds,Rec_TD,Tkl,Def_Int,Sk,College,Player_ID
0,1967,1,1,BAL,Bubba Smith,DE,22.0,1976.0,1,2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Michigan St.,SmitBu00
1,1967,1,2,MIN,Clint Jones,RB,22.0,1973.0,0,0,...,2178.0,20.0,38.0,431.0,0.0,0.0,0.0,0.0,Michigan St.,JoneCl00
2,1967,1,3,SFO,Steve Spurrier,QB,22.0,1976.0,0,0,...,258.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,Florida,SpurSt00
3,1967,1,4,MIA,Bob Griese,QB,22.0,1980.0,2,8,...,994.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,Purdue,GrieBo00
4,1967,1,5,HOU,George Webster,LB,21.0,1976.0,3,3,...,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,Michigan St.,WebsGe00


In [8]:
draft_df = draft_df.loc[draft_df.Draft_Yr < 2016]

In [9]:
draft_df.tail()

Unnamed: 0,Draft_Yr,Rnd,Pick,Tm,Player,Pos,Age,To,AP1,PB,...,Rush_Yds,Rush_TD,Rec,Rec_Yds,Rec_TD,Tkl,Def_Int,Sk,College,Player_ID
15587,2015,7,252,DEN,Josh Furman,DB,0.0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Oklahoma St.,FurmJo00
15588,2015,7,253,NWE,Xzavier Dickson,OLB,0.0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Alabama,DickXz00
15589,2015,7,254,SFO,Rory 'Busta' Anderson,TE,22.0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,South Carolina,AndeRo02
15590,2015,7,255,IND,Denzell Goode,T,24.0,2015.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Mars Hill,GoodDe01
15591,2015,7,256,ARI,Gerald Christian,TE,24.0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Louisville,ChriGe00


In [10]:
draft_df.Player_ID.isin(active_player_ids)[-10:]

15582     True
15583     True
15584     True
15585     True
15586     True
15587    False
15588    False
15589    False
15590     True
15591     True
Name: Player_ID, dtype: bool

In [11]:
active = draft_df.Player_ID.isin(active_player_ids)

# Now create a column indicating that a player's career is officially over
# via ~ and convert it to 1s and 0s
draft_df["Retired"] = (~active).astype(int)

In [12]:
draft_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 15592 entries, 0 to 15591
Data columns (total 31 columns):
Draft_Yr     15592 non-null int64
Rnd          15592 non-null int64
Pick         15592 non-null int64
Tm           15592 non-null object
Player       15592 non-null object
Pos          15592 non-null object
Age          15592 non-null float64
To           15592 non-null float64
AP1          15592 non-null int64
PB           15592 non-null int64
St           15592 non-null int64
CarAV        15592 non-null float64
DrAV         15592 non-null float64
G            15592 non-null float64
Cmp          15592 non-null float64
Att          15592 non-null float64
Yds          15592 non-null float64
TD           15592 non-null float64
Int          15592 non-null float64
Rush_Att     15592 non-null float64
Rush_Yds     15592 non-null float64
Rush_TD      15592 non-null float64
Rec          15592 non-null float64
Rec_Yds      15592 non-null float64
Rec_TD       15592 non-null float64
Tkl   

In [13]:
draft_df.loc[draft_df.Retired == 0, "To"].value_counts()

2015.0    1009
0.0         55
2014.0      43
2013.0       4
2012.0       1
Name: To, dtype: int64

In [14]:
draft_df.loc[(draft_df.Retired == 0) & (draft_df.To == 2014),
             ["Player", "Player_ID"]]

Unnamed: 0,Player,Player_ID
13591,Jordy Nelson,NelsJo00
13612,Chad Henne,HennCh01
13698,Orlando Scandrick,ScanOr99
13742,John Sullivan,SullJo24
13867,Will Beatty,BeatWi20
14081,Maurkice Pouncey,PounMa20
14146,Corey Peters,PeteCo00
14177,Dennis Pitta,PittDe00
14220,Arthur Jones,JoneAr22
14339,Phil Taylor,TaylPh00


In [15]:
with pd.option_context("display.max_rows", None):
    display(draft_df.loc[(draft_df.Retired == 0) & (draft_df.To == 2013), 
                         ["Player", "Player_ID"]])

Unnamed: 0,Player,Player_ID
13715,Josh Johnson,JohnJo05
14688,Orson Charles,CharOr00
14824,Daryl Richardson,RichDa00
15067,Michael Bowie,BowiMi00


In [16]:
draft_df.loc[(draft_df.Retired == 0) & (draft_df.To == 2011), 
             ["Player", "Player_ID"]]

Unnamed: 0,Player,Player_ID


In [17]:
def calc_duration(player):
    """
    Calculte the years played for a player. If the 'To' value is 0 then return
    the value 0.  Otherwise set that column value to equal 
    'To' - 'Draft_Yr' + 1.
    """
    
    # The player never played a season if their "To" value is 0, so return 0
    if player["To"] == 0:
        return 0
    
    # Otherwise return the number of years they played.
    duration = player["To"] - player["Draft_Yr"] + 1
    return duration

In [18]:
draft_df["Duration"]  = draft_df.apply(lambda player: calc_duration(player),
                                           axis=1)

In [19]:
draft_df.head()


Unnamed: 0,Draft_Yr,Rnd,Pick,Tm,Player,Pos,Age,To,AP1,PB,...,Rec,Rec_Yds,Rec_TD,Tkl,Def_Int,Sk,College,Player_ID,Retired,Duration
0,1967,1,1,BAL,Bubba Smith,DE,22.0,1976.0,1,2,...,0.0,0.0,0.0,0.0,0.0,0.0,Michigan St.,SmitBu00,1,10.0
1,1967,1,2,MIN,Clint Jones,RB,22.0,1973.0,0,0,...,38.0,431.0,0.0,0.0,0.0,0.0,Michigan St.,JoneCl00,1,7.0
2,1967,1,3,SFO,Steve Spurrier,QB,22.0,1976.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,Florida,SpurSt00,1,10.0
3,1967,1,4,MIA,Bob Griese,QB,22.0,1980.0,2,8,...,0.0,0.0,0.0,0.0,0.0,0.0,Purdue,GrieBo00,1,14.0
4,1967,1,5,HOU,George Webster,LB,21.0,1976.0,3,3,...,0.0,0.0,0.0,0.0,5.0,0.0,Michigan St.,WebsGe00,1,10.0


In [20]:
draft_df.to_csv("nfl_survival_analysis_data.csv", index=False)