In [2]:
from sys import argv
from requests import get
import pandas as pd 
from bs4 import BeautifulSoup
import numpy as np


In [3]:
years = [i for i in range(2018, 2020)]
weeks = [i for i in range(1,18)]
yearweek = [(a,b) for a in years for b in weeks]

In [4]:
urlDict = {
    
}

for year, week in yearweek:
    urlDict.update( {'Passing{year}week{week}'.format(year = year, week = week) : """https://www.pro-football-reference.com/play-index/pgl_finder.cgi?request=1&match=game&year_min={year}&year_max={year}&season_start=1&season_end=-1&age_min=0&age_max=99&game_type=A&league_id=&team_id=&opp_id=&game_num_min=0&game_num_max=99&week_num_min={week}&week_num_max={week}&game_day_of_week=&game_location=&game_result=&handedness=&is_active=&is_hof=&c1stat=pass_att&c1comp=gt&c1val=1&c2stat=&c2comp=gt&c2val=&c3stat=&c3comp=gt&c3val=&c4stat=&c4comp=gt&c4val=&order_by=pass_rating&from_link=1""".format(year = year, week = week)} )
    urlDict.update( {'Receiving{year}week{week}'.format(year = year, week = week) : """https://www.pro-football-reference.com/play-index/pgl_finder.cgi?request=1&match=game&year_min={year}&year_max={year}&season_start=1&season_end=-1&age_min=0&age_max=99&game_type=A&league_id=&team_id=&opp_id=&game_num_min=0&game_num_max=99&week_num_min={week}&week_num_max={week}&game_day_of_week=&game_location=&game_result=&handedness=&is_active=&is_hof=&c1stat=rec&c1comp=gt&c1val=1&c2stat=&c2comp=gt&c2val=&c3stat=&c3comp=gt&c3val=&c4stat=&c4comp=gt&c4val=&order_by=rec_yds&from_link=1""".format(year=year, week=week)})
    urlDict.update( {'Rushing{year}week{week}'.format(year = year, week = week) : """https://www.pro-football-reference.com/play-index/pgl_finder.cgi?request=1&match=game&year_min={year}&year_max={year}&season_start=1&season_end=-1&age_min=0&age_max=99&game_type=A&league_id=&team_id=&opp_id=&game_num_min=0&game_num_max=99&week_num_min={week}&week_num_max={week}&game_day_of_week=&game_location=&game_result=&handedness=&is_active=&is_hof=&c1stat=rush_att&c1comp=gt&c1val=1&c2stat=&c2comp=gt&c2val=&c3stat=&c3comp=gt&c3val=&c4stat=&c4comp=gt&c4val=&order_by=rush_yds&from_link=1""".format(year=year, week=week)})
 

In [5]:
dfs = []

defColumnSettings = {
    'axis':1,
    'inplace': True
}

In [6]:
for key, url in urlDict.items():
    response = get(url)
    
    soup = BeautifulSoup(response.content, 'html.parser')
    
    table = soup.find('table', {'id': 'results'})
    
    df = pd.read_html(str(table))[0]
    
    df.columns = df.columns.droplevel(level=0)
    
    #df.drop(['Result', 'Week', 'G#', 'Opp', 'Unnamed: 7_level_1', 'Age', 'Rk', 'Lg', 'Day'], **defColumnSettings)
    df.drop(['Unnamed: 7_level_1', 'Date', 'Lg'], **defColumnSettings)
    
    df = df[df['Pos'] != 'Pos']
    
    df.set_index(['Player', 'Pos', 'Age'], inplace = True)
    
    if 'Passing' in key:
        #df = df[['Yds', 'TD', 'Int', 'Att', 'Cmp']]
        df.rename({'Yds': 'PassingYds', 'Att': 'PassingAtt', 'Y/A': 'Y/PassingAtt', 'TD': 'PassingTD'}, **defColumnSettings)
    elif 'Receiving' in key:
        #df = df[['Rec', 'Tgt', 'Yds', 'TD']]
        #df.drop('Ctch%', **defColumnSettings)
        df.rename({'Yds': 'ReceivingYds', 'TD': 'ReceivingTD'}, **defColumnSettings)
    elif 'Rushing' in key:
        #df.drop('Y/A', **defColumnSettings)
        df.rename({'Att': 'RushingAtt', 'Yds': 'RushingYds', 'TD': 'RushingTD'}, **defColumnSettings)
    dfs.append(df)

In [7]:
df = pd.concat(dfs, join = 'outer', ignore_index = False, sort = False)
df.fillna(0, inplace = True)

df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Rk,Tm,Opp,Result,G#,Week,Day,Cmp,PassingAtt,Cmp%,...,Rec,ReceivingYds,Y/R,ReceivingTD,Ctch%,Y/Tgt,RushingAtt,RushingYds,Y/A,RushingTD
Player,Pos,Age,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
Ryan Fitzpatrick,QB,35.289,1,TAM,NOR,W 48-40,1,1,Sun,21,28,75.0,...,0,0,0,0,0,0,0,0,0,0
Aaron Rodgers,QB,34.281,2,GNB,CHI,W 24-23,1,1,Sun,20,30,66.67,...,0,0,0,0,0,0,0,0,0,0
Drew Brees,QB,39.237,3,NOR,TAM,L 40-48,1,1,Sun,37,45,82.22,...,0,0,0,0,0,0,0,0,0,0
Patrick Mahomes,QB,22.357,4,KAN,LAC,W 38-28,1,1,Sun,15,27,55.56,...,0,0,0,0,0,0,0,0,0,0
Joe Flacco,QB,33.236,5,BAL,BUF,W 47-3,1,1,Sun,25,34,73.53,...,0,0,0,0,0,0,0,0,0,0


In [22]:
df.rename(columns = {'Rk': 'Rank'})

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Rank,Tm,Opp,Result,G#,Week,Day,Cmp,PassingAtt,Cmp%,...,ReceivingYds,Y/R,ReceivingTD,Ctch%,Y/Tgt,RushingAtt,RushingYds,Y/A,RushingTD,FantasyPoints
Player,Pos,Age,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
Ryan Fitzpatrick,QB,35.289,1,TAM,NOR,W 48-40,1,1,Sun,21,28,75.00,...,0,0,0,0,0,0,0,0,0,32.68
Aaron Rodgers,QB,34.281,2,GNB,CHI,W 24-23,1,1,Sun,20,30,66.67,...,0,0,0,0,0,0,0,0,0,23.44
Drew Brees,QB,39.237,3,NOR,TAM,L 40-48,1,1,Sun,37,45,82.22,...,0,0,0,0,0,0,0,0,0,29.56
Patrick Mahomes,QB,22.357,4,KAN,LAC,W 38-28,1,1,Sun,15,27,55.56,...,0,0,0,0,0,0,0,0,0,26.24
Joe Flacco,QB,33.236,5,BAL,BUF,W 47-3,1,1,Sun,25,34,73.53,...,0,0,0,0,0,0,0,0,0,21.44
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Alex Erickson,WR,27.053,96,CIN,CLE,W 33-23,16,17,Sun,0,0,0,...,0,0,0,0,0,1,5,5.00,0,0.50
Will Grier,QB,24.270,97,CAR,NOR,L 10-42,16,17,Sun,0,0,0,...,0,0,0,0,0,3,5,1.67,0,0.50
Buddy Howell,RB,23.277,98,HOU,TEN,L 14-35,16,17,Sun,0,0,0,...,0,0,0,0,0,3,5,1.67,0,0.50
Philip Rivers,QB,38.021,99,LAC,KAN,L 21-31,16,17,Sun,0,0,0,...,0,0,0,0,0,1,5,5.00,0,0.50


In [35]:
df['Winner/Not Winner'] = np.where(df['Tm'] =='Winner', 'T', 'F')
    

In [39]:
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Rk,Tm,Opp,Result,G#,Week,Day,Cmp,PassingAtt,Cmp%,...,Y/R,ReceivingTD,Ctch%,Y/Tgt,RushingAtt,RushingYds,Y/A,RushingTD,FantasyPoints,Winner/Not Winner
Player,Pos,Age,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
Ryan Fitzpatrick,QB,35.289,1,TAM,NOR,W 48-40,1,1,Sun,21,28,75.0,...,0,0,0,0,0,0,0,0,32.68,F
Aaron Rodgers,QB,34.281,2,GNB,CHI,W 24-23,1,1,Sun,20,30,66.67,...,0,0,0,0,0,0,0,0,23.44,F
Drew Brees,QB,39.237,3,NOR,TAM,L 40-48,1,1,Sun,37,45,82.22,...,0,0,0,0,0,0,0,0,29.56,F
Patrick Mahomes,QB,22.357,4,Winners,LAC,W 38-28,1,1,Sun,15,27,55.56,...,0,0,0,0,0,0,0,0,26.24,F
Joe Flacco,QB,33.236,5,BAL,BUF,W 47-3,1,1,Sun,25,34,73.53,...,0,0,0,0,0,0,0,0,21.44,F


In [11]:
df.dtypes

Rk              object
Tm              object
Opp             object
Result          object
G#              object
Week            object
Day             object
Cmp             object
PassingAtt      object
Cmp%            object
PassingYds      object
PassingTD       object
Int             object
Rate            object
Sk              object
Yds.1           object
Y/PassingAtt    object
AY/A            object
Tgt             object
Rec             object
ReceivingYds    object
Y/R             object
ReceivingTD     object
Ctch%           object
Y/Tgt           object
RushingAtt      object
RushingYds      object
Y/A             object
RushingTD       object
dtype: object

In [15]:
fant_stats = ['PassingYds', 'PassingTD', 'Int', 'Rec', 'ReceivingYds', 'ReceivingTD', 'RushingYds', 'RushingTD']

In [19]:
for stats in fant_stats:
    df[stats] = df[stats].astype(str).astype('int64')

In [20]:
df.dtypes

Rk              object
Tm              object
Opp             object
Result          object
G#              object
Week            object
Day             object
Cmp             object
PassingAtt      object
Cmp%            object
PassingYds       int64
PassingTD        int64
Int              int64
Rate            object
Sk              object
Yds.1           object
Y/PassingAtt    object
AY/A            object
Tgt             object
Rec              int64
ReceivingYds     int64
Y/R             object
ReceivingTD      int64
Ctch%           object
Y/Tgt           object
RushingAtt      object
RushingYds       int64
Y/A             object
RushingTD        int64
dtype: object

In [21]:
df['FantasyPoints'] = df['PassingYds']/25 + df['PassingTD']*4 - df['Int']*2 + 0.5* df['Rec'] + df['ReceivingYds']/10 + df['ReceivingTD']*6 + df['RushingYds']/10 + df['RushingTD']*6


In [40]:
stats = pd.read_csv('datasets/allstats.csv')

In [41]:
stats.head()

Unnamed: 0.1,Unnamed: 0,Player,Pos,Age,Rk,Date,Tm,Opp,Result,G#,...,ReceivingYds,Y/R,ReceivingTD,Ctch%,Y/Tgt,RushingAtt,RushingYds,Y/A,RushingTD,FantasyPoints
0,0,Mike Tomczak,QB,36.324,1,1999-09-12,PIT,CLE,W 43-0,1,...,0,0.0,0,0,0.0,0,0,0.0,0,11.12
1,1,Tom Tupa,QB/P,33.218,2,1999-09-12,NYJ,NWE,L 28-30,1,...,0,0.0,0,0,0.0,0,0,0.0,0,14.6
2,2,Dan Marino*,QB,37.363,3,1999-09-13,MIA,DEN,W 38-21,1,...,0,0.0,0,0,0.0,0,0,0.0,0,16.6
3,3,Brad Johnson,QB,30.364,4,1999-09-12,WAS,DAL,L 35-41,1,...,0,0.0,0,0,0.0,0,0,0.0,0,23.28
4,4,Steve McNair,QB,26.21,5,1999-09-12,TEN,CIN,W 36-35,1,...,0,0.0,0,0,0.0,0,0,0.0,0,23.64


In [42]:
stats.dtypes

Unnamed: 0         int64
Player            object
Pos               object
Age              float64
Rk                 int64
Date              object
Tm                object
Opp               object
Result            object
G#                 int64
Week               int64
Day               object
Cmp                int64
PassingAtt         int64
Cmp%             float64
PassingYds         int64
PassingTD          int64
Int                int64
Rate             float64
Sk                 int64
Yds.1              int64
Y/PassingAtt     float64
AY/A             float64
Home T/F          object
Tgt                int64
Rec                int64
ReceivingYds       int64
Y/R              float64
ReceivingTD        int64
Ctch%             object
Y/Tgt            float64
RushingAtt         int64
RushingYds         int64
Y/A              float64
RushingTD          int64
FantasyPoints    float64
dtype: object

In [52]:
home = stats[['Player', 'Pos', 'Opp', 'Home T/F', 'Date', 'Result']]

In [53]:
home.head()

Unnamed: 0,Player,Pos,Opp,Home T/F,Date,Result
0,Mike Tomczak,QB,CLE,F,1999-09-12,W 43-0
1,Tom Tupa,QB/P,NWE,T,1999-09-12,L 28-30
2,Dan Marino*,QB,DEN,F,1999-09-13,W 38-21
3,Brad Johnson,QB,DAL,T,1999-09-12,L 35-41
4,Steve McNair,QB,CIN,T,1999-09-12,W 36-35
