In [2]:
from urllib2 import urlopen
from bs4 import BeautifulSoup
import pandas as pd

In [3]:
# url that we are scraping
url_template = "http://www.basketball-reference.com/leagues/NBA_{year}_totals.html"
Totals_df = pd.DataFrame()

for year in range(1990, 2017):  # for each year
    url = url_template.format(year=year)  # get the url
    
    html = urlopen(url)  # get the html
    soup = BeautifulSoup(html, "lxml") # create our BS object
    
    #get the column headers
    column_headers = [th.getText() for th in 
                  soup.findAll('tr', limit=2)[0].findAll('th')]

    data_rows = soup.findAll('tr')[1:]  # skip the first header row
    player_data = [[td.getText() for td in data_rows[i].findAll('td')]
            for i in range(len(data_rows))]

    # Turn yearly data into a DatFrame
    year_df = pd.DataFrame(player_data, columns=column_headers)
    # create and insert the Draft_Yr column
    year_df.insert(0, 'Season', year)
    
    # Append to the big dataframe
    Totals_df = Totals_df.append(year_df, ignore_index=True)

In [4]:
Totals_df.head()

Unnamed: 0,Season,Rk,Player,Pos,Age,Tm,G,GS,MP,FG,...,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
0,1990,1,Mark Acres,C,27,ORL,80,50,1691,138,...,0.692,154,277,431,67,36,25,70,248,362
1,1990,2,Michael Adams,PG,27,DEN,79,74,2690,398,...,0.85,49,176,225,495,121,3,141,133,1221
2,1990,3,Mark Aguirre,SF,30,DET,78,40,2005,438,...,0.756,117,188,305,145,34,19,121,201,1099
3,1990,4,Danny Ainge,PG,30,SAC,75,68,2727,506,...,0.831,69,257,326,453,113,18,185,238,1342
4,1990,5,Mark Alarie,PF,26,WSB,82,10,1893,371,...,0.812,151,223,374,142,60,39,101,219,860


In [5]:
# convert to proper data types
Totals_df = Totals_df.convert_objects(convert_numeric=True)
#get rid of any full rows with null values
Totals_df= Totals_df[Totals_df.Player.notnull()]
#replace null values with zero
Totals_df = Totals_df.fillna(0)
#change added Season to float
Totals_df['Season'] = Totals_df['Season'].astype(float)
Totals_df.dtypes

  from ipykernel import kernelapp as app


Season    float64
Rk        float64
Player     object
Pos        object
Age       float64
Tm         object
G         float64
GS        float64
MP        float64
FG        float64
FGA       float64
FG%       float64
3P        float64
3PA       float64
3P%       float64
2P        float64
2PA       float64
2P%       float64
eFG%      float64
FT        float64
FTA       float64
FT%       float64
ORB       float64
DRB       float64
TRB       float64
AST       float64
STL       float64
BLK       float64
TOV       float64
PF        float64
PTS       float64
dtype: object

In [6]:
#Lets drop RK, GS, ORB, DRB from our dataframe. These variables should not influence much
Totals_df = Totals_df.drop(['Rk', 'GS', 'ORB', 'DRB'], axis = 1)

In [7]:
Totals_df

Unnamed: 0,Season,Player,Pos,Age,Tm,G,MP,FG,FGA,FG%,...,FT,FTA,FT%,TRB,AST,STL,BLK,TOV,PF,PTS
0,1990,Mark Acres,C,27,ORL,80,1691,138,285,0.484,...,83,120,0.692,431,67,36,25,70,248,362
1,1990,Michael Adams,PG,27,DEN,79,2690,398,989,0.402,...,267,314,0.850,225,495,121,3,141,133,1221
2,1990,Mark Aguirre,SF,30,DET,78,2005,438,898,0.488,...,192,254,0.756,305,145,34,19,121,201,1099
3,1990,Danny Ainge,PG,30,SAC,75,2727,506,1154,0.438,...,222,267,0.831,326,453,113,18,185,238,1342
4,1990,Mark Alarie,PF,26,WSB,82,1893,371,785,0.473,...,108,133,0.812,374,142,60,39,101,219,860
5,1990,Steve Alford,PG,25,DAL,41,302,63,138,0.457,...,35,37,0.946,25,39,15,3,16,22,168
6,1990,Randy Allen,SG,25,SAC,63,746,106,239,0.444,...,23,43,0.535,138,23,16,19,28,102,235
7,1990,Greg Anderson,PF,25,MIL,60,1291,219,432,0.507,...,91,170,0.535,373,24,32,54,80,176,529
8,1990,Nick Anderson,SG,22,ORL,81,1785,372,753,0.494,...,186,264,0.705,316,124,69,34,138,140,931
9,1990,Richard Anderson,PF,29,CHH,54,604,88,211,0.417,...,18,23,0.783,127,55,20,9,26,64,231


In [8]:
#Convert this to a csv file 
Totals_df.to_csv("../Data/Totals_data_1990_to_2016.csv")
