In [1]:
# imports
import warnings
warnings.filterwarnings('ignore')
from bs4 import BeautifulSoup
import pandas as pd
from scipy import stats
import numpy as np
import spacy
import seaborn as sns
import matplotlib.pyplot as plt
import requests
from urllib.request import urlopen
import json
import dill

In [2]:
url='https://www.basketball-reference.com/leagues/NBA_1950_per_game.html'

In [3]:
r=requests.get(url)

In [4]:
soup = BeautifulSoup(r.content,'html.parser')

In [5]:
# gathering information on the players I need data from
player_list=[]
for player in soup.find_all('tbody'):
    player_list.append(player.find_all('tr'))

In [6]:
# when I pulled the data it came as a nested list so I need to step inside that list
player_list=player_list[0]

In [7]:
# creating list of attributes for every player
b =[]
for i in range(0,len(player_list)):
    for stat in player_list[i].find_all('td'):
        b.append(stat.text)

In [8]:
b[0:29]

['Curly Armstrong',
 'G-F',
 '31',
 'FTW',
 '63',
 '',
 '',
 '2.3',
 '8.2',
 '.279',
 '',
 '',
 '',
 '2.3',
 '8.2',
 '.279',
 '.279',
 '2.7',
 '3.8',
 '.705',
 '',
 '',
 '',
 '2.8',
 '',
 '',
 '',
 '3.4',
 '7.3']

In [9]:
chunks = [b[x:x+29] for x in range(0, len(b), 29)]

In [10]:
df=pd.DataFrame(columns=['name','pos','age','team','gp','gs','mp','fg','fga','fg%','3p','3pa','3p%','2p',
                        '2pa','2p%','efg%','ft','fta','ft%','orb','drb',
                        'trb','ast','stl','blk','tov','fouls','pts'],
               index=range(0,len(chunks)))

In [11]:
for i in range(len(chunks)):
    df.name[i] = chunks[i][0]
    df.pos[i]=chunks[i][1]
    df.age[i]=chunks[i][2]
    df.team[i]=chunks[i][3]
    df.gp[i]=chunks[i][4]
    df.gs[i]=chunks[i][5]
    df.mp[i]=chunks[i][6]
    df.fg[i]=chunks[i][7]
    df.fga[i]=chunks[i][8]
    df['fg%'][i]=chunks[i][9]
    df['3p'][i]=chunks[i][10]
    df['3pa'][i]=chunks[i][11]
    df['3p%'][i]=chunks[i][12]
    df['2p'][i]=chunks[i][13]
    df['2pa'][i]=chunks[i][14]
    df['2p%'][i]=chunks[i][15]
    df['efg%'][i]=chunks[i][16]
    df['ft'][i]=chunks[i][17]
    df['fta'][i]=chunks[i][18]
    df['ft%'][i]=chunks[i][19]
    df['orb'][i]=chunks[i][20]
    df['drb'][i]=chunks[i][21]
    df['trb'][i]=chunks[i][22]
    df['ast'][i]=chunks[i][23]
    df['stl'][i]=chunks[i][24]
    df['blk'][i]=chunks[i][25]
    df['tov'][i]=chunks[i][26]
    df['fouls'][i]=chunks[i][27]
    df['pts'][i]=chunks[i][28]

In [12]:
final_team_df = df[df.name.duplicated()==True].drop_duplicates(subset='name',keep='last')

In [13]:
final_team_df

Unnamed: 0,name,pos,age,team,gp,gs,mp,fg,fga,fg%,...,ft%,orb,drb,trb,ast,stl,blk,tov,fouls,pts
5,Ed Bartels,F,24,NYK,2,,,0.5,2.0,0.25,...,0.667,,,,0.0,,,,1.0,2.0
10,Charlie Black,F-C,28,AND,29,,,3.5,13.0,0.267,...,0.688,,,,3.0,,,,4.6,9.6
31,Jake Carter,F-C,25,AND,11,,,0.9,2.7,0.333,...,0.667,,,,0.7,,,,2.9,3.5
35,John Chaney,F-C,29,SHE,10,,,1.5,4.9,0.306,...,0.706,,,,0.5,,,,1.0,4.2
40,Paul Cloyd,G-F,29,WAT,4,,,1.5,4.5,0.333,...,0.4,,,,0.3,,,,0.3,3.5
59,Gene Englund,F-C,32,TRI,22,,,2.2,5.7,0.389,...,0.767,,,,1.1,,,,3.2,7.5
63,Johnny Ezersky,F-G,27,BOS,16,,,2.3,8.5,0.265,...,0.686,,,,1.4,,,,2.4,6.7
77,Hoot Gibson,F-C,28,WAT,30,,,2.1,6.4,0.335,...,0.683,,,,1.2,,,,3.4,5.6
87,Al Guokas,F-G,24,PHW,16,,,0.4,1.8,0.25,...,1.0,,,,0.6,,,,1.7,1.1
98,Billy Hassett,G,28,MNL,42,,,0.9,3.5,0.262,...,0.522,,,,1.6,,,,2.0,2.6


In [14]:
final_team_df.reset_index(inplace=True)
final_team_df.drop('index',axis=1,inplace=True)

In [15]:
final_team_dict = {}
for i in range(len(final_team_df)):
    final_team_dict[final_team_df.name[i]]=final_team_df.team[i]

In [16]:
df=df.drop_duplicates(subset='name',keep='first')

In [17]:
df.name = df.name.astype(str)

In [18]:
df.reset_index(inplace=True)
df.drop('index',axis=1,inplace=True)

In [19]:
# replace "TOT" with the final team that player played for
for i in range(len(df)):
    if df.name.iloc[i] in list(final_team_dict.keys()):
        df.team.iloc[i] = final_team_dict[df.name.iloc[i]]

In [20]:
def create_season_dataframe(year):
    url=f'https://www.basketball-reference.com/leagues/NBA_{year}_per_game.html'
    r=requests.get(url)
    soup = BeautifulSoup(r.content,'html.parser')
    # gathering information on the players I need data from
    player_list=[]
    for player in soup.find_all('tbody'):
        player_list.append(player.find_all('tr'))
    player_list=player_list[0]
    # creating list of attributes for every player
    b =[]
    for i in range(0,len(player_list)):
        for stat in player_list[i].find_all('td'):
            b.append(stat.text)
    chunks = [b[x:x+29] for x in range(0, len(b), 29)]
    df=pd.DataFrame(columns=['name','pos','age','team','gp','gs','mp','fg','fga','fg%','3p','3pa','3p%','2p',
                        '2pa','2p%','efg%','ft','fta','ft%','orb','drb',
                        'trb','ast','stl','blk','tov','fouls','pts'],
               index=range(0,len(chunks)))
    for i in range(len(chunks)):
        df.name[i] = chunks[i][0]
        df.pos[i]=chunks[i][1]
        df.age[i]=chunks[i][2]
        df.team[i]=chunks[i][3]
        df.gp[i]=chunks[i][4]
        df.gs[i]=chunks[i][5]
        df.mp[i]=chunks[i][6]
        df.fg[i]=chunks[i][7]
        df.fga[i]=chunks[i][8]
        df['fg%'][i]=chunks[i][9]
        df['3p'][i]=chunks[i][10]
        df['3pa'][i]=chunks[i][11]
        df['3p%'][i]=chunks[i][12]
        df['2p'][i]=chunks[i][13]
        df['2pa'][i]=chunks[i][14]
        df['2p%'][i]=chunks[i][15]
        df['efg%'][i]=chunks[i][16]
        df['ft'][i]=chunks[i][17]
        df['fta'][i]=chunks[i][18]
        df['ft%'][i]=chunks[i][19]
        df['orb'][i]=chunks[i][20]
        df['drb'][i]=chunks[i][21]
        df['trb'][i]=chunks[i][22]
        df['ast'][i]=chunks[i][23]
        df['stl'][i]=chunks[i][24]
        df['blk'][i]=chunks[i][25]
        df['tov'][i]=chunks[i][26]
        df['fouls'][i]=chunks[i][27]
        df['pts'][i]=chunks[i][28]
    final_team_df = df[df.name.duplicated()==True].drop_duplicates(subset='name',keep='last')
    final_team_df.reset_index(inplace=True)
    final_team_df.drop('index',axis=1,inplace=True)
    final_team_dict = {}
    for i in range(len(final_team_df)):
        final_team_dict[final_team_df.name[i]]=final_team_df.team[i]
    df=df.drop_duplicates(subset='name',keep='first')
    df.name = df.name.astype(str)
    df.reset_index(inplace=True)
    df.drop('index',axis=1,inplace=True)
    for i in range(len(df)):
        if df.name.iloc[i] in list(final_team_dict.keys()):
            df.team.iloc[i] = final_team_dict[df.name.iloc[i]]
    
    return df

In [21]:
yrs_to_survey=[]
for i in list(range(1947,2020)):
    yrs_to_survey.append(str(i))

In [22]:
abc=2
dfs=[]
for yr in yrs_to_survey:
    try:
        dfs.append(create_season_dataframe(yr))
    except:
        abc=2

In [23]:
final_df = pd.concat(dfs, ignore_index=True)

In [24]:
draft=pd.read_csv('NBA_Draft_1980_2017.tsv',sep='\t',encoding="ISO-8859-1")

In [25]:
draft.head()

Unnamed: 0,year,round,pick,player,position,school
0,1980,1,1.0,"Carroll, Joe Barry",F/C,Purdue
1,1980,1,2.0,"Griffith, Darrell",G,Louisville
2,1980,1,3.0,"McHale, Kevin",F/C,Minnesota
3,1980,1,4.0,"Ransey, Kelvin",G,Ohio State
4,1980,1,5.0,"Ray, James",F,Jacksonville


In [26]:
draft.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2167 entries, 0 to 2166
Data columns (total 6 columns):
year        2167 non-null int64
round       2167 non-null int64
pick        2166 non-null float64
player      2167 non-null object
position    2167 non-null object
school      2167 non-null object
dtypes: float64(1), int64(2), object(3)
memory usage: 101.7+ KB


In [27]:
draft.pick[0]

1.0

In [28]:
draft['round'][0]

1

In [29]:
draft['tot_pick']=draft.pick*draft['round']

In [30]:
draft.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2167 entries, 0 to 2166
Data columns (total 7 columns):
year        2167 non-null int64
round       2167 non-null int64
pick        2166 non-null float64
player      2167 non-null object
position    2167 non-null object
school      2167 non-null object
tot_pick    2166 non-null float64
dtypes: float64(2), int64(2), object(3)
memory usage: 118.6+ KB


In [31]:
draft_dict={}
school_dict={}
for i in range(len(draft)):
    draft_dict[draft.player[i]]=draft.tot_pick[i]
    school_dict[draft.player[i]]=draft.school[i]

In [32]:
final_df['school']=None
final_df['selection']=None

In [33]:
for i in range(len(final_df)):
    if final_df.name[i] in draft_dict.keys():
        final_df.selection[i]=draft_dict[final_df.name[i]]

In [34]:
for i in range(len(final_df)):
    if final_df.name[i] in school_dict.keys():
        final_df.school[i]=school_dict[final_df.name[i]]

In [35]:
final_df

Unnamed: 0,name,pos,age,team,gp,gs,mp,fg,fga,fg%,...,drb,trb,ast,stl,blk,tov,fouls,pts,school,selection
0,Curly Armstrong,G-F,31,FTW,63,,,2.3,8.2,.279,...,,,2.8,,,,3.4,7.3,,
1,Cliff Barker,SG,29,INO,49,,,2.1,5.6,.372,...,,,2.2,,,,2.0,5.7,,
2,Leo Barnhorst,SF,25,CHS,67,,,2.6,7.4,.349,...,,,2.1,,,,2.9,6.5,,
3,Ed Bartels,F,24,NYK,15,,,1.5,5.7,.256,...,,,1.3,,,,1.9,4.2,,
4,Ralph Beard,G,22,INO,60,,,5.7,15.6,.363,...,,,3.9,,,,2.2,14.9,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21351,Trae Young,PG,20,ATL,81,81,30.9,6.5,15.5,.418,...,2.9,3.7,8.1,0.9,0.2,3.8,1.7,19.1,Oklahoma,7
21352,Cody Zeller,C,26,CHO,49,47,25.4,3.9,7.0,.551,...,4.6,6.8,2.1,0.8,0.8,1.3,3.3,10.1,,
21353,Tyler Zeller,C,29,MEM,6,1,15.5,2.7,5.0,.533,...,2.2,4.0,0.7,0.2,0.5,0.7,3.3,7.7,,
21354,Ante Žižić,C,22,CLE,59,25,18.3,3.1,5.6,.553,...,3.6,5.4,0.9,0.2,0.4,1.0,1.9,7.8,,


In [36]:
final_df.to_csv('nba_season_data.csv')