# Basic Web Scraping with Python - NBA Data

### Importing Libraries

In [1]:
from bs4 import BeautifulSoup as soup
from urllib.request import urlopen as uReq
import pandas as pd

### Getting raw HTML data

In [2]:
data_url = "https://www.basketball-reference.com/boxscores/"
data_url

'https://www.basketball-reference.com/boxscores/'

In [3]:
uClient = uReq(data_url)
page_html = uClient.read()
uClient.close()

page_soup = soup(page_html, 'html.parser')

### Find and extract relevant data

In [11]:
tables = page_soup.findAll('table', {'class': 'suppress_all sortable stats_table'})

In [12]:
tables

[<table class="suppress_all sortable stats_table" data-cols-to-freeze=",1" id="confs_standings_E">
 <caption>Conference Standings Table</caption>
 <colgroup><col/><col/><col/><col/><col/><col/><col/></colgroup>
 <thead>
 <tr>
 <th aria-label="Eastern Conference" class="poptip sort_default_asc left" data-stat="team_name" scope="col">Eastern Conference</th>
 <th aria-label="Wins" class="poptip right" data-stat="wins" data-tip="Wins" scope="col">W</th>
 <th aria-label="Losses" class="poptip right" data-stat="losses" data-tip="Losses" scope="col">L</th>
 <th aria-label="Win-Loss Percentage" class="poptip right" data-stat="win_loss_pct" data-tip="Win-Loss Percentage" scope="col">W/L%</th>
 <th aria-label="GB" class="poptip sort_default_asc right" data-stat="gb" data-tip="Games Behind" scope="col">GB</th>
 <th aria-label="Points Per Game" class="poptip right" data-stat="pts_per_g" data-tip="Points Per Game" scope="col">PS/G</th>
 <th aria-label="Opponent Points Per Game" class="poptip right"

In [25]:
tables[0].find('tbody').findAll('tr')[0].findAll('td')[0].text

'42'

In [27]:
tables = page_soup.findAll('table', {'class': 'suppress_all sortable stats_table'})    # Find the HTML tag with the data we are looking for and fetch all instances of it
table_frames = []

for table in tables:
    
    header = []
    body = []
    
    for head in table.find('thead').findAll('th'):
        header.append(head.text)
    
    for line in table.find('tbody').findAll('tr'):
        row = []
        
        row.append(line.find('th').find('a').text)
        
        for item in line.findAll('td'):
            row.append(item.text)
        
        body.append(row)
    
    
    frame = pd.DataFrame(body, columns=header)    
        
    
    table_frames.append(frame)


table_frames[0]

Unnamed: 0,Eastern Conference,W,L,W/L%,GB,PS/G,PA/G
0,Boston Celtics,42,17,0.712,—,117.9,111.7
1,Milwaukee Bucks,41,17,0.707,0.5,114.8,111.6
2,Philadelphia 76ers,38,19,0.667,3.0,114.4,110.5
3,Cleveland Cavaliers,38,23,0.623,5.0,111.8,106.1
4,Brooklyn Nets,34,24,0.586,7.5,114.0,112.3
5,New York Knicks,33,27,0.55,9.5,114.6,112.3
6,Miami Heat,32,27,0.542,10.0,108.3,108.3
7,Atlanta Hawks,29,30,0.492,13.0,116.4,116.7
8,Washington Wizards,28,30,0.483,13.5,113.6,113.3
9,Toronto Raptors,28,31,0.475,14.0,113.3,112.5


In [30]:
table_frames[0].iloc[:, 3]

0     .712
1     .707
2     .667
3     .623
4     .586
5     .550
6     .542
7     .492
8     .483
9     .475
10    .441
11    .433
12    .407
13    .283
14    .254
Name: W/L%, dtype: object

In [31]:
# convert data to numeric

for table in table_frames:
    
    for num_column in range(6):
        table.iloc[:, num_column+1] = pd.to_numeric(table.iloc[:, num_column+1], errors='coerce')

  table.iloc[:, num_column+1] = pd.to_numeric(table.iloc[:, num_column+1], errors='coerce')
  table.iloc[:, num_column+1] = pd.to_numeric(table.iloc[:, num_column+1], errors='coerce')


In [33]:
table_frames[1]

Unnamed: 0,Western Conference,W,L,W/L%,GB,PS/G,PA/G
0,Denver Nuggets,41,18,0.695,,117.1,112.7
1,Memphis Grizzlies,35,22,0.614,5.0,116.1,112.2
2,Sacramento Kings,32,25,0.561,8.0,119.5,117.2
3,Los Angeles Clippers,33,28,0.541,9.0,111.4,111.1
4,Phoenix Suns,32,28,0.533,9.5,112.7,111.3
5,Dallas Mavericks,31,29,0.517,10.5,113.0,112.5
6,Minnesota Timberwolves,31,30,0.508,11.0,115.7,115.8
7,New Orleans Pelicans,30,29,0.508,11.0,114.9,113.5
8,Golden State Warriors,29,29,0.5,11.5,118.6,118.5
9,Oklahoma City Thunder,28,29,0.491,12.0,117.9,116.2
