In [1]:
import sys
sys.path.insert(0, '../')

In [2]:
sys.executable

'c:\\Users\\denis\\anaconda3\\envs\\ucm\\python.exe'

# Libraries

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

from urllib.request import urlopen
from bs4 import BeautifulSoup

In [3]:
from src.scrapper import scrape_nba_finals, scrape_NBA_team_data

# Get data

**Example NBA finals**

In [4]:
scrape_nba_finals()

Unnamed: 0,Year,Lg,Champion,Runner-Up,Finals MVP,Unnamed: 6,Points,Rebounds,Assists,Win Shares
0,2023,NBA,Denver Nuggets,Miami Heat,N. Jokić,,N. Jokić (600),N. Jokić (269),N. Jokić (190),N. Jokić (5.0)
1,2022,NBA,Golden State Warriors,Boston Celtics,S. Curry,,J. Tatum (615),A. Horford (214),J. Tatum (148),J. Butler (3.8)
2,2021,NBA,Milwaukee Bucks,Phoenix Suns,G. Antetokounmpo,,G. Antetokounmpo (634),G. Antetokounmpo (269),J. Holiday (199),G. Antetokounmpo (3.7)
3,2020,NBA,Los Angeles Lakers,Miami Heat,L. James,,A. Davis (582),L. James (226),L. James (184),A. Davis (4.5)
4,2019,NBA,Toronto Raptors,Golden State Warriors,K. Leonard,,K. Leonard (732),D. Green (223),D. Green (187),K. Leonard (4.9)
5,2018,NBA,Golden State Warriors,Cleveland Cavaliers,K. Durant,,L. James (748),D. Green (222),L. James (198),L. James (5.2)
6,2017,NBA,Golden State Warriors,Cleveland Cavaliers,K. Durant,,L. James (591),K. Love (191),L. James (141),L. James (4.3)
7,2016,NBA,Cleveland Cavaliers,Golden State Warriors,L. James,,K. Thompson (582),D. Green (228),R. Westbrook (198),L. James (4.7)
8,2015,NBA,Golden State Warriors,Cleveland Cavaliers,A. Iguodala,,L. James (601),D. Howard (238),L. James (169),S. Curry (3.9)
9,2014,NBA,San Antonio Spurs,Miami Heat,K. Leonard,,K. Durant (563),T. Duncan (211),R. Westbrook (153),L. James (4.3)


**Example team season**

In [2]:
year = 2022

# URL to scrape, notice f string:
url = f"https://www.basketball-reference.com/leagues/NBA_{year}_standings.html"

# collect HTML data
html = urlopen(url)

# create beautiful soup object from HTML
soup = BeautifulSoup(html, features="html.parser")

# use getText()to extract the headers into a list
titles = [th.getText() for th in soup.findAll('tr', limit=2)[0].findAll('th')]

# first, find only column headers
headers = titles[1:titles.index("SRS")+1]

# then, exclude first set of column headers (duplicated)
titles = titles[titles.index("SRS")+1:]

In [3]:
try:
    row_titles = titles[0:titles.index("Eastern Conference")]
except: row_titles = titles

# remove the non-teams from this list
for i in headers:
    row_titles.remove(i)

divisions = ["Atlantic Division", "Central Division",
                "Southeast Division", "Northwest Division",
                "Pacific Division", "Southwest Division",
                "Midwest Division"]

[row_titles.remove(x) for x in row_titles if x in divisions ]

# Separate east and west 
east_titles = row_titles[:row_titles.index('Western Conference')]
west_titles = row_titles[row_titles.index('Western Conference')+1:]

row_titles.remove("Western Conference")


In [5]:
rows = soup.findAll('tr')[1:]

In [7]:
team_stats = [[td.getText() for td in rows[i].findAll('td')]
                    for i in range(len(rows))]

In [9]:
# remove empty elements
team_stats = [e for e in team_stats if e != []]

# only keep needed rows
team_stats = team_stats[0:len(row_titles)]

In [12]:
len(team_stats)

30

In [13]:
for i in range(0, len(team_stats)):
    team_stats[i].insert(0, row_titles[i])
    team_stats[i].insert(0, year)
    
# add team, year columns to headers
headers.insert(0, "Team")
headers.insert(0, "Year")

In [15]:
final_df = pd.DataFrame(columns = ["Year", "Team", "W", "L",
                                       "W/L%", "GB", "PS/G", "PA/G",
                                       "SRS", "Playoffs",
                                       "Losing_season"])
    

In [16]:
# create a dataframe with all aquired info
year_standings = pd.DataFrame(team_stats, columns = headers)

# add a column to dataframe to indicate playoff appearance
year_standings["Playoffs"] = ["Y" if "*" in ele else "N" for ele in year_standings["Team"]]
# remove * from team names
year_standings["Team"] = [ele.replace('*', '') for ele in year_standings["Team"]]
# add losing season indicator (win % < .5)
year_standings["Losing_season"] = ["Y" if float(ele) < .5 else "N" for ele in year_standings["W/L%"]]

# append new dataframe to final_df
final_df = pd.concat([final_df, year_standings], ignore_index=True)

In [21]:
east_titles = [e.replace('*','') for e in east_titles] 

In [22]:
east_df = final_df[final_df.Team.isin(east_titles)].sort_values(by='W',ascending=False)


In [23]:
east_df

Unnamed: 0,Year,Team,W,L,W/L%,GB,PS/G,PA/G,SRS,Playoffs,Losing_season
0,2022,Miami Heat,53,29,0.646,—,110.0,105.6,4.23,Y,N
1,2022,Boston Celtics,51,31,0.622,2.0,111.8,104.5,7.02,Y,N
2,2022,Milwaukee Bucks,51,31,0.622,2.0,115.5,112.1,3.22,Y,N
3,2022,Philadelphia 76ers,51,31,0.622,2.0,109.9,107.3,2.57,Y,N
4,2022,Toronto Raptors,48,34,0.585,5.0,109.4,107.1,2.38,Y,N
5,2022,Chicago Bulls,46,36,0.561,7.0,111.6,112.0,-0.38,Y,N
6,2022,Brooklyn Nets,44,38,0.537,9.0,112.9,112.1,0.82,Y,N
7,2022,Cleveland Cavaliers,44,38,0.537,9.0,107.8,105.7,2.04,N,N
8,2022,Atlanta Hawks,43,39,0.524,10.0,113.9,112.4,1.55,Y,N
9,2022,Charlotte Hornets,43,39,0.524,10.0,115.3,114.9,0.53,N,N


In [17]:
final_df

Unnamed: 0,Year,Team,W,L,W/L%,GB,PS/G,PA/G,SRS,Playoffs,Losing_season
0,2022,Miami Heat,53,29,0.646,—,110.0,105.6,4.23,Y,N
1,2022,Boston Celtics,51,31,0.622,2.0,111.8,104.5,7.02,Y,N
2,2022,Milwaukee Bucks,51,31,0.622,2.0,115.5,112.1,3.22,Y,N
3,2022,Philadelphia 76ers,51,31,0.622,2.0,109.9,107.3,2.57,Y,N
4,2022,Toronto Raptors,48,34,0.585,5.0,109.4,107.1,2.38,Y,N
5,2022,Chicago Bulls,46,36,0.561,7.0,111.6,112.0,-0.38,Y,N
6,2022,Brooklyn Nets,44,38,0.537,9.0,112.9,112.1,0.82,Y,N
7,2022,Cleveland Cavaliers,44,38,0.537,9.0,107.8,105.7,2.04,N,N
8,2022,Atlanta Hawks,43,39,0.524,10.0,113.9,112.4,1.55,Y,N
9,2022,Charlotte Hornets,43,39,0.524,10.0,115.3,114.9,0.53,N,N
