# Step 1: Web-scraping

In [2]:
# Importing libraries to web-scrape
import requests
import os
import shutil

#### Web-scraping
When web-scraping, be sure to not overload the website by making too many requests. Be sure to minimize your number of requests and only take data from a limited number of pages.

In [3]:
# Creating a list of all years for which we collect the data on that year's MVP
# Note: this is not inclusive of 2022 but is inclusive of 1991
years = list(range(1991, 2022))

print(*years, sep=', ')

1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021


## Downloading MVP votes over the years

In [4]:
# Dynamic url that uses string formatting with a for loop through the years to access multiple web pages for the different years' data
url_start = "https://www.basketball-reference.com/awards/awards_{}.html"

for year in years:
    # Store url for that specific year in years
    url = url_start.format(year)
    
    # Get its data using the requests library
    data = requests.get(url)
    
    # Write the data obtained from the webpage into a file as a string
    # This is returned by data.text
    with open(r"cache\mvp\{}.html".format(year), "w+") as f:
        f.write(data.text)

#### Parsing the MVP votes using BeautifulSoup

In [5]:
# Importing library to parse data
from bs4 import BeautifulSoup

Example to parse one page using bs4 by reading the file, then creating a parse object and finally getting rid of the over-header to easily import the data into pandas.

We need to first find all the `tr`s, then from all of them, find the one with the class `overheader` and remove that one.

In [6]:
# Reading the first file
with open(r"cache\mvp\\1991.html") as f:
    page = f.read()
    
# Creating parse object
soup = BeautifulSoup(page, 'html.parser')

# Finding and removing the overheader
soup.find('tr', class_="over_header").decompose()

In [7]:
# Each html table on the website has a unique id
# For the MVP table, that id is 'mvp'
# Finding the table by its id and storing it into a variable
mvp_table = soup.find_all(id="mvp")[0]

Note: `mvp_table` is an html table that contains a list of all possible mvp details 

## Scraping the MVPs

#### Using pandas to load the data into a data-frame to perform analyses

In [8]:
import pandas as pd

In [9]:
# Now reading the html table into pandas by first converting it into a string then passing it to the function
# Since mvp_table is a list of dataframes, we just access the first one by using [0]
# The first data frame in the list contains the most important information 
mvp_1991 = pd.read_html(str(mvp_table))[0]

In [10]:
# Printing the first index of mvp_table
mvp_1991

Unnamed: 0,Rank,Player,Age,Tm,First,Pts Won,Pts Max,Share,G,MP,PTS,TRB,AST,STL,BLK,FG%,3P%,FT%,WS,WS/48
0,1,Michael Jordan,27,CHI,77.0,891.0,960,0.928,82,37.0,31.5,6.0,5.5,2.7,1.0,0.539,0.312,0.851,20.3,0.321
1,2,Magic Johnson,31,LAL,10.0,497.0,960,0.518,79,37.1,19.4,7.0,12.5,1.3,0.2,0.477,0.32,0.906,15.4,0.251
2,3,David Robinson,25,SAS,6.0,476.0,960,0.496,82,37.7,25.6,13.0,2.5,1.5,3.9,0.552,0.143,0.762,17.0,0.264
3,4,Charles Barkley,27,PHI,2.0,222.0,960,0.231,67,37.3,27.6,10.1,4.2,1.6,0.5,0.57,0.284,0.722,13.4,0.258
4,5,Karl Malone,27,UTA,0.0,142.0,960,0.148,82,40.3,29.0,11.8,3.3,1.1,1.0,0.527,0.286,0.77,15.5,0.225
5,6,Clyde Drexler,28,POR,1.0,75.0,960,0.078,82,34.8,21.5,6.7,6.0,1.8,0.7,0.482,0.319,0.794,12.4,0.209
6,7,Kevin Johnson,24,PHO,0.0,32.0,960,0.033,77,36.0,22.2,3.5,10.1,2.1,0.1,0.516,0.205,0.843,12.7,0.22
7,8,Dominique Wilkins,31,ATL,0.0,29.0,960,0.03,81,38.0,25.9,9.0,3.3,1.5,0.8,0.47,0.341,0.829,11.4,0.177
8,9T,Larry Bird,34,BOS,0.0,25.0,960,0.026,60,38.0,19.4,8.5,7.2,1.8,1.0,0.454,0.389,0.891,6.6,0.14
9,9T,Terry Porter,27,POR,0.0,25.0,960,0.026,81,32.9,17.0,3.5,8.0,2.0,0.1,0.515,0.415,0.823,13.0,0.235


In [11]:
# Printing details of the player at the top of the MVP table in 1991 (Micheal Jordan. Suprise, surprise...)
mvp_1991.head(1)

Unnamed: 0,Rank,Player,Age,Tm,First,Pts Won,Pts Max,Share,G,MP,PTS,TRB,AST,STL,BLK,FG%,3P%,FT%,WS,WS/48
0,1,Michael Jordan,27,CHI,77.0,891.0,960,0.928,82,37.0,31.5,6.0,5.5,2.7,1.0,0.539,0.312,0.851,20.3,0.321


In [12]:
# Creating a column called 'Year' to disambiguate when working with multiple years' data
mvp_1991["Year"] = 1991

In [13]:
# Clearly, the year column has been added to the end
mvp_1991.head()

Unnamed: 0,Rank,Player,Age,Tm,First,Pts Won,Pts Max,Share,G,MP,...,TRB,AST,STL,BLK,FG%,3P%,FT%,WS,WS/48,Year
0,1,Michael Jordan,27,CHI,77.0,891.0,960,0.928,82,37.0,...,6.0,5.5,2.7,1.0,0.539,0.312,0.851,20.3,0.321,1991
1,2,Magic Johnson,31,LAL,10.0,497.0,960,0.518,79,37.1,...,7.0,12.5,1.3,0.2,0.477,0.32,0.906,15.4,0.251,1991
2,3,David Robinson,25,SAS,6.0,476.0,960,0.496,82,37.7,...,13.0,2.5,1.5,3.9,0.552,0.143,0.762,17.0,0.264,1991
3,4,Charles Barkley,27,PHI,2.0,222.0,960,0.231,67,37.3,...,10.1,4.2,1.6,0.5,0.57,0.284,0.722,13.4,0.258,1991
4,5,Karl Malone,27,UTA,0.0,142.0,960,0.148,82,40.3,...,11.8,3.3,1.1,1.0,0.527,0.286,0.77,15.5,0.225,1991


The last couple of cells have just been an example of how to work with a year's data. Now, creating a list of data-frames to store data of the MVPs across the years.

In [14]:
# Creating a list of dfs
dfs = []

for year in years:
    with open(r"cache\mvp\{}.html".format(year)) as f:
        page = f.read()
    
    # Performing all the steps for each table through the years
    soup = BeautifulSoup(page, 'html.parser')
    soup.find('tr', class_="over_header").decompose()
    mvp_table = soup.find_all(id="mvp")[0]
    mvp_df = pd.read_html(str(mvp_table))[0]
    mvp_df["Year"] = year
    dfs.append(mvp_df)

In [15]:
# Combining all data frames into one master mvps data frame
mvps = pd.concat(dfs)

mvps.tail()

Unnamed: 0,Rank,Player,Age,Tm,First,Pts Won,Pts Max,Share,G,MP,...,TRB,AST,STL,BLK,FG%,3P%,FT%,WS,WS/48,Year
10,11,Russell Westbrook,32,WAS,0.0,5.0,1010,0.005,65,36.4,...,11.5,11.7,1.4,0.4,0.439,0.315,0.656,3.7,0.075,2021
11,12,Ben Simmons,24,PHI,0.0,3.0,1010,0.003,58,32.4,...,7.2,6.9,1.6,0.6,0.557,0.3,0.613,6.0,0.153,2021
12,13T,James Harden,31,TOT,0.0,1.0,1010,0.001,44,36.6,...,7.9,10.8,1.2,0.8,0.466,0.362,0.861,7.0,0.208,2021
13,13T,LeBron James,36,LAL,0.0,1.0,1010,0.001,45,33.4,...,7.7,7.8,1.1,0.6,0.513,0.365,0.698,5.6,0.179,2021
14,13T,Kawhi Leonard,29,LAC,0.0,1.0,1010,0.001,52,34.1,...,6.5,5.2,1.6,0.4,0.512,0.398,0.885,8.8,0.238,2021


In [16]:
# Exporting all the data to csv for future use
mvps.to_csv("mvps.csv")

## Scraping the details for all the players

Since our aim is to predict who will become the MVP, we need to take into consideration the data from all of the other players in the league.

Our new aim is to obtain all the players' data from all the years from 1991 to 2021 and map that data to our MVP data so we can create some training data for our machine learning model.


In [17]:
# Downloading player stats
player_stats_url = "https://www.basketball-reference.com/leagues/NBA_{}_per_game.html"

In [18]:
for year in years:
    url = player_stats_url.format(year)
    
    data = requests.get(url)
    
    # Downloading all the html files across the years
    with open("cache\player\{}.html".format(year), "w+") as f:
        f.write(data.text)

#### Using Selenium to work around JavaScript

The issue with rendering this table from the website is that the table itself is rendered by the website using JavaScript. To scrape a JavaScript page, we use the library Selenium.

In [19]:
import os
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time

# install selenium chrome driver from https://chromedriver.chromium.org/downloads

In [20]:
# Specify driver path
driver = webdriver.Chrome(
    executable_path=r"C:\Users\deshi\Downloads\chromedriver_win32\chromedriver.exe"
)

  driver = webdriver.Chrome(


In [21]:
# For each of the years in the list
for year in years:
    # Get the url for this year
    url = player_stats_url.format(year)
    
    # Telling the driver to go and render that url in the browser
    driver.get(url)
    
    # Running JavaScript in the browser to scroll down when it renders the table
    driver.execute_script("window.scrollTo(1,10000)")
    time.sleep(2)
    
    # Getting html data for every player in the league at that point and saving it in the cache by year
    with open("cache\player\{}.html".format(year), "w+") as f:
        f.write(driver.page_source)

WebDriverException: Message: chrome not reachable
  (Session info: chrome=100.0.4896.127)
Stacktrace:
Backtrace:
	Ordinal0 [0x009C7413+2389011]
	Ordinal0 [0x00959F61+1941345]
	Ordinal0 [0x0084C520+836896]
	Ordinal0 [0x00840682+788098]
	Ordinal0 [0x008413FF+791551]
	Ordinal0 [0x00842752+796498]
	Ordinal0 [0x0083C0D9+770265]
	Ordinal0 [0x0084D9D0+842192]
	Ordinal0 [0x008A3AE2+1194722]
	Ordinal0 [0x00893F66+1130342]
	Ordinal0 [0x0086E546+976198]
	Ordinal0 [0x0086F456+980054]
	GetHandleVerifier [0x00B79632+1727522]
	GetHandleVerifier [0x00C2BA4D+2457661]
	GetHandleVerifier [0x00A5EB81+569713]
	GetHandleVerifier [0x00A5DD76+566118]
	Ordinal0 [0x00960B2B+1968939]
	Ordinal0 [0x00965988+1989000]
	Ordinal0 [0x00965A75+1989237]
	Ordinal0 [0x0096ECB1+2026673]
	BaseThreadInitThunk [0x75A96739+25]
	RtlGetFullPathName_UEx [0x77C98E7F+1215]
	RtlGetFullPathName_UEx [0x77C98E4D+1165]


Now, like before, creating a list of data frames to store all of the data of all of the players in the league from 1991 - 2021 and merging it all together into one big data frame called players.

Fun fact: The following cell takes the longest to execute with an avg time of execution of 1:10.0 minutes due to the sheer size of the data.

In [22]:
dfs = []
for year in years:
    with open("cache\player\{}.html".format(year)) as f:
        page = f.read()
    
    soup = BeautifulSoup(page, 'html.parser')
    soup.find('tr', class_="thead").decompose()
    player_table = soup.find_all(id="per_game_stats")[0]
    player_df = pd.read_html(str(player_table))[0]
    player_df["Year"] = year
    dfs.append(player_df)

In [23]:
players = pd.concat(dfs)

In [24]:
players.head()

Unnamed: 0,Rk,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,...,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,Year
0,1,Alaa Abdelnaby,PF,22,POR,43,0,6.7,1.3,2.7,...,0.6,1.4,2.1,0.3,0.1,0.3,0.5,0.9,3.1,1991
1,2,Mahmoud Abdul-Rauf,PG,21,DEN,67,19,22.5,6.2,15.1,...,0.5,1.3,1.8,3.1,0.8,0.1,1.6,2.2,14.1,1991
2,3,Mark Acres,C,28,ORL,68,0,19.3,1.6,3.1,...,2.1,3.2,5.3,0.4,0.4,0.4,0.6,3.2,4.2,1991
3,4,Michael Adams,PG,28,DEN,66,66,35.5,8.5,21.5,...,0.9,3.0,3.9,10.5,2.2,0.1,3.6,2.5,26.5,1991
4,5,Mark Aguirre,SF,31,DET,78,13,25.7,5.4,11.7,...,1.7,3.1,4.8,1.8,0.6,0.3,1.6,2.7,14.2,1991


In [25]:
# Exporting all of the data generated into another .csv file
players.to_csv("players.csv")

## Scraping the details for all the teams

All of the code is the exact same for extraction, storing and compiling of data since its the same process we've used to extract the previous data. The only things that change are the url and the method of scraping - since there are two tables that need to be scraped. 

In [26]:
# Downloading data
team_stats_url = "https://www.basketball-reference.com/leagues/NBA_{}_standings.html"

In [27]:
for year in years:
    url = team_stats_url.format(year)
    
    data = requests.get(url)
    
    with open(r"cache\team\{}.html".format(year), "w+") as f:
        f.write(data.text)

In [28]:
# Here, the division standings are in the form of two tables
# One for the Eastern and Western conferences
dfs = []
for year in years:
    with open(r"cache\team\{}.html".format(year)) as f:
        page = f.read()
    
    soup = BeautifulSoup(page, 'html.parser')
    soup.find('tr', class_="thead").decompose()
    
    # To parse the Eastern conference data
    e_table = soup.find_all(id="divs_standings_E")[0]
    e_df = pd.read_html(str(e_table))[0]
    e_df["Year"] = year
    e_df["Team"] = e_df["Eastern Conference"]
    del e_df["Eastern Conference"]
    dfs.append(e_df)
    
    # To parse the Western conference data
    w_table = soup.find_all(id="divs_standings_W")[0]
    w_df = pd.read_html(str(w_table))[0]
    w_df["Year"] = year
    w_df["Team"] = w_df["Western Conference"]
    del w_df["Western Conference"]
    dfs.append(w_df)

In [29]:
teams = pd.concat(dfs)

In [30]:
teams

Unnamed: 0,W,L,W/L%,GB,PS/G,PA/G,SRS,Year,Team
0,56,26,.683,—,111.5,105.7,5.22,1991,Boston Celtics*
1,44,38,.537,12.0,105.4,105.6,-0.39,1991,Philadelphia 76ers*
2,39,43,.476,17.0,103.1,103.3,-0.43,1991,New York Knicks*
3,30,52,.366,26.0,101.4,106.4,-4.84,1991,Washington Bullets
4,26,56,.317,30.0,102.9,107.5,-4.53,1991,New Jersey Nets
...,...,...,...,...,...,...,...,...,...
13,42,30,.583,—,112.4,110.2,2.26,2021,Dallas Mavericks*
14,38,34,.528,4.0,113.3,112.3,1.07,2021,Memphis Grizzlies*
15,33,39,.458,9.0,111.1,112.8,-1.58,2021,San Antonio Spurs
16,31,41,.431,11.0,114.6,114.9,-0.20,2021,New Orleans Pelicans


In [31]:
teams.to_csv("teams.csv")

##### *We've now acquired all the data we needed from the website. I.e., the data for MVPs, Players and Teams over the years 1991-2021 in the league. We've stored them in .csv files so we don't need to keep scraping the website over and over again thereby minimizing the number of requests made to the site and improving the efficiency of the code.*