In [15]:
#Pre-Scraping process for basketball-reference

# Creating year and month tags for 2010-2019
years = list(range(2010, 2020))
months = ["games-october", "games-november", "games-december", "games-january", "games-february", "games-march"]

# Combining year and month tags for normal years
year_months = []
for year in years:
    for month in months:
        year_months.append(str(year) + "_" + month)
for month in months:
    year_months.append(str(2022) + "_" + month)

# Remove October and November of 2012 because of lockout season
year_months.remove("2012_games-october")
year_months.remove("2012_games-november")

NOTES:

Season 2019-2020: Works out weird, because of COVID season. We'll use every month before March. Also October is weird: 2020_games-october-2019

Season 2020-2021: Weird because it actually started in December. Playoffs started in May, so only add December-April

Season 2011-2012: Lockout shortened, no October or November games

Every other season: Scrape October-March

In [3]:
# Scraping the data from basketball-reference
import requests

url_start = "https://www.basketball-reference.com/leagues/NBA_{}.html"

for year_month in year_months:
    url = url_start.format(year_month)
    data = requests.get(url)
    with open("Game_Results/{}.html".format(year_month), "w+") as f:
        f.write(data.text)

In [16]:
from bs4 import BeautifulSoup
import pandas as pd

In [24]:
# Grabbing the tables out of the scraped HTML files
for year_month in year_months:
    with open("Game_Results/{}.html".format(year_month)) as f:
        page = f.read()
    soup = BeautifulSoup(page, "html.parser")

    # Look for the id "schedule" that contains the data
    schedule_table = soup.find_all(id="schedule")
    schedule = pd.read_html(str(schedule_table))[0]

    schedule.to_csv("Game_Results_CSV/{}".format(year_month))
    

In [25]:
import os

path = "/Users/Deen/Desktop/Neural_Nets/Final_Project/Game_Results_CSV/"

files = os.listdir(path)

# Renaming the files to a simpler format of "season_month"
for file in files:
    # break up the file into its components
    file_list = file.split("_")
    file_list[1] = file_list[1].split("-")

    # year is defined by the latter season. Example: 2009-2010 season is called "10"
    year = int(file_list[0][2:])

    # code the months as numbers
    month = 1
    if file_list[1][1] == "november":
        month = 2
    elif file_list[1][1] == "december":
        month = 3
    elif file_list[1][1] == "january":
        month = 4
    elif file_list[1][1] == "february":
        month = 5
    elif file_list[1][1] == "march":
        month = 6

    # We remove october and november, because there have not been enough games played yet to predict these outcomes
    if month == 1 or month == 2:
        os.remove(path+file)
    
    # Naming works out weird for 2009 so I did that manually
    elif year == 10:
         os.rename(path+file, f"" + "2009-10" + "_" + str(month) + ".csv")
    
    # We removed december, because in the lockout season there were not enough games played yet to predict these outcomes
    elif year == 12 and month == 3:
            os.remove(path+file)
    # Rename everything else per the new naming convention
    else:
        os.rename(path+file, f"" + file_list[0][:2] + str(year-1) + "-" + str(year) + "_" + str(month) + ".csv")


At this point all of the files will be in the main final_project folder, and you will have to manually move them back into the "Game_Results_CSV" folder.

In [5]:
from selenium import webdriver

In [6]:
driver = webdriver.Safari()

In [20]:
# Pre-scraping process for nba.com (building the list of years we will look for)

nba_years = []

for i in range(10):
    nba_years.append(str(2018-i) + "-" + str(19-i))

nba_years.append("2021-22")

['2018-19', '2017-18', '2016-17', '2015-16', '2014-15', '2013-14', '2012-13', '2011-12', '2010-11', '2009-10', '2021-22']


In [17]:
import time

url_start = "https://www.nba.com/stats/teams/traditional/?sort=MIN&dir=-1&Season={}&SeasonType=Regular%20Season&Month="

for year in nba_years:
    # 2011-12 is a special year because of the lockout season, so only look for the later months
    if year == "2011-12":
        for i in range(4):
            url = url_start.format(year) + str(i+3)
            driver.get(url)
            driver.execute_script("window.scrollTo(1,10000)")
            time.sleep(2)
            html = driver.page_source
            with open("Monthly_Stats_New/{}_".format(year) + str(i + 3) + ".html", "w+") as f:
                f.write(html)
    
    # for every other year, get all the months
    else: 
        for i in range(6):
            url = url_start.format(year) + str(i+1)
            driver.get(url)
            driver.execute_script("window.scrollTo(1,10000)")
            time.sleep(2)
            html = driver.page_source
            with open("Monthly_Stats_New/{}_".format(year) + str(i + 1) + ".html", "w+") as f:
                f.write(html)

In [26]:
# Grabbing the tables out of the scraped HTML files (EXCLUDING OCTOBER)
for year in nba_years:
    # Again, 2011-12 was a weird year so the files we pulled don't contain all months. 
    # Thus we can only look for tables in the files we have
    if year == "2011-12":
        for i in range(4):
            with open("Monthly_Stats_New/{}_".format(year) + str(i+3) + ".html") as f:
                page = f.read()
            soup = BeautifulSoup(page, "html.parser")
            stats_table = soup.find_all(class_="nba-stat-table__overflow")
            stats = pd.read_html(str(stats_table))[0]

            stats.to_csv("Monthly_Stats_CSV/{}_".format(year) + str(i+3) + ".csv")

    # March of 2012-12 did not scrape properly, so we will skip over that month
    elif year == "2012-13":
        for i in range(4):
            with open("Monthly_Stats_New/{}_".format(year) + str(i+2) + ".html") as f:
                page = f.read()
            soup = BeautifulSoup(page, "html.parser")
            stats_table = soup.find_all(class_="nba-stat-table__overflow")
            stats = pd.read_html(str(stats_table))[0]

            stats.to_csv("Monthly_Stats_CSV/{}_".format(year) + str(i+2) + ".csv")

    # For every other year, we will pull the tables for every month
    else:
        for i in range(5):
            with open("Monthly_Stats_New/{}_".format(year) + str(i+2) + ".html") as f:
                page = f.read()
            soup = BeautifulSoup(page, "html.parser")
            stats_table = soup.find_all(class_="nba-stat-table__overflow")
            stats = pd.read_html(str(stats_table))[0]

            stats.to_csv("Monthly_Stats_CSV/{}_".format(year) + str(i+2) + ".csv")

In [27]:
path = "/Users/Deen/Desktop/Neural_Nets/Final_Project/Monthly_Stats_CSV/"

files = os.listdir(path)

for file in files:
    file_list = file.split("-")
    month = int(file_list[1].split("_")[1].split(".")[0])

    # Here, year is defined by the earlier year in the season. Example: 2009-10 is "9"
    year = int(file_list[0][2:])

    # Remove all the last months because they won't be predicting anything!
    if month == 6:
        print(month)
        os.remove(path+file)

6
6
6
6
6
6
6
6
6
6


We now have game data for 2009-2022 (excluding COVID years). We will want to combine these dataframes, but first we should clean them up. Firstly, I think we shouldn't have the month of October predicting anything because there are so few games being played, so there is a high potential for noise there. Thus, we delete all CSVs from October.