This jupyter notebook webscrapes from Basketball Reference to find the date of the last game of the season for each team. The last games for each team vary since some teams don't make it to the Playoffs and the teams that do get eliminated at different times. This data can be used to estimate injury length when a player gets injured during the season, but then they don't return until after their team's season is over. 

In [80]:
import math
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
import time
from unidecode import unidecode
from tqdm import tqdm
import datetime

In [4]:
def getMonthSoup(season, month):
    url = f"https://www.basketball-reference.com/leagues/NBA_{season}_games-{month}.html"
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    return soup

def getTable(soup, tableid):
    return soup.find('table', {'id' : tableid})

def getTeamName(string):
    if 'Trail Blazers' in string:
        return 'Trail Blazers'
    return string.split(' ')[-1]

In [84]:
def toDF(gamesDict):
    data = {'Team' : [], 'Last Game' : []}
    for team, date in gamesDict.items():
        data['Team'].append(team)
        data['Last Game'].append(date)
    return pd.DataFrame(data)

def getLastGames(season, reverse_months):
    lastGames = {}
    for m in reverse_months:
        s = getMonthSoup(season, m)
        time.sleep(3)
        t = getTable(s, 'schedule')
        for i in range(len(t.find_all('tr'))):
            index = len(t.find_all('tr')) - 1 - i
            row = t.find_all('tr')[index]
            if row.find('td') is None:
                continue
            date = row.find('th', {'data-stat' : 'date_game'}).text
            team1 = row.find('td', {'data-stat' : 'home_team_name'}).text
            team2 = row.find('td', {'data-stat' : 'visitor_team_name'}).text
            if team1 not in lastGames.keys():
                lastGames[team1] = date
            if team2 not in lastGames.keys():
                lastGames[team2] = date
            if len(lastGames) == 30:
                break
    df = toDF(lastGames)
    df['Team'] = df['Team'].apply(getTeamName)
    df['Last Game'] = pd.to_datetime(df['Last Game']).dt.date
    df['Last Game'] = df['Last Game'] + pd.Timedelta(days = 1)
    return df

In [87]:
df_list = []
for season in tqdm(range(2019, 2024)):
    months = ['june', 'may', 'april']
    if season == 2020:
        months = ['october-2020', 'september', 'august', 'march']
    if season == 2021:
        months = ['july', 'june', 'may']
    season_df = getLastGames(season, months)
    df_list.append(season_df)
last_df = pd.concat(df_list)

100%|█████████████████████████████████████████████| 5/5 [00:54<00:00, 10.89s/it]


In [90]:
# last_df.to_csv('last_games.csv', index = False)

# Old

In [117]:
def getSeasonSoup(season, playoffs = True):
    if playoffs:
        url = f"https://www.basketball-reference.com/playoffs/NBA_{season}.html"
    else:
        url = f"https://www.basketball-reference.com/leagues/NBA_{season}.html"
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    return soup

def getTable(soup, tableid):
    return soup.find('table', {'id' : tableid})

def playoffResults(start_yr, end_yr):
    teams = []
    rounds = []
    seasons = []
    for i in tqdm(range(start_yr, end_yr + 1)):
        soup = getSeasonSoup(i)
        time.sleep(3)
        table = getTable(soup, tableid = 'all_playoffs')
        newList = []
        for row in table.find_all('tr'):
            if row.find('strong') is None:
                continue
            rd = row.find('strong').text
            for data in row.find_all('td'):
                for link in data.find_all('a'):
                    if link.text is None or link.text == 'Series Stats':
                        continue
                    teams.append(link.text)
                    rounds.append(rd)
                    seasons.append(i)
    data = {'Team' : teams, 'Season' : seasons, 'Round' : rounds}
    return pd.DataFrame(data)

In [112]:
results = playoffResults(2019, 2023)
results

100%|█████████████████████████████████████████████| 5/5 [00:17<00:00,  3.44s/it]


Unnamed: 0,Team,Season,Round
0,Toronto Raptors,2019,Finals
1,Golden State Warriors,2019,Finals
2,Toronto Raptors,2019,Eastern Conference Finals
3,Milwaukee Bucks,2019,Eastern Conference Finals
4,Golden State Warriors,2019,Western Conference Finals
...,...,...,...
145,Sacramento Kings,2023,Western Conference First Round
146,Los Angeles Lakers,2023,Western Conference First Round
147,Memphis Grizzlies,2023,Western Conference First Round
148,Phoenix Suns,2023,Western Conference First Round


In [118]:
teams_soup = getSeasonSoup(2023, playoffs = False)
teams_df = getTable(teams_soup, 'per_game-team')

In [130]:
nba_teams = []
for data in teams_df.find_all('td', {'data-stat' : 'team'}):
    if data.find('a') is None:
        continue
    text = data.find('a').text
    nba_teams.append(text)
nba_teams = sorted(nba_teams)

In [133]:
all_years = results['Season'].unique()
new_data = {
    'Team' : [],
    'Season' : [],
    'Round' : []
}
for season in all_years:
    small_df = results[results['Season'] == season]
    for team in nba_teams:
        new_data['Team'].append(team)
        new_data['Season'].append(season)
        if team not in small_df['Team'].tolist():
            new_data['Round'].append('Missed Playoffs')
        else:
            best = small_df[small_df['Team'] == team]['Round'].iloc[0]
            new_data['Round'].append(best)
teamResults = pd.DataFrame(new_data)

In [139]:
def convertRound(rd):
    rd = rd.replace('Eastern ', '')
    rd = rd.replace('Western ', '')
    return rd

In [141]:
teamResults['Round'] = teamResults['Round'].apply(convertRound)
teamResults

Unnamed: 0,Team,Season,Round
0,Atlanta Hawks,2019,Missed Playoffs
1,Boston Celtics,2019,Conference Semifinals
2,Brooklyn Nets,2019,Conference First Round
3,Charlotte Hornets,2019,Missed Playoffs
4,Chicago Bulls,2019,Missed Playoffs
...,...,...,...
145,Sacramento Kings,2023,Conference First Round
146,San Antonio Spurs,2023,Missed Playoffs
147,Toronto Raptors,2023,Missed Playoffs
148,Utah Jazz,2023,Missed Playoffs


In [145]:
# teamResults.to_csv('playoff_results.csv', index = False)