In [2]:
#install unidecode library to remove discrepant text characters
#while handling non-ASCII characters to translate from website to a pandas dataframe
!pip install unidecode


Collecting unidecode
  Downloading Unidecode-1.3.8-py3-none-any.whl.metadata (13 kB)
Downloading Unidecode-1.3.8-py3-none-any.whl (235 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/235.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m235.5/235.5 kB[0m [31m13.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: unidecode
Successfully installed unidecode-1.3.8


In [3]:
#get useful libraries
import numpy as np
import pandas as pd
import random
import time
from unidecode import unidecode

In [4]:
#team dictionary corresponding to abbreviations in basketball-reference
teams = [
    'atl', 'bos', 'brk', 'cho', 'chi', 'cle', 'dal', 'den', 'det', 'gsw',
    'hou', 'ind', 'lac', 'lal', 'mem', 'mia', 'mil', 'min', 'nop', 'nyk',
    'okc', 'orl', 'phi', 'pho', 'por', 'sac', 'sas', 'tor', 'uta', 'was'
]
len(teams)

30

In [5]:
#season dictionary to iterate through while web scraping
seasons = ['2015', '2016', '2017', '2018', '2019', '2020', '2021', '2022', '2023']
len(seasons)

9

In [6]:
#stats dictionary corresponding to those in basketball-reference
stats = [
    'FG', 'FGA', 'FG%',
    '3P', '3PA', '3P%',
    'FT', 'FTA', 'FT%',
    'ORB', 'TRB', 'AST',
    'STL', 'BLK', 'TOV', 'PF'
]

#split into corresponding team and opponent stats
tm_stats_dict = {stat: 'Tm_' + str(stat) for stat in stats}

opp_stats_dict = {stat + '.1': 'Opp_' + str(stat) for stat in stats}

In [7]:
#empty dataframe to hold the game logs
nba_df = pd.DataFrame()

#iterate through all teams for each season
for season in seasons:
    for team in teams:
      #url containing gamelog data for all teams/seasons
      url = 'https://www.basketball-reference.com/teams/' + team + '/' + season + '/gamelog/'
      print(url) #track progress

      #read html table having the gamelog
      team_df = pd.read_html(url, header=1, attrs={'id':'tgl_basic'})[0]

      #remove irrelevant rows and columns
      team_df = team_df[(team_df['Rk'].str != '') & (team_df['Rk'].str.isnumeric())]
      team_df = team_df.drop(columns=['Rk', 'Unnamed: 24'])

      #rename confusing columns for clarity
      team_df = team_df.rename(columns={'Unnamed: 3': 'Home', 'Tm':'Tm_Pts', 'Opp.1':'Opp_Pts'})
      team_df = team_df.rename(columns=tm_stats_dict)
      team_df = team_df.rename(columns=opp_stats_dict)

      #make the 'home' column binary (1 for home, 0 for away)
      team_df['Home'] = team_df['Home'].apply(lambda x: 0 if x == '@' else 1)

      #added season and team columns at beginning of dataframe
      team_df.insert(loc=0, column='Season', value=season)
      team_df.insert(loc=1, column='Team', value=team.upper())

      #add game logs to original main dataframe
      nba_df = pd.concat([nba_df, team_df], ignore_index=True)

      #time delay for processing purposes
      time.sleep(random.randint(4,6))

print(nba_df)

https://www.basketball-reference.com/teams/atl/2015/gamelog/
https://www.basketball-reference.com/teams/bos/2015/gamelog/
https://www.basketball-reference.com/teams/brk/2015/gamelog/
https://www.basketball-reference.com/teams/cho/2015/gamelog/
https://www.basketball-reference.com/teams/chi/2015/gamelog/
https://www.basketball-reference.com/teams/cle/2015/gamelog/
https://www.basketball-reference.com/teams/dal/2015/gamelog/
https://www.basketball-reference.com/teams/den/2015/gamelog/
https://www.basketball-reference.com/teams/det/2015/gamelog/
https://www.basketball-reference.com/teams/gsw/2015/gamelog/
https://www.basketball-reference.com/teams/hou/2015/gamelog/
https://www.basketball-reference.com/teams/ind/2015/gamelog/
https://www.basketball-reference.com/teams/lac/2015/gamelog/
https://www.basketball-reference.com/teams/lal/2015/gamelog/
https://www.basketball-reference.com/teams/mem/2015/gamelog/
https://www.basketball-reference.com/teams/mia/2015/gamelog/
https://www.basketball-r

In [8]:
#save dataframe to csv file
nba_df.to_csv('nbaGamelogs.csv', index=False)