In [4]:
# Loading the relevant packages

from basketball_reference_web_scraper import client
import pandas as pd
import requests
from time import sleep
from random import randint
import os

In [5]:
# Setting up the relative path of the data folder

data_folder = '../data'
if not os.path.exists(data_folder):
    os.makedirs(data_folder)

In [6]:
# Scraping the basketball-references website for the MVP statistics from 1980 till 2023

def get_mvp_candidates(year):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    
    url = f"https://www.basketball-reference.com/awards/awards_{year}.html#mvp"
    
    sleep(randint(5, 9))  # Add a random delay between requests to not get rate limited
    
    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()
        
        mvp_candidate_table = pd.read_html(response.text)[0].droplevel(level=0, axis=1)
        mvp_candidate_table['year'] = year
        return mvp_candidate_table
    
    except requests.exceptions.RequestException as e:
        print(f"Error fetching data for {year}: {e}")
        return None

# Define the range of years
years = range(1980,2024)

concatenated_tables = []
for year in years:
    mvp_candidates = get_mvp_candidates(year)
    if mvp_candidates is not None:
        concatenated_tables.append(mvp_candidates)

In [7]:
# Saving the dataset to the data folder


mvp_table = pd.concat(concatenated_tables)


mvp_csv_path = os.path.join(data_folder, 'mvp_candidates.csv')
mvp_table.to_csv(mvp_csv_path, index = False)

In [8]:
# Scraping the team statistics from 1980 till 2023

def get_team_stats(year):
    url = f"https://www.basketball-reference.com/leagues/NBA_{year}_standings.html#all_confs_standings_E%22"
    
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }

    sleep(randint(5, 9))  # Add a random delay between requests to not get rate limited by the website
    
    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()

        tables = pd.read_html(response.text)

        east_standing = cleaning_conference_standings(tables[0], year)
        east_standing['seed'] = east_standing['W'].rank(ascending=False)
        
        # Process and clean data for Western Conference standings
        west_standing = cleaning_conference_standings(tables[1], year)
        west_standing['seed'] = west_standing['W'].rank(ascending=False)

        combined_teams = pd.concat([east_standing, west_standing])

        return combined_teams
    
    except requests.exceptions.RequestException as e:
        print(f"Error fetching data for {year}: {e}")
        return None
    
def cleaning_conference_standings(team_df, year):
    team_df = team_df.rename(columns={team_df.columns[0]: "team"})
    team_df = team_df[~team_df['team'].str.contains('Division')]
    team_df['team'] = team_df['team'].str.replace("*", "", regex=False)
    team_df['year'] = year
    return team_df

concatenated_tables = []
for year in years:
    team_standings = get_team_stats(year)
    if mvp_candidates is not None:
        concatenated_tables.append(team_standings)

In [9]:
# Saving the dataset to the data folder

team_standings = pd.concat(concatenated_tables)

team_standings_csv = os.path.join(data_folder, 'team_standings.csv')
team_standings.to_csv(team_standings_csv, index = False)

In [10]:
# Using the basketball-references api to scrape the advanced statistics from 1980 till 2023

def get_advanced_stats(year):
    try:
        advanced_stats = pd.DataFrame(client.players_advanced_season_totals(season_end_year=year))
        advanced_stats['year'] = year
        sleep(randint(5, 9)) 
        return advanced_stats
    
    except Exception as e:
        print(f"Error fetching data for year {year}: {e}")
        return None
    
concatenated_tables = []
for year in years:
    advanced_stats_data = get_advanced_stats(year)
    if mvp_candidates is not None:
        concatenated_tables.append(advanced_stats_data)

In [11]:
# Saving the advanced statistics to the data folder

advanced_stats = pd.concat(concatenated_tables)

advanced_stats_csv = os.path.join(data_folder, 'advanced_stats.csv')
advanced_stats.to_csv(advanced_stats_csv, index = False)

# 2024 DATA

In [12]:
# Using the basketball-references api to get the regular season statistics for each player in the 2023-24 regular season

player_2024_data = pd.DataFrame(client.players_season_totals(season_end_year=2024))

In [13]:
player_2024_data.columns

Index(['slug', 'name', 'positions', 'age', 'team', 'games_played',
       'games_started', 'minutes_played', 'made_field_goals',
       'attempted_field_goals', 'made_three_point_field_goals',
       'attempted_three_point_field_goals', 'made_free_throws',
       'attempted_free_throws', 'offensive_rebounds', 'defensive_rebounds',
       'assists', 'steals', 'blocks', 'turnovers', 'personal_fouls', 'points'],
      dtype='object')

In [14]:
# Performing some basic data cleaning operations to convert all the statistics from season totals to averages

player_2024_data['FGM'] = player_2024_data['made_field_goals'] / player_2024_data['games_played']
player_2024_data['FGA'] = player_2024_data['attempted_field_goals'] / player_2024_data['games_played']
player_2024_data['3PM'] = player_2024_data['made_three_point_field_goals'] / player_2024_data['games_played']
player_2024_data['3PA'] = player_2024_data['attempted_three_point_field_goals'] / player_2024_data['games_played']
player_2024_data['FTM'] = player_2024_data['made_free_throws'] / player_2024_data['games_played']
player_2024_data['FTA'] = player_2024_data['attempted_free_throws'] / player_2024_data['games_played']
player_2024_data['TRB'] = (player_2024_data['offensive_rebounds'] + player_2024_data['defensive_rebounds']) / player_2024_data['games_played']
player_2024_data['AST'] = player_2024_data['assists'] / player_2024_data['games_played']
player_2024_data['STL'] = player_2024_data['steals'] / player_2024_data['games_played']
player_2024_data['BLK'] = player_2024_data['blocks'] / player_2024_data['games_played']
player_2024_data['TOV'] = player_2024_data['turnovers'] / player_2024_data['games_played']
player_2024_data['PF'] = player_2024_data['personal_fouls'] / player_2024_data['games_played']
player_2024_data['PTS'] = player_2024_data['points'] / player_2024_data['games_played']
player_2024_data['FG%'] = player_2024_data['FGM']/player_2024_data['FGA']
player_2024_data['3P%'] = player_2024_data['3PM']/player_2024_data['3PA']
player_2024_data['FT%'] = player_2024_data['FTM']/player_2024_data['FTA']

In [15]:
# Getting the advanced statistics for each player from the 2023-24 season using the basketball-references api

player_2024_advanced_data = pd.DataFrame(client.players_advanced_season_totals(season_end_year=2024))

In [16]:
player_2024_advanced_data

Unnamed: 0,slug,name,positions,age,team,games_played,minutes_played,player_efficiency_rating,true_shooting_percentage,three_point_attempt_rate,...,usage_percentage,offensive_win_shares,defensive_win_shares,win_shares,win_shares_per_48_minutes,offensive_box_plus_minus,defensive_box_plus_minus,box_plus_minus,value_over_replacement_player,is_combined_totals
0,achiupr01,Precious Achiuwa,[Position.CENTER],24,Team.TORONTO_RAPTORS,25,437,15.0,0.512,0.276,...,21.2,0.0,0.4,0.4,0.048,-1.4,-0.2,-1.6,0.0,False
1,achiupr01,Precious Achiuwa,[Position.POWER_FORWARD],24,Team.NEW_YORK_KNICKS,49,1187,14.5,0.564,0.167,...,14.0,1.2,1.8,3.0,0.122,-1.9,0.5,-1.4,0.2,False
2,adebaba01,Bam Adebayo,[Position.CENTER],26,Team.MIAMI_HEAT,71,2416,19.8,0.576,0.041,...,24.9,2.9,4.3,7.2,0.144,0.8,1.7,2.4,2.7,False
3,agbajoc01,Ochai Agbaji,[Position.SHOOTING_GUARD],23,Team.UTAH_JAZZ,51,1003,8.1,0.531,0.570,...,12.2,0.1,0.3,0.4,0.019,-2.5,-0.5,-3.0,-0.3,False
4,agbajoc01,Ochai Agbaji,[Position.SHOOTING_GUARD],23,Team.TORONTO_RAPTORS,27,638,7.1,0.453,0.375,...,15.4,-0.6,0.3,-0.3,-0.024,-5.0,-1.5,-6.5,-0.7,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
652,youngth01,Thaddeus Young,[Position.POWER_FORWARD],35,Team.PHOENIX_SUNS,10,89,15.1,0.515,0.048,...,12.8,0.1,0.1,0.3,0.137,-1.5,0.6,-0.9,0.0,False
653,youngtr01,Trae Young,[Position.POINT_GUARD],25,Team.ATLANTA_HAWKS,54,1942,20.3,0.585,0.465,...,30.5,4.0,0.6,4.6,0.114,4.9,-2.3,2.6,2.2,False
654,yurtsom01,Omer Yurtseven,[Position.CENTER],25,Team.UTAH_JAZZ,48,545,15.0,0.565,0.130,...,17.9,0.3,0.4,0.7,0.062,-1.6,-1.5,-3.0,-0.1,False
655,zelleco01,Cody Zeller,[Position.CENTER],31,Team.NEW_ORLEANS_PELICANS,43,320,12.8,0.483,0.048,...,12.9,0.4,0.4,0.8,0.124,-2.9,0.3,-2.6,0.0,False


In [17]:
# Merging the 2 datasets with each other

final_2024_data = pd.merge(player_2024_data, player_2024_advanced_data, on = ['name', 'team'], how = 'left')

In [18]:
print(final_2024_data['team'].unique())

[<Team.TORONTO_RAPTORS: 'TORONTO RAPTORS'>
 <Team.NEW_YORK_KNICKS: 'NEW YORK KNICKS'> <Team.MIAMI_HEAT: 'MIAMI HEAT'>
 <Team.UTAH_JAZZ: 'UTAH JAZZ'>
 <Team.MEMPHIS_GRIZZLIES: 'MEMPHIS GRIZZLIES'>
 <Team.MINNESOTA_TIMBERWOLVES: 'MINNESOTA TIMBERWOLVES'>
 <Team.PHOENIX_SUNS: 'PHOENIX SUNS'>
 <Team.CLEVELAND_CAVALIERS: 'CLEVELAND CAVALIERS'>
 <Team.NEW_ORLEANS_PELICANS: 'NEW ORLEANS PELICANS'>
 <Team.MILWAUKEE_BUCKS: 'MILWAUKEE BUCKS'>
 <Team.ORLANDO_MAGIC: 'ORLANDO MAGIC'>
 <Team.WASHINGTON_WIZARDS: 'WASHINGTON WIZARDS'>
 <Team.PORTLAND_TRAIL_BLAZERS: 'PORTLAND TRAIL BLAZERS'>
 <Team.DETROIT_PISTONS: 'DETROIT PISTONS'>
 <Team.CHARLOTTE_HORNETS: 'CHARLOTTE HORNETS'>
 <Team.PHILADELPHIA_76ERS: 'PHILADELPHIA 76ERS'>
 <Team.BOSTON_CELTICS: 'BOSTON CELTICS'>
 <Team.SAN_ANTONIO_SPURS: 'SAN ANTONIO SPURS'>
 <Team.SACRAMENTO_KINGS: 'SACRAMENTO KINGS'>
 <Team.BROOKLYN_NETS: 'BROOKLYN NETS'>
 <Team.LOS_ANGELES_CLIPPERS: 'LOS ANGELES CLIPPERS'>
 <Team.OKLAHOMA_CITY_THUNDER: 'OKLAHOMA CITY THUNDER'>

In [19]:
# Performing some data cleaning operations to clean the team names column

def extract_team_name(x):
    parts = str(x).rsplit('.', 1)
    if len(parts) == 2:
        return parts[1].strip("'").replace('_', ' ')
    else:
        return 'Unknown'

final_2024_data['team_names'] = final_2024_data['team'].apply(extract_team_name)

final_2024_data['team_names'] = final_2024_data['team_names'].apply(lambda x: x.title())

final_2024_data['team_names'].unique()

array(['Toronto Raptors', 'New York Knicks', 'Miami Heat', 'Utah Jazz',
       'Memphis Grizzlies', 'Minnesota Timberwolves', 'Phoenix Suns',
       'Cleveland Cavaliers', 'New Orleans Pelicans', 'Milwaukee Bucks',
       'Orlando Magic', 'Washington Wizards', 'Portland Trail Blazers',
       'Detroit Pistons', 'Charlotte Hornets', 'Philadelphia 76Ers',
       'Boston Celtics', 'San Antonio Spurs', 'Sacramento Kings',
       'Brooklyn Nets', 'Los Angeles Clippers', 'Oklahoma City Thunder',
       'Atlanta Hawks', 'Chicago Bulls', 'Denver Nuggets',
       'Houston Rockets', 'Indiana Pacers', 'Dallas Mavericks',
       'Los Angeles Lakers', 'Golden State Warriors'], dtype=object)

In [20]:
final_2024_data['team_names'] = final_2024_data['team_names'].replace('Philadelphia 76Ers', 'Philadelphia 76ers')

In [21]:
regular_season_2024 = get_team_stats(2024)

In [22]:
regular_season_2024['team_names'] = regular_season_2024['team'].apply(lambda x: x.split('\xa0')[0])

In [23]:
regular_season_2024.drop(columns = ['team'], inplace = True)

In [24]:
final_2024_data.drop(columns = ['slug_x', 'team'], inplace = True)

In [25]:
regular_season_2024

Unnamed: 0,W,L,W/L%,GB,PS/G,PA/G,SRS,year,seed,team_names
0,64,18,0.78,—,120.6,109.2,10.75,2024,1.0,Boston Celtics
1,50,32,0.61,14.0,112.8,108.2,4.36,2024,2.0,New York Knicks
2,49,33,0.598,15.0,119.0,116.4,2.44,2024,3.0,Milwaukee Bucks
3,48,34,0.585,16.0,112.6,110.2,1.98,2024,4.0,Cleveland Cavaliers
4,47,35,0.573,17.0,110.5,108.4,1.48,2024,6.0,Orlando Magic
5,47,35,0.573,17.0,123.3,120.2,2.75,2024,6.0,Indiana Pacers
6,47,35,0.573,17.0,114.6,111.5,2.51,2024,6.0,Philadelphia 76ers
7,46,36,0.561,18.0,110.1,108.4,1.1,2024,8.0,Miami Heat
8,39,43,0.476,25.0,112.3,113.7,-1.77,2024,9.0,Chicago Bulls
9,36,46,0.439,28.0,118.3,120.5,-2.38,2024,10.0,Atlanta Hawks


In [26]:
# Mapping the wins, losses for each team to their players

final_2024_data = pd.merge(final_2024_data, regular_season_2024, on = ['team_names'], how = 'left')

In [27]:
# Saving the final dataset to the data folder

player_2024_data = os.path.join(data_folder, 'player_2024_data.csv')
final_2024_data.to_csv(player_2024_data, index = False)