## Data Collecting

In [19]:
# import nba_api
#!pip install nba_api
from nba_api.stats.endpoints import playercareerstats, DraftHistory, commonallplayers, leaguegamefinder, boxscoretraditionalv2, teamgamelog
from nba_api.stats.endpoints import playergamelog
from nba_api.stats.static import teams, players
from nba_api.stats.endpoints import playerdashboardbyyearoveryear
import requests
import numpy as np
import pandas as pd
import time
from requests.exceptions import ReadTimeout

## Draft

In [15]:
# need to remove the 'ROUND_NUMBER' '2', but maybe we can do it as data cleaning
all_draft_picks = pd.DataFrame()

for year in range(2003, 2025):
    draft_history = DraftHistory(league_id='00', season_year_nullable=str(year))
    draft_df = draft_history.get_data_frames()[0]
    all_draft_picks = pd.concat([all_draft_picks, draft_df], ignore_index=True)

grouped_draft_picks = all_draft_picks.groupby('SEASON')
draft_picks_by_year = {year: group.copy() for year, group in grouped_draft_picks}

pd.set_option('display.max_rows', None)
display(all_draft_picks.head())

Unnamed: 0,PERSON_ID,PLAYER_NAME,SEASON,ROUND_NUMBER,ROUND_PICK,OVERALL_PICK,DRAFT_TYPE,TEAM_ID,TEAM_CITY,TEAM_NAME,TEAM_ABBREVIATION,ORGANIZATION,ORGANIZATION_TYPE,PLAYER_PROFILE_FLAG
0,2544,LeBron James,2003,1,1,1,Draft,1610612739,Cleveland,Cavaliers,CLE,Saint Vincent-Saint Mary,High School,1
1,2545,Darko Milicic,2003,1,2,2,Draft,1610612765,Detroit,Pistons,DET,KK Vrsac (Serbia),Other Team/Club,1
2,2546,Carmelo Anthony,2003,1,3,3,Draft,1610612743,Denver,Nuggets,DEN,Syracuse,College/University,1
3,2547,Chris Bosh,2003,1,4,4,Draft,1610612761,Toronto,Raptors,TOR,Georgia Tech,College/University,1
4,2548,Dwyane Wade,2003,1,5,5,Draft,1610612748,Miami,Heat,MIA,Marquette,College/University,1


In [16]:
#all_draft_picks.to_csv('Data/all_draft_picks.csv', index=False)

## Current player stats

In [17]:
all_players = commonallplayers.CommonAllPlayers(is_only_current_season=1)
players_df = all_players.get_data_frames()[0]
# Drop unknown columns
players_df = players_df.drop(columns='OTHERLEAGUE_EXPERIENCE_CH')
display(players_df.head())
#players_df.to_csv('Data/current_players.csv', index=False)

Unnamed: 0,PERSON_ID,DISPLAY_LAST_COMMA_FIRST,DISPLAY_FIRST_LAST,ROSTERSTATUS,FROM_YEAR,TO_YEAR,PLAYERCODE,PLAYER_SLUG,TEAM_ID,TEAM_CITY,TEAM_NAME,TEAM_ABBREVIATION,TEAM_SLUG,TEAM_CODE,GAMES_PLAYED_FLAG
0,1630173,"Achiuwa, Precious",Precious Achiuwa,1,2020,2023,precious_achiuwa,precious_achiuwa,1610612752,New York,Knicks,NYK,knicks,knicks,Y
1,203500,"Adams, Steven",Steven Adams,1,2013,2023,steven_adams,steven_adams,1610612745,Houston,Rockets,HOU,rockets,rockets,Y
2,1628389,"Adebayo, Bam",Bam Adebayo,1,2017,2023,bam_adebayo,bam_adebayo,1610612748,Miami,Heat,MIA,heat,heat,Y
3,1630534,"Agbaji, Ochai",Ochai Agbaji,1,2022,2023,ochai_agbaji,ochai_agbaji,1610612761,Toronto,Raptors,TOR,raptors,raptors,Y
4,1630583,"Aldama, Santi",Santi Aldama,1,2021,2023,santi_aldama,santi_aldama,1610612763,Memphis,Grizzlies,MEM,grizzlies,grizzlies,Y


In [21]:
active_players = players.get_active_players()
all_players_season_stats = []

for player in active_players:
    time.sleep(0.6)
    player_id = player['id']
    player_name = player['full_name']

    player_season_stats = playerdashboardbyyearoveryear.PlayerDashboardByYearOverYear(player_id=player_id)

    season_stats_df = player_season_stats.get_data_frames()[1]  # Index 1 is usually the season totals
    season_stats_df = season_stats_df[season_stats_df['GROUP_VALUE'] == '2023-24']

    selected_columns_df = season_stats_df[['GP', 'MIN', 'FG_PCT', 'FG3_PCT', 'FT_PCT', 'REB', 'AST', 'BLK', 'STL', 'PF', 'TOV', 'PTS']]
    season_stats_df['PLAYER_NAME'] = player_name

    all_players_season_stats.append(season_stats_df)

all_players_season_stats = pd.concat(all_players_season_stats, ignore_index=True)

# Drop useless columns
all_players_season_stats = all_players_season_stats.drop(columns=['GROUP_SET', 'GROUP_VALUE'])

display(all_players_season_stats.head(5))
display(all_players_season_stats.tail(5))

# Output to csv
all_players_season_stats.to_csv('Data/all_players_season_stats_2023_24.csv', index=False)

KeyboardInterrupt: 

In [None]:
#all_players_season_stats.to_csv('Data/all_players_season_stats_2023_24.csv', index=False)

In [None]:
# sort by team name just in case. Maybe this is also part of cleaning.
sorted_players_df = players_df.sort_values(by='TEAM_NAME')

display(sorted_players_df.head())

#sorted_players_df.to_csv('Data/current_players_sorted_by_team.csv', index=False)

Unnamed: 0,PERSON_ID,DISPLAY_LAST_COMMA_FIRST,DISPLAY_FIRST_LAST,ROSTERSTATUS,FROM_YEAR,TO_YEAR,PLAYERCODE,PLAYER_SLUG,TEAM_ID,TEAM_CITY,TEAM_NAME,TEAM_ABBREVIATION,TEAM_SLUG,TEAM_CODE,GAMES_PLAYED_FLAG
311,201577,"Lopez, Robin",Robin Lopez,0,2008,2023,robin_lopez,robin_lopez,0,,,,,,Y
478,202397,"Smith, Ish",Ish Smith,0,2010,2023,ish_smith,ish_smith,0,,,,,,Y
233,1627863,"House Jr., Danuel",Danuel House Jr.,0,2016,2023,danuel_house,danuel_house_jr,0,,,,,,Y
200,1630181,"Hampton, R.J.",R.J. Hampton,0,2020,2023,rj_hampton,rj_hampton,0,,,,,,Y
383,1628373,"Ntilikina, Frank",Frank Ntilikina,0,2017,2023,frank_ntilikina,frank_ntilikina,0,,,,,,Y
66,1629717,"Brooks, Armoni",Armoni Brooks,0,2020,2023,armoni_brooks,armoni_brooks,0,,,,,,Y
93,1630608,"Cazalon, Malcolm",Malcolm Cazalon,0,2023,2023,malcolm_cazalon,malcolm_cazalon,0,,,,,,Y
64,1631167,"Brockington, Izaiah",Izaiah Brockington,0,2023,2023,izaiah_brockington,izaiah_brockington,0,,,,,,Y
382,1641806,"Nowell, Markquis",Markquis Nowell,0,2023,2023,markquis_nowell,markquis_nowell,0,,,,,,Y
113,1630622,"Crutcher, Jalen",Jalen Crutcher,0,2023,2023,jalen_crutcher,jalen_crutcher,0,,,,,,Y


## Team Stats

In [None]:
def make_api_request(endpoint, *args, **kwargs):
    max_retries = 5
    backoff_factor = 1.5
    for retry in range(max_retries):
        try:
            response = endpoint(*args, **kwargs, timeout=60)
            return response.get_data_frames()[0]
        except ReadTimeout:
            sleep_time = (backoff_factor ** retry) * 2
            print(f"Timeout encountered for args {args}, kwargs {kwargs}. Retrying after {sleep_time} seconds.")
            time.sleep(sleep_time)
        except Exception as e:
            print(f"An exception occurred for args {args}, kwargs {kwargs}: {e}")
            break
    return pd.DataFrame()

all_team_game_stats = pd.DataFrame()
nba_teams = teams.get_teams()

for team in nba_teams:
    team_id = team['id']
    team_game_stats_df = make_api_request(
        teamgamelog.TeamGameLog,
        team_id=team_id,
        season='2023-24'
    )
    if not team_game_stats_df.empty:
        team_game_stats_df['Team'] = team['full_name']
        all_team_game_stats = pd.concat([all_team_game_stats, team_game_stats_df], ignore_index=True)
    else:
        print(f"Data for team {team['full_name']} could not be retrieved.")

#display(all_team_game_stats.head(5))
#display(all_team_game_stats.tail(5))

In [None]:
all_team_game_stats['HOME_TEAM_ABBR'] = all_team_game_stats['MATCHUP'].str[:3]
all_team_game_stats['VISITOR_TEAM_ABBR'] = all_team_game_stats['MATCHUP'].str[-3:]
all_team_game_stats.drop('MATCHUP', axis=1, inplace=True)
all_team_game_stats.drop(['Team'], axis=1, inplace=True)
all_team_game_stats['GAME_DATE'] = pd.to_datetime(all_team_game_stats['GAME_DATE']).dt.strftime('%Y-%m-%d')

all_team_game_stats = all_team_game_stats[['Team_ID', 'Game_ID', 'GAME_DATE', 'HOME_TEAM_ABBR', 'VISITOR_TEAM_ABBR', 'WL', 'W', 'L', 'W_PCT', 'MIN', 'FGM', 'FGA', 'FG_PCT', 'FG3M', 'FG3A', 'FG3_PCT', 'FTM', 'FTA', 'FT_PCT', 'OREB', 'DREB', 'REB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS']]
all_team_game_stats.rename(columns={'Team_ID': 'Home_Team_ID'}, inplace=True)

display(all_team_game_stats.head(5))

# all_team_game_stats.to_csv('Data/all_team_game_stats.csv', index=False)

  all_team_game_stats['GAME_DATE'] = pd.to_datetime(all_team_game_stats['GAME_DATE']).dt.strftime('%Y-%m-%d')


Unnamed: 0,Home_Team_ID,Game_ID,GAME_DATE,HOME_TEAM_ABBR,VISITOR_TEAM_ABBR,WL,W,L,W_PCT,MIN,...,FT_PCT,OREB,DREB,REB,AST,STL,BLK,TOV,PF,PTS
0,1610612737,22301104,2024-04-03,ATL,DET,,,,,0,...,0.5,2,8,10,3,2,2,2,0,15
1,1610612737,22301091,2024-04-01,ATL,CHI,W,35.0,40.0,0.467,240,...,0.667,8,37,45,29,4,5,14,21,113
2,1610612737,22301076,2024-03-30,ATL,MIL,L,34.0,40.0,0.459,240,...,0.833,9,30,39,24,5,3,11,26,113
3,1610612737,22301060,2024-03-28,ATL,BOS,W,34.0,39.0,0.466,265,...,0.6,17,36,53,24,5,5,11,17,123
4,1610612737,22301051,2024-03-27,ATL,POR,W,33.0,39.0,0.458,240,...,0.8,8,33,41,23,12,9,16,17,120


In [None]:
all_team_game_stats.dtypes


Home_Team_ID           int64
Game_ID               object
GAME_DATE             object
HOME_TEAM_ABBR        object
VISITOR_TEAM_ABBR     object
WL                    object
W                    float64
L                    float64
W_PCT                float64
MIN                    int64
FGM                    int64
FGA                    int64
FG_PCT               float64
FG3M                   int64
FG3A                   int64
FG3_PCT              float64
FTM                    int64
FTA                    int64
FT_PCT               float64
OREB                   int64
DREB                   int64
REB                    int64
AST                    int64
STL                    int64
BLK                    int64
TOV                    int64
PF                     int64
PTS                    int64
dtype: object