# Player Dataprep

In [92]:
from nba_api.stats.endpoints import leaguegamelog
import pandas as pd

YEAR = 2021

COUNTER = 0
DIRECTION = "ASC"
LEAGUE = "00"
PLAYER_OR_TEAM = "P"
SEASON_TYPE = "Regular Season"
SORTER = "DATE"

def fetch_season_game_logs(season):
    # Create LeagueGameLog object instance for the season
    gamelog = leaguegamelog.LeagueGameLog(
        counter=COUNTER,
        direction=DIRECTION,
        league_id=LEAGUE,
        player_or_team_abbreviation=PLAYER_OR_TEAM,
        season=season,
        season_type_all_star=SEASON_TYPE,
        sorter=SORTER
    )
    
    # Execute request and fetch data
    data = gamelog.get_data_frames()[0]
    
    # Filter columns and add Home/Away column
    columns_to_keep = [
       "GAME_ID", "TEAM_ID", "TEAM_NAME", "PLAYER_ID", "PLAYER_NAME", "MATCHUP", "WL", "GAME_DATE",
        "PTS", "OREB", "DREB", "AST", "STL", "BLK", "TOV", "FGM", "FGA", "FG_PCT",
        "FG3M", "FG3A", "FG3_PCT", "FTM", "FTA", "FT_PCT", "MIN"
    ]
    filtered_data = data[columns_to_keep].copy()
    filtered_data['HOME/AWAY'] = filtered_data['MATCHUP'].apply(lambda x: 0 if '@' in x else 1)
    
    return filtered_data

def fetch_multiple_seasons(start_year, end_year):
    all_seasons_data = []  # List to store data for all seasons
    
    # Loop through each season from start_year to end_year (inclusive)
    for year in range(start_year, end_year + 1):
        season = f"{year}-{str(year + 1)[-2:]}"  # Format season string (e.g., '2010-11')
        season_data = fetch_season_game_logs(season)
        all_seasons_data.append(season_data)
    
    # Concatenate all seasons' data into a single DataFrame
    combined_data = pd.concat(all_seasons_data, ignore_index=True)
    combined_data = combined_data.sort_values(by='GAME_ID', ascending=True)
    combined_data = combined_data.reset_index(drop=True)
    
    return combined_data

def filter_top_players(data):
    # Group by GAME_ID and TEAM_ID, then sort by MIN within each group and take the top 7 players
    top_players = data.groupby(['GAME_ID', 'TEAM_ID']).apply(lambda x: x.nlargest(7, 'MIN')).reset_index(drop=True)
    return top_players

def ensure_minimum_players(data, min_players=5):
    # Ensure each game has at least `min_players` players
    def fill_missing_players(group):
        if len(group) < min_players:
            missing_count = min_players - len(group)
            for _ in range(missing_count):
                # Add a row with missing player information and zeroed stats
                missing_row = pd.Series({
                    'GAME_ID': group['GAME_ID'].iloc[0],
                    'TEAM_ID': group['TEAM_ID'].iloc[0],
                    'TEAM_NAME': group['TEAM_NAME'].iloc[0],
                    'PLAYER_ID': None,
                    'PLAYER_NAME': 'N/A',
                    'MATCHUP': group['MATCHUP'].iloc[0],
                    'WL': group['WL'].iloc[0],
                    'GAME_DATE': group['GAME_DATE'].iloc[0],
                    'PTS': 0, 'OREB': 0, 'DREB': 0, 'AST': 0, 'STL': 0, 'BLK': 0, 'TOV': 0,
                    'FGM': 0, 'FGA': 0, 'FG_PCT': 0, 'FG3M': 0, 'FG3A': 0, 'FG3_PCT': 0,
                    'FTM': 0, 'FTA': 0, 'FT_PCT': 0, 'MIN': 0, 'HOME/AWAY': group['HOME/AWAY'].iloc[0]
                })
                group = pd.concat([group, missing_row.to_frame().T], ignore_index=True)
        return group
    
    return data.groupby(['GAME_ID', 'TEAM_ID']).apply(fill_missing_players).reset_index(drop=True)

def pivot_player_stats(data):
    data = filter_top_players(data)
    data = ensure_minimum_players(data)
    
    # Separate home and away teams
    home_data = data[data['HOME/AWAY'] == 1].copy()
    away_data = data[data['HOME/AWAY'] == 0].copy()

    # Calculate team points
    home_team_points = home_data.groupby('GAME_ID')['PTS'].sum().reset_index().rename(columns={'PTS': 'HOME_TEAM_POINTS'})
    away_team_points = away_data.groupby('GAME_ID')['PTS'].sum().reset_index().rename(columns={'PTS': 'AWAY_TEAM_POINTS'})
    
    # Merge team points back into the pivoted data
    home_data = pd.merge(home_data, home_team_points, on='GAME_ID', how='left')
    away_data = pd.merge(away_data, away_team_points, on='GAME_ID', how='left')
    
    # Rank players within each game by playtime to label columns
    home_data['RANK'] = home_data.groupby('GAME_ID')['MIN'].rank("dense", ascending=False).astype(int)
    away_data['RANK'] = away_data.groupby('GAME_ID')['MIN'].rank("dense", ascending=False).astype(int)
    
    # Pivot the data
    home_pivot = home_data.pivot_table(
        index=['GAME_ID', 'GAME_DATE', 'TEAM_ID', 'TEAM_NAME', 'HOME_TEAM_POINTS'],
        columns='RANK',
        values=['PLAYER_ID', 'PLAYER_NAME', 'PTS', 'OREB', 'DREB', 'AST', 'STL', 'BLK', 'TOV', 'FGM', 'FGA', 'FG_PCT', 
                'FG3M', 'FG3A', 'FG3_PCT', 'FTM', 'FTA', 'FT_PCT', 'MIN'],
        aggfunc='first'
    )
    away_pivot = away_data.pivot_table(
        index=['GAME_ID', 'GAME_DATE', 'TEAM_ID', 'TEAM_NAME', 'AWAY_TEAM_POINTS'],
        columns='RANK',
        values=['PLAYER_ID', 'PLAYER_NAME', 'PTS', 'OREB', 'DREB', 'AST', 'STL', 'BLK', 'TOV', 'FGM', 'FGA', 'FG_PCT', 
                'FG3M', 'FG3A', 'FG3_PCT', 'FTM', 'FTA', 'FT_PCT', 'MIN'],
        aggfunc='first'
    )
    
    # Flatten the MultiIndex columns and label them appropriately
    home_pivot.columns = [f'HomeTeamPlayer{rank}_{stat}' for stat, rank in home_pivot.columns]
    away_pivot.columns = [f'AwayTeamPlayer{rank}_{stat}' for stat, rank in away_pivot.columns]
    
    # Reset index
    home_pivot = home_pivot.reset_index()
    away_pivot = away_pivot.reset_index()
    
    # Merge home and away data back together
    merged_data = pd.merge(home_pivot, away_pivot, on=['GAME_ID', 'GAME_DATE'], suffixes=('_home', '_away'))
    
    # Define the desired column order
    column_order = ['GAME_ID', 'GAME_DATE', 'TEAM_ID_home', 'TEAM_NAME_home', 'HOME_TEAM_POINTS', 'TEAM_ID_away', 'TEAM_NAME_away', 'AWAY_TEAM_POINTS']
    
    # Add player names and IDs first
    for rank in range(1, 8):
        column_order.extend([f'HomeTeamPlayer{rank}_PLAYER_NAME', f'HomeTeamPlayer{rank}_PLAYER_ID', 
                             f'AwayTeamPlayer{rank}_PLAYER_NAME', f'AwayTeamPlayer{rank}_PLAYER_ID'])
    
    # Add remaining stats in the desired order
    stats = ['PTS', 'OREB', 'DREB', 'AST', 'STL', 'BLK', 'TOV', 'FGM', 'FGA', 'FG_PCT', 'FG3M', 'FG3A', 'FG3_PCT', 'FTM', 'FTA', 'FT_PCT', 'MIN']
    for stat in stats:
        for rank in range(1, 8):
            column_order.extend([f'HomeTeamPlayer{rank}_{stat}', f'AwayTeamPlayer{rank}_{stat}'])

    # Ensure all columns are present in the order
    merged_data = merged_data[column_order]
    
    # Rename the columns
    merged_data.rename(columns={
        'TEAM_ID_home': 'HOME_TEAM_ID',
        'TEAM_NAME_home': 'HOME_TEAM_NAME',
        'TEAM_ID_away': 'AWAY_TEAM_ID',
        'TEAM_NAME_away': 'AWAY_TEAM_NAME'
    }, inplace=True)
    
    return merged_data

# Fetch player game logs for the 2023 season
player_game_logs = fetch_multiple_seasons(YEAR, YEAR)

# Pivot the data to get one row per game with the top 7 players from each team
pivoted_player_stats = pivot_player_stats(player_game_logs)

print("Pivoted player stats shape:", pivoted_player_stats.shape)
pivoted_player_stats


Pivoted player stats shape: (1230, 274)


Unnamed: 0,GAME_ID,GAME_DATE,HOME_TEAM_ID,HOME_TEAM_NAME,HOME_TEAM_POINTS,AWAY_TEAM_ID,AWAY_TEAM_NAME,AWAY_TEAM_POINTS,HomeTeamPlayer1_PLAYER_NAME,HomeTeamPlayer1_PLAYER_ID,...,HomeTeamPlayer3_MIN,AwayTeamPlayer3_MIN,HomeTeamPlayer4_MIN,AwayTeamPlayer4_MIN,HomeTeamPlayer5_MIN,AwayTeamPlayer5_MIN,HomeTeamPlayer6_MIN,AwayTeamPlayer6_MIN,HomeTeamPlayer7_MIN,AwayTeamPlayer7_MIN
0,0022100001,2021-10-19,1610612749,Milwaukee Bucks,106,1610612751,Brooklyn Nets,101,Giannis Antetokounmpo,203507.0,...,28.0,29.0,26.0,24.0,23.0,23.0,,21.0,,
1,0022100002,2021-10-19,1610612747,Los Angeles Lakers,101,1610612744,Golden State Warriors,101,Anthony Davis,203076.0,...,35.0,29.0,31.0,26.0,26.0,25.0,20.0,23.0,19.0,
2,0022100003,2021-10-20,1610612766,Charlotte Hornets,117,1610612754,Indiana Pacers,119,Gordon Hayward,202330.0,...,32.0,33.0,29.0,28.0,24.0,26.0,21.0,24.0,,
3,0022100004,2021-10-20,1610612765,Detroit Pistons,69,1610612741,Chicago Bulls,87,Saddiq Bey,1630180.0,...,29.0,35.0,28.0,33.0,26.0,28.0,22.0,27.0,20.0,15.0
4,0022100005,2021-10-20,1610612752,New York Knicks,130,1610612738,Boston Celtics,134,RJ Barrett,1629628.0,...,44.0,32.0,36.0,31.0,34.0,23.0,28.0,,22.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1225,0022101226,2022-04-10,1610612752,New York Knicks,105,1610612761,Toronto Raptors,77,Immanuel Quickley,1630193.0,...,39.0,28.0,33.0,26.0,23.0,25.0,17.0,22.0,,20.0
1226,0022101227,2022-04-10,1610612753,Orlando Magic,113,1610612748,Miami Heat,105,R.J. Hampton,1630181.0,...,29.0,36.0,28.0,35.0,25.0,34.0,,20.0,,
1227,0022101228,2022-04-10,1610612755,Philadelphia 76ers,93,1610612765,Detroit Pistons,71,Matisse Thybulle,1629680.0,...,32.0,26.0,28.0,25.0,24.0,23.0,23.0,22.0,,
1228,0022101229,2022-04-10,1610612756,Phoenix Suns,96,1610612758,Sacramento Kings,103,Aaron Holiday,1628988.0,...,27.0,30.0,26.0,27.0,23.0,18.0,,,,


In [88]:
pivoted_player_stats.to_csv(f'player_stats_{YEAR}.csv', index=False)