In [1]:
import pandas as pd

In [135]:
data_directory='/Users/cb/src/nba_mvp_ml/data/processed/by_season'

In [78]:
def calc_team_stats(year):
    data = pd.read_csv(f'/Users/cb/src/nba_mvp_ml/data/processed/by_season/stats_{year}.csv')

    # Define numeric columns to sum
    numeric_columns = [
        col for col in data.select_dtypes(include=['number']).columns
        if col not in ['PLAYER_ID', 'GP', 'GS', 'PLAYER_AGE', 'SEASON_ID', 'LEAGUE_ID', 'TEAM_ID']
    ]
    
    # Define aggregation dictionary
    aggregation_dict = {
        'PLAYER_FULLNAME': lambda x: ', '.join(x.unique()),  # Combine player names
        **{col: 'sum' for col in numeric_columns}  # Sum numeric columns explicitly
    }
    
    # Group by the specified columns and aggregate
    team_stats = data.groupby(['SEASON_ID', 'LEAGUE_ID', 'TEAM_ABBREVIATION', 'TEAM_ID'], as_index=False).agg(aggregation_dict)
    
    # Add a column counting the number of players on the team
    team_stats['PLAYER_COUNT'] = team_stats['PLAYER_FULLNAME'].apply(lambda x: len(x.split(', ')))
    
    # Reorder columns to where player count (per team) is listed next to the list of team names
    columns = team_stats.columns.to_list()
    new_order = columns[:5] + ['PLAYER_COUNT'] + columns[5:-1]
    team_stats = team_stats[new_order]

    return team_stats

# for year in list(range(1980, 2024)):
#     team_stats = calc_team_stats(year)
#     team_stats.to_csv(f'/Users/cb/src/nba_mvp_ml/data/processed/by_season/team_stats_{year}.csv')

In [66]:
# Load the CSV files
def load_data(year, data_directory):
    """
    Load player and team stats for a specific year.

    Parameters:
    - year (int): The year of the season.

    Returns:
    - player_stats (DataFrame): Player statistics.
    - team_stats (DataFrame): Team statistics.
    """
    player_stats = pd.read_csv(f"{data_directory}/stats_{year}.csv")
    team_stats = pd.read_csv(f"{data_directory}/team_stats_{year}.csv")
    return player_stats, team_stats

# Load data for 2023
player_stats_2023, team_stats_2023 = load_data(2023)

# Display the first few rows of the dataframes
# player_stats_2023.head()
# team_stats_2023.head()

In [120]:
def aggregate_team_stats(team_df):
    """
    Aggregate team stats into a dictionary for reference by TEAM_ID.

    Parameters:
    - team_df (DataFrame): Team statistics DataFrame.

    Returns:
    - team_stats_dict (dict): Dictionary of aggregated team stats by TEAM_ID.
    """
    return team_df.set_index('TEAM_ID')[['PTS', 'MIN']].to_dict(orient='index')
    
def calculate_team_possessions(team_df):
    """
    Calculate team possessions for the season.

    Parameters:
    - team_df (DataFrame): Team statistics DataFrame.

    Returns:
    - team_df (DataFrame): Updated DataFrame with 'Possessions' column.
    
    Note:
    - This function assumes season totals, not per-game averages.
    """
    team_df['Possessions'] = (
        team_df['FGA'] + 0.44 * team_df['FTA'] - team_df['OREB'] + team_df['TOV']
    )
    return team_df

def calculate_player_possessions(player_df, team_df):
    """
    Estimate player possessions based on team possessions.

    Parameters:
    - player_df (DataFrame): Player statistics DataFrame.
    - team_df (DataFrame): Team statistics DataFrame.

    Returns:
    - player_df (DataFrame): Updated DataFrame with 'Player_Possessions' column.

    Limitations:
    - Assumes proportional distribution of possessions based on minutes played.
    - Does not account for individual player's specific playstyle or usage rate.
    """
    team_df = team_df[['TEAM_ID', 'Possessions', 'MIN']]
    player_df = player_df.merge(team_df, on='TEAM_ID', suffixes=('', '_team'))
    player_df['Player_Possessions'] = (
        player_df['Possessions'] * (player_df['MIN'] / player_df['MIN_team'])
    )
    return player_df.drop(columns=['Possessions', 'MIN_team'])

def aggregate_team_stats_for_bpm(team_df):
    """
    Aggregate team stats into a dictionary for reference by TEAM_ID.

    Parameters:
    - team_df (DataFrame): Team statistics DataFrame.

    Returns:
    - team_stats_dict (dict): Aggregated stats (PTS, MIN, etc.) by TEAM_ID.
    """
    return team_df.set_index('TEAM_ID')[['PTS', 'MIN', 'STL', 'BLK', 'REB']].to_dict(orient='index')

In [132]:
def calculate_true_shooting_percentage(df, pts_col, fga_col, fta_col, new_col='TS%'):
    """
    Calculate True Shooting Percentage (TS%) based on season totals.

    Parameters:
    - df (DataFrame): Player or team stats DataFrame.
    - pts_col, fga_col, fta_col (str): Column names for points, FGA, and FTA.
    - new_col (str): Name of the new TS% column.

    Returns:
    - df (DataFrame): Updated DataFrame with TS%.
    """
    df[new_col] = df[pts_col] / (2 * (df[fga_col] + 0.44 * df[fta_col]))
    return df

def calculate_effective_field_goal_percentage(df, fgm_col, fg3m_col, fga_col, new_col='eFG%'):
    """
    Calculate Effective Field Goal Percentage (eFG%) for the season.

    Parameters:
    - df (DataFrame): Player or team stats DataFrame.
    - fgm_col, fg3m_col, fga_col (str): Column names for FGM, FG3M, and FGA.
    - new_col (str): Name of the new eFG% column.

    Returns:
    - df (DataFrame): Updated DataFrame with eFG%.
    """
    df[new_col] = (df[fgm_col] + 0.5 * df[fg3m_col]) / df[fga_col]
    return df

def calculate_per(df, pts_col, reb_col, ast_col, tov_col, min_col, new_col='PER'):
    """
    Calculate Player Efficiency Rating (PER) as an approximation for the season.

    Parameters:
    - df (DataFrame): Player stats DataFrame.
    - pts_col, reb_col, ast_col, tov_col, min_col (str): Column names.
    - new_col (str): Name of the new PER column.

    Returns:
    - df (DataFrame): Updated DataFrame with PER.
    """
    df[new_col] = (df[pts_col] + df[reb_col] + df[ast_col] - df[tov_col]) / df[min_col]
    return df

def calculate_team_pace(team_df):
    """
    Calculate team Pace for the season.

    Parameters:
    - team_df (DataFrame): Team statistics DataFrame.

    Returns:
    - team_df (DataFrame): Updated DataFrame with 'Pace' column.
    
    Note:
    - This calculation uses season totals, assuming complete data.
    """
    team_df['Pace'] = (team_df['Possessions'] / team_df['MIN']) * 48
    return team_df

def calculate_win_shares(player_df, team_stats_dict, pts_col, min_col, new_col='WS'):
    """
    Calculate Win Shares for players based on team-level stats.

    Parameters:
    - player_df (DataFrame): Player statistics DataFrame.
    - team_stats_dict (dict): Aggregated team stats (PTS and MIN) by TEAM_ID.
    - pts_col, min_col (str): Column names for player points and minutes.
    - new_col (str): Name of the new Win_Shares column.

    Returns:
    - player_df (DataFrame): Updated DataFrame with Win_Shares.

    Notes:
    - Avoids merging by using a pre-aggregated dictionary of team stats.
    """
    def compute_win_shares(row):
        team_id = row['TEAM_ID']
        if team_id in team_stats_dict:
            team_pts = team_stats_dict[team_id]['PTS']
            team_min = team_stats_dict[team_id]['MIN']
            return (row[pts_col] / team_pts) * (row[min_col] / team_min)
        return 0  # Default if TEAM_ID is missing from team stats
    
    player_df[new_col] = player_df.apply(compute_win_shares, axis=1)
    return player_df

def calculate_bpm(player_df, team_stats_dict, pts_col='PTS', ast_col='AST', tov_col='TOV',
                  stl_col='STL', blk_col='BLK', reb_col='REB', min_col='MIN', new_col='BPM'):
    """
    Approximate Box Plus-Minus (BPM) for players based on box score stats.

    Parameters:
    - player_df (DataFrame): Player statistics DataFrame.
    - team_stats_dict (dict): Aggregated team stats by TEAM_ID.
    - pts_col, ast_col, tov_col, stl_col, blk_col, reb_col, min_col (str): Column names for stats.
    - new_col (str): Name of the new BPM column.

    Returns:
    - player_df (DataFrame): Updated DataFrame with BPM.

    Notes:
    - This is an approximate calculation without regression-based weights or league averages.
    """
    def compute_bpm(row):
        team_id = row['TEAM_ID']
        if team_id in team_stats_dict:
            team_stats = team_stats_dict[team_id]
            
            # Team-level context
            team_pts = team_stats['PTS']
            team_min = team_stats['MIN']

            # Player contributions
            obpm = (
                0.1 * row[pts_col] +  # Points
                0.5 * row[ast_col] -  # Assists
                0.25 * row[tov_col]   # Turnovers (penalty)
            )

            dbpm = (
                0.3 * row[stl_col] +  # Steals
                0.3 * row[blk_col] +  # Blocks
                0.1 * row[reb_col]    # Defensive rebounds
            )

            # Scale by minutes
            minutes_factor = row[min_col] / team_min if team_min > 0 else 0
            bpm = (obpm + dbpm) * minutes_factor
            return bpm
        return 0  # Default if TEAM_ID is missing

    player_df[new_col] = player_df.apply(compute_bpm, axis=1)
    return player_df

In [137]:
for year in list(range(1980, 2024)):

    player_stats, team_stats = load_data(year)

    # Calculate intermediate metrics for teams
    team_stats = calculate_team_possessions(team_stats)
    team_stats = calculate_team_pace(team_stats)
    
    # Add advanced metrics for teams
    team_stats = calculate_true_shooting_percentage(team_stats, 'PTS', 'FGA', 'FTA')
    team_stats = calculate_effective_field_goal_percentage(team_stats, 'FGM', 'FG3M', 'FGA')
    
    # Add advanced metrics for players
    player_stats = calculate_true_shooting_percentage(player_stats, 'PTS', 'FGA', 'FTA')
    player_stats = calculate_effective_field_goal_percentage(player_stats, 'FGM', 'FG3M', 'FGA')
    player_stats = calculate_per(player_stats, 'PTS', 'REB', 'AST', 'TOV', 'MIN')
    
    # Aggregate team stats
    team_stats_aggregated = aggregate_team_stats(team_stats)
    
    # Calculate Win Shares for players using aggregated team stats
    player_stats = calculate_win_shares(player_stats, team_stats_aggregated, 'PTS', 'MIN')

    # # Aggregate team stats
    # team_stats_aggregated_for_bpm = aggregate_team_stats_for_bpm(team_stats)
    
    # # Calculate BPM for players
    # player_stats = calculate_bpm(player_stats, team_stats_aggregated_for_bpm,
    #                               pts_col='PTS', ast_col='AST', tov_col='TOV',
    #                               stl_col='STL', blk_col='BLK', reb_col='REB', min_col='MIN')


    player_stats.to_csv(f"{data_directory}/stats_{year}.csv")
    team_stats.to_csv(f"{data_directory}/team_stats_{year}.csv")