In [1]:
# !pip install plotly jupyterlab

In [2]:
import pandas as pd

import plotly.express as px
import plotly.io as pio
# pio.renderers.default = 'notebook'
pio.renderers.default = 'iframe'

import seaborn as sns
import matplotlib.pyplot as plt

In [3]:
def fix_encoding(text):
    try:
        return text.encode('latin1').decode('utf-8').upper()
    except (UnicodeEncodeError, UnicodeDecodeError):
        return text.upper()  # Return original text if fixing fails

In [32]:
year = 2023

player_stats_path = f'/Users/cb/src/nba_mvp_ml/data/processed/by_season/players/stats_{year}.csv'
team_stats_path = f'/Users/cb/src/nba_mvp_ml/data/processed/by_season/team (basketball-reference)/team_stats_{year}_updated.csv'
mvp_votes_path = f'/Users/cb/src/nba_mvp_ml/data/processed/by_season/mvp/mvp_{year}-{str(year+1)[2:]}.csv'

player_df = pd.read_csv(player_stats_path)
team_df = pd.read_csv(team_stats_path).dropna(axis=1, how='all')
mvp_df = pd.read_csv(mvp_votes_path)

player_df['PLAYER_FULLNAME'] = player_df['PLAYER_FULLNAME'].apply(fix_encoding)
mvp_df['Player'] = mvp_df['Player'].apply(fix_encoding)


print(f'Player dataframe columns:\n{list(player_df.columns)}')
print(f'Team dataframe columns:\n{list(team_df.columns)}')
print(f'MVP dataframe columns:\n{list(mvp_df.columns)}')

print(f'MVP List: {list(mvp_df['Player'])}')

Player dataframe columns:
['Unnamed: 0', 'PLAYER_ID', 'PLAYER_FULLNAME', 'SEASON_ID', 'LEAGUE_ID', 'TEAM_ID', 'TEAM_ABBREVIATION', 'PLAYER_AGE', 'GP', 'GS', 'MIN', 'FGM', 'FGA', 'FG_PCT', 'FG3M', 'FG3A', 'FG3_PCT', 'FTM', 'FTA', 'FT_PCT', 'OREB', 'DREB', 'REB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'MIN_PG', 'FGM_PG', 'FGA_PG', 'FG_PCT_PG', 'FG3M_PG', 'FG3A_PG', 'FG3_PCT_PG', 'FTM_PG', 'FTA_PG', 'FT_PCT_PG', 'OREB_PG', 'DREB_PG', 'REB_PG', 'AST_PG', 'STL_PG', 'BLK_PG', 'TOV_PG', 'PF_PG', 'PTS_PG', 'TS%', 'eFG%', 'PER', 'WS']
Team dataframe columns:
['Rk_trad', 'Team', 'G', 'MP', 'FG', 'FGA', 'FG%', '3P', '3PA', '3P%', '2P', '2PA', '2P%', 'FT', 'FTA', 'FT%', 'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'SEASON_ID', 'Rk_opp', 'G_opp', 'MP_opp', 'FG_opp', 'FGA_opp', 'FG%_opp', '3P_opp', '3PA_opp', '3P%_opp', '2P_opp', '2PA_opp', '2P%_opp', 'FT_opp', 'FTA_opp', 'FT%_opp', 'ORB_opp', 'DRB_opp', 'TRB_opp', 'AST_opp', 'STL_opp', 'BLK_opp', 'TOV_opp', 'PF_opp', 'PTS_opp', '

In [5]:
def per_vs_team_success(player_df, team_df, mvp_list):
    # Merge player and team data on TEAM_ID and SEASON_ID
    merged_df = pd.merge(player_df, team_df, on=['TEAM_ID', 'SEASON_ID'], suffixes=('_player', '_team'))
    
    # Add a column to differentiate MVP candidates and the actual MVP
    actual_mvp = mvp_list[0]  # The first player in the list is the actual MVP
    merged_df['MVP_Candidate'] = merged_df['PLAYER_FULLNAME'].apply(
        lambda x: 'MVP' if x == actual_mvp else ('MVP Candidate' if x in mvp_list else 'No MVP votes')
    )
    
    # Scatterplot
    fig = px.scatter(
        merged_df, x='PER', y='W/L%',
        size='PTS_player', color='MVP_Candidate',
        hover_name='PLAYER_FULLNAME',
        title='Player Efficiency Rating (PER) vs Team Success (W/L%)',
        labels={'PER': 'Player Efficiency Rating (PER)', 'W/L%': 'Team Win/Loss Percentage'}
    )
    fig.update_traces(marker=dict(line=dict(width=2, color='DarkSlateGrey')))
    fig.update_layout(legend_title_text='Legend')
    fig.show()

per_vs_team_success(player_df, team_df, mvp_list=list(mvp_df['Player']))

In [6]:
def defense_vs_opponent_scoring(player_df, team_df, mvp_list):
    # Merge player and team data
    merged_df = pd.merge(player_df, team_df, on=['TEAM_ID', 'SEASON_ID'], suffixes=('_player', '_team'))
    
    # Add a column to differentiate MVP candidates and the actual MVP
    actual_mvp = mvp_list[0]
    merged_df['MVP_Candidate'] = merged_df['PLAYER_FULLNAME'].apply(
        lambda x: 'MVP' if x == actual_mvp else ('MVP Candidate' if x in mvp_list else 'No MVP votes')
    )
    
    # Scatterplot for defensive stats and opponent scoring
    fig = px.scatter(
        merged_df, x='BLK_player', y='PTS_opp_pg',
        size='STL_player', color='MVP_Candidate',
        hover_name='PLAYER_FULLNAME',
        title='Defensive Impact (BLK, STL) vs Opponent Points Per Game (PTS_opp_pg)',
        labels={'BLK_player': 'Player Blocks (BLK)', 'PTS_opp_pg': 'Opponent Points Per Game'}
    )
    fig.update_traces(marker=dict(line=dict(width=2, color='DarkSlateGrey')))
    fig.update_layout(legend_title_text='Legend')
    fig.show()

# Example usage
defense_vs_opponent_scoring(player_df, team_df, mvp_list=list(mvp_df['Player']))

In [7]:
def scoring_vs_offensive_rating(player_df, team_df, mvp_list):
    """
    Creates a scatter plot highlighting MVP candidates and the actual MVP.

    Parameters:
    - player_df (DataFrame): The player statistics DataFrame.
    - team_df (DataFrame): The team statistics DataFrame.
    - mvp_list (list): List of player names who were MVP candidates, with the actual MVP as the first element.
    """
    # Merge player and team data
    merged_df = pd.merge(player_df, team_df, on=['TEAM_ID', 'SEASON_ID'], suffixes=('_player', '_team'))
    
    # Ensure non-negative values for bubble size
    merged_df['PER'] = merged_df['PER'].clip(lower=0)  # Replace negative PER with 0
    
    # Add a column to differentiate MVP candidates and the actual MVP
    actual_mvp = mvp_list[0]  # The first player in the list is the actual MVP
    merged_df['MVP_Candidate'] = merged_df['PLAYER_FULLNAME'].apply(
        lambda x: 'MVP' if x == actual_mvp else ('MVP Candidate' if x in mvp_list else 'No MVP votes')
    )
    
    # Scatterplot
    fig = px.scatter(
        merged_df, x='PTS_player', y='ORtg',
        size='PER', color='MVP_Candidate',
        hover_name='PLAYER_FULLNAME',
        title='Scoring Impact (PTS) vs Team Offensive Rating (ORtg)',
        labels={'PTS_player': 'Player Points Scored (PTS)', 'ORtg': 'Team Offensive Rating (ORtg)'}
    )
    fig.update_traces(marker=dict(line=dict(width=2, color='DarkSlateGrey')))
    fig.update_layout(legend_title_text='Legend')
    fig.show()

# Example usage
scoring_vs_offensive_rating(player_df, team_df, mvp_list=list(mvp_df['Player']))

In [8]:
def top_scorers_team_success(player_df, team_df, mvp_list):
    # Merge and filter top scorers
    merged_df = pd.merge(player_df, team_df, on=['TEAM_ID', 'SEASON_ID'], suffixes=('_player', '_team'))
    
    # Add a column to differentiate MVP candidates and the actual MVP
    actual_mvp = mvp_list[0]
    merged_df['MVP_Candidate'] = merged_df['PLAYER_FULLNAME'].apply(
        lambda x: 'MVP' if x == actual_mvp else ('MVP Candidate' if x in mvp_list else 'No MVP votes')
    )
    
    # Filter top 20 scorers
    top_scorers = merged_df.nlargest(20, 'PTS_player')  # Top 20 scorers
    
    # Bar chart
    fig = px.bar(
        top_scorers, x='PLAYER_FULLNAME', y='PTS_player',
        color='MVP_Candidate',
        title='Top Scorers and Team Success',
        labels={'PTS_player': 'Player Points Scored', 'W/L%': 'Team Win/Loss Percentage'},
        hover_name='TEAM_ABBREVIATION_player'
    )
    fig.update_xaxes(categoryorder='total descending')
    fig.show()

# Example usage
top_scorers_team_success(player_df, team_df, mvp_list=list(mvp_df['Player']))

In [34]:
def advanced_metrics_player_contribution_filtered(player_df, team_df, mvp_df, include_non_mvp=False):
    """
    Creates an advanced metrics visualization highlighting MVP candidates with granularity based on 'Pts Won'.

    Parameters:
    - player_df (DataFrame): The player statistics DataFrame.
    - team_df (DataFrame): The team statistics DataFrame.
    - mvp_df (DataFrame): The MVP voting DataFrame.
    - include_non_mvp (bool): Whether to include non-MVP candidates in the visualization.
    """
    # Merge player and team data
    merged_df = pd.merge(player_df, team_df, on=['TEAM_ID', 'SEASON_ID'], suffixes=('_player', '_team'))

    # Add a column to differentiate MVP candidates
    merged_df['MVP_Candidate'] = merged_df['PLAYER_FULLNAME'].apply(
        lambda x: 'MVP Candidate' if x in mvp_df['Player'].values else 'Other'
    )

    # Merge MVP voting data into the player/team dataset
    merged_with_mvp = pd.merge(
        merged_df,
        mvp_df[['Player', 'Pts Won', 'WS', 'PTS', 'TRB', 'AST']],  # Select key MVP metrics
        how='left',
        left_on='PLAYER_FULLNAME',
        right_on='Player'
    )

    # Replace metrics with MVP data where available
    merged_with_mvp['WS'] = merged_with_mvp['WS_y'].fillna(merged_with_mvp['WS_x'])
    merged_with_mvp['PTS'] = merged_with_mvp['PTS'].fillna(merged_with_mvp['PTS_player'])
    merged_with_mvp['TRB'] = merged_with_mvp['TRB_y'].fillna(merged_with_mvp['REB'])
    merged_with_mvp['AST'] = merged_with_mvp['AST_y'].fillna(merged_with_mvp['AST_player'])

    # Optionally filter out non-MVP candidates
    if not include_non_mvp:
        merged_with_mvp = merged_with_mvp[merged_with_mvp['MVP_Candidate'] != 'Other']
    
    # Drop unnecessary columns and avoid confusion between suffixes
    merged_with_mvp = merged_with_mvp.rename(columns={'Pts Won': 'Pts_Won'})
    analysis_df = merged_with_mvp[['PER', 'WS', 'Pace', 'MOV', 'Pts_Won']]

    # Parallel coordinates plot for multidimensional analysis
    fig = px.parallel_coordinates(
        analysis_df,
        dimensions=['PER', 'WS', 'Pace', 'MOV'],  # Metrics to include
        color='Pts_Won',
        color_continuous_scale='Viridis',  # Granularity of 'Pts Won'
        labels={
            'PER': 'Player Efficiency Rating (PER)',
            'WS': 'Win Shares',
            'Pace': 'Team Pace',
            'MOV': 'Margin of Victory (MOV)',
            'Pts_Won': 'MVP Points Won'
        },
        title='Advanced Metrics: Player Contribution Highlighting MVP Points'
    )
    fig.show()

# Example usage
advanced_metrics_player_contribution_filtered(player_df, team_df, mvp_df)

KeyError: 'AST_y'

In [19]:
def advanced_metrics_scatter_matrix(player_df, team_df, mvp_list, include_non_mvp=False):
    # Merge player and team data
    merged_df = pd.merge(player_df, team_df, on=['TEAM_ID', 'SEASON_ID'], suffixes=('_player', '_team'))
    
    # Add a column to differentiate MVP candidates and the actual MVP
    actual_mvp = mvp_list[0]
    merged_df['MVP_Candidate'] = merged_df['PLAYER_FULLNAME'].apply(
        lambda x: 'MVP' if x == actual_mvp else ('MVP Candidate' if x in mvp_list else 'Other')
    )

    # Optionally filter out non-MVP candidates
    if not include_non_mvp:
        merged_df = merged_df[merged_df['MVP_Candidate'] != 'Other']
    
    # Scatter matrix
    fig = px.scatter_matrix(
        merged_df,
        dimensions=['PER', 'WS', 'Pace', 'MOV'],  # Metrics to include
        color='MVP_Candidate',
        title='Scatter Matrix: Advanced Metrics and MVP Highlights',
        labels={
            'PER': 'Player Efficiency Rating (PER)',
            'WS': 'Win Shares',
            'Pace': 'Team Pace',
            'MOV': 'Margin of Victory (MOV)'
        }
    )
    fig.update_traces(diagonal_visible=False)  # Hide diagonal density plots
    fig.show()

advanced_metrics_scatter_matrix(player_df, team_df, mvp_list=list(mvp_df['Player']))

In [14]:
def voting_share_vs_points(df):
    fig = px.scatter(
        df, x='Share', y='PTS',
        size='WS', color='Tm',
        hover_name='Player',
        title='MVP Voting Share vs. Points Per Game',
        labels={'Share': 'MVP Voting Share', 'PTS': 'Points Per Game', 'WS': 'Win Shares'}
    )
    fig.show()

# Example usage
voting_share_vs_points(mvp_df)

In [None]:
def voting_share_distribution(df):
    team_shares = df.groupby('Tm')['Share'].sum().reset_index()
    
    fig = px.bar(
        team_shares, x='Tm', y='Share',
        title='Distribution of MVP Votes Among Teams',
        labels={'Tm': 'Team', 'Share': 'Total MVP Voting Share'},
        text='Share'
    )
    fig.update_traces(texttemplate='%{text:.2%}', textposition='outside')
    fig.show()

# Example usage
voting_share_distribution(mvp_df)

In [None]:
def voting_share_vs_ws_per_48(df):
    fig = px.scatter(
        df, x='Share', y='WS/48',
        size='PTS', color='Tm',
        hover_name='Player',
        title='MVP Voting Share vs. WS/48',
        labels={'Share': 'MVP Voting Share', 'WS/48': 'Win Shares Per 48 Minutes', 'PTS': 'Points Per Game'}
    )
    fig.show()

# Example usage
voting_share_vs_ws_per_48(mvp_df)

In [None]:
def player_performance_parallel(df):
    fig = px.parallel_coordinates(
        df,
        dimensions=['PTS', 'TRB', 'AST', 'WS', 'WS/48'],
        color='Share',
        title='Player Performance Metrics of MVP Candidates',
        labels={'PTS': 'Points Per Game', 'TRB': 'Rebounds Per Game', 'AST': 'Assists Per Game',
                'WS': 'Win Shares', 'WS/48': 'Win Shares Per 48 Minutes', 'Share': 'MVP Voting Share'}
    )
    fig.show()

# Example usage
player_performance_parallel(mvp_df)

In [None]:
def age_vs_voting_share(df):
    fig = px.scatter(
        df, x='Age', y='Share',
        size='PTS', color='Tm',
        hover_name='Player',
        title='Age vs. MVP Voting Share',
        labels={'Age': 'Player Age', 'Share': 'MVP Voting Share', 'PTS': 'Points Per Game'}
    )
    fig.show()

# Example usage
age_vs_voting_share(mvp_df)

In [None]:
def team_representation(df):
    team_counts = df['Tm'].value_counts().reset_index()
    team_counts.columns = ['Team', 'Count']
    
    fig = px.bar(
        team_counts, x='Team', y='Count',
        title='Team Representation in MVP Voting',
        labels={'Team': 'Team', 'Count': 'Number of MVP Candidates'},
        text='Count'
    )
    fig.update_traces(textposition='outside')
    fig.show()

# Example usage
team_representation(mvp_df)

In [None]:
list(mvp_df['Player'])