In [1]:
import pandas as pd
import numpy as np

# Read in the data with the first two lines as a multi-index and the 'Player ID' column as the index
df = pd.read_csv('./interim/nba_draft_data_combined.csv', header=[0,1], index_col=0)

# replace all 0s with NaNs
df = df.replace(0, np.nan)

df.head()

Career,Career,Career,Career,Career,Career,Career,Career,Career,Career,Career,...,2021-22,2022-23,2022-23,2022-23,2022-23,2022-23,2022-23,2022-23,2022-23,2022-23
Player ID,Season,Pk,Tm,Player,College,Yrs,G,MP,PTS,TRB,...,TEAM_ID,AGE,DEF_RATING,GP,MIN,NET_RATING,OFF_RATING,PIE,TEAM_ABBREVIATION,TEAM_ID
947.0,1996,1,PHI,Allen Iverson,Georgetown,14.0,914.0,37584.0,24368.0,3394.0,...,,,,,,,,,0,
948.0,1996,2,TOR,Marcus Camby,UMass,17.0,973.0,28684.0,9262.0,9513.0,...,,,,,,,,,0,
949.0,1996,3,VAN,Shareef Abdur-Rahim,California,12.0,830.0,28882.0,15028.0,6239.0,...,,,,,,,,,0,
950.0,1996,4,MIL,Stephon Marbury,Georgia Tech,13.0,846.0,31891.0,16297.0,2516.0,...,,,,,,,,,0,
951.0,1996,5,MIN,Ray Allen,UConn,18.0,1300.0,46344.0,24505.0,5272.0,...,,,,,,,,,0,


In [2]:
# Get the position of the last career column
position_to_insert = len(df['Career'].columns)
# Get the list of columns to insert
columns_to_insert = [('Career', 'OFF_RATING'), ('Career', 'DEF_RATING'), ('Career', 'NET_RATING'), ('Career', 'PIE')]

# Insert multiple columns with NaN values
for col in columns_to_insert:
    df.insert(loc=position_to_insert, column=col, value=np.nan)

In [3]:
# create list with all seasons between 1996-97 and 2022-23
seasons = []
for i in range(1996, 2023):
    seasons.append(str(i) + '-' + str(i+1)[2:])

# for each season, insert a new column with the product of 'GP' and 'OFF_RATING'
for season in seasons:
    # find the (season, 'OFF_RATING') column as the position to insert the new column
    position_to_insert = df.columns.get_loc((season, 'OFF_RATING'))
    df.insert(loc=position_to_insert, column=(season, 'OFF_RATING_weighted'), value=np.nan)
    # fill the new column with the product of (season,'GP') and (season,'OFF_RATING')
    df[(season, 'OFF_RATING_weighted')] = df[(season, 'GP')] * df[(season, 'OFF_RATING')]
    
    position_to_insert = df.columns.get_loc((season, 'DEF_RATING'))
    df.insert(loc=position_to_insert, column=(season, 'DEF_RATING_weighted'), value=np.nan)
    df[(season, 'DEF_RATING_weighted')] = df[(season, 'GP')] * df[(season, 'DEF_RATING')]
    
    position_to_insert = df.columns.get_loc((season, 'NET_RATING'))
    df.insert(loc=position_to_insert, column=(season, 'NET_RATING_weighted'), value=np.nan)
    df[(season, 'NET_RATING_weighted')] = df[(season, 'GP')] * df[(season, 'NET_RATING')]
    
    position_to_insert = df.columns.get_loc((season, 'PIE'))
    df.insert(loc=position_to_insert, column=(season, 'PIE_weighted'), value=np.nan)
    df[(season, 'PIE_weighted')] = df[(season, 'GP')] * df[(season, 'PIE')]

  df.insert(loc=position_to_insert, column=(season, 'DEF_RATING_weighted'), value=np.nan)
  df.insert(loc=position_to_insert, column=(season, 'NET_RATING_weighted'), value=np.nan)
  df.insert(loc=position_to_insert, column=(season, 'PIE_weighted'), value=np.nan)
  df.insert(loc=position_to_insert, column=(season, 'OFF_RATING_weighted'), value=np.nan)


In [4]:
# in the ('Career', rating) column, fill the NaNs with the sum of the weighted columns divided by the ('Career', 'G') column
df[('Career', 'OFF_RATING')] = df.filter(regex='OFF_RATING_weighted').sum(axis=1) / df[('Career', 'G')]
df[('Career', 'DEF_RATING')] = df.filter(regex='DEF_RATING_weighted').sum(axis=1) / df[('Career', 'G')]
df[('Career', 'NET_RATING')] = df.filter(regex='NET_RATING_weighted').sum(axis=1) / df[('Career', 'G')]
df[('Career', 'PIE')] = df.filter(regex='PIE_weighted').sum(axis=1) / df[('Career', 'G')]
# drop the weighted columns
df = df.drop(df.filter(regex='_weighted').columns, axis=1)

In [5]:
# copy the 'Career' columns to a new dataframe
df_career = df['Career'].copy()

# export the dataframe to a csv file
df_career.to_csv('./interim/player_career_avg.csv')

df_career.head(60)


Player ID,Season,Pk,Tm,Player,College,Yrs,G,MP,PTS,TRB,...,TRB.1,AST.1,WS,WS/48,BPM,VORP,PIE,NET_RATING,DEF_RATING,OFF_RATING
947.0,1996,1,PHI,Allen Iverson,Georgetown,14.0,914.0,37584.0,24368.0,3394.0,...,3.7,6.2,99.0,0.126,3.2,49.6,0.136685,0.846389,102.536105,103.373742
948.0,1996,2,TOR,Marcus Camby,UMass,17.0,973.0,28684.0,9262.0,9513.0,...,9.8,1.9,81.6,0.137,2.3,31.3,0.126784,0.228571,102.740699,102.970298
949.0,1996,3,VAN,Shareef Abdur-Rahim,California,12.0,830.0,28882.0,15028.0,6239.0,...,7.5,2.5,71.2,0.118,1.0,21.9,0.12588,-5.022892,105.843614,100.801928
950.0,1996,4,MIL,Stephon Marbury,Georgia Tech,13.0,846.0,31891.0,16297.0,2516.0,...,3.0,7.6,77.5,0.117,1.8,30.7,0.123372,-1.780615,105.290426,103.501064
951.0,1996,5,MIN,Ray Allen,UConn,18.0,1300.0,46344.0,24505.0,5272.0,...,4.1,3.4,145.1,0.15,2.9,57.6,0.119475,3.884615,104.047,107.94
952.0,1996,6,BOS,Antoine Walker,Kentucky,12.0,893.0,31531.0,15647.0,6891.0,...,7.7,3.5,38.1,0.058,0.3,18.0,0.106725,-2.122172,104.507055,102.397984
953.0,1996,7,LAC,Lorenzen Wright,Memphis,13.0,778.0,18535.0,6191.0,4943.0,...,6.4,0.8,30.5,0.079,-2.6,-3.0,0.079762,-3.612725,102.883676,99.27545
954.0,1996,8,NJN,Kerry Kittles,Villanova,8.0,507.0,16929.0,7165.0,1983.0,...,3.9,2.6,44.8,0.127,2.2,18.0,0.106501,1.059763,100.949901,101.997436
955.0,1996,9,DAL,Samaki Walker,Louisville,10.0,445.0,7612.0,2376.0,2089.0,...,4.7,0.6,14.2,0.089,-2.6,-1.1,0.079793,-4.483371,101.874607,97.406517
956.0,1996,10,IND,Erick Dampier,Mississippi State,16.0,987.0,24003.0,7309.0,7005.0,...,7.1,0.8,52.7,0.105,-1.3,4.1,0.082994,0.622594,102.67153,103.291084


**Select a penalty for unbounded stats**

Drafted players who never played a game in the NBA are perceived as bad draft picks but never accumulated any stats. Since excluding them from the analysis would worsen the relative performance of other players selected at that respective draft position, the empty stats should be replaced by a "bad stat". For total points, assists and rebounds, 0 is the absolute minimum and therefore suitable as a replacement stat. However, all other stats are unbounded and don't have a definite minimum value. Using the minimum value in our data would be greatly affected by outliers which is why we opted for a quantile that can be dynamically chosen. We recommend using a quantile between 1% and 5%.

In [6]:
# TODO: Allow filtering by season range

penalty_quantile = 0.02 # TODO: Replace with widget sometime

relevant_stats = ['PTS', 'TRB', 'AST', 'WS', 'WS/48', 'BPM', 'VORP', 'PIE', 'OFF_RATING', 'DEF_RATING', 'NET_RATING']
na_fill_values = {'PTS': 0, 'TRB': 0, 'AST': 0, 'WS': df_career['WS'].quantile(penalty_quantile), 'WS/48': df_career['WS/48'].quantile(penalty_quantile), 'BPM': df_career['BPM'].quantile(penalty_quantile), 'VORP': df_career['VORP'].quantile(penalty_quantile), 'PIE': df_career['PIE'].quantile(penalty_quantile), 'OFF_RATING': df_career['OFF_RATING'].quantile(penalty_quantile), 'DEF_RATING': df_career['DEF_RATING'].quantile(1-penalty_quantile), 'NET_RATING': df_career['NET_RATING'].quantile(penalty_quantile)}

# print a list of quantiles for each relevant stat
#for stat in relevant_stats:
    #print(stat + ': ' + str(df_career[stat].quantile(penalty_quantile)))

# fill the NaNs with the respective entry in the na_fill_values dict for each column
df_career_na_filled = df_career.fillna(value=na_fill_values)

df_career_na_filled.head(60)

Player ID,Season,Pk,Tm,Player,College,Yrs,G,MP,PTS,TRB,...,TRB.1,AST.1,WS,WS/48,BPM,VORP,PIE,NET_RATING,DEF_RATING,OFF_RATING
947.0,1996,1,PHI,Allen Iverson,Georgetown,14.0,914.0,37584.0,24368.0,3394.0,...,3.7,6.2,99.0,0.126,3.2,49.6,0.136685,0.846389,102.536105,103.373742
948.0,1996,2,TOR,Marcus Camby,UMass,17.0,973.0,28684.0,9262.0,9513.0,...,9.8,1.9,81.6,0.137,2.3,31.3,0.126784,0.228571,102.740699,102.970298
949.0,1996,3,VAN,Shareef Abdur-Rahim,California,12.0,830.0,28882.0,15028.0,6239.0,...,7.5,2.5,71.2,0.118,1.0,21.9,0.12588,-5.022892,105.843614,100.801928
950.0,1996,4,MIL,Stephon Marbury,Georgia Tech,13.0,846.0,31891.0,16297.0,2516.0,...,3.0,7.6,77.5,0.117,1.8,30.7,0.123372,-1.780615,105.290426,103.501064
951.0,1996,5,MIN,Ray Allen,UConn,18.0,1300.0,46344.0,24505.0,5272.0,...,4.1,3.4,145.1,0.15,2.9,57.6,0.119475,3.884615,104.047,107.94
952.0,1996,6,BOS,Antoine Walker,Kentucky,12.0,893.0,31531.0,15647.0,6891.0,...,7.7,3.5,38.1,0.058,0.3,18.0,0.106725,-2.122172,104.507055,102.397984
953.0,1996,7,LAC,Lorenzen Wright,Memphis,13.0,778.0,18535.0,6191.0,4943.0,...,6.4,0.8,30.5,0.079,-2.6,-3.0,0.079762,-3.612725,102.883676,99.27545
954.0,1996,8,NJN,Kerry Kittles,Villanova,8.0,507.0,16929.0,7165.0,1983.0,...,3.9,2.6,44.8,0.127,2.2,18.0,0.106501,1.059763,100.949901,101.997436
955.0,1996,9,DAL,Samaki Walker,Louisville,10.0,445.0,7612.0,2376.0,2089.0,...,4.7,0.6,14.2,0.089,-2.6,-1.1,0.079793,-4.483371,101.874607,97.406517
956.0,1996,10,IND,Erick Dampier,Mississippi State,16.0,987.0,24003.0,7309.0,7005.0,...,7.1,0.8,52.7,0.105,-1.3,4.1,0.082994,0.622594,102.67153,103.291084


In [7]:
# TODO: Maybe add toggle button that allows comparing only to players of same position

# group the df_career dataframe by 'Pk' and calculate the average for each relevant stat
df_avg = df_career_na_filled.groupby('Pk')[relevant_stats].mean(numeric_only=True)
df_avg = df_avg.reset_index()

df_avg

Player ID,Pk,PTS,TRB,AST,WS,WS/48,BPM,VORP,PIE,OFF_RATING,DEF_RATING,NET_RATING
0,1,10584.814815,4343.777778,2085.592593,58.014815,0.120926,1.42963,23.292593,0.124651,107.487503,106.777783,0.704352
1,2,7187.888889,3181.740741,1377.740741,35.960222,0.083764,-0.691111,10.307407,0.100532,103.784086,107.38225,-3.207812
2,3,10229.111111,3584.666667,2222.814815,52.744444,0.116148,1.274074,19.259259,0.117689,107.041839,107.019079,0.020454
3,4,8261.62963,3193.777778,2149.666667,42.859259,0.094815,-0.620741,13.607407,0.101631,106.379093,107.328083,-0.949534
4,5,8369.962963,2950.62963,1954.444444,39.886148,0.083815,-0.514815,13.133333,0.097731,105.009115,106.95509,-1.945165
5,6,4981.259259,1965.925926,977.444444,23.578741,0.08043,-0.666667,6.881481,0.090968,104.194691,106.552246,-2.3535
6,7,7493.222222,2754.37037,1558.444444,32.992593,0.084407,-1.20963,7.862963,0.09511,105.717086,107.377567,-1.664906
7,8,5479.666667,2033.259259,1219.111111,22.874074,0.076296,-1.665185,4.966667,0.085489,104.309812,106.343669,-2.033688
8,9,7624.037037,3266.888889,1507.222222,42.896296,0.095296,-0.533333,14.081481,0.100264,105.06628,106.602401,-1.530061
9,10,6987.0,2425.444444,1464.074074,32.655556,0.080259,-1.177778,9.422222,0.087978,105.069798,106.50283,-1.429809


In [8]:
# for each relevant stat, create a new column in the df_career_na_filled dataframe with the difference between the player's stat and the average for that stat for their draft position
for stat in relevant_stats:
    df_career_na_filled[stat + '_diff'] = df_career_na_filled[stat] - df_career_na_filled['Pk'].map(df_avg[stat])

df_career_na_filled.head(60)

Player ID,Season,Pk,Tm,Player,College,Yrs,G,MP,PTS,TRB,...,TRB_diff,AST_diff,WS_diff,WS/48_diff,BPM_diff,VORP_diff,PIE_diff,OFF_RATING_diff,DEF_RATING_diff,NET_RATING_diff
947.0,1996,1,PHI,Allen Iverson,Georgetown,14.0,914.0,37584.0,24368.0,3394.0,...,212.259259,4246.259259,63.039778,0.042236,3.891111,39.292593,0.036153,-0.410344,-4.846145,4.054202
948.0,1996,2,TOR,Marcus Camby,UMass,17.0,973.0,28684.0,9262.0,9513.0,...,5928.333333,-385.814815,28.855556,0.020852,1.025926,12.040741,0.009096,-4.071541,-4.27838,0.208118
949.0,1996,3,VAN,Shareef Abdur-Rahim,California,12.0,830.0,28882.0,15028.0,6239.0,...,3045.222222,-40.666667,28.340741,0.023185,1.620741,8.292593,0.024248,-5.577166,-1.484468,-4.073358
950.0,1996,4,MIL,Stephon Marbury,Georgia Tech,13.0,846.0,31891.0,16297.0,2516.0,...,-434.62963,4516.555556,37.613852,0.033185,2.314815,17.566667,0.025641,-1.508051,-1.664665,0.16455
951.0,1996,5,MIN,Ray Allen,UConn,18.0,1300.0,46344.0,24505.0,5272.0,...,3306.074074,3383.555556,121.521259,0.06957,3.566667,50.718519,0.028507,3.745309,-2.505246,6.238115
952.0,1996,6,BOS,Antoine Walker,Kentucky,12.0,893.0,31531.0,15647.0,6891.0,...,4136.62963,1611.555556,5.107407,-0.026407,1.50963,10.137037,0.011615,-3.319101,-2.870512,-0.457266
953.0,1996,7,LAC,Lorenzen Wright,Memphis,13.0,778.0,18535.0,6191.0,4943.0,...,2909.740741,-597.111111,7.625926,0.002704,-0.934815,-7.966667,-0.005727,-5.034363,-3.459993,-1.579037
954.0,1996,8,NJN,Kerry Kittles,Villanova,8.0,507.0,16929.0,7165.0,1983.0,...,-1283.888889,-212.222222,1.903704,0.031704,2.733333,3.918519,0.006237,-3.068844,-5.6525,2.589824
955.0,1996,9,DAL,Samaki Walker,Louisville,10.0,445.0,7612.0,2376.0,2089.0,...,-336.444444,-1212.074074,-18.455556,0.008741,-1.422222,-10.522222,-0.008185,-7.663281,-4.628223,-3.053562
956.0,1996,10,IND,Erick Dampier,Mississippi State,16.0,987.0,24003.0,7309.0,7005.0,...,5612.777778,146.962963,36.502741,0.035125,0.372593,0.859259,0.00041,0.247801,-2.836452,2.706073


**Calculate ranks for all players above a minimum amount of games played**

This section calculates the ranking of the performance above/below the average player at that draft position for each stat. Some of the stats that are normalized per 48 minutes can be affected by players that only played very little but performed well in this limited time (e.g., a player only ever played two minutes at the end of a blowout but scored four points). To avoid such players being ranked very highly, a minimum amount of games can be set here and all players that do not meet this requirement are excluded from the ranking.

In [9]:
min_games = 82 # TODO: Replace with widget sometime

# create a new column for each relevant stat with the rank of the player's stat for their draft position, only using player with min_games
for stat in relevant_stats:
    if stat == 'DEF_RATING': # lower DEF_RATING is better
        df_career_na_filled[stat + '_rank'] = df_career_na_filled[df_career_na_filled['G'] >= min_games][stat + '_diff'].rank(ascending=True, method='min')
    else:
        df_career_na_filled[stat + '_rank'] = df_career_na_filled[df_career_na_filled['G'] >= min_games][stat + '_diff'].rank(ascending=False, method='min')

**Show Performance of Selected Player**

In [10]:
import ipywidgets as widgets
from IPython.display import display, HTML
from tabulate import tabulate

# create a combobox widget with all 'Player' values
player_widget = widgets.Combobox(
    placeholder='Choose a Player',
    options=df_career_na_filled['Player'].unique().tolist(),
    description='Player:',
    ensure_option=True,
    disabled=False
)

# Define the output area to display additional information
output_player_ranks = widgets.Output()

# Function to update the output area based on the selected player
def on_value_change_player_ranks(change):
    output_player_ranks.clear_output()
    selected_player = change['new']
    with output_player_ranks:
        player_data = df_career_na_filled[df_career_na_filled['Player'] == selected_player]
        display(HTML(f"<h3>Player: {player_data['Player'].values[0]}</h3>"))
        display(HTML(f"<p>Year: {player_data['Season'].values[0]} Pick: {player_data['Pk'].values[0]} - Drafted by: {player_data['Tm'].values[0]}</p>"))
        display(HTML(f"<p>Played {int(player_data['G'].values[0])} games in {int(player_data['Yrs'].values[0])} years</p>"))
        stats_table = [relevant_stats]
        row = []
        for stat in relevant_stats:
            rank = int(player_data[f'{stat}_rank'].values[0])
            actual_stat = round(player_data[stat].values[0], 1)
            diff_stat = round(player_data[f'{stat}_diff'].values[0], 1)
            if diff_stat > 0:
                diff_stat = "+" + str(diff_stat)
            row += f"{rank}. - {actual_stat} ({diff_stat})",
        stats_table.append(row)
        display(HTML(tabulate(stats_table, tablefmt="html")))

# Observe changes in the value of the combobox and call the function
player_widget.observe(on_value_change_player_ranks, names='value')

# Display the widgets
display(player_widget)
display(output_player_ranks)

Combobox(value='', description='Player:', ensure_option=True, options=('Allen Iverson', 'Marcus Camby', 'Share…

Output()

In [11]:
import plotly.express as px
from IPython.display import clear_output

# Filter players based on the minimum games played
filtered_df = df_career_na_filled[df_career_na_filled['G'] >= min_games]

# create a combobox widget with all 'Player' values
player_widget_diff_scatter = widgets.Combobox(
    placeholder='Choose a Player',
    options=df_career_na_filled['Player'].unique().tolist(),
    description='Player:',
    ensure_option=True,
    disabled=False
)

# create a dropdown widget with all relevant stats
stat_dropdown_diff_scatter = widgets.Dropdown(
    options=relevant_stats,
    description='Select Stat:',
    disabled=False,
)

# Define the output area to display the scatter plot
output_diff_scatter = widgets.Output()

# Function to update the output area based on the selected stat
def on_value_change_diff_scatter(change):
    clear_output()
    output_diff_scatter.clear_output()
    selected_stat = stat_dropdown_diff_scatter.value
    selected_player = player_widget_diff_scatter.value

    display(player_widget_diff_scatter)
    display(stat_dropdown_diff_scatter)
    with output_diff_scatter:
        fig = px.scatter(filtered_df, x='Pk', y=f'{selected_stat}_diff', hover_name='Player',
                         hover_data={'Pk': True, f'{selected_stat}_diff': True, 'Player': False, selected_stat: True, f'{selected_stat}_rank': True})
        if selected_player in filtered_df['Player'].values:
            highlighted_player = filtered_df[filtered_df['Player'] == selected_player]
            fig.add_trace(px.scatter(highlighted_player, x='Pk', y=f'{selected_stat}_diff', hover_name='Player',
                                     hover_data={'Pk': True, f'{selected_stat}_diff': True, 'Player': False, selected_stat: True, f'{selected_stat}_rank': True},
                                     color_discrete_sequence=['red']).data[0])
        fig.update_traces(marker=dict(size=12), showlegend=False)
        fig.update_layout(title=f'{selected_stat} Difference vs Draft Pick for Players with at least {min_games} Games Played',
                          xaxis_title='Draft Pick', yaxis_title=f'{selected_stat} Difference')
        fig.show()
    
# Observe changes in the value of the dropdown and call the function
stat_dropdown_diff_scatter.observe(on_value_change_diff_scatter, names='value')
player_widget_diff_scatter.observe(on_value_change_diff_scatter, names='value')

# Display the dropdown and the output area
display(player_widget_diff_scatter)
display(stat_dropdown_diff_scatter)
display(output_diff_scatter)

Combobox(value='', description='Player:', ensure_option=True, options=('Allen Iverson', 'Marcus Camby', 'Share…

Dropdown(description='Select Stat:', options=('PTS', 'TRB', 'AST', 'WS', 'WS/48', 'BPM', 'VORP', 'PIE', 'OFF_R…

Output()

In [12]:
# create a combobox widget with all 'Player' values
player_widget_total_scatter = widgets.Combobox(
    placeholder='Choose a Player',
    options=df_career_na_filled['Player'].unique().tolist(),
    description='Player:',
    ensure_option=True,
    disabled=False
)

# create a dropdown widget with all relevant stats
stat_dropdown_total_scatter = widgets.Dropdown(
    options=relevant_stats,
    description='Select Stat:',
    disabled=False,
)

# Define the output area to display the scatter plot
output_total_scatter = widgets.Output()

# Function to update the output area based on the selected stat
def on_value_change_total_scatter(change):
    clear_output()
    output_total_scatter.clear_output()
    selected_stat = stat_dropdown_total_scatter.value
    selected_player = player_widget_total_scatter.value

    display(player_widget_total_scatter)
    display(stat_dropdown_total_scatter)
    with output_total_scatter:
        fig = px.scatter(filtered_df, x='Pk', y=selected_stat, hover_name='Player',
                         hover_data={'Pk': True, selected_stat: True, 'Player': False, f'{selected_stat}_diff': True, f'{selected_stat}_rank': True})
        
        # add a yellow dot for each average in the df_avg dataframe
        fig.add_trace(px.scatter(df_avg, x='Pk', y=selected_stat, hover_name='Pk',
                                 hover_data={'Pk': True, selected_stat: True}, 
                                 color_discrete_sequence=['yellow']).data[0])
        
        if selected_player in filtered_df['Player'].values:
            highlighted_player = filtered_df[filtered_df['Player'] == selected_player]
            fig.add_trace(px.scatter(highlighted_player, x='Pk', y=selected_stat, hover_name='Player',
                                    hover_data={'Pk': True, selected_stat: True, 'Player': False, f'{selected_stat}_diff': True, f'{selected_stat}_rank': True},
                                    color_discrete_sequence=['red']).data[0])
        fig.update_traces(marker=dict(size=12), showlegend=False)
        fig.update_layout(title=f'{selected_stat} vs Draft Pick for Players with at least {min_games} Games Played',
                          xaxis_title='Draft Pick', yaxis_title=selected_stat)
        fig.show()
    
# Observe changes in the value of the dropdown and call the function
stat_dropdown_total_scatter.observe(on_value_change_total_scatter, names='value')
player_widget_total_scatter.observe(on_value_change_total_scatter, names='value')

# Display the dropdown and the output area
display(player_widget_total_scatter)
display(stat_dropdown_total_scatter)
display(output_total_scatter)

Combobox(value='', description='Player:', ensure_option=True, options=('Allen Iverson', 'Marcus Camby', 'Share…

Dropdown(description='Select Stat:', options=('PTS', 'TRB', 'AST', 'WS', 'WS/48', 'BPM', 'VORP', 'PIE', 'OFF_R…

Output()

# **Analysis by Team over Seasons**

In [13]:
# function to replace outdated abbreviations with the current ones
def clean_team_names(df):
    team_dict = {'CHH': 'CHO', 'CHA': 'CHO', 'NJN': 'BRK', 'NOH': 'NOP', 'NOK': 'NOP', 'SEA': 'OKC', 'VAN': 'MEM', 'WSB': 'WAS'}
    df['Tm'] = df['Tm'].replace(team_dict)
    return df

In [14]:
# TODO: give option to weight by minutes played

# group the df_career_na_filled by team and season and calculate the average
df_team_avg_by_season = clean_team_names(df_career_na_filled).groupby(['Tm', 'Season']).mean(numeric_only=True)
df_team_avg_by_season = df_team_avg_by_season.reset_index()

df_team_avg_by_season

Player ID,Tm,Season,Pk,Yrs,G,MP,PTS,TRB,AST,FG%,...,TRB_rank,AST_rank,WS_rank,WS/48_rank,BPM_rank,VORP_rank,PIE_rank,OFF_RATING_rank,DEF_RATING_rank,NET_RATING_rank
0,ATL,1996,28.000000,2.0,74.00,525.00,255.000000,143.000000,33.000000,0.47200,...,,,,,,,,,,
1,ATL,1997,41.000000,4.5,156.00,2397.00,675.666667,206.666667,66.666667,0.38800,...,388.0,381.0,372.00,127.00,187.0,348.0,223.0,205.00,320.0,211.00
2,ATL,1998,34.500000,2.0,77.50,1377.00,494.000000,178.500000,95.000000,0.36900,...,847.0,900.0,888.00,916.00,901.0,833.0,707.0,916.00,158.0,741.00
3,ATL,1999,18.500000,9.0,579.25,14832.25,6259.250000,1607.500000,1576.750000,0.42075,...,494.5,575.5,513.25,586.75,517.0,483.5,563.0,731.25,329.5,601.25
4,ATL,2000,34.333333,4.5,249.50,4145.00,945.333333,403.666667,126.333333,0.42650,...,773.0,796.5,805.00,812.50,815.5,822.5,872.5,869.00,167.0,713.50
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
740,WAS,2018,29.500000,5.0,297.00,5951.00,974.500000,572.000000,224.000000,0.42900,...,635.0,682.0,683.00,594.00,558.0,721.0,541.0,204.00,924.0,520.00
741,WAS,2019,9.000000,4.0,210.00,5650.00,2620.000000,1049.000000,275.000000,0.48000,...,906.0,934.0,931.00,815.00,862.0,941.0,622.0,445.00,944.0,877.00
742,WAS,2020,23.000000,2.5,135.50,3058.00,973.500000,650.500000,257.500000,0.41900,...,877.0,924.0,937.00,845.00,830.0,936.0,798.0,358.00,891.0,673.00
743,WAS,2021,15.000000,2.0,151.00,3894.00,1454.000000,416.000000,172.000000,0.47800,...,835.0,834.0,723.00,511.00,586.0,751.0,662.0,120.00,965.0,511.00


In [15]:
# create a scatter plot with the average of each team for each season. The seasons should be the x axis and the selected relevant stat differences should be the y axis
def create_team_scatter(selected_stat, selected_team, show_players):
    fig = px.scatter(df_team_avg_by_season, x='Season', y=f'{selected_stat}_diff', hover_name='Tm',
                     hover_data={'Tm': True, selected_stat: True, f'{selected_stat}_diff': True})
    
    # add a red dot for all stats of the selected team
    fig.add_trace(px.scatter(df_team_avg_by_season[df_team_avg_by_season['Tm'] == selected_team], x='Season', y=f'{selected_stat}_diff', hover_name='Tm',
                                hover_data={'Tm': True, selected_stat: True, f'{selected_stat}_diff': True},
                                color_discrete_sequence=['red']).data[0])
    
    if show_players:
        # add a yellow dot for each player of the selected team
        fig.add_trace(px.scatter(df_career_na_filled[df_career_na_filled['Tm'] == selected_team], x='Season', y=f'{selected_stat}_diff', hover_name='Player',
                                    hover_data={'Player': True, selected_stat: True, f'{selected_stat}_diff': True},
                                    color_discrete_sequence=['yellow']).data[0])

    fig.update_traces(marker=dict(size=12), showlegend=False)
    fig.update_layout(title=f'{selected_stat} Difference vs Season for Teams',
                      xaxis_title='Season', yaxis_title=f'{selected_stat} Difference')
    fig.show()

# create a dropdown widget with all relevant stats
stat_dropdown_team_scatter = widgets.Dropdown(
    options=relevant_stats,
    description='Select Stat:',
    disabled=False,
)

# create a dropdown widget with all teams
team_dropdown_team_scatter = widgets.Dropdown(
    options=df_team_avg_by_season['Tm'].unique().tolist(),
    description='Select Team:',
    disabled=False,
)

# create a checkbox widget to toggle between showing players or not
player_checkbox_team_scatter = widgets.Checkbox(
    value=True,
    description='Show Players selected by Team',
    disabled=False,
    indent=False
)

# Define the output area to display the scatter plot
output_team_scatter = widgets.Output()

# Observe changes in the value of the dropdown and call the function
def on_value_change_team_scatter(change):
    clear_output()
    output_team_scatter.clear_output()
    selected_stat = stat_dropdown_team_scatter.value
    selected_team = team_dropdown_team_scatter.value
    show_players = player_checkbox_team_scatter.value

    display(stat_dropdown_team_scatter)
    display(team_dropdown_team_scatter)
    display(player_checkbox_team_scatter)
    with output_team_scatter:
        create_team_scatter(selected_stat, selected_team, show_players)

stat_dropdown_team_scatter.observe(on_value_change_team_scatter, names='value')
team_dropdown_team_scatter.observe(on_value_change_team_scatter, names='value')
player_checkbox_team_scatter.observe(on_value_change_team_scatter, names='value')

# Display the dropdown and the output area
display(stat_dropdown_team_scatter)
display(team_dropdown_team_scatter)
display(player_checkbox_team_scatter)
display(output_team_scatter)



Dropdown(description='Select Stat:', options=('PTS', 'TRB', 'AST', 'WS', 'WS/48', 'BPM', 'VORP', 'PIE', 'OFF_R…

Dropdown(description='Select Team:', options=('ATL', 'BOS', 'BRK', 'CHI', 'CHO', 'CLE', 'DAL', 'DEN', 'DET', '…

Checkbox(value=True, description='Show Players selected by Team', indent=False)

Output()

# **Analysis by Team**

In [16]:
df_team_avg = clean_team_names(df_career_na_filled).groupby('Tm').mean(numeric_only=True)
df_team_avg = df_team_avg.reset_index()

df_team_avg

Player ID,Tm,Season,Pk,Yrs,G,MP,PTS,TRB,AST,FG%,...,TRB_rank,AST_rank,WS_rank,WS/48_rank,BPM_rank,VORP_rank,PIE_rank,OFF_RATING_rank,DEF_RATING_rank,NET_RATING_rank
0,ATL,2009.261538,29.907692,5.625,314.803571,7577.267857,2774.107692,1146.446154,656.692308,0.436982,...,551.414634,536.121951,549.439024,544.317073,529.97561,529.634146,518.463415,544.536585,548.121951,589.02439
1,BOS,2010.184615,30.030769,5.864407,318.033898,8091.542373,3318.892308,1130.384615,661.446154,0.439897,...,535.487179,496.846154,500.307692,487.897436,469.923077,475.512821,509.512821,465.871795,477.0,437.846154
2,BRK,2009.918367,30.714286,5.466667,292.444444,6881.377778,2616.55102,1224.040816,406.346939,0.450523,...,481.53125,532.78125,500.6875,519.28125,503.875,475.34375,499.46875,488.0,532.0625,510.03125
3,CHI,2007.229508,27.229508,6.568966,354.275862,8547.473684,3303.852459,1465.967213,676.442623,0.438982,...,476.195652,486.76087,494.282609,531.891304,550.521739,515.521739,559.065217,543.217391,443.826087,496.021739
4,CHO,2010.823529,25.372549,6.702128,376.021277,9200.0,3873.156863,1410.666667,879.666667,0.463911,...,520.361111,501.027778,519.555556,509.111111,501.944444,562.444444,455.416667,534.194444,517.861111,554.666667
5,CLE,2008.44,24.52,7.266667,415.244444,10926.222222,4487.34,1615.64,1076.14,0.428556,...,545.277778,497.166667,503.75,528.611111,527.861111,530.166667,547.583333,499.472222,466.638889,477.944444
6,DAL,2007.065217,36.869565,5.121212,234.030303,4523.242424,1362.347826,611.891304,285.326087,0.425812,...,561.782609,559.695652,562.608696,512.347826,524.217391,539.086957,550.0,590.869565,415.304348,511.130435
7,DEN,2007.98,31.56,6.069767,333.325581,8211.139535,3196.56,1251.02,705.2,0.443884,...,459.206897,432.241379,412.724138,422.689655,431.517241,456.551724,435.034483,416.344828,544.586207,438.37931
8,DET,2009.472727,31.909091,6.020408,328.714286,7692.306122,2751.981818,1297.963636,537.981818,0.429041,...,494.028571,521.857143,490.085714,505.828571,504.142857,505.085714,517.057143,510.657143,447.8,493.257143
9,GSW,2008.875,27.979167,6.395349,368.093023,9373.372093,3877.020833,1466.0625,777.916667,0.460093,...,463.6,492.366667,458.933333,469.066667,505.933333,469.533333,496.1,503.333333,499.433333,493.4


In [17]:
# create a scatter plot with the difference of the selected stat for each team. The y axis should be the selected stat and the x axis should be the average Pk
def teams_vs_avg_pick_scatter(selected_stat):
    fig = px.scatter(df_team_avg, x='Pk', y=f'{selected_stat}_diff', hover_name='Tm',
                     hover_data={'Tm': True, selected_stat: True, f'{selected_stat}_diff': True, 'Pk': True})
    fig.update_traces(marker=dict(size=12), showlegend=False)
    fig.update_layout(title=f'{selected_stat} Difference vs Average Draft Position for Teams',
                      xaxis_title='Average Draft Position', yaxis_title=f'{selected_stat} Difference')
    fig.show()

# create a dropdown widget with all relevant stats
stat_dropdown_avg_position_scatter = widgets.Dropdown(
    options=relevant_stats,
    description='Select Stat:',
    disabled=False,
)

# Define the output area to display the scatter plot
output_avg_position_scatter = widgets.Output()

# Observe changes in the value of the dropdown and call the function
def on_value_change_avg_position_scatter(change):
    clear_output()
    output_avg_position_scatter.clear_output()
    selected_stat = stat_dropdown_avg_position_scatter.value

    display(stat_dropdown_avg_position_scatter)
    with output_avg_position_scatter:
        teams_vs_avg_pick_scatter(selected_stat)

stat_dropdown_avg_position_scatter.observe(on_value_change_avg_position_scatter, names='value')

# Display the dropdown and the output area
display(stat_dropdown_avg_position_scatter)
display(output_avg_position_scatter)

Dropdown(description='Select Stat:', options=('PTS', 'TRB', 'AST', 'WS', 'WS/48', 'BPM', 'VORP', 'PIE', 'OFF_R…

Output()

In [18]:
# for each stat, print the team with the highest average difference for that stat
for stat in relevant_stats:
    print(f'Best team by {stat}: {df_team_avg[df_team_avg[f"{stat}_diff"] == df_team_avg[f"{stat}_diff"].max()]["Tm"].values[0]}')

Best team by PTS: SAS
Best team by TRB: TOR
Best team by AST: SAS
Best team by WS: SAS
Best team by WS/48: IND
Best team by BPM: IND
Best team by VORP: SAS
Best team by PIE: PHO
Best team by OFF_RATING: PHO
Best team by DEF_RATING: PHO
Best team by NET_RATING: MIL
