<h1 style="color:black; font-weight:bold">NBA Draft Analysis</h1>   


<h2 style="color:red; font-weight:bold">Import Libraries</h2>

In [75]:
import pandas as pd
import numpy as np
import seaborn as sns
import plotly.express as px


<h2 style="color:red; font-weight:bold">Load the 1989-2021 Draft Dataset</h2>

In [76]:
df = pd.read_csv('nbaplayersdraft.csv')

In [77]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1922 entries, 0 to 1921
Data columns (total 24 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   id                         1922 non-null   int64  
 1   year                       1922 non-null   int64  
 2   rank                       1922 non-null   int64  
 3   overall_pick               1922 non-null   int64  
 4   team                       1922 non-null   object 
 5   player                     1922 non-null   object 
 6   college                    1585 non-null   object 
 7   years_active               1669 non-null   float64
 8   games                      1669 non-null   float64
 9   minutes_played             1669 non-null   float64
 10  points                     1669 non-null   float64
 11  total_rebounds             1669 non-null   float64
 12  assists                    1669 non-null   float64
 13  field_goal_percentage      1665 non-null   float

In [78]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
id,1922.0,961.5,554.977927,1.0,481.25,961.5,1441.75,1922.0
year,1922.0,2005.317378,9.456946,1989.0,1997.0,2005.0,2013.0,2021.0
rank,1922.0,29.694589,16.912454,1.0,15.0,30.0,44.0,60.0
overall_pick,1922.0,29.694589,16.912454,1.0,15.0,30.0,44.0,60.0
years_active,1669.0,6.332534,4.656321,1.0,2.0,5.0,10.0,22.0
games,1669.0,348.04254,324.897567,1.0,72.0,235.0,584.0,1541.0
minutes_played,1669.0,8399.055722,9845.871529,0.0,838.0,4204.0,13246.0,52139.0
points,1669.0,3580.413421,4826.142847,0.0,265.0,1552.0,5150.0,37062.0
total_rebounds,1669.0,1497.009587,2003.686388,0.0,128.0,656.0,2139.0,15091.0
assists,1669.0,774.300779,1284.602969,0.0,46.0,257.0,910.0,12091.0


<h2 style="color:red; font-weight:bold"> Check for NA Values</h2>

In [79]:
df.isna().sum()

id                             0
year                           0
rank                           0
overall_pick                   0
team                           0
player                         0
college                      337
years_active                 253
games                        253
minutes_played               253
points                       253
total_rebounds               253
assists                      253
field_goal_percentage        257
3_point_percentage           377
free_throw_percentage        289
average_minutes_played       253
points_per_game              253
average_total_rebounds       253
average_assists              253
win_shares                   253
win_shares_per_48_minutes    254
box_plus_minus               254
value_over_replacement       253
dtype: int64

<h2 style="color:red; font-weight:bold">Replacing All NA Values in Numerical Columns with 0 </h2>

In [80]:
na_cols = ['years_active', 'games', 'minutes_played', 'points', 'total_rebounds',
       'assists', 'field_goal_percentage', '3_point_percentage',
       'free_throw_percentage', 'average_minutes_played', 'points_per_game',
       'average_total_rebounds', 'average_assists', 'win_shares',
       'win_shares_per_48_minutes', 'box_plus_minus',
       'value_over_replacement']

for col in na_cols:
    df[col]= df[col].fillna(0)

### All players with missing college values fall into three categories:   
* Foreign Players   
* Entered the NBA straight out of High School   
* Entered the NBA straight out of the G-League    

With 337 total players that fall into these categories, I created one category to represent them together: HS/G-Lea/Foreign

In [81]:
df['college']= df['college'].fillna("HS/G-Lea/Foreign")

In [82]:
df.isna().sum()

id                           0
year                         0
rank                         0
overall_pick                 0
team                         0
player                       0
college                      0
years_active                 0
games                        0
minutes_played               0
points                       0
total_rebounds               0
assists                      0
field_goal_percentage        0
3_point_percentage           0
free_throw_percentage        0
average_minutes_played       0
points_per_game              0
average_total_rebounds       0
average_assists              0
win_shares                   0
win_shares_per_48_minutes    0
box_plus_minus               0
value_over_replacement       0
dtype: int64

With the inclusion of team relocations and expansion teams, I made the necessary adjustments to the team abbreviations in the NBA draft picks dataset. This step aims to ensure the accuracy and consistency of the team abbreviations, aligning them with the current state of the NBA teams.

In [83]:
condition = (df['team'] == 'CHH') & (df['year'] == 2014)
df.loc[condition, 'team'] = 'CHO'
team_replace = {"SEA" : "OKC", "VAN" : "MEM", "NJN" : "BRK", "WSB" : "WAS",
               "CHA" : "CHO", "NOK" : "NOP", "NOH" : "NOP", "CHH" : "NOP"}
df['team'] = df['team'].replace(team_replace)

# Exploratory Data Analysis
***
<h2 style="color:red; font-weight:bold">EDA for each NBA Team </h2>
<h3 style="color:blue; font-weight:bold">Points Per Game by Team and Overall Pick</h3>

In [84]:
sorted_df = df.sort_values(by = "team", ascending = True)

fig = px.scatter(sorted_df, x = "overall_pick", y = "points_per_game",
                    range_x = (0, 60), range_y = (0, 30),
                    hover_data = ["college"],
                    hover_name="player",
                    animation_frame = "team", range_color = (0, 25),
                    color = "points_per_game", color_continuous_scale = "YlOrRd")
                    #title = "Points Per Game Overall Pick 1-60 Animation by Year and Games")

fig.update_traces(marker = dict(size = 5)) # scaling down the markers
fig.update_layout(template = "plotly_dark", font = dict(family = "PT Sans", size = 12))
fig.show()

In [85]:
df.groupby(by="team")["points"].agg(["mean","min","max"])

Unnamed: 0_level_0,mean,min,max
team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ATL,2430.9125,0.0,20894.0
BOS,3190.853333,0.0,26397.0
BRK,2760.131148,0.0,14247.0
CHI,2968.173333,0.0,20558.0
CHO,2921.358974,0.0,14414.0
CLE,4397.763636,0.0,37062.0
DAL,1935.0,0.0,17529.0
DEN,3426.140625,0.0,28289.0
DET,3034.753846,0.0,17137.0
GSW,3856.530303,0.0,25728.0


<h3 style="color:blue; font-weight:bold">Total Career Games by Team and Overall Pick</h3>

In [86]:
sorted_df = df.sort_values(by = "team", ascending = True)

fig = px.scatter(sorted_df, x = "overall_pick", y = "games",
                    range_x = (0, 60), range_y = (0, 1600),
                    hover_data = ["college"],
                    hover_name="player",
                    animation_frame = "team", range_color = (0, 1600),
                    color = "games", color_continuous_scale = "YlOrRd")
                    #title = "Points Per Game Overall Pick 1-60 Animation by Year and Games")

fig.update_traces(marker = dict(size = 5)) # scaling down the markers
fig.update_layout(template = "plotly_dark", font = dict(family = "PT Sans", size = 12))
fig.show()

In [87]:
df.groupby(by="team")["games"].agg(["mean","min","max"])

Unnamed: 0_level_0,mean,min,max
team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ATL,256.3875,0.0,1410.0
BOS,295.306667,0.0,1343.0
BRK,275.819672,0.0,1232.0
CHI,318.613333,0.0,1076.0
CHO,298.461538,0.0,976.0
CLE,381.454545,0.0,1366.0
DAL,219.903226,0.0,1391.0
DEN,312.90625,0.0,1260.0
DET,316.246154,0.0,1026.0
GSW,329.727273,0.0,1541.0


<h3 style="color:blue; font-weight:bold">Total Career Rebounds by Team and Overall Pick</h3>

In [88]:
fig = px.scatter(sorted_df, x = "overall_pick", y = "average_total_rebounds",
                    range_x = (0, 60), range_y = (0, 18),
                    hover_data = ["college"],
                    hover_name="player",
                    animation_frame = "team", range_color = (0, 15),
                    color = "average_total_rebounds", color_continuous_scale = "YlOrRd")
                    #title = "Points Per Game Overall Pick 1-60 Animation by Year and Games")

fig.update_traces(marker = dict(size = 5)) # scaling down the markers
fig.update_layout(template = "plotly_dark", font = dict(family = "PT Sans", size = 12))
fig.show()

In [89]:
df.groupby(by="team")["total_rebounds"].agg(["mean","min","max"])

Unnamed: 0_level_0,mean,min,max
team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ATL,1047.4875,0.0,11305.0
BOS,1118.64,0.0,7690.0
BRK,1294.311475,0.0,8409.0
CHI,1324.0,0.0,9040.0
CHO,1170.589744,0.0,5967.0
CLE,1596.309091,0.0,10210.0
DAL,871.16129,0.0,8725.0
DEN,1406.703125,0.0,12359.0
DET,1346.215385,0.0,9519.0
GSW,1513.060606,0.0,6854.0


<h3 style="color:blue; font-weight:bold">Total Career Assists by Team and Overall Pick</h3>

In [90]:
fig = px.scatter(sorted_df, x = "overall_pick", y = "average_assists",
                    range_x = (0, 60), range_y = (0, 10),
                    hover_data = ["college"],
                    hover_name="player",
                    animation_frame = "team", range_color = (0, 10),
                    color = "average_assists", color_continuous_scale = "YlOrRd")
                    #title = "Points Per Game Overall Pick 1-60 Animation by Year and Games")

fig.update_traces(marker = dict(size = 5)) # scaling down the markers
fig.update_layout(template = "plotly_dark", font = dict(family = "PT Sans", size = 12))
fig.show()

In [91]:
df.groupby(by="team")["assists"].agg(["mean","min","max"])

Unnamed: 0_level_0,mean,min,max
team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ATL,577.45,0.0,5415.0
BOS,657.746667,0.0,5636.0
BRK,564.278689,0.0,5972.0
CHI,631.68,0.0,4245.0
CHO,665.410256,0.0,5096.0
CLE,1097.018182,0.0,10045.0
DAL,517.564516,0.0,12091.0
DEN,741.359375,0.0,4508.0
DET,589.2,0.0,4252.0
GSW,802.060606,0.0,7095.0


<h3 style="color:blue; font-weight:bold">Career Span by Team and Overall Pick</h3>

In [92]:
fig = px.scatter(sorted_df, x = "overall_pick", y = "years_active",
                    range_x = (0, 60), range_y = (0, 25),
                    hover_data = ["college"],
                    hover_name="player",
                    animation_frame = "team", range_color = (0, 25),
                    color = "years_active", color_continuous_scale = "YlOrRd")
                    #title = "Points Per Game Overall Pick 1-60 Animation by Year and Games")

fig.update_traces(marker = dict(size = 5)) # scaling down the markers
fig.update_layout(template = "plotly_dark", font = dict(family = "PT Sans", size = 12))
fig.show()

In [93]:
df.groupby(by="team")["years_active"].agg(["mean","min","max"])

Unnamed: 0_level_0,mean,min,max
team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ATL,4.725,0.0,19.0
BOS,5.4,0.0,19.0
BRK,5.032787,0.0,17.0
CHI,5.946667,0.0,17.0
CHO,5.358974,0.0,14.0
CLE,6.745455,0.0,20.0
DAL,4.33871,0.0,19.0
DEN,5.515625,0.0,19.0
DET,5.892308,0.0,18.0
GSW,5.924242,0.0,22.0


<h3 style="color:blue; font-weight:bold">Career Box Plus Minus by Team and Overall Pick</h3>

In [94]:
fig = px.scatter(sorted_df, x = "overall_pick", y = "box_plus_minus",
                    range_x = (0, 60), range_y = (-50, 50),
                    hover_data = ["college"],
                    hover_name="player",
                    animation_frame = "team", range_color = (-50, 50),
                    color = "box_plus_minus", color_continuous_scale = "YlOrRd")
                    #title = "Points Per Game Overall Pick 1-60 Animation by Year and Games")

fig.update_traces(marker = dict(size = 5)) # scaling down the markers
fig.update_layout(template = "plotly_dark", font = dict(family = "PT Sans", size = 12))
fig.show()

In [95]:
df.groupby(by="team")["box_plus_minus"].agg(["mean","min","max"])

Unnamed: 0_level_0,mean,min,max
team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ATL,-2.50875,-16.3,6.8
BOS,-2.325333,-25.9,5.3
BRK,-2.695082,-52.0,3.0
CHI,-2.466667,-11.1,4.7
CHO,-1.728205,-21.8,6.0
CLE,-1.903636,-15.9,8.9
DAL,-2.270968,-25.5,3.8
DEN,-1.2625,-8.8,8.9
DET,-1.821538,-14.0,9.4
GSW,-2.536364,-32.4,6.5


<h3 style="color:blue; font-weight:bold">Value over Replacement by Team and Overall Pick</h3>

In [96]:
fig = px.scatter(sorted_df, x = "overall_pick", y = "value_over_replacement",
                    range_x = (0, 60), range_y = (-10, 60),
                    hover_data = ["college"],
                    hover_name="player",
                    animation_frame = "team",
                    color = "value_over_replacement", color_continuous_scale = "YlOrRd")
                    #title = "Points Per Game Overall Pick 1-60 Animation by Year and Games")

fig.update_traces(marker = dict(size = 5)) # scaling down the markers
fig.update_layout(template = "plotly_dark", font = dict(family = "PT Sans", size = 12))
fig.show()

In [97]:
df.groupby(by="team")["value_over_replacement"].agg(["mean","min","max"])

Unnamed: 0_level_0,mean,min,max
team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ATL,2.9775,-6.4,57.5
BOS,3.662667,-5.2,65.5
BRK,3.77541,-2.4,39.2
CHI,3.037333,-4.3,38.3
CHO,2.864103,-3.2,29.8
CLE,6.665455,-5.9,142.6
DAL,1.985484,-4.0,73.5
DEN,4.390625,-2.2,44.0
DET,3.250769,-3.5,43.6
GSW,5.35,-2.4,60.9


<h2 style="color:red; font-weight:bold">EDA for each College</h2>
<h3 style="color:blue; font-weight:bold">Top 5 Colleges with the Most Players Drafted</h3>

In [98]:
group_counts = df['college'].value_counts().reset_index()
group_counts.columns = ['college', 'Count']
print(group_counts.loc[1:5])


    college  Count
1  Kentucky     58
2      Duke     57
3   Arizona     43
4       UNC     43
5      UCLA     42


In [99]:
colleges_avg = df.drop(['id', 'year', 'rank', 'overall_pick', 'team'], axis=1)
colleges_avg['count'] = colleges_avg.groupby('college')['college'].transform('count')
colleges_avg = colleges_avg.groupby('college').mean().reset_index()
top25 = colleges_avg.nlargest(26, 'count')
top25 = top25.drop(67)
top25.reset_index(drop=True, inplace=True)


The default value of numeric_only in DataFrameGroupBy.mean is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.



<h3 style="color:blue; font-weight:bold">Years Active for the Top 25 Colleges</h3>


In [100]:
fig = px.bar(top25, x='college', y='years_active', hover_data = ["count"], color='college', 
                 title = "Years Active for the Top 25 Colleges")
fig.update_layout(template = "plotly_dark", font = dict(family = "PT Sans", size = 12))
fig.show()

<h3 style="color:blue; font-weight:bold">Points Per Game for the Top 25 Colleges</h3>

In [101]:
fig = px.bar(top25, x='college', y='points_per_game', hover_data = ["count"], color='college', 
                 title = "Points Per Game for the Top 25 Colleges")
fig.update_layout(template = "plotly_dark", font = dict(family = "PT Sans", size = 12))
fig.show()

<h3 style="color:blue; font-weight:bold">Rebounds Per Game for the Top 25 Colleges</h3>

In [102]:
fig = px.bar(top25, x='college', y='average_total_rebounds', hover_data = ["count"], color='college', 
                 title = "Rebounds Per Game for the Top 25 Colleges")
fig.update_layout(template = "plotly_dark", font = dict(family = "PT Sans", size = 12))
fig.show()

<h3 style="color:blue; font-weight:bold">Assists Per Game for the Top 25 Colleges</h3>

In [103]:
fig = px.bar(top25, x='college', y='average_assists', hover_data = ["count"], color='college', 
                 title = "Assists Per Game for the Top 25 Colleges")
fig.update_layout(template = "plotly_dark", font = dict(family = "PT Sans", size = 12))
fig.show()

<h3 style="color:blue; font-weight:bold">Box Plus Minus for the Top 25 Colleges</h3>

In [104]:
fig = px.bar(top25, x='college', y='box_plus_minus', hover_data = ["count"], color='college', 
                 title = "Box Plus Minus for the Top 25 Colleges")
fig.update_layout(template = "plotly_dark", font = dict(family = "PT Sans", size = 12))
fig.show()

<h3 style="color:blue; font-weight:bold">Value Over Replacement for the Top 25 Colleges</h3>

In [105]:
fig = px.bar(top25, x='college', y='value_over_replacement', hover_data = ["count"], color='college', 
                 title = "Value Over Replacement for the Top 25 Colleges")
fig.update_layout(template = "plotly_dark", font = dict(family = "PT Sans", size = 12))
fig.show()

<h2 style="color:red; font-weight:bold">Success of Draft Picks</h2>
<h3 style="color:blue; font-weight:bold">Create a method for valuing each draft slot in the NBA Draft</h3>

I devised a method to evaluate the success of a draft pick by comparing a player's performance to the historical average of players chosen at the same draft position. This involved calculating average statistics across all relevant categories in the NBA draft dataset, grouping them by overall pick position. By comparing a player's stats to these averages, I generated a count of how many statistical categories the player exceeded the average. If a player surpassed the average in 9 or more statistical categories, they were classified as a "hit." As a disclamer, these stats do not reflect seasons after the 2021 meaning some players maybe considered a miss if playing was not high in their rookie/sophomore year.

Let's begin by identifying the draft positions that have yielded the most successful players based on their statistics, and then we'll rank them accordingly.

In [106]:
df.fillna(0, inplace=True)
# drop columns that are not needed
draft = df.drop(['id', 'year', 'rank', 'player', 'team', 'college'], axis=1)
# group all rows by overall pick value and get the mean for each row
draft = draft.groupby('overall_pick').mean().reset_index()
c_names = draft.columns.tolist()
c_names.pop(0)
# go through each column and create a new column that will contain it's rank for that statistic
for col in c_names:
    name = col + "_rank"
    draft[name] = draft[col].rank(ascending=False)
draft = draft.drop(columns = c_names)
# create the picks final rank by getting the average of all statistical rankings together
draft['rank']= draft.mean(axis=1)
temp = draft.sort_values(by=['rank'])
rank_list = temp["overall_pick"].values.tolist()
print("Rank: ")
print(*rank_list, sep = ", ") 

Rank: 
3, 4, 1, 5, 2, 7, 10, 9, 13, 8, 6, 11, 12, 17, 14, 24, 15, 21, 18, 23, 27, 16, 19, 26, 22, 20, 25, 28, 30, 29, 35, 45, 37, 31, 38, 40, 32, 42, 34, 33, 36, 43, 47, 41, 46, 39, 48, 56, 44, 57, 49, 50, 55, 60, 52, 51, 54, 58, 53, 59


<h3 style="color:blue; font-weight:bold">How many teams hit on their Picks</h3>

In [107]:
draft_avg = df.drop(['id', 'year', 'rank', 'player', 'team', 'college'], axis=1)
draft_avg = draft_avg.groupby('overall_pick').mean().reset_index()
team = df['team'].unique()
team = sorted(team)
temp = {
    'team': ['ATL','BOS','BRK','CHI','CHO','CLE','DAL','DEN',
 'DET','GSW','HOU','IND','LAC','LAL','MEM','MIA','MIL','MIN',
 'NOP','NYK','OKC','ORL','PHI','PHO','POR','SAC','SAS','TOR','UTA','WAS'],
 'hits': [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0],
 'total': [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
}
team_success = pd.DataFrame(temp)

### Categories used for comparison:
* 'years_active'
* 'games'
* 'minutes_played'
* 'points'
* 'total_rebounds'
* 'assists'
* 'field_goal_percentage'
* '3_point_percentage' 
* 'free_throw_percentage'
* 'average_minutes_played'
* 'points_per_game'
* 'average_total_rebounds'
* 'average_assists' 
* 'win_shares'
* 'win_shares_per_48_minutes'
* 'box_plus_minus'
* 'value_over_replacement'

If a players is has 9 statistical categories that are better than the average value, he will be considered a "hit". 




In [108]:
# for every team go through each pick and compare the player's stats to the pick average
for t in team:
    temp = df[df['team'] == t]
    # total_pick will be used to keep track of the total number of picks for the given team
    total_picks = 0
    # count will be used to keep track of the "hits"
    count = 0
    for index, player in temp.iterrows():
        avg_pick = player['overall_pick']
        avg_row = draft_avg[draft_avg['overall_pick'] == avg_pick]
        total = 0
        for c in c_names:
            a = float(player[c])
            b = float(avg_row[c])
            if (a >= b):
                total += 1
        total_picks +=1
        if total >= 9:
            count += 1
    # update the team's values after evaluationg each pick in the team_success df
    team_success.loc[team_success['team'] == t, 'hits'] = count
    team_success.loc[team_success['team'] == t, 'total'] = total_picks

# create a new column in the team_success df, it wull be the percentage of picks that are considered hits
team_success['pct_hit'] = team_success['hits']/team_success['total']

In [109]:
cols = ['pct_hit']
team_success.style.background_gradient(cmap='seismic',subset=cols).hide(axis="index")


team,hits,total,pct_hit
ATL,22,80,0.275
BOS,29,75,0.386667
BRK,16,61,0.262295
CHI,30,75,0.4
CHO,12,39,0.307692
CLE,22,55,0.4
DAL,16,62,0.258065
DEN,24,64,0.375
DET,29,65,0.446154
GSW,24,66,0.363636


The San Antonio Spurs are the most successful franchise when it comes to drafting players, while the Portland Trail Blazers are the least successful in this regard.