# Group 3 - Final Project
### Evan S, Joey G, Ryan S, Jack V

## Introduction
The data we will be working on is a comprehensive dataset of NBA player, game, and season statistics from 1947-present. The dataset originates from kaggle, which was scraped from basketball-reference.com. Advanced statistics were not calculated until 1980, so we will filter for data between the years 1980 and 2024.

Link: https://www.kaggle.com/datasets/sumitrodatta/nba-aba-baa-stats?select=Advanced.csv

In [2]:
# Import data & clean
import numpy as np
import pandas as pd
# load in data
end_of_season_teams = pd.read_csv('nba_data/End of Season Teams (Voting).csv')
player_awards = pd.read_csv('nba_data/Player Award Shares.csv')
player_career = pd.read_csv('nba_data/Player Career Info.csv')
player_season = pd.read_csv('nba_data/Player Season Info.csv')
player_shooting = pd.read_csv('nba_data/Player Shooting.csv')
player_totals = pd.read_csv('nba_data/Player Totals.csv')
team_summary = pd.read_csv('nba_data/Team Summaries.csv')
# check to make sure it loaded in
#end_of_season_teams.head(50)
#player_awards.head()   # get rid of useless awards
#player_career.head(50)   # used for num_seasons
#player_season.head()   # only experience == 1 for roty data
#player_shooting.head() #
#player_totals.head()   #
#team_summary.head()    #
#team_summary.columns

In [3]:
# cleaning end_of_season_teams
all_nba_players = end_of_season_teams[
                        (end_of_season_teams['type'] == 'All-NBA')][
                            ['player', 'season', 'number_tm', 'pts_won']]

all_nba_players.rename(columns={'number_tm': 'all_nba_team', 
                             'pts_won' : 'all_nba_voting_pts'}, inplace=True)
#all_nba_players.head()


# cleaning player_awards into 3 different dfs
dpoy_players = player_awards[(player_awards['award'] == 'dpoy')][
                            ['player', 'season', 'pts_won', 'winner']]

dpoy_players.rename(columns={'pts_won' : 'dpoy_voting_pts', 
                             'winner':'won_dpoy'}, inplace=True)

mvp_players = player_awards[(player_awards['award'] == 'nba mvp')][
                            ['player', 'season', 'pts_won', 'winner']]

mvp_players.rename(columns={'pts_won' : 'mvp_voting_pts', 
                             'winner':'won_mvp'}, inplace=True)


roty_players = player_awards[(player_awards['award'] == 'nba roy')][
                            ['player', 'season', 'pts_won', 'winner']]

roty_players.rename(columns={'pts_won' : 'roty_voting_pts', 
                             'winner':'won_roty'}, inplace=True)


#dpoy.head(10)
#mvp.head(10)
#roty.head(10)

# cleaning player_career
player_career = player_career[['player', 'num_seasons']]
#player_career.head(10)


# cleaning player_season
player_season = player_season[['player', 'season', 'seas_id', 'pos', 'age', 'tm', 'experience']]
player_season['team_count'] = (
    player_season
    .groupby(['player', 'season'])['tm'] 
    .transform('nunique') # adds a  column with number of teams
    # so we can remove all people on more than 1 team in a season
)

player_season = player_season.query("team_count == 1")

player_season.rename(columns={'tm': 'abbreviation'}, inplace=True)
#player_season.head(10)

# cleaning player_shooting
player_shooting['team_count'] = (
    player_shooting
    .groupby(['player', 'season'])['tm'] 
    .transform('nunique') # adds a  column with number of teams
    # so we can remove all people on more than 1 team in a season
)


# selects only players with 1 team in a season
# an NBA MVP, 1st Team Member, and ROTY have never won the award
# while playing for 2 teams in the year they won
# There has been 1 NBA DPOY that was on 2 teams the year, he won the award
# That player is Dikembe Motumbo and we will still be removing him for consistency
player_shooting = player_shooting.query("team_count == 1")[['player', 'season', 'seas_id', 'avg_dist_fga',
       'fg_percent_from_x2p_range', 'fg_percent_from_x3p_range',
       'percent_assisted_x2p_fg', 'percent_assisted_x3p_fg', 'corner_3_point_percent']]
#player_shooting.head(10)


# cleaning player_shooting
player_totals['team_count'] = (
    player_totals
    .groupby(['player', 'season'])['tm'] 
    .transform('nunique') # adds a  column with number of teams
    # so we can remove all people on more than 1 team in a season
)

# selects only players with 1 team in a season
# an NBA MVP, 1st Team Member, and ROTY have never won the award
# while playing for 2 teams in the year they won
# There has been 1 NBA DPOY that was on 2 teams the year, he won the award
# That player is Dikembe Motumbo and we will still be removing him for consistency
player_totals = player_totals.query("team_count == 1")[['player', 'season', 'seas_id', 'g', 'gs', 'mp', 'fg', 'fga', 
                                                        'fg_percent', 'x3p', 'x3pa', 'x2p', 'x2pa', 
                                                        'e_fg_percent', 'ft', 'fta', 'ft_percent', 'orb', 
                                                        'drb', 'trb', 'ast', 'stl', 'blk', 'tov', 'pts']]


# cleaning team_summary
team_summary = team_summary[['season', 'team', 'abbreviation', 'playoffs', 'w', 'l', 'o_rtg', 'd_rtg', 'n_rtg', 'pace']]
#team_summary.head()


In [4]:
# combining into one df
# player_season -> player_shooting -> player_totals --> player_career --> team summary 
# then add awards df individually
nba = (
    player_season
    .merge(player_shooting, on = ['player', 'season'], how = 'left')
    .merge(player_totals, on = ['player', 'season'], how = 'left')
    .merge(player_career, on = 'player', how = 'left')
    .merge(team_summary, on = ['abbreviation', 'season'], how = 'left')
).query("season >= 1980").rename(
           columns = {
                'pos': 'position', 
                'abbreviation' : 'team_abbr',
                'fg_percent_from_x2p_range' : 'fg_percent_2p', 
               'fg_percent_from_x3p_range' : 'fg_percent_3p', 
               'g' : 'games_played', 
               'gs' : 'games_started', 
               'mp' : 'minutes_played',
               'orb' : 'off_reb', 
               'drb' : 'def_reb', 
               'trb' : 'tot_reb',
               'num_seasons' : 'total_seasons_played',
               'playoffs' : 'team_made_playoffs',
               'w' : 'team_wins',
               'l' : 'team_losses',
               'o_rtg' : 'team_off_rtg',
               'd_rtg' : 'team_def_rtg',
               'n_rtg' : 'team_net_rtg',
               'pace' : 'team_pace'               
           })

# selecting and ordering columns
nba = nba[['seas_id','player', 'season', 'team', 'team_abbr', 'position', 'age',
               'experience', 'avg_dist_fga', 'fg_percent_2p', 'fg_percent_3p',
               'percent_assisted_x2p_fg', 'percent_assisted_x3p_fg',
               'corner_3_point_percent', 'games_played', 'games_started',
               'minutes_played', 'fg', 'fga', 'fg_percent', 'x3p', 'x3pa', 'x2p',
               'x2pa', 'e_fg_percent', 'ft', 'fta', 'ft_percent', 'off_reb', 'def_reb', 'tot_reb',
               'ast', 'stl', 'blk', 'tov', 'pts', 'total_seasons_played',
               'team_made_playoffs', 'team_wins', 'team_losses', 'team_off_rtg',
               'team_def_rtg', 'team_net_rtg', 'team_pace']]


nba.head(10)

Unnamed: 0,seas_id,player,season,team,team_abbr,position,age,experience,avg_dist_fga,fg_percent_2p,...,tov,pts,total_seasons_played,team_made_playoffs,team_wins,team_losses,team_off_rtg,team_def_rtg,team_net_rtg,team_pace
5747,7991.0,Abdul Jeelani,1980,Portland Trail Blazers,POR,SF,25.0,1,,,...,117.0,737.0,2,True,38.0,44.0,103.4,104.3,-0.9,98.6
5748,7992.0,Adrian Dantley,1980,Utah Jazz,UTA,SF,24.0,4,,,...,233.0,1903.0,15,False,24.0,58.0,104.2,110.4,-6.2,97.6
5749,7993.0,Al Skinner,1980,Philadelphia 76ers,PHI,SG,27.0,6,,,...,2.0,2.0,6,True,59.0,23.0,105.0,101.0,4.0,103.0
5750,7997.0,Allan Bristow,1980,Utah Jazz,UTA,SF,28.0,7,,,...,179.0,953.0,10,False,24.0,58.0,104.2,110.4,-6.2,97.6
5751,7998.0,Allen Leavell,1980,Houston Rockets,HOU,PG,22.0,1,,,...,205.0,843.0,10,True,41.0,41.0,108.1,108.0,0.1,101.2
5752,7999.0,Alonzo Bradley,1980,Houston Rockets,HOU,SF,26.0,3,,,...,8.0,41.0,3,True,41.0,41.0,108.1,108.0,0.1,101.2
5753,8000.0,Alvan Adams,1980,Phoenix Suns,PHO,C,25.0,5,,,...,218.0,1118.0,13,True,55.0,27.0,105.6,102.2,3.4,104.8
5754,8001.0,Alvin Scott,1980,Phoenix Suns,PHO,SF,24.0,3,,,...,92.0,350.0,8,True,55.0,27.0,105.6,102.2,3.4,104.8
5755,8002.0,Andre Wakefield,1980,Utah Jazz,UTA,SG,25.0,2,,,...,8.0,15.0,2,False,24.0,58.0,104.2,110.4,-6.2,97.6
5756,8003.0,Anthony Roberts,1980,Denver Nuggets,DEN,SF,24.0,3,,,...,28.0,177.0,5,False,30.0,52.0,103.4,107.6,-4.2,103.9


## Data Summary

In [6]:
# Feature descriptions

## Motivating Questions

## EDA

- Everyone makes 1-2 EDA graphs (can be really simple just showing the distribution of data)
- Make 1-2 Model specific Visualizations (ex: whoever works on MVP model, show distribution of pts by MVPS and non MVPs)
- If possible, visually show how good our model is (mostly for regression)

In [9]:
# EDA

## Model Building & Implementation

- For each method, finish data cleaning by doing the following:
    - adding the necessary awards data (**DONE**)
    - selecting necesary columns/rows (label df as award name)
- Set aside the 2025 Data, store in a df called 'awardName_25' (ex: mvp_25)
- Perform any test/train or CV splits (in a dfs in the form awardName_X_train, awardName_X_test, etc)
- Make Model
- Show results/accuracy (visually if possible or explain)
- Make 2025 predictions

### MVP

Regression (linear or logistic): predicting pts_won
    
    Before training the model, it may be beneficial to remove some players (ex: players who have played less than 15 minutes played per or started at least 40 games)

    Features 
        Team wins, Typical per game stats (fg%, points, rebounds, assists, etc), Age / experience,
    Go back and find the best MVP season ever (run only on MVP winners)



In [12]:
mvp_players.head()

Unnamed: 0,player,season,mvp_voting_pts,won_mvp
42,Nikola Jokić,2024,926.0,True
43,Shai Gilgeous-Alexander,2024,640.0,False
44,Luka Dončić,2024,566.0,False
45,Giannis Antetokounmpo,2024,192.0,False
46,Jalen Brunson,2024,142.0,False


In [13]:
mvp = (
    nba
    .merge(mvp_players, on = ['player', 'season'], how = 'left')
)

mvp['mvp_voting_pts'] = mvp['mvp_voting_pts'].fillna(0)
mvp['won_mvp'] = mvp['won_mvp'].fillna(False)
mvp.head()
#mvp.query("won_mvp == True")

  mvp['won_mvp'] = mvp['won_mvp'].fillna(False)


Unnamed: 0,seas_id,player,season,team,team_abbr,position,age,experience,avg_dist_fga,fg_percent_2p,...,total_seasons_played,team_made_playoffs,team_wins,team_losses,team_off_rtg,team_def_rtg,team_net_rtg,team_pace,mvp_voting_pts,won_mvp
0,7991.0,Abdul Jeelani,1980,Portland Trail Blazers,POR,SF,25.0,1,,,...,2,True,38.0,44.0,103.4,104.3,-0.9,98.6,0.0,False
1,7992.0,Adrian Dantley,1980,Utah Jazz,UTA,SF,24.0,4,,,...,15,False,24.0,58.0,104.2,110.4,-6.2,97.6,0.0,False
2,7993.0,Al Skinner,1980,Philadelphia 76ers,PHI,SG,27.0,6,,,...,6,True,59.0,23.0,105.0,101.0,4.0,103.0,0.0,False
3,7997.0,Allan Bristow,1980,Utah Jazz,UTA,SF,28.0,7,,,...,10,False,24.0,58.0,104.2,110.4,-6.2,97.6,0.0,False
4,7998.0,Allen Leavell,1980,Houston Rockets,HOU,PG,22.0,1,,,...,10,True,41.0,41.0,108.1,108.0,0.1,101.2,0.0,False


### DPOY

    Tree
    
    Features
        Typical per game defensive stats
        Team points allowed per game

In [15]:
dpoy = (
    nba
    .merge(dpoy_players, on = ['player', 'season'], how = 'left')
)

dpoy['dpoy_voting_pts'] = dpoy['dpoy_voting_pts'].fillna(0)
dpoy['won_dpoy'] = dpoy['won_dpoy'].fillna(False)
dpoy.head()
#dpoy.query("won_dpoy == True")

  dpoy['won_dpoy'] = dpoy['won_dpoy'].fillna(False)


Unnamed: 0,seas_id,player,season,team,team_abbr,position,age,experience,avg_dist_fga,fg_percent_2p,...,total_seasons_played,team_made_playoffs,team_wins,team_losses,team_off_rtg,team_def_rtg,team_net_rtg,team_pace,dpoy_voting_pts,won_dpoy
0,7991.0,Abdul Jeelani,1980,Portland Trail Blazers,POR,SF,25.0,1,,,...,2,True,38.0,44.0,103.4,104.3,-0.9,98.6,0.0,False
1,7992.0,Adrian Dantley,1980,Utah Jazz,UTA,SF,24.0,4,,,...,15,False,24.0,58.0,104.2,110.4,-6.2,97.6,0.0,False
2,7993.0,Al Skinner,1980,Philadelphia 76ers,PHI,SG,27.0,6,,,...,6,True,59.0,23.0,105.0,101.0,4.0,103.0,0.0,False
3,7997.0,Allan Bristow,1980,Utah Jazz,UTA,SF,28.0,7,,,...,10,False,24.0,58.0,104.2,110.4,-6.2,97.6,0.0,False
4,7998.0,Allen Leavell,1980,Houston Rockets,HOU,PG,22.0,1,,,...,10,True,41.0,41.0,108.1,108.0,0.1,101.2,0.0,False


### ROTY

    Logistic Regression
    
    Only keep where the player experience is 1 for training
    
    Features
    
        Typical per game stats - fg%, points, rebounds, assists, etc
        
        Minutes played

In [17]:
roty = (
    nba.query("experience == 1") # only keeping players with 1 season of experience to train the model better
    .merge(roty_players, on = ['player', 'season'], how = 'left')
)

roty['roty_voting_pts'] = roty['roty_voting_pts'].fillna(0)
roty['won_roty'] = roty['won_roty'].fillna(False)
roty.head()
#roty.query("won_roty == True")

  roty['won_roty'] = roty['won_roty'].fillna(False)


Unnamed: 0,seas_id,player,season,team,team_abbr,position,age,experience,avg_dist_fga,fg_percent_2p,...,total_seasons_played,team_made_playoffs,team_wins,team_losses,team_off_rtg,team_def_rtg,team_net_rtg,team_pace,roty_voting_pts,won_roty
0,7991.0,Abdul Jeelani,1980,Portland Trail Blazers,POR,SF,25.0,1,,,...,2,True,38.0,44.0,103.4,104.3,-0.9,98.6,0.0,False
1,7998.0,Allen Leavell,1980,Houston Rockets,HOU,PG,22.0,1,,,...,10,True,41.0,41.0,108.1,108.0,0.1,101.2,0.0,False
2,8006.0,Arvid Kramer,1980,Denver Nuggets,DEN,C,23.0,1,,,...,1,False,30.0,52.0,103.4,107.6,-4.2,103.9,0.0,False
3,8010.0,Bernard Toone,1980,Philadelphia 76ers,PHI,PF,23.0,1,,,...,1,True,59.0,23.0,105.0,101.0,4.0,103.0,0.0,False
4,8011.0,Bill Cartwright,1980,New York Knicks,NYK,C,22.0,1,,,...,15,False,39.0,43.0,106.5,107.5,-1.0,106.5,0.0,False


### All NBA 1st Team

    Regression (linear or logistic) or trees by position- 2 guards, 3 forwards/centers

    Features: Similar to mvp


In [40]:
#all_nba_players.head()

all_nba = nba.merge(all_nba_players, on = ['player', 'season'], how = 'left')
#all_nba.head()
all_nba['all_nba_voting_pts'] = all_nba['all_nba_voting_pts'].fillna(0)
all_nba['all_nba_team'] = all_nba['all_nba_team'].fillna('No Votes')
all_nba.head()
all_nba.query("all_nba_team == '1T'")

# whoever does this one has some options, could predict total votes or team number

Unnamed: 0,seas_id,player,season,team,team_abbr,position,age,experience,avg_dist_fga,fg_percent_2p,...,total_seasons_played,team_made_playoffs,team_wins,team_losses,team_off_rtg,team_def_rtg,team_net_rtg,team_pace,all_nba_team,all_nba_voting_pts
16115,29847.0,Devin Booker,2022,Phoenix Suns,PHO,SG,25.0,7,16.1,0.508,...,10,True,64.0,18.0,114.8,107.3,7.5,99.8,1T,460.0
16166,29918.0,Giannis Antetokounmpo,2022,Milwaukee Bucks,MIL,PF,27.0,9,10.0,0.616,...,12,True,51.0,31.0,115.1,111.8,3.3,99.9,1T,500.0
16231,30011.0,Jayson Tatum,2022,Boston Celtics,BOS,SF,23.0,5,15.2,0.524,...,8,True,51.0,31.0,114.4,106.9,7.5,96.6,1T,390.0
16315,30158.0,Luka Dončić,2022,Dallas Mavericks,DAL,PG,22.0,4,15.9,0.528,...,7,True,52.0,30.0,112.8,109.4,3.4,95.4,1T,476.0
16370,30247.0,Nikola Jokić,2022,Denver Nuggets,DEN,C,26.0,7,10.2,0.652,...,10,True,48.0,34.0,114.5,112.1,2.4,97.8,1T,476.0
16644,30665.0,Giannis Antetokounmpo,2023,Milwaukee Bucks,MIL,PF,28.0,10,8.1,0.596,...,12,False,58.0,24.0,115.4,111.9,3.5,100.5,1T,500.0
16709,30751.0,Jayson Tatum,2023,Boston Celtics,BOS,PF,24.0,6,14.8,0.558,...,8,False,57.0,25.0,118.0,111.5,6.5,98.5,1T,484.0
16723,30765.0,Joel Embiid,2023,Philadelphia 76ers,PHI,C,28.0,7,11.3,0.587,...,9,False,54.0,28.0,117.7,113.3,4.4,96.9,1T,474.0
16800,30880.0,Luka Dončić,2023,Dallas Mavericks,DAL,PG,23.0,5,14.7,0.588,...,7,False,38.0,44.0,116.8,116.7,0.1,96.6,1T,403.0
16900,31046.0,Shai Gilgeous-Alexander,2023,Oklahoma City Thunder,OKC,PG,24.0,5,9.8,0.533,...,7,False,40.0,42.0,115.2,114.2,1.0,101.1,1T,407.0


## Conclusions

## Limitations & Recommendations for Future Work