In [1]:
# Import libraries
import pandas as pd
import os

# Define file paths and load data
data_path = "../unprocessed-player-data"
season_file = "nba_player_stats_2023-24.csv" # 23-24 season for intial testing
file_path = os.path.join(data_path, season_file)
season_data = pd.read_csv(file_path)
season_data.head() # Test successful load

Unnamed: 0,Player,Age,Team,Pos,G,GS,MP,FG,FGA,FG%,...,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,Awards,Season
0,Joel Embiid,29.0,PHI,C,39.0,39.0,33.6,11.5,21.8,0.529,...,8.6,11.0,5.6,1.2,1.7,3.8,2.9,34.7,AS,2023-24
1,Luka Dončić,24.0,DAL,PG,70.0,70.0,37.5,11.5,23.6,0.487,...,8.4,9.2,9.8,1.4,0.5,4.0,2.1,33.9,"MVP-3,CPOY-6,AS,NBA1",2023-24
2,Giannis Antetokounmpo,29.0,MIL,PF,73.0,73.0,35.2,11.5,18.8,0.611,...,8.8,11.5,6.5,1.2,1.1,3.4,2.9,30.4,"MVP-4,DPOY-9,CPOY-12,AS,NBA1",2023-24
3,Shai Gilgeous-Alexander,25.0,OKC,PG,75.0,75.0,34.0,10.6,19.8,0.535,...,4.7,5.5,6.2,2.0,0.9,2.2,2.5,30.1,"MVP-2,DPOY-7,CPOY-3,AS,NBA1",2023-24
4,Jalen Brunson,27.0,NYK,PG,77.0,77.0,35.4,10.3,21.4,0.479,...,3.1,3.6,6.7,0.9,0.2,2.4,1.9,28.7,"MVP-5,CPOY-5,AS,NBA2",2023-24


In [2]:
# Combine season data into a single data structure
season_files = [file for file in os.listdir(data_path) if file.endswith('.csv')]
all_season_data = []

# Iterate through datasets and concatenate
for season_data_file in season_files:
    file_path = os.path.join(data_path, season_data_file)
    season_df = pd.read_csv(file_path)  # Renamed variable to avoid conflict
    all_season_data.append(season_df)

# Concatenate all the dataframes into one large dataframe
combined_data = pd.concat(all_season_data, ignore_index=True)

# Display the first/last few rows of the combined dataset
combined_data

Unnamed: 0,Player,Age,Team,Pos,G,GS,MP,FG,FGA,FG%,...,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,Awards,Season
0,Adrian Dantley,25.0,UTA,SF,80.0,,42.7,11.4,20.3,0.559,...,4.0,6.4,4.0,1.4,0.2,3.5,3.1,30.7,"MVP-12,AS,NBA2",1980-81
1,Moses Malone,25.0,HOU,C,80.0,,40.6,10.1,19.3,0.522,...,8.8,14.8,1.8,1.0,1.9,3.9,2.8,27.8,"MVP-4,AS,NBA2",1980-81
2,George Gervin,28.0,SAS,SG,82.0,82.0,33.7,10.4,21.1,0.492,...,3.6,5.1,3.2,1.1,0.7,3.1,2.6,27.1,"MVP-5,AS,NBA1",1980-81
3,Kareem Abdul-Jabbar,33.0,LAL,C,80.0,,37.2,10.5,18.2,0.574,...,7.8,10.3,3.4,0.7,2.9,3.1,3.1,26.2,"MVP-3,AS,NBA1",1980-81
4,David Thompson,26.0,DEN,SG,77.0,,34.0,9.5,18.8,0.506,...,2.3,3.7,3.0,0.7,0.8,3.2,3.0,25.5,,1980-81
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24060,Justin Minaya,25.0,POR,SF,1.0,0.0,6.0,0.0,0.0,,...,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,,2024-25
24061,Riley Minix,24.0,SAS,SF,1.0,0.0,7.0,0.0,1.0,0.000,...,2.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,,2024-25
24062,Jalen Pickett,25.0,DEN,SG,2.0,0.0,2.0,0.0,0.5,0.000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,2024-25
24063,Cole Swider,25.0,DET,SF,2.0,0.0,6.5,0.0,2.5,0.000,...,1.0,1.0,0.5,0.0,0.0,0.0,0.5,0.0,,2024-25


In [3]:
# Show empty values in combined data
combined_data.isnull().sum()

Player        0
Age          45
Team         45
Pos          45
G            45
GS          345
MP           45
FG           45
FGA          45
FG%         130
3P           45
3PA          45
3P%        3793
2P           45
2PA          45
2P%         226
eFG%        130
FT           45
FTA          45
FT%        1154
ORB          45
DRB          45
TRB          45
AST          45
STL          45
BLK          45
TOV          45
PF           45
PTS          45
Awards    21821
Season        0
dtype: int64

In [4]:
# Calculate percentages from the raw values
combined_data['FG%'] = (combined_data['FG'] / combined_data['FGA']) * 100
combined_data['3P%'] = (combined_data['3P'] / combined_data['3PA']) * 100
combined_data['2P%'] = (combined_data['2P'] / combined_data['2PA']) * 100
combined_data['FT%'] = (combined_data['FT'] / combined_data['FTA']) * 100

# Calculate eFG% (Effective Field Goal Percentage)
combined_data['eFG%'] = ((combined_data['FG'] + 0.5 * combined_data['3P']) / combined_data['FGA']) * 100

# Handle division by zero (replace NaN values with 0 or another value)
combined_data['FG%'].fillna(0, inplace=True)
combined_data['3P%'].fillna(0, inplace=True)
combined_data['2P%'].fillna(0, inplace=True)
combined_data['FT%'].fillna(0, inplace=True)
combined_data['eFG%'].fillna(0, inplace=True)

# Show updated empty values in combined data
combined_data.isnull().sum()

Player        0
Age          45
Team         45
Pos          45
G            45
GS          345
MP           45
FG           45
FGA          45
FG%           0
3P           45
3PA          45
3P%           0
2P           45
2PA          45
2P%           0
eFG%          0
FT           45
FTA          45
FT%           0
ORB          45
DRB          45
TRB          45
AST          45
STL          45
BLK          45
TOV          45
PF           45
PTS          45
Awards    21821
Season        0
dtype: int64

In [5]:
# Drop lines representing 'League Average'
combined_data = combined_data[combined_data['Player'] != 'League Average']

# Show updated empty values in combined data
combined_data.isnull().sum()

Player        0
Age           0
Team          0
Pos           0
G             0
GS          300
MP            0
FG            0
FGA           0
FG%           0
3P            0
3PA           0
3P%           0
2P            0
2PA           0
2P%           0
eFG%          0
FT            0
FTA           0
FT%           0
ORB           0
DRB           0
TRB           0
AST           0
STL           0
BLK           0
TOV           0
PF            0
PTS           0
Awards    21776
Season        0
dtype: int64

In [62]:
# Most NaN for GS come from players in the 1980-81 season, drop values since data is from long ago
combined_data = combined_data.dropna(subset=['GS'])

# Add 'No Award' value for 'Awards' column where Awards is NaN
combined_data.loc[combined_data['Awards'].isna(), 'Awards'] = "No Awards"

# Save combined dataframe to processed data directory
processed_data_path = "../processed-player-data/combined_data.csv"
combined_data.to_csv(processed_data_path, index=False)
print(f"Data saved to: {processed_data_path}") # Confirmation

# Show updated empty values in combined data
combined_data.isnull().sum()

Data saved to: ../processed-player-data/combined_data.csv


Player    0
Age       0
Team      0
Pos       0
G         0
GS        0
MP        0
FG        0
FGA       0
FG%       0
3P        0
3PA       0
3P%       0
2P        0
2PA       0
2P%       0
eFG%      0
FT        0
FTA       0
FT%       0
ORB       0
DRB       0
TRB       0
AST       0
STL       0
BLK       0
TOV       0
PF        0
PTS       0
Awards    0
Season    0
dtype: int64

In [63]:
# Filter players who have exactly '6MOY-1' in the Awards column
smoy_winners = combined_data[combined_data['Awards'] == '6MOY-1']

# Sort by 'MP' (minutes played) and select the lowest 5
lowest_mp_smoy_winners = smoy_winners.sort_values(by='MP').head(5)

# Display relevant columns for these players
lowest_mp_smoy_winners[['Player', 'MP', 'Awards', 'Season']].head()

# Lowest SMOY Winner had 19.3, My cutoff will be 20 MP


Unnamed: 0,Player,MP,Awards,Season
2029,Bill Walton,19.3,6MOY-1,1985-86
9502,Corliss Williamson,21.8,6MOY-1,2001-02
22963,Naz Reid,24.2,6MOY-1,2023-24
16846,Lou Williams,25.2,6MOY-1,2014-15
7417,Danny Manning,25.6,6MOY-1,1997-98


In [55]:
# Filter players who have '6MOY-1' in their Awards column
smoy_players = combined_data[combined_data['Awards'].str.contains(r'6MOY-[1]\b', na=False)]

# Sort by 'G' (Games Played)
top_g_smoy_players = smoy_players[['Player', 'G', 'GS', 'MP', 'Awards', 'Season']].sort_values(by=['G'], ascending=False)

# Display the result
top_g_smoy_players

Unnamed: 0,Player,G,GS,MP,Awards,Season
10477,Antawn Jamison,82.0,2.0,29.0,6MOY-1,2003-04
4849,Clifford Robinson,82.0,12.0,31.4,6MOY-1,1992-93
14500,Lamar Odom,82.0,35.0,32.2,6MOY-1,2010-11
11068,Ben Gordon,82.0,3.0,24.4,"ROY-2,6MOY-1",2004-05
1166,Kevin McHale,82.0,10.0,31.4,"6MOY-1,AS",1983-84
8461,Rodney Rogers,82.0,7.0,27.9,6MOY-1,1999-00
5325,Dell Curry,82.0,0.0,26.5,6MOY-1,1993-94
3996,Detlef Schrempf,82.0,3.0,32.1,6MOY-1,1990-91
6310,Toni Kukoč,81.0,20.0,26.0,6MOY-1,1995-96
22963,Naz Reid,81.0,14.0,24.2,6MOY-1,2023-24


In [56]:
# Filter players who have '6MOY-1' in their Awards column
smoy_players = combined_data[combined_data['Awards'].str.contains(r'6MOY-[1]\b', na=False)]

# Sort by 'GS' (Games Played)
top_gs_smoy_players = smoy_players[['Player', 'G', 'GS', 'MP', 'Awards', 'Season']].sort_values(by=['GS'], ascending=False)

# Display the result
top_gs_smoy_players

Unnamed: 0,Player,G,GS,MP,Awards,Season
14500,Lamar Odom,82.0,35.0,32.2,6MOY-1,2010-11
8984,Aaron McKie,76.0,33.0,31.5,6MOY-1,2000-01
1504,Kevin McHale,79.0,31.0,33.6,6MOY-1,1984-85
2246,Ricky Pierce,79.0,31.0,31.7,6MOY-1,1986-87
9986,Bobby Jackson,59.0,26.0,28.4,6MOY-1,2002-03
16197,Jamal Crawford,69.0,24.0,30.3,6MOY-1,2013-14
12693,Manu Ginóbili,74.0,23.0,31.1,"MVP-10,6MOY-1,NBA3",2007-08
6310,Toni Kukoč,81.0,20.0,26.0,6MOY-1,1995-96
18624,Lou Williams,79.0,19.0,32.8,6MOY-1,2017-18
12182,Leandro Barbosa,80.0,18.0,32.7,6MOY-1,2006-07


In [59]:
# Save CSV, only including players that fit preliminary SMOY reqs (i.e: MP >= 20, GS <= 40, G >= 50 )
smoy_eligible_data = combined_data[
    (combined_data['GS'] <= 40) &
    (combined_data['MP'] >= 20) &
    (combined_data['G'] >= 50)
]

# Save the filtered data to a new CSV file
output_path = "../processed-player-data/smoy_eligible_players.csv"
smoy_eligible_data.to_csv(output_path, index=False)

# Confirm the data has been saved
print(f"Filtered SMOY data saved to {output_path}")


Filtered SMOY data saved to ../processed-player-data/smoy_eligible_players.csv
