In [1]:
# Import libraries
import pandas as pd
import os

# Define file paths and load data
data_path = "../unprocessed-player-data"
season_file = "nba_player_stats_2023-24.csv" # 23-24 season for intial testing
file_path = os.path.join(data_path, season_file)
season_data = pd.read_csv(file_path)
season_data.head() # Test successful load

Unnamed: 0,Player,Age,Team,Pos,G,GS,MP,FG,FGA,FG%,...,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,Awards,Season
0,Joel Embiid,29.0,PHI,C,39.0,39.0,33.6,11.5,21.8,0.529,...,8.6,11.0,5.6,1.2,1.7,3.8,2.9,34.7,AS,2023-24
1,Luka Dončić,24.0,DAL,PG,70.0,70.0,37.5,11.5,23.6,0.487,...,8.4,9.2,9.8,1.4,0.5,4.0,2.1,33.9,"MVP-3,CPOY-6,AS,NBA1",2023-24
2,Giannis Antetokounmpo,29.0,MIL,PF,73.0,73.0,35.2,11.5,18.8,0.611,...,8.8,11.5,6.5,1.2,1.1,3.4,2.9,30.4,"MVP-4,DPOY-9,CPOY-12,AS,NBA1",2023-24
3,Shai Gilgeous-Alexander,25.0,OKC,PG,75.0,75.0,34.0,10.6,19.8,0.535,...,4.7,5.5,6.2,2.0,0.9,2.2,2.5,30.1,"MVP-2,DPOY-7,CPOY-3,AS,NBA1",2023-24
4,Jalen Brunson,27.0,NYK,PG,77.0,77.0,35.4,10.3,21.4,0.479,...,3.1,3.6,6.7,0.9,0.2,2.4,1.9,28.7,"MVP-5,CPOY-5,AS,NBA2",2023-24


In [2]:
# Combine season data into a single data structure
season_files = [file for file in os.listdir(data_path) if file.endswith('.csv')]
all_season_data = []

# Iterate through datasets and concatenate
for season_data_file in season_files:
    file_path = os.path.join(data_path, season_data_file)
    season_df = pd.read_csv(file_path)  # Renamed variable to avoid conflict
    all_season_data.append(season_df)

# Concatenate all the dataframes into one large dataframe
combined_data = pd.concat(all_season_data, ignore_index=True)

# Display the first/last few rows of the combined dataset
combined_data

Unnamed: 0,Player,Age,Team,Pos,G,GS,MP,FG,FGA,FG%,...,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,Awards,Season
0,Adrian Dantley,25.0,UTA,SF,80.0,,42.7,11.4,20.3,0.559,...,4.0,6.4,4.0,1.4,0.2,3.5,3.1,30.7,"MVP-12,AS,NBA2",1980-81
1,Moses Malone,25.0,HOU,C,80.0,,40.6,10.1,19.3,0.522,...,8.8,14.8,1.8,1.0,1.9,3.9,2.8,27.8,"MVP-4,AS,NBA2",1980-81
2,George Gervin,28.0,SAS,SG,82.0,82.0,33.7,10.4,21.1,0.492,...,3.6,5.1,3.2,1.1,0.7,3.1,2.6,27.1,"MVP-5,AS,NBA1",1980-81
3,Kareem Abdul-Jabbar,33.0,LAL,C,80.0,,37.2,10.5,18.2,0.574,...,7.8,10.3,3.4,0.7,2.9,3.1,3.1,26.2,"MVP-3,AS,NBA1",1980-81
4,David Thompson,26.0,DEN,SG,77.0,,34.0,9.5,18.8,0.506,...,2.3,3.7,3.0,0.7,0.8,3.2,3.0,25.5,,1980-81
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24060,Justin Minaya,25.0,POR,SF,1.0,0.0,6.0,0.0,0.0,,...,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,,2024-25
24061,Riley Minix,24.0,SAS,SF,1.0,0.0,7.0,0.0,1.0,0.000,...,2.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,,2024-25
24062,Jalen Pickett,25.0,DEN,SG,2.0,0.0,2.0,0.0,0.5,0.000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,2024-25
24063,Cole Swider,25.0,DET,SF,2.0,0.0,6.5,0.0,2.5,0.000,...,1.0,1.0,0.5,0.0,0.0,0.0,0.5,0.0,,2024-25


In [3]:
# Show empty values in combined data
combined_data.isnull().sum()

Player        0
Age          45
Team         45
Pos          45
G            45
GS          345
MP           45
FG           45
FGA          45
FG%         130
3P           45
3PA          45
3P%        3793
2P           45
2PA          45
2P%         226
eFG%        130
FT           45
FTA          45
FT%        1154
ORB          45
DRB          45
TRB          45
AST          45
STL          45
BLK          45
TOV          45
PF           45
PTS          45
Awards    21821
Season        0
dtype: int64

In [4]:
# Calculate percentages from the raw values
combined_data['FG%'] = (combined_data['FG'] / combined_data['FGA']) * 100
combined_data['3P%'] = (combined_data['3P'] / combined_data['3PA']) * 100
combined_data['2P%'] = (combined_data['2P'] / combined_data['2PA']) * 100
combined_data['FT%'] = (combined_data['FT'] / combined_data['FTA']) * 100

# Calculate eFG% (Effective Field Goal Percentage)
combined_data['eFG%'] = ((combined_data['FG'] + 0.5 * combined_data['3P']) / combined_data['FGA']) * 100

# Handle division by zero (replace NaN values with 0 or another value)
combined_data['FG%'].fillna(0, inplace=True)
combined_data['3P%'].fillna(0, inplace=True)
combined_data['2P%'].fillna(0, inplace=True)
combined_data['FT%'].fillna(0, inplace=True)
combined_data['eFG%'].fillna(0, inplace=True)

# Show updated empty values in combined data
combined_data.isnull().sum()

Player        0
Age          45
Team         45
Pos          45
G            45
GS          345
MP           45
FG           45
FGA          45
FG%           0
3P           45
3PA          45
3P%           0
2P           45
2PA          45
2P%           0
eFG%          0
FT           45
FTA          45
FT%           0
ORB          45
DRB          45
TRB          45
AST          45
STL          45
BLK          45
TOV          45
PF           45
PTS          45
Awards    21821
Season        0
dtype: int64

In [5]:
# Drop lines representing 'League Average'
combined_data = combined_data[combined_data['Player'] != 'League Average']

# Show updated empty values in combined data
combined_data.isnull().sum()

Player        0
Age           0
Team          0
Pos           0
G             0
GS          300
MP            0
FG            0
FGA           0
FG%           0
3P            0
3PA           0
3P%           0
2P            0
2PA           0
2P%           0
eFG%          0
FT            0
FTA           0
FT%           0
ORB           0
DRB           0
TRB           0
AST           0
STL           0
BLK           0
TOV           0
PF            0
PTS           0
Awards    21776
Season        0
dtype: int64

In [10]:
# Most NaN for GS come from players in the 1980-81 season, add 0 values since data is from long ago
combined_data['GS'] = combined_data['GS'].fillna(0)

# Add 'No Award' value for 'Awards' column where Awards is NaN
combined_data.loc[combined_data['Awards'].isna(), 'Awards'] = "No Awards"

# Save combined dataframe to processed data directory
processed_data_path = "../processed-player-data/combined_data.csv"
combined_data.to_csv(processed_data_path, index=False)
print(f"Data saved to: {processed_data_path}") # Confirmation

# Show updated empty values in combined data
combined_data.isnull().sum()

Data saved to: ../processed-player-data/combined_data.csv


Player    0
Age       0
Team      0
Pos       0
G         0
         ..
TOV       0
PF        0
PTS       0
Awards    0
Season    0
Length: 33, dtype: int64

In [11]:
# Add TS% column to the combined dataset
combined_data['TS%'] = combined_data.apply(
    lambda row: (row['PTS'] / (2 * (row['FGA'] + 0.44 * row['FTA'])) * 100)
    if (row['FGA'] + 0.44 * row['FTA']) > 0 else 0, axis=1
)

# Display the updated DataFrame
combined_data.head()

Unnamed: 0,Player,Age,Team,Pos,G,GS,MP,FG,FGA,FG%,3P,3PA,3P%,2P,2PA,2P%,eFG%,FT,FTA,FT%,TS%,EFF,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,Awards,Season
0,Adrian Dantley,25.0,UTA,SF,80.0,0.0,42.7,11.4,20.3,56.157635,0.0,0.1,0.0,11.3,20.3,55.665025,56.157635,7.9,9.8,80.612245,62.367951,35.5,2.4,4.0,6.4,4.0,1.4,0.2,3.5,3.1,30.7,"MVP-12,AS,NBA2",1980-81
1,Moses Malone,25.0,HOU,C,80.0,0.0,40.6,10.1,19.3,52.331606,0.0,0.0,0.0,10.1,19.3,52.331606,52.331606,7.6,10.1,75.247525,58.541105,39.625,5.9,8.8,14.8,1.8,1.0,1.9,3.9,2.8,27.8,"MVP-4,AS,NBA2",1980-81
2,George Gervin,28.0,SAS,SG,82.0,82.0,33.7,10.4,21.1,49.2891,0.1,0.4,25.0,10.3,20.7,49.758454,49.526066,6.2,7.6,81.578947,55.432826,26.829268,1.5,3.6,5.1,3.2,1.1,0.7,3.1,2.6,27.1,"MVP-5,AS,NBA1",1980-81
3,Kareem Abdul-Jabbar,33.0,LAL,C,80.0,0.0,37.2,10.5,18.2,57.692308,0.0,0.0,0.0,10.5,18.2,57.692308,57.692308,5.3,6.9,76.811594,61.6877,38.875,2.5,7.8,10.3,3.4,0.7,2.9,3.1,3.1,26.2,"MVP-3,AS,NBA1",1980-81
4,David Thompson,26.0,DEN,SG,77.0,0.0,34.0,9.5,18.8,50.531915,0.1,0.5,20.0,9.4,18.3,51.36612,50.797872,6.4,8.0,80.0,57.123656,25.454545,1.4,2.3,3.7,3.0,0.7,0.8,3.2,3.0,25.5,No Awards,1980-81


In [12]:
# Add EFF column to the combined dataset
combined_data['EFF'] = combined_data.apply(
    lambda row: (
        (row['PTS'] + row['TRB'] + row['AST'] + row['STL'] + row['BLK'] -
         (row['FGA'] - row['FG']) - (row['FTA'] - row['FT']) - row['TOV']) / row['G'] * 100
    ) if row['G'] > 0 else 0, axis=1
)

# Display the updated DataFrame
combined_data.head()

Unnamed: 0,Player,Age,Team,Pos,G,GS,MP,FG,FGA,FG%,3P,3PA,3P%,2P,2PA,2P%,eFG%,FT,FTA,FT%,TS%,EFF,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,Awards,Season
0,Adrian Dantley,25.0,UTA,SF,80.0,0.0,42.7,11.4,20.3,56.157635,0.0,0.1,0.0,11.3,20.3,55.665025,56.157635,7.9,9.8,80.612245,62.367951,35.5,2.4,4.0,6.4,4.0,1.4,0.2,3.5,3.1,30.7,"MVP-12,AS,NBA2",1980-81
1,Moses Malone,25.0,HOU,C,80.0,0.0,40.6,10.1,19.3,52.331606,0.0,0.0,0.0,10.1,19.3,52.331606,52.331606,7.6,10.1,75.247525,58.541105,39.625,5.9,8.8,14.8,1.8,1.0,1.9,3.9,2.8,27.8,"MVP-4,AS,NBA2",1980-81
2,George Gervin,28.0,SAS,SG,82.0,82.0,33.7,10.4,21.1,49.2891,0.1,0.4,25.0,10.3,20.7,49.758454,49.526066,6.2,7.6,81.578947,55.432826,26.829268,1.5,3.6,5.1,3.2,1.1,0.7,3.1,2.6,27.1,"MVP-5,AS,NBA1",1980-81
3,Kareem Abdul-Jabbar,33.0,LAL,C,80.0,0.0,37.2,10.5,18.2,57.692308,0.0,0.0,0.0,10.5,18.2,57.692308,57.692308,5.3,6.9,76.811594,61.6877,38.875,2.5,7.8,10.3,3.4,0.7,2.9,3.1,3.1,26.2,"MVP-3,AS,NBA1",1980-81
4,David Thompson,26.0,DEN,SG,77.0,0.0,34.0,9.5,18.8,50.531915,0.1,0.5,20.0,9.4,18.3,51.36612,50.797872,6.4,8.0,80.0,57.123656,25.454545,1.4,2.3,3.7,3.0,0.7,0.8,3.2,3.0,25.5,No Awards,1980-81


In [13]:
print(combined_data.columns)
# Reorder Features
columns = ['Player', 'Age', 'Team', 'Pos', 'G', 'GS', 'MP', 'FG', 'FGA', 'FG%',
           '3P', '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%',
           'TS%', 'EFF', 'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 
           'Awards', 'Season']

# Apply the new column order
combined_data = combined_data[columns]
# Display all columns in the terminal
pd.set_option('display.max_columns', None)  # Show all columns
pd.set_option('display.max_rows', 10)      # Limit the number of rows shown for readability

# Print the DataFrame
print(combined_data)

Index(['Player', 'Age', 'Team', 'Pos', 'G', 'GS', 'MP', 'FG', 'FGA', 'FG%',
       '3P', '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%',
       'TS%', 'EFF', 'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF',
       'PTS', 'Awards', 'Season'],
      dtype='object')
                    Player   Age Team Pos     G    GS    MP    FG   FGA  \
0           Adrian Dantley  25.0  UTA  SF  80.0   0.0  42.7  11.4  20.3   
1             Moses Malone  25.0  HOU   C  80.0   0.0  40.6  10.1  19.3   
2            George Gervin  28.0  SAS  SG  82.0  82.0  33.7  10.4  21.1   
3      Kareem Abdul-Jabbar  33.0  LAL   C  80.0   0.0  37.2  10.5  18.2   
4           David Thompson  26.0  DEN  SG  77.0   0.0  34.0   9.5  18.8   
...                    ...   ...  ...  ..   ...   ...   ...   ...   ...   
24059          Mac McClung  26.0  ORL  SG   1.0   0.0   5.0   0.0   0.0   
24060        Justin Minaya  25.0  POR  SF   1.0   0.0   6.0   0.0   0.0   
24061          Riley Minix  24.0  SAS  SF