In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix


In [2]:
df = pd.read_csv("data/player_logs_2023-24_with_bio.csv")
df.head()

Unnamed: 0,SEASON_ID,PLAYER_ID,GAME_ID,GAME_DATE,MATCHUP,WL,MIN,FGM,FGA,FG_PCT,...,PLAYER_NAME,SEASON,PLAYER_NAME_BIO,POSITION,HEIGHT,HEIGHT_IN,WEIGHT,WEIGHT_LB,BIRTHDATE,TEAM
0,22023,1630173,22301190,"Apr 14, 2024",NYK vs. CHI,W,19,2,3,0.667,...,Precious Achiuwa,2023-24,Precious Achiuwa,Forward,6-8,80,243.0,243.0,1999-09-19T00:00:00,Knicks
1,22023,1630173,22301175,"Apr 12, 2024",NYK vs. BKN,W,8,2,2,1.0,...,Precious Achiuwa,2023-24,Precious Achiuwa,Forward,6-8,80,243.0,243.0,1999-09-19T00:00:00,Knicks
2,22023,1630173,22301167,"Apr 11, 2024",NYK @ BOS,W,16,1,6,0.167,...,Precious Achiuwa,2023-24,Precious Achiuwa,Forward,6-8,80,243.0,243.0,1999-09-19T00:00:00,Knicks
3,22023,1630173,22301139,"Apr 07, 2024",NYK @ MIL,W,5,0,1,0.0,...,Precious Achiuwa,2023-24,Precious Achiuwa,Forward,6-8,80,243.0,243.0,1999-09-19T00:00:00,Knicks
4,22023,1630173,22301119,"Apr 05, 2024",NYK @ CHI,L,19,0,2,0.0,...,Precious Achiuwa,2023-24,Precious Achiuwa,Forward,6-8,80,243.0,243.0,1999-09-19T00:00:00,Knicks


In [3]:
# Calculate per 36 minutes stats
# List of columns to calculate per 36 minutes stats for
per_game_stats = ['FGM', 'FGA', 'FG3M', 'FG3A', 'FTM', 'FTA', 'OREB', 'DREB', 'REB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS']

for stat in per_game_stats:
    # Create a new column name for the per 36 minutes stat
    per_36_stat_name = f'{stat}_PER_36_MIN'
    # Calculate the per 36 minutes stat, handling division by zero by filling with 0
    df[per_36_stat_name] = (df[stat] / df['MIN'] * 36).fillna(0)

display(df.head())

Unnamed: 0,SEASON_ID,PLAYER_ID,GAME_ID,GAME_DATE,MATCHUP,WL,MIN,FGM,FGA,FG_PCT,...,FTA_PER_36_MIN,OREB_PER_36_MIN,DREB_PER_36_MIN,REB_PER_36_MIN,AST_PER_36_MIN,STL_PER_36_MIN,BLK_PER_36_MIN,TOV_PER_36_MIN,PF_PER_36_MIN,PTS_PER_36_MIN
0,22023,1630173,22301190,"Apr 14, 2024",NYK vs. CHI,W,19,2,3,0.667,...,0.0,1.894737,7.578947,9.473684,3.789474,0.0,1.894737,3.789474,5.684211,7.578947
1,22023,1630173,22301175,"Apr 12, 2024",NYK vs. BKN,W,8,2,2,1.0,...,0.0,0.0,13.5,13.5,0.0,0.0,0.0,0.0,0.0,22.5
2,22023,1630173,22301167,"Apr 11, 2024",NYK @ BOS,W,16,1,6,0.167,...,0.0,4.5,6.75,11.25,0.0,0.0,2.25,2.25,0.0,4.5
3,22023,1630173,22301139,"Apr 07, 2024",NYK @ MIL,W,5,0,1,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.2,0.0
4,22023,1630173,22301119,"Apr 05, 2024",NYK @ CHI,L,19,0,2,0.0,...,0.0,1.894737,5.684211,7.578947,1.894737,1.894737,0.0,1.894737,7.578947,0.0


In [4]:
# Group by player and aggregate the numerical stats by calculating the mean
# Also include the 'POSITION' and take the first value
aggregation_functions = {col: 'mean' for col in df.select_dtypes(include=['number']).columns if col not in ['SEASON_ID', 'GAME_ID', 'VIDEO_AVAILABLE', 'PLAYER_ID']}
aggregation_functions['POSITION'] = 'first'


df_aggregated = df.groupby(['PLAYER_ID', 'PLAYER_NAME']).agg(aggregation_functions).reset_index()

display(df_aggregated.head())
display(df_aggregated.info())

Unnamed: 0,PLAYER_ID,PLAYER_NAME,MIN,FGM,FGA,FG_PCT,FG3M,FG3A,FG3_PCT,FTM,...,OREB_PER_36_MIN,DREB_PER_36_MIN,REB_PER_36_MIN,AST_PER_36_MIN,STL_PER_36_MIN,BLK_PER_36_MIN,TOV_PER_36_MIN,PF_PER_36_MIN,PTS_PER_36_MIN,POSITION
0,2544,LeBron James,35.323944,9.647887,17.873239,0.542239,2.098592,5.112676,0.404592,4.267606,...,0.862764,6.587453,7.450217,8.552123,1.240106,0.523435,3.508694,1.091847,26.195756,Forward
1,101108,Chris Paul,26.431034,3.551724,8.051724,0.425069,1.344828,3.62069,0.331966,0.741379,...,0.414074,4.881236,5.29531,9.144667,1.778351,0.118572,1.792716,2.603366,12.688309,Guard
2,200768,Kyle Lowry,28.183333,2.716667,6.283333,0.409717,1.633333,4.166667,0.38095,1.05,...,0.719022,3.370198,4.08922,5.339844,1.256857,0.454109,1.821471,3.144748,10.002223,Guard
3,200782,P.J. Tucker,15.709677,0.580645,1.612903,0.197839,0.419355,1.129032,0.241968,0.096774,...,1.672023,3.763643,5.435666,1.170992,1.267622,0.361255,0.553392,4.497375,3.275039,Forward
4,201142,Kevin Durant,37.2,10.013333,19.146667,0.530987,2.24,5.426667,0.422333,4.826667,...,0.516608,5.851974,6.368582,4.869153,0.908258,1.173239,3.134328,1.697489,26.38506,Forward


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 448 entries, 0 to 447
Data columns (total 41 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   PLAYER_ID        448 non-null    int64  
 1   PLAYER_NAME      448 non-null    object 
 2   MIN              448 non-null    float64
 3   FGM              448 non-null    float64
 4   FGA              448 non-null    float64
 5   FG_PCT           448 non-null    float64
 6   FG3M             448 non-null    float64
 7   FG3A             448 non-null    float64
 8   FG3_PCT          448 non-null    float64
 9   FTM              448 non-null    float64
 10  FTA              448 non-null    float64
 11  FT_PCT           448 non-null    float64
 12  OREB             448 non-null    float64
 13  DREB             448 non-null    float64
 14  REB              448 non-null    float64
 15  AST              448 non-null    float64
 16  STL              448 non-null    float64
 17  BLK             

None

In [5]:
# Calculate Assist-to-Turnover Ratio, handling division by zero
df_aggregated['AST_TOV_RATIO'] = np.where(
    df_aggregated['TOV'] == 0,
    0,  # Or a small number, or NaN, depending on desired handling
    df_aggregated['AST'] / df_aggregated['TOV']
)

# Calculate True Shooting Percentage, handling division by zero
df_aggregated['TS_PCT'] = np.where(
    (2 * (df_aggregated['FGA'] + 0.44 * df_aggregated['FTA'])) == 0,
    0,  # Or NaN
    df_aggregated['PTS'] / (2 * (df_aggregated['FGA'] + 0.44 * df_aggregated['FTA']))
)

# Calculate Offensive Rebounding Ratio
df_aggregated['OREB_RATIO'] = np.where(
    df_aggregated['REB'] == 0,
    0, # Or NaN
    df_aggregated['OREB'] / df_aggregated['REB']
)

# Calculate Defensive Rebounding Ratio
df_aggregated['DREB_RATIO'] = np.where(
    df_aggregated['REB'] == 0,
    0, # Or NaN
    df_aggregated['DREB'] / df_aggregated['REB']
)

display(df_aggregated.head())

Unnamed: 0,PLAYER_ID,PLAYER_NAME,MIN,FGM,FGA,FG_PCT,FG3M,FG3A,FG3_PCT,FTM,...,STL_PER_36_MIN,BLK_PER_36_MIN,TOV_PER_36_MIN,PF_PER_36_MIN,PTS_PER_36_MIN,POSITION,AST_TOV_RATIO,TS_PCT,OREB_RATIO,DREB_RATIO
0,2544,LeBron James,35.323944,9.647887,17.873239,0.542239,2.098592,5.112676,0.404592,4.267606,...,1.240106,0.523435,3.508694,1.091847,26.195756,Forward,2.404082,0.629683,0.117761,0.882239
1,101108,Chris Paul,26.431034,3.551724,8.051724,0.425069,1.344828,3.62069,0.331966,0.741379,...,1.778351,0.118572,1.792716,2.603366,12.688309,Guard,5.171053,0.544011,0.08,0.92
2,200768,Kyle Lowry,28.183333,2.716667,6.283333,0.409717,1.633333,4.166667,0.38095,1.05,...,1.256857,0.454109,1.821471,3.144748,10.002223,Guard,2.976471,0.593902,0.175258,0.824742
3,200782,P.J. Tucker,15.709677,0.580645,1.612903,0.197839,0.419355,1.129032,0.241968,0.096774,...,1.267622,0.361255,0.553392,4.497375,3.275039,Forward,2.0,0.506625,0.329412,0.670588
4,201142,Kevin Durant,37.2,10.013333,19.146667,0.530987,2.24,5.426667,0.422333,4.826667,...,0.908258,1.173239,3.134328,1.697489,26.38506,Forward,1.54918,0.626341,0.082828,0.917172


In [6]:
df_aggregated.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 448 entries, 0 to 447
Data columns (total 45 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   PLAYER_ID        448 non-null    int64  
 1   PLAYER_NAME      448 non-null    object 
 2   MIN              448 non-null    float64
 3   FGM              448 non-null    float64
 4   FGA              448 non-null    float64
 5   FG_PCT           448 non-null    float64
 6   FG3M             448 non-null    float64
 7   FG3A             448 non-null    float64
 8   FG3_PCT          448 non-null    float64
 9   FTM              448 non-null    float64
 10  FTA              448 non-null    float64
 11  FT_PCT           448 non-null    float64
 12  OREB             448 non-null    float64
 13  DREB             448 non-null    float64
 14  REB              448 non-null    float64
 15  AST              448 non-null    float64
 16  STL              448 non-null    float64
 17  BLK             

In [7]:
X = df_aggregated.select_dtypes(include=['number']).drop('PLAYER_ID', axis=1)

# Clean the 'POSITION' column to keep only the first position listed
# Assuming positions are separated by '/'
df_aggregated['POSITION'] = df_aggregated['POSITION'].str.split('/').str[0].str.strip()
y = df_aggregated['POSITION']

display(X.head())
display(y.head())

Unnamed: 0,MIN,FGM,FGA,FG_PCT,FG3M,FG3A,FG3_PCT,FTM,FTA,FT_PCT,...,AST_PER_36_MIN,STL_PER_36_MIN,BLK_PER_36_MIN,TOV_PER_36_MIN,PF_PER_36_MIN,PTS_PER_36_MIN,AST_TOV_RATIO,TS_PCT,OREB_RATIO,DREB_RATIO
0,35.323944,9.647887,17.873239,0.542239,2.098592,5.112676,0.404592,4.267606,5.690141,0.730859,...,8.552123,1.240106,0.523435,3.508694,1.091847,26.195756,2.404082,0.629683,0.117761,0.882239
1,26.431034,3.551724,8.051724,0.425069,1.344828,3.62069,0.331966,0.741379,0.896552,0.259034,...,9.144667,1.778351,0.118572,1.792716,2.603366,12.688309,5.171053,0.544011,0.08,0.92
2,28.183333,2.716667,6.283333,0.409717,1.633333,4.166667,0.38095,1.05,1.25,0.38195,...,5.339844,1.256857,0.454109,1.821471,3.144748,10.002223,2.976471,0.593902,0.175258,0.824742
3,15.709677,0.580645,1.612903,0.197839,0.419355,1.129032,0.241968,0.096774,0.096774,0.064516,...,1.170992,1.267622,0.361255,0.553392,4.497375,3.275039,2.0,0.506625,0.329412,0.670588
4,37.2,10.013333,19.146667,0.530987,2.24,5.426667,0.422333,4.826667,5.64,0.802547,...,4.869153,0.908258,1.173239,3.134328,1.697489,26.38506,1.54918,0.626341,0.082828,0.917172


0    Forward
1      Guard
2      Guard
3    Forward
4    Forward
Name: POSITION, dtype: object

In [8]:
# Clean the 'POSITION' column to keep only the first position listed
# Assuming positions are separated by '-'
df_aggregated['POSITION'] = df_aggregated['POSITION'].str.split('-').str[0].str.strip()

# Select all numerical columns from the aggregated DataFrame, excluding the player ID
X = df_aggregated.select_dtypes(include=['number']).drop('PLAYER_ID', axis=1)
y = df_aggregated['POSITION']

display(X.head())
display(y.head())

Unnamed: 0,MIN,FGM,FGA,FG_PCT,FG3M,FG3A,FG3_PCT,FTM,FTA,FT_PCT,...,AST_PER_36_MIN,STL_PER_36_MIN,BLK_PER_36_MIN,TOV_PER_36_MIN,PF_PER_36_MIN,PTS_PER_36_MIN,AST_TOV_RATIO,TS_PCT,OREB_RATIO,DREB_RATIO
0,35.323944,9.647887,17.873239,0.542239,2.098592,5.112676,0.404592,4.267606,5.690141,0.730859,...,8.552123,1.240106,0.523435,3.508694,1.091847,26.195756,2.404082,0.629683,0.117761,0.882239
1,26.431034,3.551724,8.051724,0.425069,1.344828,3.62069,0.331966,0.741379,0.896552,0.259034,...,9.144667,1.778351,0.118572,1.792716,2.603366,12.688309,5.171053,0.544011,0.08,0.92
2,28.183333,2.716667,6.283333,0.409717,1.633333,4.166667,0.38095,1.05,1.25,0.38195,...,5.339844,1.256857,0.454109,1.821471,3.144748,10.002223,2.976471,0.593902,0.175258,0.824742
3,15.709677,0.580645,1.612903,0.197839,0.419355,1.129032,0.241968,0.096774,0.096774,0.064516,...,1.170992,1.267622,0.361255,0.553392,4.497375,3.275039,2.0,0.506625,0.329412,0.670588
4,37.2,10.013333,19.146667,0.530987,2.24,5.426667,0.422333,4.826667,5.64,0.802547,...,4.869153,0.908258,1.173239,3.134328,1.697489,26.38506,1.54918,0.626341,0.082828,0.917172


0    Forward
1      Guard
2      Guard
3    Forward
4    Forward
Name: POSITION, dtype: object

In [9]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of y_test:", y_test.shape)

Shape of X_train: (358, 42)
Shape of X_test: (90, 42)
Shape of y_train: (358,)
Shape of y_test: (90,)


In [10]:
from sklearn.ensemble import RandomForestClassifier

# Initialize and train the RandomForest Classifier model
rf_model_cleaned_pos = RandomForestClassifier(random_state=42)

rf_model_cleaned_pos.fit(X_train_imputed, y_train)

NameError: name 'X_train_imputed' is not defined