In [40]:
# Dependencies
import pandas as pd
import numpy as np

In [41]:
# Initialize variables
cutoff_year = 2000

test_data_cutoff_year = 2022
prediction_data_cutoff_year = test_data_cutoff_year + 1

# DATA CLEANUP

In [42]:
# Read in CSV file 
team_summaries = pd.read_csv("Resources\Team Summaries.csv")
team_summaries = team_summaries[['season','abbreviation','playoffs','w','l']]
team_summaries.rename(columns={'abbreviation':'tm'}, inplace=True)
team_summaries['playoffs'] = team_summaries['playoffs'].astype(int)
# team_summaries

In [43]:
# Read in CSV file
all_nba_df = pd.read_csv("Resources\End of Season Teams.csv")
# all_nba_df.head()

In [44]:
# Filter All-NBA players after year 2010. Keep only columns: all_nba, seas_id
modern_all_nba = all_nba_df[(all_nba_df['season'] >= cutoff_year) & (all_nba_df['type'] == 'All-NBA')][['type','seas_id']]

# print(modern_all_nba.info())
# modern_all_nba.head()

In [45]:
# Read in CSV file
player_totals_df = pd.read_csv("Resources\Player Totals.csv")

# Filter data after cutoff_year
player_totals_df =  player_totals_df[player_totals_df['season'] >= cutoff_year]

# player_totals_df.head()

In [46]:
# Find traded players current team
traded_players = player_totals_df.sort_values(by='seas_id', ascending=False).drop_duplicates(['player','season'])[['seas_id','season','player','tm']]

player_totals = pd.merge(player_totals_df, traded_players, on=['player','season'], how='left')

player_totals.rename(columns={'seas_id_x':'seas_id','tm_y':'tm'}, inplace=True)
player_totals.drop('seas_id_y', axis=1, inplace=True)

player_totals = pd.merge(player_totals, team_summaries, on=['season','tm'], how='left')

player_totals = player_totals.sort_values(by='seas_id').drop_duplicates(['season','player'])
#player_totals

In [47]:
# Filter data after 2010
modern_player_totals = player_totals.copy()

# Categorize positions into 3 categories
modern_player_totals['forward'] = modern_player_totals['pos'].str.contains('f', case=False).astype(int)
modern_player_totals['guard'] = modern_player_totals['pos'].str.contains('g', case=False).astype(int)
modern_player_totals['center'] = modern_player_totals['pos'].str.contains('c', case=False).astype(int)

# Drop unnecessary columns
modern_player_totals.drop(['birth_year','player_id','player','lg','tm_x','pos','fg_percent','x3p_percent','x2p_percent','ft_percent','e_fg_percent','tm'], axis=1, inplace=True)


# print(modern_player_totals.info())
# modern_player_totals.head()

In [48]:
# Merge ALL NBA members with players
merged_nba = pd.merge(modern_all_nba, modern_player_totals, on='seas_id', how='right')
merged_nba['all_nba'] = merged_nba['type'].notnull().astype(int)

merged_nba.drop('type', axis=1, inplace=True)

# print(merged_nba.info())
# merged_nba.head()

# MACHINE LEARNING

In [49]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier

In [50]:
test_data = merged_nba[merged_nba['season'] <= test_data_cutoff_year]

# Split the data into X_train, X_test, y_train, y_test
X = test_data.drop("all_nba", axis=1)
y = test_data["all_nba"]
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

# Scale the features using a standard scaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


# By positions
Xg = X[X['guard'] == 1].drop(['forward','center'], axis=1)
yg = test_data[test_data['guard'] == 1]['all_nba']
Xg_train, Xg_test, yg_train, yg_test = train_test_split(Xg, yg, random_state=1)

Xf = X[X['forward'] == 1].drop(['guard','center'], axis=1)
yf = test_data[test_data['forward'] == 1]['all_nba']
Xf_train, Xf_test, yf_train, yf_test = train_test_split(Xf, yf, random_state=1)

Xc = X[X['center'] == 1].drop(['guard','forward'], axis=1)
yc = test_data[test_data['center'] == 1]['all_nba']
Xc_train, Xc_test, yc_train, yc_test = train_test_split(Xc, yc, random_state=1)

scalerg = StandardScaler()
Xg_train_scaled = scalerg.fit_transform(Xg_train)
Xg_test_scaled = scalerg.transform(Xg_test)

scalerf = StandardScaler()
Xf_train_scaled = scalerf.fit_transform(Xf_train)
Xf_test_scaled = scalerf.transform(Xf_test)

scalerc = StandardScaler()
Xc_train_scaled = scalerc.fit_transform(Xc_train)
Xc_test_scaled = scalerc.transform(Xc_test)

In [51]:
# Train a Logistic Regression model and print the model score
classifier = LogisticRegression(max_iter=10000)

classifier.fit(X_train_scaled, y_train)

print(f"Training Data Score: {classifier.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test_scaled, y_test)}")
print('-------------------------------------')

# By positions
classifierg = LogisticRegression(max_iter=10000)
classifierg.fit(Xg_train_scaled, yg_train)

print(f"Training Data Score: {classifierg.score(Xg_train_scaled, yg_train)}")
print(f"Testing Data Score: {classifierg.score(Xg_test_scaled, yg_test)}")
print('-------------------------------------')

classifierf = LogisticRegression(max_iter=10000)
classifierf.fit(Xf_train_scaled, yf_train)

print(f"Training Data Score: {classifierf.score(Xf_train_scaled, yf_train)}")
print(f"Testing Data Score: {classifierf.score(Xf_test_scaled, yf_test)}")
print('-------------------------------------')

classifierc = LogisticRegression(max_iter=10000)
classifierc.fit(Xc_train_scaled, yc_train)

print(f"Training Data Score: {classifierc.score(Xc_train_scaled, yc_train)}")
print(f"Testing Data Score: {classifierc.score(Xc_test_scaled, yc_test)}")

Training Data Score: 0.9894379021488406
Testing Data Score: 0.9898033503277495
-------------------------------------
Training Data Score: 0.9892697466467958
Testing Data Score: 0.9874888293118856
-------------------------------------
Training Data Score: 0.9932556713672593
Testing Data Score: 0.9944852941176471
-------------------------------------
Training Data Score: 0.9923708920187794
Testing Data Score: 0.9824253075571178


In [52]:
# Make predictions on new data
prediction_data = merged_nba[merged_nba['season'] == prediction_data_cutoff_year].drop('all_nba', axis=1)
prediction_data_scaled = scaler.transform(prediction_data)
proba = classifier.predict_proba(prediction_data_scaled)

# Add the All-NBA probability to the prediction_data dataframe as percentage
prediction_data['All-NBA Probability'] = np.around(proba[:, 1] * 100 , 2)
prediction_data.sort_values(by='All-NBA Probability', ascending=False, inplace=True)

# By positions
prediction_gdata = merged_nba[(merged_nba['season'] == prediction_data_cutoff_year) & (merged_nba['guard']==1)].drop(['all_nba','forward','center'], axis=1)
prediction_gdata_scaled = scalerg.transform(prediction_gdata)
proba_g = classifierg.predict_proba(prediction_gdata_scaled)

# Add the All-NBA probability to the prediction_data dataframe as percentage
prediction_gdata['All-NBA G Probability'] = np.around(proba_g[:, 1] * 100 , 2)
prediction_gdata.sort_values(by='All-NBA G Probability', ascending=False, inplace=True)

prediction_fdata = merged_nba[(merged_nba['season'] == prediction_data_cutoff_year) & (merged_nba['forward']==1)].drop(['all_nba','guard','center'], axis=1)
prediction_fdata_scaled = scalerf.transform(prediction_fdata)
proba_f = classifierf.predict_proba(prediction_fdata_scaled)

# Add the All-NBA probability to the prediction_data dataframe as percentage
prediction_fdata['All-NBA F Probability'] = np.around(proba_f[:, 1] * 100 , 2)
prediction_fdata.sort_values(by='All-NBA F Probability', ascending=False, inplace=True)

prediction_cdata = merged_nba[(merged_nba['season'] == prediction_data_cutoff_year) & (merged_nba['center']==1)].drop(['all_nba','guard','forward'], axis=1)
prediction_cdata_scaled = scalerc.transform(prediction_cdata)
proba_c = classifierc.predict_proba(prediction_cdata_scaled)

# Add the All-NBA probability to the prediction_data dataframe as percentage
prediction_cdata['All-NBA C Probability'] = np.around(proba_c[:, 1] * 100 , 2)
prediction_cdata.sort_values(by='All-NBA C Probability', ascending=False, inplace=True)

In [53]:
# Merge in player names with correct Team name.  Correcting erroneous players with TOT as team name
player_info = player_totals.sort_values(by='seas_id', ascending=False)[['seas_id','player','tm']]

predicted_players = pd.merge(player_info, prediction_data, on='seas_id', how='right')

predicted_g_players = pd.merge(player_info, prediction_gdata, on='seas_id', how='right')
predicted_f_players = pd.merge(player_info, prediction_fdata, on='seas_id', how='right')
predicted_c_players = pd.merge(player_info, prediction_cdata, on='seas_id', how='right')

In [54]:
# Separate players by positions
top_guards = predicted_players[predicted_players['guard'] == 1].head(20)
top_forwards = predicted_players[predicted_players['forward'] == 1].head(10)
top_centers = predicted_players[predicted_players['center'] == 1].head(10)

# By positions
top_gguards = predicted_g_players[predicted_g_players['guard'] == 1].head(20)
top_fforwards = predicted_f_players[predicted_f_players['forward'] == 1].head(10)
top_ccenters = predicted_c_players[predicted_c_players['center'] == 1].head(10)


# OUTPUT

In [55]:
print("Top Guards")
print(top_guards.head(10).to_markdown())
print('-----------------------------------------------------------------------------------------------------')
print("Top Forwards")
print(top_forwards.head(10).to_markdown())
print('-----------------------------------------------------------------------------------------------------')
print("Top Centers")
print(top_centers.head(6).to_markdown())

Top Guards
|    |   seas_id | player                  | tm   |   season |   age |   experience |   g |   gs |   mp |   fg |   fga |   x3p |   x3pa |   x2p |   x2pa |   ft |   fta |   orb |   drb |   trb |   ast |   stl |   blk |   tov |   pf |   pts |   playoffs |   w |   l |   forward |   guard |   center |   All-NBA Probability |
|---:|----------:|:------------------------|:-----|---------:|------:|-------------:|----:|-----:|-----:|-----:|------:|------:|-------:|------:|-------:|-----:|------:|------:|------:|------:|------:|------:|------:|------:|-----:|------:|-----------:|----:|----:|----------:|--------:|---------:|----------------------:|
|  3 |     30847 | Luka Dončić             | DAL  |     2023 |    23 |            5 |  53 |   53 | 1930 |  594 |  1175 |   145 |    415 |   449 |    760 |  426 |   583 |    46 |   419 |   465 |   428 |    79 |    27 |   192 |  140 |  1759 |          0 |  32 |  31 |         0 |       1 |        0 |                 93.87 |
|  5 |     30680 | J

In [56]:
# print("Top Guards")
# print(top_gguards.head(10).to_markdown())
# print('-----------------------------------------------------------------------------------------------------')
# print("Top Forwards")
# print(top_fforwards.head(10).to_markdown())
# print('-----------------------------------------------------------------------------------------------------')
# print("Top Centers")
# print(top_ccenters.head(6).to_markdown())

Top Guards
|    |   seas_id | player                  | tm   |   season |   age |   experience |   g |   gs |   mp |   fg |   fga |   x3p |   x3pa |   x2p |   x2pa |   ft |   fta |   orb |   drb |   trb |   ast |   stl |   blk |   tov |   pf |   pts |   playoffs |   w |   l |   guard |   All-NBA G Probability |
|---:|----------:|:------------------------|:-----|---------:|------:|-------------:|----:|-----:|-----:|-----:|------:|------:|-------:|------:|-------:|-----:|------:|------:|------:|------:|------:|------:|------:|------:|-----:|------:|-----------:|----:|----:|--------:|------------------------:|
|  0 |     30847 | Luka Dončić             | DAL  |     2023 |    23 |            5 |  53 |   53 | 1930 |  594 |  1175 |   145 |    415 |   449 |    760 |  426 |   583 |    46 |   419 |   465 |   428 |    79 |    27 |   192 |  140 |  1759 |          0 |  32 |  31 |       1 |                   90.92 |
|  1 |     30680 | Ja Morant               | MEM  |     2023 |    23 |            4