<a href="https://colab.research.google.com/github/DrakeSorensen1/Prediction-Model/blob/main/ML_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Data Collection**

In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv('prediction_model/all_seasons.csv')

df.head()

Unnamed: 0.1,Unnamed: 0,player_name,team_abbreviation,age,player_height,player_weight,college,country,draft_year,draft_round,...,pts,reb,ast,net_rating,oreb_pct,dreb_pct,usg_pct,ts_pct,ast_pct,season
0,0,Randy Livingston,HOU,22.0,193.04,94.800728,Louisiana State,USA,1996,2,...,3.9,1.5,2.4,0.3,0.042,0.071,0.169,0.487,0.248,1996-97
1,1,Gaylon Nickerson,WAS,28.0,190.5,86.18248,Northwestern Oklahoma,USA,1994,2,...,3.8,1.3,0.3,8.9,0.03,0.111,0.174,0.497,0.043,1996-97
2,2,George Lynch,VAN,26.0,203.2,103.418976,North Carolina,USA,1993,1,...,8.3,6.4,1.9,-8.2,0.106,0.185,0.175,0.512,0.125,1996-97
3,3,George McCloud,LAL,30.0,203.2,102.0582,Florida State,USA,1989,1,...,10.2,2.8,1.7,-2.7,0.027,0.111,0.206,0.527,0.125,1996-97
4,4,George Zidek,DEN,23.0,213.36,119.748288,UCLA,USA,1995,1,...,2.8,1.7,0.3,-14.1,0.102,0.169,0.195,0.5,0.064,1996-97


# **Data Cleaning**

In [2]:
df.rename(columns={'Unnamed: 0': 'player_id'}, inplace=True)

df['age'] = df['age'].astype(int)

df.drop(columns=['college', 'country', 'draft_year', 'draft_round', 'draft_number'], inplace=True)

null_values = df.isnull().sum()

#print("Null values in each column:")
#print(null_values)

#print("\nTypes of data after conversion:\n", df.dtypes)

df.sample(10)

Unnamed: 0,player_id,player_name,team_abbreviation,age,player_height,player_weight,gp,pts,reb,ast,net_rating,oreb_pct,dreb_pct,usg_pct,ts_pct,ast_pct,season
6671,6671,Al Jefferson,UTA,26,208.28,127.00576,82,18.6,9.7,1.8,-3.2,0.094,0.228,0.242,0.528,0.09,2010-11
8652,8652,Eric Gordon,NOP,27,193.04,97.52228,45,15.2,2.2,2.7,-2.6,0.008,0.067,0.205,0.565,0.129,2015-16
6379,6379,Cartier Martin,WAS,26,200.66,99.79024,52,4.0,1.4,0.3,2.3,0.043,0.121,0.176,0.524,0.05,2010-11
3618,3618,Latrell Sprewell,MIN,34,195.58,88.45044,80,12.8,3.2,2.2,-1.2,0.032,0.083,0.22,0.489,0.119,2004-05
750,750,Chris Mullin,IND,34,200.66,97.52228,82,11.3,3.0,2.3,10.6,0.022,0.107,0.189,0.607,0.135,1997-98
6907,6907,Donte Greene,SAC,24,210.82,102.511792,53,5.4,2.5,0.6,-5.8,0.024,0.154,0.182,0.491,0.069,2011-12
2704,2704,Kedrick Brown,BOS,22,200.66,100.697424,51,2.8,2.7,0.4,-3.9,0.072,0.162,0.147,0.392,0.052,2002-03
10388,10388,Spencer Dinwiddie,BKN,26,198.12,95.25432,68,16.8,2.4,4.6,-2.7,0.013,0.069,0.242,0.58,0.272,2018-19
1492,1492,Robert Horry,LAL,29,208.28,106.59412,76,5.7,4.8,1.6,12.3,0.088,0.143,0.126,0.528,0.103,1999-00
9699,9699,Trey McKinney-Jones,IND,27,195.58,99.79024,1,0.0,0.0,0.0,-33.3,0.0,0.0,0.0,0.0,0.0,2017-18


# **Data Visualization**

In [None]:
import matplotlib.pyplot as plt

################
# FIRST GRAPH #
###############

# Filter data for a specific player
player_name = 'LeBron James'
player_data = df[df['player_name'] == player_name]

# Plot points
plt.plot(player_data['season'], player_data['pts'], marker='o', label='Points')

# Plot rebounds
plt.plot(player_data['season'], player_data['reb'], marker='o', label='Rebounds')

# Plot assists
plt.plot(player_data['season'], player_data['ast'], marker='o', label='Assists')

# Adding labels and title
plt.xlabel('Year')
plt.ylabel('Stats')
plt.title(f'{player_name} NBA Player Stats by Year')
plt.xticks(player_data['season'])  # Set x-ticks to be the years
plt.xticks(player_data['season'], rotation=45)  # Rotate x-tick labels by 45 degrees
plt.legend()

# Display the plot
plt.tight_layout()
plt.show()

################
# SECOND GRAPH #
###############

# Filter the DataFrame for a specific team
team_name = 'ATL'
team_df = df[df['team_abbreviation'] == team_name]

# Group the data by season for the specific team
grouped_data = team_df.groupby('season')

# Calculate the mean of each statistic for each season
mean_stats = grouped_data.agg({'pts': 'mean', 'reb': 'mean', 'ast': 'mean', 'net_rating': 'mean'}).reset_index()

# Plot average points
plt.plot(mean_stats['season'], mean_stats['pts'], marker='o', label='Average Points')

# Plot average rebounds
plt.plot(mean_stats['season'], mean_stats['reb'], marker='o', label='Average Rebounds')

# Plot average assists
plt.plot(mean_stats['season'], mean_stats['ast'], marker='o', label='Average Assists')

# Adding labels and title
plt.xlabel('Season')
plt.ylabel('Average Stats')
plt.title(f'{team_name} Average Performance by Season')
plt.xticks(rotation=45)  # Rotate x-axis labels for better readability
plt.legend()

# Display the plot
plt.tight_layout()
plt.show()

################
# Third GRAPH #
###############

# Filter the DataFrame for a specific team
team_name = 'GSW'
team_df = df[df['team_abbreviation'] == team_name]

# Group the data by season for the specific team
grouped_data = team_df.groupby('season')

# Calculate the mean of each statistic for each season
mean_stats = grouped_data.agg({'net_rating': 'mean'}).reset_index()

# Plot average net rating
plt.plot(mean_stats['season'], mean_stats['net_rating'], marker='o', label='Average Net Rating')

# Adding labels and title
plt.xlabel('Season')
plt.ylabel('Average Net Rating')
plt.title(f'{team_name} Average Net Rating by Season')
plt.xticks(rotation=45)  # Rotate x-axis labels for better readability
plt.legend()

# Display the plot
plt.tight_layout()
plt.show()

# **Splitting Data**

In [4]:
# Selecting features
features = ['age', 'pts', 'reb', 'ast', 'usg_pct', 'ts_pct']

# Function to split data for each player
def split_data_by_player(data):
    train_data = []
    test_data = []
    for player_name, player_df in data.groupby('player_name'):

        # Sort player's data chronologically
        player_df = player_df.sort_values(by='season')

        # Determine the index to split the data
        split_index = int(0.8 * len(player_df))

        # Split data into training and testing sets
        train_data.append(player_df.iloc[:split_index])
        test_data.append(player_df.iloc[split_index:])

    return pd.concat(train_data), pd.concat(test_data)

# Split data for each player
train_data, test_data = split_data_by_player(df)

# Extract selected features for training and testing sets
X_train = train_data[features]
y_train = train_data[['pts', 'reb', 'ast']]

X_test = test_data[features]
y_test = test_data[['pts', 'reb', 'ast']]

# Print the shapes of the training and testing sets to verify
print("Training set - X:", X_train.shape, " y:", y_train.shape)
print("Testing set - X:", X_test.shape, " y:", y_test.shape)

# Check the first few rows of the training data for a specific player (e.g., LeBron James)
print("First few rows of training data for LeBron James:")
print(X_train[train_data['player_name'] == 'LeBron James'].head())

# Check the first few rows of the testing data for the same player
print("First few rows of testing data for LeBron James:")
print(X_test[test_data['player_name'] == 'LeBron James'].head())

6
Training set - X: (8997, 6)  y: (8997, 3)
Testing set - X: (3847, 6)  y: (3847, 3)
First few rows of training data for LeBron James:
      age   pts  reb  ast  usg_pct  ts_pct
3448   19  20.9  5.5  5.9    0.280   0.488
3616   20  27.2  7.4  7.2    0.297   0.554
4103   21  31.4  7.0  6.6    0.336   0.568
4807   22  27.3  6.7  6.0    0.309   0.552
5133   23  30.0  7.9  7.2    0.333   0.568
First few rows of testing data for LeBron James:
       age   pts  reb   ast  usg_pct  ts_pct
10958   35  25.3  7.8  10.2    0.308   0.577
11595   36  25.0  7.7   7.8    0.310   0.602
12108   37  30.3  8.2   6.2    0.317   0.619
12733   38  28.9  8.3   6.8    0.322   0.583


# **Training**

In [5]:
%pip install catboost

Collecting catboost
  Downloading catboost-1.2.5-cp310-cp310-manylinux2014_x86_64.whl (98.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.2/98.2 MB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.2.5


In [42]:
from catboost import CatBoostRegressor
from catboost.utils import eval_metric
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.metrics import mean_absolute_error

param_grid = {
    'depth': [4, 6],                 # Different depths of the trees
    'learning_rate': [0.05, 0.1],  # Different learning rates
    'iterations': [100, 200],            # Number of boosting iterations
}

# Initialize CatBoostRegressor
model_pts = GridSearchCV(CatBoostRegressor(silent=True), param_grid)
model_reb = GridSearchCV(CatBoostRegressor(silent=True), param_grid)
model_ast = GridSearchCV(CatBoostRegressor(silent=True), param_grid)

player_name = 'Brandon Ingram'

# Train the model based on the player
# points
new_X_train_pts = X_train[train_data['player_name'] == player_name]
new_y_train_pts = y_train[train_data['player_name'] == player_name]['pts']

# rebounds
new_X_train_reb = X_train[train_data['player_name'] == player_name]
new_y_train_reb = y_train[train_data['player_name'] == player_name]['reb']

# assists
new_X_train_ast = X_train[train_data['player_name'] == player_name]
new_y_train_ast = y_train[train_data['player_name'] == player_name]['ast']

# Train the model
model_pts.fit(new_X_train_pts, new_y_train_pts)
model_reb.fit(new_X_train_reb, new_y_train_reb)
model_ast.fit(new_X_train_ast, new_y_train_ast)

# points
new_X_test_pts = X_test[test_data['player_name'] == player_name]
new_y_test_pts = y_test[test_data['player_name'] == player_name]['pts']

# rebounds
new_X_test_reb = X_test[test_data['player_name'] == player_name]
new_y_test_reb = y_test[test_data['player_name'] == player_name]['reb']

# assists
new_X_test_ast = X_test[test_data['player_name'] == player_name]
new_y_test_ast = y_test[test_data['player_name'] == player_name]['ast']

# Make predictions on the data
# points
test_pred_pts = model_pts.predict(new_X_test_pts)
train_pred_pts = model_pts.predict(new_X_train_pts)

# rebounds
test_pred_reb = model_reb.predict(new_X_test_reb)
train_pred_reb = model_reb.predict(new_X_train_reb)

# assists
test_pred_ast = model_ast.predict(new_X_test_ast)
train_pred_ast = model_ast.predict(new_X_train_ast)

# Calculate MAE for the test predictions
mae_pts = mean_absolute_error(new_y_test_pts, test_pred_pts)
mae_reb = mean_absolute_error(new_y_test_reb, test_pred_reb)
mae_ast = mean_absolute_error(new_y_test_ast, test_pred_ast)

# Calculate MAE for the training predictions
mae_pts_train = mean_absolute_error(new_y_train_pts, train_pred_pts)
mae_reb_train = mean_absolute_error(new_y_train_reb, train_pred_reb)
mae_ast_train = mean_absolute_error(new_y_train_ast, train_pred_ast)

# Calculate MAPE for the training predictions
mape_pts_train = mean_absolute_percentage_error(new_y_train_pts, train_pred_pts)
mape_reb_train = mean_absolute_error(new_y_train_reb, train_pred_reb)
mape_ast_train = mean_absolute_error(new_y_train_ast, train_pred_ast)

# Calculate MAPE for the testing predictions
mape_pts_test = mean_absolute_percentage_error(new_y_test_pts, test_pred_pts)
mape_reb_test = mean_absolute_percentage_error(new_y_test_reb, test_pred_reb)
mape_ast_test = mean_absolute_percentage_error(new_y_test_ast, test_pred_ast)

print("MAE for points (Training):", mae_pts_train)
print("MAE for points (Test):", mae_pts)
print(f"MAPE for points (Training): {100 - mape_pts_train * 100:.2f}%")
print(f"MAPE for points (Test): {100 - mape_pts_test * 100:.2f}%\n")
print("MAE for rebounds (Training):", mae_reb_train)
print("MAE for rebounds (Test):", mae_reb)
print(f"MAPE for rebounds (Training): {100 - mape_reb_train * 100:.2f}%")
print(f"MAPE for rebounds (Test): {100 - mape_reb_test * 100:.2f}%\n")
print("MAE for assists (Training):", mae_ast_train)
print("MAE for assists (Test):", mae_ast)
print(f"MAPE for assists (Training): {100 - mape_ast_train * 100:.2f}%")
print(f"MAPE for assists (Test): {100 - mape_ast_test * 100:.2f}%\n")

  return 1 - residual_sum_of_squares / total_sum_of_squares
  (array - array_means[:, np.newaxis]) ** 2, axis=1, weights=weights


MAE for points (Training): 1.0687563212017668
MAE for points (Test): 1.668328559021889
MAPE for points (Training): 92.05%
MAPE for points (Test): 93.06%

MAE for rebounds (Training): 0.15281349583110798
MAE for rebounds (Test): 0.256650998634981
MAPE for rebounds (Training): 84.72%
MAPE for rebounds (Test): 95.49%

MAE for assists (Training): 0.2331675350851592
MAE for assists (Test): 1.4676052543937312
MAPE for assists (Training): 76.68%
MAPE for assists (Test): 74.26%



# **Trained Data Visualization**

In [None]:
# Filter data for a specific player
player_data = df[df['player_name'] == player_name]

#################
# Points Graph #
#################

# Get seasons for test data
train_seasons = player_data.loc[new_X_train_pts.index, 'season']


# Plot actual points for training data
plt.plot(train_seasons, new_y_train_pts, marker='o', label='Actual Points (Training)')

# Plot predicted points for training data
plt.plot(train_seasons, train_pred_pts, marker='o', label='Predicted Points (Training)')

# Adding labels and title
plt.xlabel('Season')
plt.ylabel('Points')
plt.title(f'{player_name} NBA Player Stats by Season (Training Data)')
plt.xticks(rotation=45)  # Rotate x-tick labels by 45 degrees
plt.legend()

# Display the plot
plt.tight_layout()
plt.show()


#################
# Rebounds Graph #
#################

# Get seasons for test data
train_seasons = player_data.loc[new_X_train_reb.index, 'season']


# Plot actual points for training data
plt.plot(train_seasons, new_y_train_reb, marker='o', label='Actual Rebounds (Training)')

# Plot predicted points for training data
plt.plot(train_seasons, train_pred_reb, marker='o', label='Predicted Rebounds (Training)')

# Adding labels and title
plt.xlabel('Season')
plt.ylabel('Rebounds')
plt.title(f'{player_name} NBA Player Stats by Season (Training Data)')
plt.xticks(rotation=45)  # Rotate x-tick labels by 45 degrees
plt.legend()

# Display the plot
plt.tight_layout()
plt.show()

#################
# Assists Graph #
#################

# Get seasons for test data
train_seasons = player_data.loc[new_X_train_ast.index, 'season']

# Plot actual points for training data
plt.plot(train_seasons, new_y_train_ast, marker='o', label='Actual Assists (Training)')

# Plot predicted points for training data
plt.plot(train_seasons, train_pred_ast, marker='o', label='Predicted Assists (Training)')

# Adding labels and title
plt.xlabel('Season')
plt.ylabel('Assists')
plt.title(f'{player_name} NBA Player Stats by Season (Training Data)')
plt.xticks(rotation=45)  # Rotate x-tick labels by 45 degrees
plt.legend()

# Display the plot
plt.tight_layout()
plt.show()

# **Test Data Visualization**

In [None]:
# Get seasons for test data
test_seasons = player_data.loc[new_X_test_pts.index, 'season']

# Plot actual points for test data
plt.plot(test_seasons, new_y_test_pts, marker='o', label='Actual Points (Testing)')

# Plot predicted points for test data
plt.plot(test_seasons, test_pred_pts, marker='o', label='Predicted Points (Testing)')

# Adding labels and title
plt.xlabel('Season')
plt.ylabel('Points')
plt.title(f'{player_name} NBA Player Stats by Season (Testing Data)')
plt.xticks(rotation=45)  # Rotate x-tick labels by 45 degrees
plt.legend()

# Display the plot
plt.tight_layout()
plt.show()

# Get seasons for test data
test_seasons = player_data.loc[new_X_test_reb.index, 'season']

# Plot actual points for test data
plt.plot(test_seasons, new_y_test_reb, marker='o', label='Actual Rebounds (Testing)')

# Plot predicted points for test data
plt.plot(test_seasons, test_pred_reb, marker='o', label='Predicted Rebounds (Testing)')

# Adding labels and title
plt.xlabel('Season')
plt.ylabel('Rebounds')
plt.title(f'{player_name} NBA Player Stats by Season (Testing Data)')
plt.xticks(rotation=45)  # Rotate x-tick labels by 45 degrees
plt.legend()

# Display the plot
plt.tight_layout()
plt.show()

# Get seasons for test data
test_seasons = player_data.loc[new_X_test_ast.index, 'season']

# Plot actual points for test data
plt.plot(test_seasons, new_y_test_ast, marker='o', label='Actual Assists (Testing)')

# Plot predicted points for test data
plt.plot(test_seasons, test_pred_ast, marker='o', label='Predicted Assists (Testing)')

# Adding labels and title
plt.xlabel('Season')
plt.ylabel('Assists')
plt.title(f'{player_name} NBA Player Stats by Season (Testing Data)')
plt.xticks(rotation=45)  # Rotate x-tick labels by 45 degrees
plt.legend()

# Display the plot
plt.tight_layout()
plt.show()

# **Predict Future Seasons**

In [39]:
# Get the last season in your past data
last_season = df['season'].max()
num_future_seasons = 3

# Extract the year part from the last known season
last_year = int(last_season.split("-")[0])

# Generate future season entries
future_seasons = [f"{last_year + i}-{last_year + i + 1}" for i in range(1, num_future_seasons + 1)]

print(future_seasons)

extracted_col = df['player_name']

# Calculate means of features from past data
feature_means = df.groupby('player_name')[features].mean().reset_index()

future_data = pd.concat([future_data, extracted_col], axis=1)
print(future_data)
# # Merge means with future_data
# future_data = pd.merge(future_data, feature_means, on='player_name', how='left')


# # Extract features from future data for prediction
# X_future = future_data[features]

# # Make predictions for future seasons
# predictions = model_pts.predict(X_future)

# # Combine predictions with future seasons for analysis
# predictions_df = pd.DataFrame({"season": future_seasons, "predicted_stat": predictions})

# # Print or analyze predictions_df as needed
# print(predictions_df)

['2023-2024', '2024-2025', '2025-2026']
          season       player_name       player_name
0      2023-2024  Randy Livingston  Randy Livingston
1      2024-2025  Gaylon Nickerson  Gaylon Nickerson
2      2025-2026      George Lynch      George Lynch
3            NaN    George McCloud    George McCloud
4            NaN      George Zidek      George Zidek
...          ...               ...               ...
12839        NaN       Joel Embiid       Joel Embiid
12840        NaN   John Butler Jr.   John Butler Jr.
12841        NaN      John Collins      John Collins
12842        NaN      Jericho Sims      Jericho Sims
12843        NaN    JaMychal Green    JaMychal Green

[12844 rows x 3 columns]
