<a href="https://colab.research.google.com/github/DrakeSorensen1/Prediction-Model/blob/main/ML_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Data Collection**

In [None]:
import pandas as pd
import numpy as np

df = pd.read_csv('prediction_model/all_seasons.csv')

df.head()

# **Data Cleaning**

In [None]:
df.rename(columns={'Unnamed: 0': 'player_id'}, inplace=True)

df['age'] = df['age'].astype(int)

df.drop(columns=['college', 'country', 'draft_year', 'draft_round', 'draft_number'], inplace=True)

null_values = df.isnull().sum()

#print("Null values in each column:")
#print(null_values)

#print("\nTypes of data after conversion:\n", df.dtypes)

df.sample(10)

# **Data Visualization**

In [None]:
import matplotlib.pyplot as plt

################
# FIRST GRAPH #
###############

# Filter data for a specific player
player_name = 'LeBron James'
player_data = df[df['player_name'] == player_name]

# Plot points
plt.plot(player_data['season'], player_data['pts'], marker='o', label='Points')

# Plot rebounds
plt.plot(player_data['season'], player_data['reb'], marker='o', label='Rebounds')

# Plot assists
plt.plot(player_data['season'], player_data['ast'], marker='o', label='Assists')

# Adding labels and title
plt.xlabel('Year')
plt.ylabel('Stats')
plt.title(f'{player_name} NBA Player Stats by Year')
plt.xticks(player_data['season'])  # Set x-ticks to be the years
plt.xticks(player_data['season'], rotation=45)  # Rotate x-tick labels by 45 degrees
plt.legend()

# Display the plot
plt.tight_layout()
plt.show()

################
# SECOND GRAPH #
###############

# Filter the DataFrame for a specific team
team_name = 'ATL'
team_df = df[df['team_abbreviation'] == team_name]

# Group the data by season for the specific team
grouped_data = team_df.groupby('season')

# Calculate the mean of each statistic for each season
mean_stats = grouped_data.agg({'pts': 'mean', 'reb': 'mean', 'ast': 'mean', 'net_rating': 'mean'}).reset_index()

# Plot average points
plt.plot(mean_stats['season'], mean_stats['pts'], marker='o', label='Average Points')

# Plot average rebounds
plt.plot(mean_stats['season'], mean_stats['reb'], marker='o', label='Average Rebounds')

# Plot average assists
plt.plot(mean_stats['season'], mean_stats['ast'], marker='o', label='Average Assists')

# Adding labels and title
plt.xlabel('Season')
plt.ylabel('Average Stats')
plt.title(f'{team_name} Average Performance by Season')
plt.xticks(rotation=45)  # Rotate x-axis labels for better readability
plt.legend()

# Display the plot
plt.tight_layout()
plt.show()

################
# Third GRAPH #
###############

# Filter the DataFrame for a specific team
team_name = 'GSW'
team_df = df[df['team_abbreviation'] == team_name]

# Group the data by season for the specific team
grouped_data = team_df.groupby('season')

# Calculate the mean of each statistic for each season
mean_stats = grouped_data.agg({'net_rating': 'mean'}).reset_index()

# Plot average net rating
plt.plot(mean_stats['season'], mean_stats['net_rating'], marker='o', label='Average Net Rating')

# Adding labels and title
plt.xlabel('Season')
plt.ylabel('Average Net Rating')
plt.title(f'{team_name} Average Net Rating by Season')
plt.xticks(rotation=45)  # Rotate x-axis labels for better readability
plt.legend()

# Display the plot
plt.tight_layout()
plt.show()

# **Splitting Data**

In [None]:
# Selecting features
features = ['age', 'pts', 'reb', 'ast', 'usg_pct', 'ts_pct']

# Function to split data for each player
def split_data_by_player(data):
    train_data = []
    test_data = []
    for player_name, player_df in data.groupby('player_name'):

        # Sort player's data chronologically
        player_df = player_df.sort_values(by='season')

        # Determine the index to split the data
        split_index = int(0.8 * len(player_df))

        # Split data into training and testing sets
        train_data.append(player_df.iloc[:split_index])
        test_data.append(player_df.iloc[split_index:])

    return pd.concat(train_data), pd.concat(test_data)

# Split data for each player
train_data, test_data = split_data_by_player(df)

# Extract selected features for training and testing sets
X_train = train_data[features]
y_train = train_data[['pts', 'reb', 'ast']]

X_test = test_data[features]
y_test = test_data[['pts', 'reb', 'ast']]

# Print the shapes of the training and testing sets to verify
print("Training set - X:", X_train.shape, " y:", y_train.shape)
print("Testing set - X:", X_test.shape, " y:", y_test.shape)

# Check the first few rows of the training data for a specific player (e.g., LeBron James)
print("First few rows of training data for LeBron James:")
print(X_train[train_data['player_name'] == 'LeBron James'].head())

# Check the first few rows of the testing data for the same player
print("First few rows of testing data for LeBron James:")
print(X_test[test_data['player_name'] == 'LeBron James'].head())

# **Training**

In [None]:
%pip install catboost

In [None]:
from catboost import CatBoostRegressor
from catboost.utils import eval_metric
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.metrics import mean_absolute_error

param_grid = {
    'depth': [4, 6],                 # Different depths of the trees
    'learning_rate': [0.05, 0.1],  # Different learning rates
    'iterations': [100, 200],            # Number of boosting iterations
}

# Initialize CatBoostRegressor
model_pts = GridSearchCV(CatBoostRegressor(silent=True), param_grid)
model_reb = GridSearchCV(CatBoostRegressor(silent=True), param_grid)
model_ast = GridSearchCV(CatBoostRegressor(silent=True), param_grid)

player_name = 'Brandon Ingram'

# Train the model based on the player
# points
new_X_train_pts = X_train[train_data['player_name'] == player_name]
new_y_train_pts = y_train[train_data['player_name'] == player_name]['pts']

# rebounds
new_X_train_reb = X_train[train_data['player_name'] == player_name]
new_y_train_reb = y_train[train_data['player_name'] == player_name]['reb']

# assists
new_X_train_ast = X_train[train_data['player_name'] == player_name]
new_y_train_ast = y_train[train_data['player_name'] == player_name]['ast']

# Train the model
model_pts.fit(new_X_train_pts, new_y_train_pts)
model_reb.fit(new_X_train_reb, new_y_train_reb)
model_ast.fit(new_X_train_ast, new_y_train_ast)

# points
new_X_test_pts = X_test[test_data['player_name'] == player_name]
new_y_test_pts = y_test[test_data['player_name'] == player_name]['pts']

# rebounds
new_X_test_reb = X_test[test_data['player_name'] == player_name]
new_y_test_reb = y_test[test_data['player_name'] == player_name]['reb']

# assists
new_X_test_ast = X_test[test_data['player_name'] == player_name]
new_y_test_ast = y_test[test_data['player_name'] == player_name]['ast']

# Make predictions on the data
# points
test_pred_pts = model_pts.predict(new_X_test_pts)
train_pred_pts = model_pts.predict(new_X_train_pts)

# rebounds
test_pred_reb = model_reb.predict(new_X_test_reb)
train_pred_reb = model_reb.predict(new_X_train_reb)

# assists
test_pred_ast = model_ast.predict(new_X_test_ast)
train_pred_ast = model_ast.predict(new_X_train_ast)

# Calculate MAE for the test predictions
mae_pts = mean_absolute_error(new_y_test_pts, test_pred_pts)
mae_reb = mean_absolute_error(new_y_test_reb, test_pred_reb)
mae_ast = mean_absolute_error(new_y_test_ast, test_pred_ast)

# Calculate MAE for the training predictions
mae_pts_train = mean_absolute_error(new_y_train_pts, train_pred_pts)
mae_reb_train = mean_absolute_error(new_y_train_reb, train_pred_reb)
mae_ast_train = mean_absolute_error(new_y_train_ast, train_pred_ast)

# Calculate MAPE for the training predictions
mape_pts_train = mean_absolute_percentage_error(new_y_train_pts, train_pred_pts)
mape_reb_train = mean_absolute_error(new_y_train_reb, train_pred_reb)
mape_ast_train = mean_absolute_error(new_y_train_ast, train_pred_ast)

# Calculate MAPE for the testing predictions
mape_pts_test = mean_absolute_percentage_error(new_y_test_pts, test_pred_pts)
mape_reb_test = mean_absolute_percentage_error(new_y_test_reb, test_pred_reb)
mape_ast_test = mean_absolute_percentage_error(new_y_test_ast, test_pred_ast)

print("MAE for points (Training):", mae_pts_train)
print("MAE for points (Test):", mae_pts)
print(f"MAPE for points (Training): {100 - mape_pts_train * 100:.2f}%")
print(f"MAPE for points (Test): {100 - mape_pts_test * 100:.2f}%\n")
print("MAE for rebounds (Training):", mae_reb_train)
print("MAE for rebounds (Test):", mae_reb)
print(f"MAPE for rebounds (Training): {100 - mape_reb_train * 100:.2f}%")
print(f"MAPE for rebounds (Test): {100 - mape_reb_test * 100:.2f}%\n")
print("MAE for assists (Training):", mae_ast_train)
print("MAE for assists (Test):", mae_ast)
print(f"MAPE for assists (Training): {100 - mape_ast_train * 100:.2f}%")
print(f"MAPE for assists (Test): {100 - mape_ast_test * 100:.2f}%\n")

# **Trained Data Visualization**

In [None]:
# Filter data for a specific player
player_data = df[df['player_name'] == player_name]

#################
# Points Graph #
#################

# Get seasons for test data
train_seasons = player_data.loc[new_X_train_pts.index, 'season']


# Plot actual points for training data
plt.plot(train_seasons, new_y_train_pts, marker='o', label='Actual Points (Training)')

# Plot predicted points for training data
plt.plot(train_seasons, train_pred_pts, marker='o', label='Predicted Points (Training)')

# Adding labels and title
plt.xlabel('Season')
plt.ylabel('Points')
plt.title(f'{player_name} NBA Player Stats by Season (Training Data)')
plt.xticks(rotation=45)  # Rotate x-tick labels by 45 degrees
plt.legend()

# Display the plot
plt.tight_layout()
plt.show()


#################
# Rebounds Graph #
#################

# Get seasons for test data
train_seasons = player_data.loc[new_X_train_reb.index, 'season']


# Plot actual points for training data
plt.plot(train_seasons, new_y_train_reb, marker='o', label='Actual Rebounds (Training)')

# Plot predicted points for training data
plt.plot(train_seasons, train_pred_reb, marker='o', label='Predicted Rebounds (Training)')

# Adding labels and title
plt.xlabel('Season')
plt.ylabel('Rebounds')
plt.title(f'{player_name} NBA Player Stats by Season (Training Data)')
plt.xticks(rotation=45)  # Rotate x-tick labels by 45 degrees
plt.legend()

# Display the plot
plt.tight_layout()
plt.show()

#################
# Assists Graph #
#################

# Get seasons for test data
train_seasons = player_data.loc[new_X_train_ast.index, 'season']

# Plot actual points for training data
plt.plot(train_seasons, new_y_train_ast, marker='o', label='Actual Assists (Training)')

# Plot predicted points for training data
plt.plot(train_seasons, train_pred_ast, marker='o', label='Predicted Assists (Training)')

# Adding labels and title
plt.xlabel('Season')
plt.ylabel('Assists')
plt.title(f'{player_name} NBA Player Stats by Season (Training Data)')
plt.xticks(rotation=45)  # Rotate x-tick labels by 45 degrees
plt.legend()

# Display the plot
plt.tight_layout()
plt.show()

# **Test Data Visualization**

In [None]:
# Get seasons for test data
test_seasons = player_data.loc[new_X_test_pts.index, 'season']

# Plot actual points for test data
plt.plot(test_seasons, new_y_test_pts, marker='o', label='Actual Points (Testing)')

# Plot predicted points for test data
plt.plot(test_seasons, test_pred_pts, marker='o', label='Predicted Points (Testing)')

# Adding labels and title
plt.xlabel('Season')
plt.ylabel('Points')
plt.title(f'{player_name} NBA Player Stats by Season (Testing Data)')
plt.xticks(rotation=45)  # Rotate x-tick labels by 45 degrees
plt.legend()

# Display the plot
plt.tight_layout()
plt.show()

# Get seasons for test data
test_seasons = player_data.loc[new_X_test_reb.index, 'season']

# Plot actual points for test data
plt.plot(test_seasons, new_y_test_reb, marker='o', label='Actual Rebounds (Testing)')

# Plot predicted points for test data
plt.plot(test_seasons, test_pred_reb, marker='o', label='Predicted Rebounds (Testing)')

# Adding labels and title
plt.xlabel('Season')
plt.ylabel('Rebounds')
plt.title(f'{player_name} NBA Player Stats by Season (Testing Data)')
plt.xticks(rotation=45)  # Rotate x-tick labels by 45 degrees
plt.legend()

# Display the plot
plt.tight_layout()
plt.show()

# Get seasons for test data
test_seasons = player_data.loc[new_X_test_ast.index, 'season']

# Plot actual points for test data
plt.plot(test_seasons, new_y_test_ast, marker='o', label='Actual Assists (Testing)')

# Plot predicted points for test data
plt.plot(test_seasons, test_pred_ast, marker='o', label='Predicted Assists (Testing)')

# Adding labels and title
plt.xlabel('Season')
plt.ylabel('Assists')
plt.title(f'{player_name} NBA Player Stats by Season (Testing Data)')
plt.xticks(rotation=45)  # Rotate x-tick labels by 45 degrees
plt.legend()

# Display the plot
plt.tight_layout()
plt.show()