In [1]:
import pandas as pd
import numpy as np
import warnings
import matplotlib.pyplot as plt
from sklearn import linear_model
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm
from sklearn import metrics
from sklearn.decomposition import PCA
import math
from keras.models import Sequential
from tensorflow.keras.layers import LSTM
from keras.layers import Dense
from gurobipy import Model, GRB, quicksum

In [2]:
warnings.filterwarnings('ignore')

In [3]:
# Load the dataframes
%store -r df_player_statistics
%store -r df_2021_2022
%store -r df_2022_2023
%store -r df_2023_2024
%store -r df_yahoo
%store -r df_top_200

In [4]:
# Helper method 1
def most_recent_played(row, start_idx, end_idx):
    # Count non-NaN values (seasons played)
    games_played = row[start_idx:end_idx].notna().sum()

    # Check if the player has played 3 seasons or less
    if games_played <= 3:
        # Return the most recent non-NaN value in the specified range
        return row[start_idx:end_idx].dropna().iloc[-1]
    else:
        return None

In [5]:
# Helper method 2
def forecast(initial, final, df_stats):
    
    # Forecasting
    initial_list = [12, 23, 34, 45, 56]
    final_list = [23, 34, 45, 56, 67]
    
    # Dataframe to list
    for i in range(initial, final):
        counter = 0
        for j in range(len(initial_list)):
            counter += 1
            raw_seq = df_stats.values.tolist()[i][initial_list[j]:final_list[j]]
            raw_seq = [float(i) for i in raw_seq if i != float('nan')]
            raw_seq = [x for x in raw_seq if not (isinstance(x, float) and math.isnan(x))]

            if len(raw_seq) > 3:

                # split a univariate sequence into samples
                def split_sequence(sequence, n_steps):
                    X, y = list(), list()
                    for i in range(len(sequence)):
                        # find the end of this pattern
                        end_ix = i + n_steps
                        # check if we are beyond the sequence
                        if end_ix > len(sequence)-1:
                            break
                        # gather input and output parts of the pattern
                        seq_x, seq_y = sequence[i:end_ix], sequence[end_ix]
                        X.append(seq_x)
                        y.append(seq_y)
                    return np.array(X), np.array(y)

                n_steps = 3

                # predict data
                x_input = raw_seq[-1*n_steps:]
                y_expected = 32

                # split into samples
                X, y = split_sequence(raw_seq, n_steps)

                # reshape from [samples, timesteps] into [samples, timesteps, features]
                n_features = 1
                X = X.reshape((X.shape[0], X.shape[1], n_features))

                # define model
                def get_model(m):
                    if m == 'Vanilla_LSTM':
                        model = Sequential(name=m)
                        model.add(LSTM(50, activation='relu', input_shape=(n_steps, n_features)))
                        model.add(Dense(1))
                        model.compile(optimizer='adam', loss='mse')
                    elif m == 'Stacked_LSTM':
                        model = Sequential(name=m)
                        model.add(LSTM(50, activation='relu', return_sequences=True, input_shape=(n_steps, n_features)))
                        model.add(LSTM(50, activation='relu'))
                        model.add(Dense(1))
                        model.compile(optimizer='adam', loss='mse')
                    model.summary()
                    return model
                model = get_model('Vanilla_LSTM')
                model = get_model('Stacked_LSTM')

                # fit model
                model.fit(X, y, epochs=200, verbose=0)

                # show prediction
                x_input = np.array(x_input)
                x_input = x_input.reshape((1, n_steps, n_features))
                yhat = model.predict(x_input, verbose=0)
                if counter == 1:
                    df_stats["Forecasted PTS"][i] = yhat.item()
                elif counter == 2:
                    df_stats["Forecasted TRB"][i] = yhat.item()
                elif counter == 3:
                    df_stats["Forecasted AST"][i] = yhat.item()
                elif counter == 4:
                    df_stats["Forecasted 3P"][i] = yhat.item()
                elif counter == 5:
                    df_stats["Forecasted STL"][i] = yhat.item()
    
    return df_stats

In [6]:
def forecasting(df_top_200, df_player_statistics):
    
    # Past 10 seasons
    df_past_stats = pd.read_csv('NBA_Player_Stats.csv')
    
    df_past_stats = df_past_stats[8679:13770]
    
    df_past_stats = df_past_stats.replace("Luka Dončić", "Luka Doncic")
    df_past_stats = df_past_stats.replace("Nikola Jokić", "Nikola Jokic")
    df_past_stats = df_past_stats.replace("Alperen Şengün", "Alperen Sengun")
    df_past_stats = df_past_stats.replace("Nikola Vučević", "Nikola Vucevic")
    df_past_stats = df_past_stats.replace("Jusuf Nurkić", "Jusuf Nurkic")
    df_past_stats = df_past_stats.replace("Bogdan Bogdanović", "Bogdan Bogdanovic")
    df_past_stats = df_past_stats.replace("Kristaps Porziņģis", "Kristaps Porzingis")
    df_past_stats = df_past_stats.replace("Dennis Schröder", "Dennis Schroder")
    df_past_stats = df_past_stats.replace("Jonas Valančiūnas", "Jonas Valanciunas")
    
    df_past_stats = df_past_stats[["Player", "G", "3P", "PTS", "TRB", "AST", "STL", "Year"]]
    
    df_13_14 = df_past_stats[:602]
    
    df_14_15 = df_past_stats[602:1244]
    
    df_15_16 = df_past_stats[1244:1813]
    
    df_16_17 = df_past_stats[1813:2400]
    
    df_17_18 = df_past_stats[2400:3055]
    
    df_18_19 = df_past_stats[3055:3754]
    
    df_19_20 = df_past_stats[3754:4396]
    
    df_20_21 = df_past_stats[4396:]
    
    # Rename and drop
    df_13_14 = df_13_14.drop_duplicates(subset=['Player'], keep='first')
    df_13_14 = df_13_14[["Player", "G", "3P", "PTS", "TRB", "AST", "STL"]]
    df_13_14 = df_13_14.rename(columns={'G': 'G 13/14', '3P': '3P 13/14', 'TRB': 'TRB 13/14', 'AST': 'AST 13/14', 
                                                'STL': 'STL 13/14','PTS': 'PTS 13/14'})
    
    
    df_14_15 = df_14_15.drop_duplicates(subset=['Player'], keep='first')
    df_14_15 = df_14_15[["Player", "G", "3P", "PTS", "TRB", "AST", "STL"]]
    df_14_15 = df_14_15.rename(columns={'G': 'G 14/15', '3P': '3P 14/15', 'TRB': 'TRB 14/15', 'AST': 'AST 14/15', 
                                                'STL': 'STL 14/15','PTS': 'PTS 14/15'})
    
    
    df_15_16 = df_15_16.drop_duplicates(subset=['Player'], keep='first')
    df_15_16 = df_15_16[["Player", "G", "3P", "PTS", "TRB", "AST", "STL"]]
    df_15_16 = df_15_16.rename(columns={'G': 'G 15/16', '3P': '3P 15/16', 'TRB': 'TRB 15/16', 'AST': 'AST 15/16', 
                                                'STL': 'STL 15/16','PTS': 'PTS 15/16'})
    
    
    df_16_17 = df_16_17.drop_duplicates(subset=['Player'], keep='first')
    df_16_17 = df_16_17[["Player", "G", "3P", "PTS", "TRB", "AST", "STL"]]
    df_16_17 = df_16_17.rename(columns={'G': 'G 16/17', '3P': '3P 16/17', 'TRB': 'TRB 16/17', 'AST': 'AST 16/17', 
                                                'STL': 'STL 16/17','PTS': 'PTS 16/17'})
    
    
    df_17_18 = df_17_18.drop_duplicates(subset=['Player'], keep='first')
    df_17_18 = df_17_18[["Player", "G", "3P", "PTS", "TRB", "AST", "STL"]]
    df_17_18 = df_17_18.rename(columns={'G': 'G 17/18', '3P': '3P 17/18', 'TRB': 'TRB 17/18', 'AST': 'AST 17/18', 
                                                'STL': 'STL 17/18','PTS': 'PTS 17/18'})
    
    
    df_18_19 = df_18_19.drop_duplicates(subset=['Player'], keep='first')
    df_18_19 = df_18_19[["Player", "G", "3P", "PTS", "TRB", "AST", "STL"]]
    df_18_19 = df_18_19.rename(columns={'G': 'G 18/19', '3P': '3P 18/19', 'TRB': 'TRB 18/19', 'AST': 'AST 18/19', 
                                                'STL': 'STL 18/19','PTS': 'PTS 18/19'})
    
    
    df_19_20 = df_19_20.drop_duplicates(subset=['Player'], keep='first')
    df_19_20 = df_19_20[["Player", "G", "3P", "PTS", "TRB", "AST", "STL"]]
    df_19_20 = df_19_20.rename(columns={'G': 'G 19/20', '3P': '3P 19/20', 'TRB': 'TRB 19/20', 'AST': 'AST 19/20', 
                                                'STL': 'STL 19/20','PTS': 'PTS 19/20'})
    
    
    df_20_21 = df_20_21.drop_duplicates(subset=['Player'], keep='first')
    df_20_21 = df_20_21[["Player", "G", "3P", "PTS", "TRB", "AST", "STL"]]
    df_20_21 = df_20_21.rename(columns={'G': 'G 20/21', '3P': '3P 20/21', 'TRB': 'TRB 20/21', 'AST': 'AST 20/21', 
                                                'STL': 'STL 20/21','PTS': 'PTS 20/21'})
    
    # Combining dataframes
    df_player_names = df_top_200["Player"]
    
    df_stats = df_player_statistics[["Player", "G 21/22", "G 22/23", "G 23/24", 
                                     "3P 21/22", "3P 22/23", "3P 23/24", 
                                     "PTS 21/22", "PTS 22/23", "PTS 23/24", 
                                     "TRB 21/22", "TRB 22/23", "TRB 23/24", 
                                     "AST 21/22", "AST 22/23", "AST 23/24", 
                                     "STL 21/22", "STL 22/23", "STL 23/24"]]
    
    df_stats = pd.merge(df_player_names, df_stats, on=['Player'], how="left")

    
    df_stats = pd.merge(df_stats, df_13_14, on=['Player'], how="left")
    df_stats = pd.merge(df_stats, df_14_15, on=['Player'], how="left")
    df_stats = pd.merge(df_stats, df_15_16, on=['Player'], how="left")
    df_stats = pd.merge(df_stats, df_16_17, on=['Player'], how="left")
    df_stats = pd.merge(df_stats, df_17_18, on=['Player'], how="left")
    df_stats = pd.merge(df_stats, df_18_19, on=['Player'], how="left")
    df_stats = pd.merge(df_stats, df_19_20, on=['Player'], how="left")
    df_stats = pd.merge(df_stats, df_20_21, on=['Player'], how="left")
    
    # Reorder columns
    df_stats = df_stats.reindex(columns=['Player', 'G 13/14', 'G 14/15', 'G 15/16', 'G 16/17', 'G 17/18', 'G 18/19', 'G 19/20', 'G 20/21', 'G 21/22', 'G 22/23', 'G 23/24',
                                         'PTS 13/14', 'PTS 14/15', 'PTS 15/16', 'PTS 16/17', 'PTS 17/18', 'PTS 18/19', 'PTS 19/20', 'PTS 20/21', 'PTS 21/22', 'PTS 22/23', 'PTS 23/24', 
                                         'TRB 13/14', 'TRB 14/15', 'TRB 15/16', 'TRB 16/17', 'TRB 17/18', 'TRB 18/19', 'TRB 19/20', 'TRB 20/21', 'TRB 21/22', 'TRB 22/23', 'TRB 23/24',
                                         'AST 13/14', 'AST 14/15', 'AST 15/16', 'AST 16/17', 'AST 17/18', 'AST 18/19', 'AST 19/20', 'AST 20/21', 'AST 21/22', 'AST 22/23', 'AST 23/24',
                                         '3P 13/14', '3P 14/15', '3P 15/16', '3P 16/17', '3P 17/18', '3P 18/19', '3P 19/20', '3P 20/21', '3P 21/22', '3P 22/23', '3P 23/24',
                                         'STL 13/14', 'STL 14/15', 'STL 15/16', 'STL 16/17', 'STL 17/18', 'STL 18/19', 'STL 19/20', 'STL 20/21', 'STL 21/22', 'STL 22/23', 'STL 23/24'])
    
    # Adding the new columns
    df_stats['Forecasted PTS'] = df_stats.apply(most_recent_played, axis=1, args=(12, 23))
    df_stats['Forecasted TRB'] = df_stats.apply(most_recent_played, axis=1, args=(23, 34))
    df_stats['Forecasted AST'] = df_stats.apply(most_recent_played, axis=1, args=(34, 45))
    df_stats['Forecasted 3P'] = df_stats.apply(most_recent_played, axis=1, args=(45, 56))
    df_stats['Forecasted STL'] = df_stats.apply(most_recent_played, axis=1, args=(56, 67))
    
    # Forecast method call
    df_stats = forecast(0, 40, df_stats)
    df_stats = forecast(40, 80, df_stats)
    df_stats = forecast(80, 120, df_stats)
    df_stats = forecast(120, 160, df_stats)
    df_stats = forecast(160, 200, df_stats)

    # Forecasted games
    df_forecasted_g = pd.read_csv('Yahoo.csv')
    df_forecasted_g = df_forecasted_g[["Player", "Average Games Played"]]
    df_stats = pd.merge(df_stats, df_forecasted_g, on=['Player'], how="left")
    df_stats = df_stats.rename(columns={'Average Games Played': 'Forecasted G'})
    df_stats['Forecasted G'] = round(df_stats['Forecasted G'] * 82)
    
    # Write to csv
    df_stats.to_csv('Forecasted_Player_Stats.csv', index=False)
    
    return

In [7]:
forecasting(df_top_200, df_player_statistics)



