In [1]:
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', 25)

In [2]:
stats = pd.read_csv("stats.csv")

def get_min_age(player):
    return stats[stats['Player'] == player]['Age'].min()

stats['min_age'] = stats['Player'].apply(get_min_age)

stats.dropna(subset=['TOTAL_MP'], inplace = True) # removes 609 seasons where the player didn't play at all
stats = stats[stats['TOTAL_MP'] >= 100]

def get_total_seasons(player):
    return stats[stats['Player'] == player].shape[0]

stats['TOTAL_seasons'] = stats['Player'].apply(get_total_seasons)

def index_seasons(player, year):
    season_list = list(stats[stats['Player'] == player]['Year'])
    season_list.sort()
    return season_list.index(year)

stats['season_index'] = [index_seasons(player, year) for player, year in zip(stats['Player'], stats['Year'])]

In [3]:
# convert minutes played to % of available minutes played
# helps to account for seasons shortened by injury or lockout

stats['min_proportion'] = stats['TOTAL_MP'] / (stats['G'] * 40)

In [4]:
stats['3P%'].fillna(value = 0.12, inplace = True)
def fix_low_3pa(attempts, rate):
    if attempts <= 5:
        return 0.15
    
    elif attempts <= 15:
        if rate >= 0.25:
            return 0.25
        else:
            return np.max([0.12, rate])
        
    elif attempts <= 25:
        if rate >= 0.35:
            return 0.35
        else:
            return np.max([0.1, rate])
        
    else:
        return rate

stats['3P%'] = [fix_low_3pa(attempts, rate) for attempts,
                                  rate in zip(stats['TOTAL_3PA'], stats['3P%'])]
def fix_low_fta(attempts, rate):
    if attempts <= 10:
        return 0.75
    
    elif attempts <= 25:
        if rate >= 0.8:
            return 0.8
        else:
            return np.max([0.7, rate])
        
    elif attempts <= 50:
        if rate >= 0.85:
            return 0.85
        else:
            return np.max([0.60, rate])
        
    else:
        return rate

stats['FT%'] = [fix_low_fta(attempts, rate) for attempts,
                                  rate in zip(stats['TOTAL_FTA'], stats['FT%'])]

def fix_low_2pa(attempts, rate):
    if attempts <= 25:
        if rate >= 0.6:
            return 0.6
        else:
            return np.max([0.3, rate])
    else:
        return rate

stats['2P%'] = [fix_low_fta(attempts, rate) for attempts,
                                  rate in zip(stats['TOTAL_2PA'], stats['2P%'])]

In [5]:
stats['two_point_attempts'] = stats['2PA_per_100']/75
stats['three_point_attempts'] = stats['3PA_per_100']/75
stats['free_throw_attempts'] = stats['FTA_per_100']/75
stats['defensive_rebounds'] = stats['DRB_per_100']/75
stats['offensive_rebounds'] = stats['ORB_per_100']/75
stats['assists'] = stats['AST_per_100']/75
stats['steals'] = stats['STL_per_100']/75
stats['blocks'] = stats['BLK_per_100']/75
stats['turnovers'] = stats['TOV_per_100']/75
stats['personal_fouls'] = stats['PF_per_100']/75

In [6]:
stats['3P%'] = stats['3P%'] * 1.8

In [7]:
def logit(p):
    if p > 0.95:
        p = 0.95
    if p < 0.05:
        p = 0.05
    return np.log(p/(1-p))

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

In [8]:
stats['minutes_proportion'] = stats['min_proportion'].apply(logit)
stats['two_point_percentage'] = stats['2P%'].apply(logit)
stats['three_point_percentage'] = stats['3P%'].apply(logit)
stats['free_throw_percentage'] = stats['FT%'].apply(logit)

In [9]:
stats['Age2'] = stats['Age'] ** 2
stats['Age3'] = stats['Age'] ** 3

In [10]:
class MultiModelContainer:
    
    '''
    Object to function as a container for multiple linear models.
    Initialize with a dictionary where keys are targets and values are lists of features.
    '''
    
    def __init__(self, feature_dict, data):
        
        self.model_dict = {}
        self.feature_dict = feature_dict
        
        # initialize and fit models
        for target in feature_dict.keys():
            features = feature_dict[target]
            self.model_dict[target] = RandomForestRegressor(n_estimators = 75, max_depth = 6).fit(data[features].values, data[target].values)     

    def predict(self, targets, data, output_df = False, input_df = True, feature_map = None):
        
        switch = False
        
        for target in targets:
            # select features for target
            features = self.feature_dict[target]
            
            if input_df:
                # select model and input data for prediction
                preds = self.model_dict[target].predict(data[features])
            else:
                # if not using a df use feature map to select data from an array
                X = np.hstack(tuple([data[:,feature_map[feature]] for feature in features]))
                preds = self.model_dict[target].predict(X.reshape(-1,len(features)))
            
            if not switch:
                predictions = preds.reshape(-1,1)
                switch = True
                
            else:
                predictions = np.hstack((predictions, preds.reshape(-1,1)))
        
        if not output_df:
            return predictions
        
        else:
            return pd.DataFrame(predictions, columns = targets, index = data.index)

In [12]:
import pickle

with open('Models/FinalModelRFper75adv.pkl', 'rb') as file:
    FinalModel = pickle.load(file)
    
with open('Models/MinutesModelRFper75adv.pkl', 'rb') as file:
    MinutesModel = pickle.load(file)

In [15]:
features = ['Age', 'two_point_percentage', 'two_point_attempts', 'three_point_percentage',
       'three_point_attempts', 'free_throw_percentage', 'free_throw_attempts',
       'defensive_rebounds', 'offensive_rebounds', 'assists', 'steals',
       'blocks', 'turnovers', 'personal_fouls' ,'OBPM', 'DBPM', 'BPM', 
            'VORP', 'OWS', 'DWS','PER','Age2', 'Age3']

# map p1 features to column index of current version
p1_features = ['Age', 'two_point_percentage_p1', 'two_point_attempts_p1',
       'three_point_percentage_p1', 'three_point_attempts_p1',
       'free_throw_percentage_p1', 'free_throw_attempts_p1',
       'defensive_rebounds_p1', 'offensive_rebounds_p1', 'assists_p1',
       'steals_p1', 'blocks_p1', 'turnovers_p1', 'personal_fouls_p1','OBPM_p1', 'DBPM_p1', 'BPM_p1', 
            'VORP_p1', 'OWS_p1', 'DWS_p1','PER_p1','Age2', 'Age3']

feature_map = {}

for feat in p1_features:
    feature_map[feat] = p1_features.index(feat)
    
targets = ['two_point_percentage', 'two_point_attempts', 'three_point_percentage',
       'three_point_attempts', 'free_throw_percentage', 'free_throw_attempts',
       'defensive_rebounds', 'offensive_rebounds', 'assists', 'steals',
       'blocks', 'turnovers', 'personal_fouls' ,'OBPM', 'DBPM', 'BPM', 
            'VORP', 'OWS', 'DWS','PER']

In [16]:
mask = np.array(stats['Year'] == 2021)
new_players = stats[mask]
new_players

Unnamed: 0.1,Unnamed: 0,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,3P,...,offensive_rebounds,assists,steals,blocks,turnovers,personal_fouls,minutes_proportion,two_point_percentage,three_point_percentage,free_throw_percentage,Age2,Age3
630,630,Aaron Gordon,PF,25,DEN,50,50,27.7,4.6,10.0,0.463,1.2,...,0.036000,0.076000,0.016000,0.016000,0.045333,0.042667,0.809486,0.132192,0.417981,0.623438,625,15625
631,631,Austin Rivers,SG,28,DEN,36,7,23.5,2.9,6.8,0.424,1.5,...,0.006667,0.064000,0.024000,0.001333,0.028000,0.050667,0.350775,0.040005,0.682068,0.895384,784,21952
632,632,Bol Bol,PF,21,DEN,32,2,5.0,0.8,1.8,0.431,0.3,...,0.000000,0.029333,0.012000,0.041333,0.053333,0.057333,-1.945910,0.405465,0.532217,0.847298,441,9261
633,633,Facundo Campazzo,PG,29,DEN,65,19,21.9,1.8,4.8,0.381,1.2,...,0.010667,0.106667,0.036000,0.006667,0.033333,0.061333,0.192904,-0.224944,0.547692,1.982994,841,24389
635,635,JaMychal Green,PF,30,DEN,58,5,19.3,3.0,6.4,0.463,1.4,...,0.050667,0.030667,0.014667,0.013333,0.030667,0.085333,-0.068993,0.136210,0.935550,1.430633,900,27000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13897,13897,Patty Mills,PG,32,SAS,68,1,24.8,3.7,9.0,0.412,2.4,...,0.006667,0.062667,0.016000,0.001333,0.025333,0.030667,0.487364,0.000000,0.730888,2.313635,1024,32768
13898,13898,Quinndary Weatherspoon,SG,24,SAS,20,0,6.1,0.8,1.8,0.457,0.1,...,0.016000,0.042667,0.042667,0.010667,0.053333,0.096000,-1.724831,0.405465,-0.844442,1.386294,576,13824
13899,13899,Rudy Gay,PF,34,SAS,63,1,21.6,4.2,10.0,0.420,1.7,...,0.020000,0.041333,0.021333,0.018667,0.030667,0.054667,0.155870,-0.196631,0.780556,1.411485,1156,39304
13900,13900,Tre Jones,PG,21,SAS,37,1,7.3,1.0,2.1,0.474,0.1,...,0.018667,0.093333,0.018667,0.000000,0.030667,0.030667,-1.504490,-0.136210,-0.994623,1.386294,441,9261


In [17]:
print(list(new_players['Player']))

['Aaron Gordon', 'Austin Rivers', 'Bol Bol', 'Facundo Campazzo', 'JaMychal Green', 'JaVale McGee', 'Jamal Murray', 'Markus Howard', 'Michael Porter Jr.', 'Monte Morris', 'Nikola Jokić', 'PJ Dozier', 'Paul Millsap', 'Shaquille Harrison', 'Vlatko Čančar', 'Will Barton', 'Zeke Nnaji', 'Aaron Holiday', 'Caris LeVert', 'Domantas Sabonis', 'Doug McDermott', 'Edmond Sumner', 'Goga Bitadze', 'JaKarr Sampson', 'Jeremy Lamb', 'Justin Holiday', 'Kelan Martin', 'Malcolm Brogdon', 'Myles Turner', 'Oshae Brissett', 'T.J. McConnell', 'T.J. Warren', 'Aaron Nesmith', 'Carsen Edwards', 'Evan Fournier', 'Grant Williams', 'Jabari Parker', 'Jaylen Brown', 'Jayson Tatum', 'Kemba Walker', 'Luke Kornet', 'Marcus Smart', 'Payton Pritchard', 'Robert Williams', 'Romeo Langford', 'Semi Ojeleye', 'Tacko Fall', 'Tremont Waters', 'Tristan Thompson', 'Abdel Nader', 'Cameron Johnson', 'Cameron Payne', 'Chris Paul', 'Dario Šarić', 'Deandre Ayton', 'Devin Booker', "E'Twaun Moore", 'Frank Kaminsky', 'Jae Crowder', 'Jalen

In [18]:
new_players.shape

(468, 119)

In [19]:
def predict_career(player_id, n_years, start_year, stats_model, minutes_model):
    
    # get the players actual data and create seed for predictions
    mask = np.array(stats['Player'] == player_id) & np.array(stats['Year'] == start_year)
    seed = np.array(stats[mask][features]).reshape(1,-1)
    mseed = np.array(stats[mask]['minutes_proportion']).reshape(-1,1)
    
    # iterate through total seasons and generate prediction based on each past prediction
    for i in range(0, n_years):
        
        # get inputs for prediction, previous season stats and CURRENT age
        pred_inputs = seed[-1,1:-2].reshape(1,-1)
        current_age = np.array(seed[i,0] + 1).reshape(1,-1)
        pred_inputs = np.hstack((current_age, pred_inputs, current_age**2, current_age**3))
        preds = stats_model.predict(targets, pred_inputs, input_df = False, feature_map = feature_map)
        
        # append age features to prediction
        preds = np.hstack((current_age, preds, current_age**2, current_age**3))
        
        # append predictions to seed
        seed = np.vstack((seed, preds))
    
    # predict minutes in each season
    for i in range(0, n_years):
        pred_inputs = np.hstack((seed[i+1,:], mseed[-1,:])).reshape(1,-1)
        minutes = minutes_model.predict(pred_inputs)
        mseed = np.vstack((mseed, minutes.reshape(-1,1)))
        
    seed = np.hstack((mseed, seed))
    
    # Convert logits back to percentages
    seed[:,0] = sigmoid(seed[:,0])
    
    for i, feat in enumerate(features):
        if feat[-10:] == 'percentage':
            seed[:,i + 1] = sigmoid(seed[:,i + 1])
    
    # Remove age2 and age3 features
    seed = seed[:,:-2]
    
    # Output Formatting
    cols = ['minutes_proportion'] + features[:-2]
    seed = pd.DataFrame(np.round(seed, 3), columns = cols)
    seed['Player'] = player_id
    #seed['name'] = seasons[mask]['name'][0]
    seed['Year'] = [start_year + i for i in range(n_years + 1)]
    seed['index_col'] = [str(player_id) + str(year) for player_id, year in zip(seed['Player'], seed['Year'])]
    seed.set_index('index_col', inplace = True)
    
    # Adjust 3pt percentage by dividing by 1.8
    seed['three_point_percentage'] = seed['three_point_percentage'] / 1.8
    
    #order columns for output
    final_cols = ['Player', 'Year'] + cols
    
    return seed[final_cols]

In [20]:
career_preds = {}

for player_id in list(new_players['Player']):
    career_preds[player_id] = predict_career(player_id, 5, 2021, FinalModel, MinutesModel)









































































































In [21]:
from copy import deepcopy

In [22]:
def per_game_averages(player_df):
    pg = deepcopy(player_df)
    
    pg['minutes_per_game'] = pg['minutes_proportion'] * 40
    pg.drop(columns = 'minutes_proportion', inplace = True)
    
    pg['two_pointers'] = pg['two_point_attempts'] * pg['two_point_percentage'] * 75 * pg['minutes_per_game'] / 36
    
    pg['three_pointers'] = pg['three_point_attempts'] * pg['three_point_percentage'] * 75 * pg['minutes_per_game'] / 36
    
    pg['free_throws'] = pg['free_throw_attempts'] * pg['free_throw_percentage'] * 75 * pg['minutes_per_game'] / 36

    pg['defensive_rebounds'] = pg['defensive_rebounds'] * 75 * pg['minutes_per_game'] / 36
    
    pg['offensive_rebounds'] = pg['offensive_rebounds'] * 75 * pg['minutes_per_game'] / 36
    
    pg['total_rebounds'] = pg['defensive_rebounds'] + pg['offensive_rebounds'] 
    
    pg['assists'] = pg['assists'] * 75 * pg['minutes_per_game'] / 36
    
    pg['steals'] = pg['steals'] * 75 * pg['minutes_per_game'] / 36
    
    pg['blocks'] = pg['blocks'] * 75 * pg['minutes_per_game'] / 36
    
    pg['turnovers'] = pg['turnovers'] * 75 * pg['minutes_per_game'] / 36
    
    pg['personal_fouls'] = pg['personal_fouls'] * 75 * pg['minutes_per_game'] / 36
    
    pg['points'] = 2 * pg['two_pointers'] + 3 * pg['three_pointers'] + 1 * pg['free_throws']
    
    pg['field_goals'] = pg['two_pointers'] + pg['three_pointers']
    
    pg['field_goal_percentage'] = pg['field_goals'] / ((pg['two_point_attempts'] + pg['three_point_attempts']) * 75 * pg['minutes_per_game'] / 36)
    
    final_columns = ['Player', 'Year', 'Age', 'minutes_per_game', 'field_goals', 'field_goal_percentage',
                     'three_pointers', 'three_point_percentage', 'two_pointers', 'two_point_percentage',  'free_throws',
                     'free_throw_percentage', 'total_rebounds', 'offensive_rebounds', 'defensive_rebounds',
                     'assists', 'steals', 'blocks', 'turnovers', 'personal_fouls', 'points']
    
    pg = pg.round(2)
    
    return pg[final_columns]

In [23]:
per_game_preds = {}

for player_id in career_preds.keys():

    per_game_preds[player_id] = per_game_averages(career_preds[player_id])

In [24]:
# Combine all the dataframes

switch = 0

for player_id in per_game_preds.keys():
    
    if not switch:
        per_game_df = per_game_preds[player_id]
        switch = 1
    
    else:
        per_game_df = pd.concat([per_game_df, per_game_preds[player_id]], axis = 0)

In [25]:
def stat_leaders(year, stat, N):
    
    stat_df = per_game_df[per_game_df['Year'] == year][['Player', stat]]
    return stat_df.sort_values(stat, ascending = False).head(N)

In [26]:
per_game_df[per_game_df['Year'] == 2022][['Player','Age','points']]

Unnamed: 0_level_0,Player,Age,points
index_col,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Aaron Gordon2022,Aaron Gordon,26.0,15.93
Austin Rivers2022,Austin Rivers,29.0,8.57
Bol Bol2022,Bol Bol,22.0,8.19
Facundo Campazzo2022,Facundo Campazzo,30.0,7.63
JaMychal Green2022,JaMychal Green,31.0,9.74
...,...,...,...
Patty Mills2022,Patty Mills,33.0,11.81
Quinndary Weatherspoon2022,Quinndary Weatherspoon,25.0,6.64
Rudy Gay2022,Rudy Gay,35.0,14.37
Tre Jones2022,Tre Jones,22.0,6.00


In [27]:
stat_leaders(2022, 'minutes_per_game', 10)

Unnamed: 0_level_0,Player,minutes_per_game
index_col,Unnamed: 1_level_1,Unnamed: 2_level_1
Julius Randle2022,Julius Randle,36.84
James Harden2022,James Harden,36.24
Jayson Tatum2022,Jayson Tatum,36.2
Domantas Sabonis2022,Domantas Sabonis,36.16
Russell Westbrook2022,Russell Westbrook,36.16
Bradley Beal2022,Bradley Beal,36.04
Damian Lillard2022,Damian Lillard,35.84
Pascal Siakam2022,Pascal Siakam,35.4
Nikola Jokić2022,Nikola Jokić,35.0
Trae Young2022,Trae Young,35.0


In [28]:
stat_leaders(2022, 'points', 10)

Unnamed: 0_level_0,Player,points
index_col,Unnamed: 1_level_1,Unnamed: 2_level_1
Bradley Beal2022,Bradley Beal,38.52
Joel Embiid2022,Joel Embiid,38.48
Giannis Antetokounmpo2022,Giannis Antetokounmpo,36.27
Luka Dončić2022,Luka Dončić,35.47
Damian Lillard2022,Damian Lillard,34.98
Stephen Curry2022,Stephen Curry,34.63
Nikola Jokić2022,Nikola Jokić,34.53
Zion Williamson2022,Zion Williamson,33.81
Donovan Mitchell2022,Donovan Mitchell,33.66
Devin Booker2022,Devin Booker,33.58


In [29]:
stat_leaders(2022, 'BPM', 10)

KeyError: "['BPM'] not in index"

In [None]:
stat_leaders(2022, 'VORP', 10)