In [1]:
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', 25)

In [2]:
stats = pd.read_csv("stats.csv")

def get_min_age(player):
    return stats[stats['Player'] == player]['Age'].min()

stats['min_age'] = stats['Player'].apply(get_min_age)

stats.dropna(subset=['TOTAL_MP'], inplace = True) # removes 609 seasons where the player didn't play at all
stats = stats[stats['TOTAL_MP'] >= 100]

def get_total_seasons(player):
    return stats[stats['Player'] == player].shape[0]

stats['TOTAL_seasons'] = stats['Player'].apply(get_total_seasons)

def index_seasons(player, year):
    season_list = list(stats[stats['Player'] == player]['Year'])
    season_list.sort()
    return season_list.index(year)

stats['season_index'] = [index_seasons(player, year) for player, year in zip(stats['Player'], stats['Year'])]

In [3]:
# convert minutes played to % of available minutes played
# helps to account for seasons shortened by injury or lockout

stats['min_proportion'] = stats['TOTAL_MP'] / (stats['G'] * 40)

In [4]:
stats['3P%'].fillna(value = 0.15, inplace = True)

def fix_low_3pa(attempts, rate):
    if attempts <= 5:
        return 0.15
    
    elif attempts <= 15:
        if rate >= 0.25:
            return 0.25
        else:
            return np.max([0.12, rate])
        
    elif attempts <= 25:
        if rate >= 0.35:
            return 0.35
        else:
            return np.max([0.1, rate])
        
    else:
        return rate

stats['3P%'] = [fix_low_3pa(attempts, rate) for attempts,
                                  rate in zip(stats['TOTAL_3PA'], stats['3P%'])]

def fix_low_fta(attempts, rate):
    if attempts <= 10:
        return 0.75
    
    elif attempts <= 25:
        if rate >= 0.8:
            return 0.8
        else:
            return np.max([0.7, rate])
        
    elif attempts <= 50:
        if rate >= 0.85:
            return 0.85
        else:
            return np.max([0.60, rate])
        
    else:
        return rate

stats['FT%'] = [fix_low_fta(attempts, rate) for attempts,
                                  rate in zip(stats['TOTAL_FTA'], stats['FT%'])]
def fix_low_2pa(attempts, rate):
    if attempts <= 25:
        if rate >= 0.6:
            return 0.6
        else:
            return np.max([0.3, rate])
    else:
        return rate

stats['2P%'] = [fix_low_fta(attempts, rate) for attempts,
                                  rate in zip(stats['TOTAL_2PA'], stats['2P%'])]


In [5]:
# convert counting stats to per 36 minute
stats['two_point_attempts'] = stats['TOTAL_2PA'] / stats['TOTAL_MP'] * 36
stats['three_point_attempts'] = stats['TOTAL_3PA'] / stats['TOTAL_MP'] * 36
stats['free_throw_attempts'] = stats['TOTAL_FTA'] / stats['TOTAL_MP'] * 36
stats['defensive_rebounds'] = stats['TOTAL_DRB'] / stats['TOTAL_MP'] * 36
stats['offensive_rebounds'] = stats['TOTAL_ORB'] / stats['TOTAL_MP'] * 36
stats['assists'] = stats['TOTAL_AST'] / stats['TOTAL_MP'] * 36
stats['steals'] = stats['TOTAL_STL'] / stats['TOTAL_MP'] * 36
stats['blocks'] = stats['TOTAL_BLK'] / stats['TOTAL_MP'] * 36
stats['turnovers'] = stats['TOTAL_TOV'] / stats['TOTAL_MP'] * 36
stats['personal_fouls'] = stats['TOTAL_PF'] / stats['TOTAL_MP'] * 36

In [6]:
stats['3P%'] = stats['3P%'] * 1.8

In [7]:

def logit(p):
    if p > 0.95:
        p = 0.95
    if p < 0.05:
        p = 0.05
    return np.log(p/(1-p))

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

In [8]:
stats['minutes_proportion'] = stats['min_proportion'].apply(logit)
stats['two_point_percentage'] = stats['2P%'].apply(logit)
stats['three_point_percentage'] = stats['3P%'].apply(logit)
stats['free_throw_percentage'] = stats['FT%'].apply(logit)

In [9]:
stats['Age2'] = stats['Age'] ** 2
stats['Age3'] = stats['Age'] ** 3

In [10]:
def predict_career(player_id, train_test, model, df = False):
    
    # get the players actual data and create seed for predictions
    if train_test == 'train':
        player_seq = train[train['Player'] == player_id].sort_values(by = 'season_index')
    else:
        player_seq = test[test['Player'] == player_id].sort_values(by = 'season_index')
    
    ix = player_seq.index
    player_seq = np.array(player_seq[features])
    total_seasons = player_seq.shape[0]
    
    seed = player_seq[0,:].reshape(1,-1)
    
    # iterate through total seasons and generate prediction based on each past prediction
    for season in range(1, total_seasons):
        
        # get inputs for prediction, previous season stats and CURRENT age
        pred_inputs = seed[-1,1:-2].reshape(1,-1)
        current_age = np.array(player_seq[season,0]).reshape(1,-1)
        pred_inputs = np.hstack((current_age, pred_inputs, current_age**2, current_age**3))
        preds = model.predict(targets, pred_inputs, input_df = False, feature_map = feature_map)
        
        # append age features to prediction
        current_age = np.array(current_age).reshape(1,-1)
        preds = np.hstack((current_age, preds, current_age**2, current_age**3))
        
        # append predictions to seed
        seed = np.vstack((seed, preds))
    
    # Convert logits back to percentages
    for i, feat in enumerate(features):
        if feat[-10:] == 'percentage':
            seed[:,i] = sigmoid(seed[:,i])
            player_seq[:, i] = sigmoid(player_seq[:, i])
    
    if df:
        seed = pd.DataFrame(np.round(seed, 3), columns = features, index = ix)
        errors = pd.DataFrame(np.round(seed - player_seq, 3), columns = features, index = ix)

    else:
        errors = seed - player_seq
        
    return seed, errors

In [11]:
switch = 0

for player in train['Player'].unique():
    pred, error = predict_career(player, 'train', MultiModel)
    error = np.abs(error)[1:,:]
    
    if not switch:
        train_error = error
        train_pred = pred[1:,:]
        switch = 1
    else:
        train_error = np.vstack((train_error, error))
        train_pred = np.vstack((train_pred, pred[1:,:]))
        
total_seasons = train_error.shape[0]

train_maes = np.sum(train_error, axis = 0) / total_seasons

train_maes = pd.DataFrame(train_maes.reshape(1,-1), columns = features)