In [1]:
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)
stats = pd.read_csv("features.csv")
del stats["Unnamed: 0"]
data = stats[stats['TOTAL_seasons'] >= 3]
player_list = data['Player'].unique()
from sklearn.model_selection import train_test_split

# Split players into training and test sets
train_players_list, test_players_list = train_test_split(player_list, random_state = 42)


In [2]:
def train_or_test(player):
    if player in train_players_list:
        return 'train'
    else:
        return 'test'

data['train_test'] = data['Player'].apply(train_or_test)

In [3]:
train = data[data['train_test'] == 'train']
test = data[data['train_test'] == 'test']

In [4]:
def logit(p):
    if p > 0.95:
        p = 0.95
    if p < 0.05:
        p = 0.05
    return np.log(p/(1-p))

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

In [5]:
features = ['minutes_proportion', 'two_point_percentage', 'two_point_attempts', 'three_point_percentage',
              'three_point_attempts', 'free_throw_percentage','free_throw_attempts', 'defensive_rebounds',
              'offensive_rebounds', 'assists', 'steals', 'blocks', 'turnovers', 'personal_fouls']

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

cfeatures = ['Age', 'two_point_percentage', 'two_point_attempts', 'three_point_percentage',
              'three_point_attempts', 'free_throw_percentage','free_throw_attempts', 'defensive_rebounds',
              'offensive_rebounds', 'assists', 'steals', 'blocks', 'turnovers', 'personal_fouls']

plt.figure(figsize=(11,9))
plt.title('Feature Correlations')
sns.heatmap(data[cfeatures].corr(), annot = True, cmap = 'RdYlBu');

In [None]:
from collections import defaultdict

# For each feature identify strongly correlated features

corrs = data.corr()
corr_dict = defaultdict(list)

for feature in features:
    feat_corrs = corrs[feature]
    for feat, corr in zip(feat_corrs.index, feat_corrs.values):
        if np.abs(corr) > 0.3 and feat != feature and feat[-2] == 'p':
            corr_dict[feature].append((feat, corr))

for key in corr_dict.keys():
    corr_dict[key] = sorted(corr_dict[key], key=lambda tup: -tup[1])

In [None]:
feat_dict = {}
for feat in list(corr_dict.keys())[1:]:
    feat_dict[feat] = ['two_point_percentage_p1', 'two_point_attempts_p1',
       'three_point_percentage_p1', 'three_point_attempts_p1',
       'free_throw_percentage_p1', 'free_throw_attempts_p1',
       'defensive_rebounds_p1', 'offensive_rebounds_p1', 'assists_p1',
       'steals_p1', 'blocks_p1', 'turnovers_p1', 'personal_fouls_p1']
for key in feat_dict.keys():
    feat_dict[key] = feat_dict[key] + ['Age', 'Age2', 'Age3']
targets = list(feat_dict.keys())

In [None]:
X_train = train[train['season_index'] >= 1][targets]
y_train = train[train['season_index'] >= 1]['two_point_attempts']
X_test = test[test['season_index'] >= 1][targets]
y_test = test[test['season_index'] >= 1]['two_point_attempts']

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [None]:
def evaluate_general_model(model, X_data, y_data, sig = False, mae = True, r2 = False):
    
    preds = model.predict(X_data)
    
    if sig == True:
        preds = sigmoid(preds)
        y_data = sigmoid(y_data)
    
    if mae:
        return mean_absolute_error(y_data, preds)
    
    if r2 == False:
        return mean_squared_error(y_data, preds)
    
    else :
        return r2_score(y_data, preds)

In [None]:
from sklearn.neural_network import MLPRegressor
MLP_model = MLPRegressor(random_state=4, max_iter=500)
MLP_model.fit(X_train, y_train)
evaluate_general_model(MLP_model, X_train, y_train)

In [None]:
evaluate_general_model(MLP_model, X_test, y_test)

In [None]:
from sklearn.ensemble import GradientBoostingRegressor
grad_model = GradientBoostingRegressor(n_estimators=75)
grad_model.fit(X_train, y_train)
evaluate_general_model(grad_model, X_train, y_train)

In [None]:
class MultiModelContainer:
    
    '''
    Object to function as a container for multiple linear models.
    Initialize with a dictionary where keys are targets and values are lists of features.
    '''
    
    def __init__(self, feature_dict, data):
        
        self.model_dict = {}
        self.feature_dict = feature_dict
        
        # initialize and fit models
        for target in feature_dict.keys():
            features = feature_dict[target]
            self.model_dict[target] = GradientBoostingRegressor(n_estimators=75).fit(data[features], data[target])    

    def predict(self, targets, data, output_df = False, input_df = True, feature_map = None):
        
        switch = False
        
        for target in targets:
            # select features for target
            features = self.feature_dict[target]
            
            if input_df:
                # select model and input data for prediction
                preds = self.model_dict[target].predict(data[features])
            else:
                # if not using a df use feature map to select data from an array
                X = np.hstack(tuple([data[:,feature_map[feature]] for feature in features]))
                preds = self.model_dict[target].predict(X.reshape(-1,len(features)))
            
            if not switch:
                predictions = preds.reshape(-1,1)
                switch = True
                
            else:
                predictions = np.hstack((predictions, preds.reshape(-1,1)))
        
        if not output_df:
            return predictions
        
        else:
            return pd.DataFrame(predictions, columns = targets, index = data.index)

In [None]:
X_train = train[train['season_index'] >= 1]
X_test = test[test['season_index'] >= 1]

In [None]:
MultiModel = MultiModelContainer(feat_dict, X_train)

In [None]:
results = defaultdict(list)

for key in targets:
    
    feats = feat_dict[key]
    
    if key[-10:] == 'percentage':
        
        train_mae = evaluate_general_model(MultiModel.model_dict[key],
                                                X_train[feats], X_train[key], sig = True)
        test_mae = evaluate_general_model(MultiModel.model_dict[key],
                                                X_test[feats], X_test[key], sig = True)
        train_rmse = np.sqrt(evaluate_general_model(MultiModel.model_dict[key], 
                                                    X_train[feats], X_train[key], sig = True, mae = False))
        test_rmse = np.sqrt(evaluate_general_model(MultiModel.model_dict[key], 
                                           X_test[feats], X_test[key], sig = True, mae = False))
        train_r2 = evaluate_general_model(MultiModel.model_dict[key],
                                                X_train[feats], X_train[key], sig = True, mae = False, r2 = True)
        test_r2 = evaluate_general_model(MultiModel.model_dict[key],
                                                X_test[feats], X_test[key], sig = True, mae = False, r2 = True)
        stddev = sigmoid(data[key]).std()
        
        results['train_mae'].append(np.round(train_mae, 3))
        results['train_zscore'].append(np.round(train_mae/stddev, 3))
        results['train_RMSE'].append(np.round(train_rmse, 3))
        results['train_R^2'].append(np.round(train_r2, 3))
        results['test_mae'].append(np.round(test_mae, 3))
        results['test_zscore'].append(np.round(test_mae/stddev, 3))
        results['test_RMSE'].append(np.round(test_rmse, 3))
        results['test_R^2'].append(np.round(test_r2, 3))
        
    else:
        train_mae = evaluate_general_model(MultiModel.model_dict[key], X_train[feats], X_train[key])
        test_mae = evaluate_general_model(MultiModel.model_dict[key], X_test[feats], X_test[key])
        train_rmse = np.sqrt(evaluate_general_model(MultiModel.model_dict[key], X_train[feats], X_train[key], mae = False))
        test_rmse = np.sqrt(evaluate_general_model(MultiModel.model_dict[key], X_test[feats], X_test[key], mae = False))
        train_r2 = evaluate_general_model(MultiModel.model_dict[key], X_train[feats], X_train[key], mae = False, r2 = True)
        test_r2 = evaluate_general_model(MultiModel.model_dict[key], X_test[feats], X_test[key], mae = False, r2 = True)
        stddev = data[key].std()
        
        results['train_mae'].append(np.round(train_mae, 3))
        results['train_zscore'].append(np.round(train_mae/stddev, 3))
        results['train_RMSE'].append(np.round(train_rmse, 3))
        results['train_R^2'].append(np.round(train_r2, 3))
        results['test_mae'].append(np.round(test_mae, 3))
        results['test_zscore'].append(np.round(test_mae/stddev, 3))
        results['test_RMSE'].append(np.round(test_rmse, 3))
        results['test_R^2'].append(np.round(test_r2, 3))

results = pd.DataFrame(results, index = feat_dict.keys())

In [None]:
results

In [None]:
results.sum()[['train_zscore', 'test_zscore']]