## MODELLING

 Predicting All-NBA Teams for the 2015–16 and 2016–17 seasons using logistic regression, support vector classifier, and random forest methods.


In [14]:
# logistic regression
import time
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import f1_score
np.random.seed(100)
import warnings
warnings.filterwarnings("ignore")

In [15]:
# read training data
train_allNBA = pd.read_csv('train_allNBA.csv').fillna(0)
predictors = [c for c in train_allNBA.columns if "allNBA" not in c]

# test data from 2016-17 NBA season
test2016 = pd.read_csv('nba2016_17.csv').fillna(0)
test2016 = pd.concat((test2016[['Player','playerID','pos']], pd.get_dummies(test2016['pos'],prefix='pos',drop_first=True), test2016.iloc[:,3:]), axis=1)
# test data from 2015-16 NBA season
test2015 = pd.read_csv('nba2015_16.csv').fillna(0)
test2015 = pd.concat((test2015[['Player','playerID','pos']], pd.get_dummies(test2015['pos'],prefix='pos',drop_first=True), test2015.iloc[:,3:]), axis=1)

In [16]:
# Standardise predictors
scaler = StandardScaler()
scaler.fit(train_allNBA[predictors[2:]])
X_train = pd.DataFrame(scaler.transform(train_allNBA[predictors[2:]]))
X_train.columns = predictors[2:]
X_train = pd.concat((train_allNBA[predictors[:2]], X_train), axis = 1)
X_test2016 = pd.DataFrame(scaler.transform(test2016[predictors[2:]]))
X_test2016.columns = predictors[2:]
X_test2016 = pd.concat((test2016[predictors[:2]], X_test2016), axis = 1)
X_test2015 = pd.DataFrame(scaler.transform(test2015[predictors[2:]]))
X_test2015.columns = predictors[2:]
X_test2015 = pd.concat((test2015[predictors[:2]], X_test2015), axis = 1)

In [17]:
# Split training set into a training and a calibration set
X_train, X_cal, y_train, y_cal= train_test_split(X_train, train_allNBA['allNBA'], test_size = 0.5)

In [18]:
X_train.head()

Unnamed: 0,pos_F,pos_G,GP,oRebounds,dRebounds,assists,steals,blocks,turnovers,PF,fgAttempted,fgMade,ftAttempted,ftMade,threeAttempted,threeMade,ftPct,efgPct
3910,1,0,1.002096,1.204517,1.114875,0.812644,0.370645,-0.043359,2.327722,1.97778,1.716819,2.469952,1.684566,1.425118,-0.654973,-0.60351,-0.141536,1.545525
5897,0,0,1.002096,0.030198,0.612669,-0.347481,-0.5678,0.271857,-0.869547,0.955305,0.364968,0.41732,-0.989057,-0.928499,-0.507513,-0.52884,0.197519,0.157722
3665,1,0,-0.530166,-0.715043,-0.98225,-0.849156,-0.935018,-0.785042,-1.135986,-0.854594,-1.081935,-1.011728,-0.749466,-0.82527,-0.563688,-0.60351,-1.165912,-0.01089
9151,0,0,1.043509,3.44024,1.241807,-0.666254,-0.506598,3.220049,-0.469888,1.507676,-0.310957,-0.431448,-0.117815,-0.508702,-0.704127,-0.640845,-1.836807,-0.853948
11060,0,1,0.587971,-0.692459,-0.165475,1.789866,1.10508,-0.562537,1.068192,-0.008408,0.498041,0.508259,-0.204939,-0.013203,1.114549,1.487238,1.15697,0.767318


In [19]:
# Use GridSearchCV to determine best parameters for logitReg
t0 = time.time()
logit = LogisticRegression()
parameters = {'C': [0.01, 0.1, 1, 10, 100, 1000], 'penalty': ['l1', 'l2'], 'class_weight': [None, 'balanced']}
stratKFolds = StratifiedKFold(n_splits = 3)
logitGrid = GridSearchCV(logit, parameters, scoring='f1', cv=stratKFolds)
logitGrid.fit(X_train, y_train)
t1 = time.time()
print(logitGrid.best_params_)
print("Time taken:", str(t1-t0), "seconds")

{'C': 1, 'class_weight': None, 'penalty': 'l2'}
Time taken: 4.187865972518921 seconds


In [20]:
t0 = time.time()
logit = logitGrid.best_estimator_
logitCV = CalibratedClassifierCV(logit, cv = stratKFolds)
logitCV.fit(X_cal, y_cal)
t1 = time.time()
print("Time taken:", str(t1-t0), "seconds")

Time taken: 0.28489089012145996 seconds


In [21]:
# NBA 2016-17 All-NBA Team Predictions
probs2016 = pd.DataFrame(logitCV.predict_proba(X_test2016))
probs2016 = pd.concat((test2016[['Player','pos']], probs2016), axis = 1)
pred2016 = probs2016.sort_values(1, ascending = False).iloc[:15,:] # select only the first 15 observations with the highest probability of being classified positive
print("\nPredicted 2016-17 All-NBA Team winners:\n")
print(pred2016[['Player','pos']].sort_index())
print("\nActual 2016-17 All-NBA Team winners:\n")
print(test2016.loc[test2016['allNBA'] == 1, ['Player','pos']])
print("\nNumber of correct All-NBA predictions:", len(set(pred2016['Player']) & set(test2016.loc[test2016['allNBA'] == 1, 'Player'])))
print("Decision threshold:", str(pred2016.iloc[14,3].round(3)))
probs2016['Label'] = 0
for i, label in enumerate(probs2016[1]):
    if label >= pred2016.iloc[14,3]:
        probs2016['Label'].iloc[i] = 1
print("Classifier F1 score:", f1_score(test2016['allNBA'], probs2016['Label']).round(3))



Predicted 2016-17 All-NBA Team winners:

                    Player pos
15   Giannis Antetokounmpo   F
65            Jimmy Butler   F
89        DeMarcus Cousins   C
97           Stephen Curry   G
99           Anthony Davis   C
107          DeMar DeRozan   G
118           Kevin Durant   F
171           James Harden   G
219           LeBron James   F
260          Kawhi Leonard   F
265         Damian Lillard   G
423          Isaiah Thomas   G
432     Karl-Anthony Towns   C
451              John Wall   G
457      Russell Westbrook   G

Actual 2016-17 All-NBA Team winners:

                    Player pos
15   Giannis Antetokounmpo   F
65            Jimmy Butler   F
97           Stephen Curry   G
99           Anthony Davis   C
107          DeMar DeRozan   G
118           Kevin Durant   F
154            Rudy Gobert   C
163         Draymond Green   F
171           James Harden   G
219           LeBron James   F
239         DeAndre Jordan   C
260          Kawhi Leonard   F
423          Isaiah 

In [22]:
# NBA 2015-16 All-NBA Team Predictions
probs2015 = pd.DataFrame(logitCV.predict_proba(X_test2015))
probs2015 = pd.concat((test2015[['Player','pos']], probs2015), axis = 1)
pred2015 = probs2015.sort_values(1, ascending = False).iloc[:15,:] # select only the first 15 observations with the highest probability of being classified positive
print("\nPredicted 2015-16 All-NBA Team winners:\n")
print(pred2015[['Player','pos']].sort_index())
print("\nActual 2015-16 All-NBA Team winners:\n")
print(test2015.loc[test2015['allNBA'] == 1, ['Player','pos']])
print("\nNumber of correct All-NBA predictions:", len(set(pred2015['Player']) & set(test2015.loc[test2015['allNBA'] == 1, 'Player'])))
print("Decision threshold:", str(pred2015.iloc[14,3].round(3)))
probs2015['Label'] = 0
for i, label in enumerate(probs2015[1]):
    if label >= pred2015.iloc[14,3]:
        probs2015['Label'].iloc[i] = 1
print("Classifier F1 score:", f1_score(test2015['allNBA'], probs2015['Label']).round(3))


Predicted 2015-16 All-NBA Team winners:

                Player pos
71        Jimmy Butler   G
96    DeMarcus Cousins   C
104      Stephen Curry   G
106      Anthony Davis   C
125       Kevin Durant   F
176       James Harden   G
220       LeBron James   F
240     DeAndre Jordan   C
262      Kawhi Leonard   F
265     Damian Lillard   G
272         Kyle Lowry   G
347         Chris Paul   G
444       Kemba Walker   G
445          John Wall   G
451  Russell Westbrook   G

Actual 2015-16 All-NBA Team winners:

                Player pos
6    LaMarcus Aldridge   F
96    DeMarcus Cousins   C
104      Stephen Curry   G
120     Andre Drummond   C
125       Kevin Durant   F
152        Paul George   F
165     Draymond Green   F
220       LeBron James   F
240     DeAndre Jordan   C
262      Kawhi Leonard   F
265     Damian Lillard   G
272         Kyle Lowry   G
347         Chris Paul   G
424      Klay Thompson   G
451  Russell Westbrook   G

Number of correct All-NBA predictions: 10
Decision thr

In [24]:
# Determine optimal penalty parameter C for LinearSVC
from sklearn import svm
t0 = time.time()
svClassifier = svm.SVC(kernel = 'linear')
parameters = {'C': [0.01,0.1,1,10,100,1000], 'class_weight': [None,'balanced']}
stratKFolds = StratifiedKFold(n_splits = 3)
svcGrid = GridSearchCV(svClassifier, parameters, scoring = 'f1', cv = stratKFolds)
svcGrid.fit(X_train, y_train)
t1 = time.time()
print(svcGrid.best_params_)
print("Time taken:", str(t1-t0), "seconds")

{'C': 1, 'class_weight': None}
Time taken: 364.43068623542786 seconds


In [26]:
# Use StratifiedKFold and CalibratedClassifierCV to train the model
t0 = time.time()
svClassifier = svcGrid.best_estimator_.set_params(probability = True)
svClassifierCV = CalibratedClassifierCV(svClassifier, cv = stratKFolds)
svClassifierCV.fit(X_cal, y_cal)
t1 = time.time()
print("Time taken:", str(t1-t0), "seconds")


Time taken: 2.6397151947021484 seconds


In [27]:
# NBA 2016-17 All-NBA Team Predictions
probs2016 = pd.DataFrame(svClassifierCV.predict_proba(X_test2016))
probs2016 = pd.concat((test2016[['Player','pos']], probs2016), axis = 1)
pred2016 = probs2016.sort_values(1, ascending = False).iloc[:15,:] # select only the first 15 observations with the highest probability of being classified positive
print("\nPredicted 2016-17 All-NBA Team winners:\n")
print(pred2016[['Player','pos']].sort_index())
print("\nActual 2016-17 All-NBA Team winners:\n")
print(test2016.loc[test2016['allNBA'] == 1, ['Player','pos']])
print("\nNumber of correct All-NBA predictions:", len(set(pred2016['Player']) & set(test2016.loc[test2016['allNBA'] == 1, 'Player'])))
print("Decision threshold:", str(pred2016.iloc[14,3].round(3)))
probs2016['Label'] = 0
for i, label in enumerate(probs2016[1]):
    if label >= pred2016.iloc[14,3]:
        probs2016['Label'].iloc[i] = 1
print("Classifier F1 score:", f1_score(test2016['allNBA'], probs2016['Label']))

# NBA 2015-16 All-NBA Team Predictions
probs2015 = pd.DataFrame(svClassifierCV.predict_proba(X_test2015))
probs2015 = pd.concat((test2015[['Player','pos']], probs2015), axis = 1)
pred2015 = probs2015.sort_values(1, ascending = False).iloc[:15,:] # select only the first 15 observations with the highest probability of being classified positive
print("\nPredicted 2015-16 All-NBA Team winners:\n")
print(pred2015[['Player','pos']].sort_index())
print("\nActual 2015-16 All-NBA Team winners:\n")
print(test2015.loc[test2015['allNBA'] == 1, ['Player','pos']])
print("\nNumber of correct All-NBA predictions:", len(set(pred2015['Player']) & set(test2015.loc[test2015['allNBA'] == 1, 'Player'])))
print("Decision threshold:", str(pred2015.iloc[14,3].round(3)))
probs2015['Label'] = 0
for i, label in enumerate(probs2015[1]):
    if label >= pred2015.iloc[14,3]:
        probs2015['Label'].iloc[i] = 1
print("Classifier F1 score:", f1_score(test2015['allNBA'], probs2015['Label']).round(3))


Predicted 2016-17 All-NBA Team winners:

                    Player pos
15   Giannis Antetokounmpo   F
65            Jimmy Butler   F
89        DeMarcus Cousins   C
97           Stephen Curry   G
99           Anthony Davis   C
107          DeMar DeRozan   G
118           Kevin Durant   F
171           James Harden   G
219           LeBron James   F
260          Kawhi Leonard   F
265         Damian Lillard   G
423          Isaiah Thomas   G
432     Karl-Anthony Towns   C
451              John Wall   G
457      Russell Westbrook   G

Actual 2016-17 All-NBA Team winners:

                    Player pos
15   Giannis Antetokounmpo   F
65            Jimmy Butler   F
97           Stephen Curry   G
99           Anthony Davis   C
107          DeMar DeRozan   G
118           Kevin Durant   F
154            Rudy Gobert   C
163         Draymond Green   F
171           James Harden   G
219           LeBron James   F
239         DeAndre Jordan   C
260          Kawhi Leonard   F
423          Isaiah 

In [28]:
#train with random forest