In [1]:
import numpy as np
import pandas as pd
import time

# @param dfFile: pandas.DataFrame ('nba_preprocessed.csv')
# @param dateStart, dateEnd: str in the format of 'YYYY-MM-DD'
# @param attriToDrop: list[str]
# @return X, Y: pandas.DataFrame
# featureExtraction() outputs X, Y for model training.
# Game date can be assigned
# Attribute to be dropped can be assigned
def featureExtraction(dfFile, dateStart='1000-01-01', dateEnd='2999-12-31', attriToDrop=None):
    df = pd.read_csv(dfFile)
    
    # Date selection
    df = df.loc[lambda df: (df.Date_A > dateStart) & (df.Date_A < dateEnd), :].reset_index(drop=True)
    
    # Get label Y
    Y = df[['W/L_A']]
    Y = Y.rename(columns={'W/L_A': 'Label'})
    
    # Get attributes X
    colToDrop = ['Team_A', 'Date_A', 'W/L_A', 'Score_A', 'Opponent_A', 'Team_B', 'Date_B', 'W/L_B', 'Home/Away_B', 'Score_B', 'Opponent_B']
    colToDrop += attriToDrop if attriToDrop else []
    X = df.drop(columns = colToDrop)
    
    return X, Y

In [2]:
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from matplotlib import pyplot as plt
from sklearn.svm import SVC
import numpy as np
import operator
%matplotlib inline

def CrossValidationGridSearchNested(X_data, Y_data, num_trials, fold_num, est_classifcation, tuned_param, scoring):
    max_score = -1
    best_estimator = est_classifcation
    
    for i in range(num_trials):
        inner_cv = StratifiedKFold(n_splits=fold_num, random_state=i, shuffle=True)
        outer_cv = StratifiedKFold(n_splits=fold_num, random_state=i+1, shuffle=True)

        # Non_nested parameter search and scoring
        clf = GridSearchCV(estimator=est_classifcation, param_grid=tuned_param, cv=inner_cv, scoring=scoring)
        clf.fit(X_data, Y_data)
        
        # CV with parameter optimization
        param_score = cross_val_score(clf.best_estimator_, X=X_data, y=Y_data, cv=outer_cv, scoring=scoring).mean()
        if(param_score > max_score):
            max_score = param_score
            best_estimator = clf.best_estimator_
            
        progress = (i+1)/num_trials*100
        print(f'> progress = {progress}%')
    
    return (max_score, best_estimator)

# Execution

## -Feature Extraction

In [3]:
dfFile = '../NBA_Ino_part/nba_preprocessed.csv'
dateStart = '2017-10-01'
dateEnd = '2018-04-30'
# X, Y = featureExtraction(dfFile, dateStart, dateEnd)
X, Y = featureExtraction(dfFile, attriToDrop=['PTS_A', 'PTS_B'], dateStart=dateStart, dateEnd=dateEnd)
# X, Y = featureExtraction(dfFile)

In [4]:
X

Unnamed: 0,Home/Away_A,FG%_A,FGM_A,FGA_A,3P%_A,3PM_A,3PA_A,FT%_A,FTM_A,FTA_A,...,FTM_B,FTA_B,REB_B,OREB_B,DREB_B,AST_B,STL_B,BLK_B,TOV_B,PF_B
0,0,0.468,36,77,0.481,13,27,0.917,11,12,...,15,16,36,6,30,28,9,5,15,18
1,1,0.435,37,85,0.333,10,30,0.600,12,20,...,9,13,39,12,27,15,6,3,13,21
2,1,0.488,39,80,0.348,8,23,0.783,18,23,...,26,28,38,10,28,24,13,3,15,24
3,0,0.384,38,99,0.258,8,31,0.600,9,15,...,21,26,49,9,40,18,5,5,14,16
4,0,0.462,42,91,0.200,4,20,0.739,17,23,...,12,12,43,11,32,23,7,4,12,21
5,0,0.505,47,93,0.550,11,20,0.864,19,22,...,21,31,56,17,39,24,5,4,15,18
6,0,0.412,35,85,0.439,18,41,0.710,22,31,...,21,23,49,11,38,27,8,10,22,27
7,1,0.534,47,88,0.500,16,32,0.655,19,29,...,16,25,38,9,29,20,7,4,13,25
8,1,0.450,36,80,0.391,9,23,0.560,14,25,...,7,11,41,8,33,23,9,6,23,24
9,1,0.460,40,87,0.250,6,24,0.815,22,27,...,9,17,41,7,34,38,13,4,13,23


In [5]:
X_val = X.values
X_val

array([[ 0.   ,  0.468, 36.   , ...,  5.   , 15.   , 18.   ],
       [ 1.   ,  0.435, 37.   , ...,  3.   , 13.   , 21.   ],
       [ 1.   ,  0.488, 39.   , ...,  3.   , 15.   , 24.   ],
       ...,
       [ 1.   ,  0.544, 49.   , ...,  6.   ,  8.   , 16.   ],
       [ 0.   ,  0.522, 47.   , ...,  0.   , 13.   , 16.   ],
       [ 0.   ,  0.349, 30.   , ...,  6.   , 15.   , 20.   ]])

In [6]:
Y.head(5)

Unnamed: 0,Label
0,0
1,1
2,0
3,0
4,1


In [7]:
Y_val = Y.values
Y_val[0:5]

array([[0],
       [1],
       [0],
       [0],
       [1]])

In [8]:
Y_val = Y.values.ravel()
Y_val[0:5]

array([0, 1, 0, 0, 1])

## -Cross Validation Grid Search

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
from time import time

# Set the parameters by cross-validation
tuned_max_features = ['auto', 'sqrt', 'log2']
tuned_parameters = {
                     "loss": ['deviance', 'exponential'],
                     "n_estimators": range(100, 1001, 100),
                     "learning_rate": [1, 0.5, 0.25, 0.1, 0.05, 0.01, 0.005, 0.001],
                     "max_depth": [3, 5, 8, 9, 11],
                     "subsample": [0.1, 0.5, 1],
                     "max_features": tuned_max_features
                   }

# Number of random trials
NUM_TRIALS = 1

# We will use a Support Vector Classifier with "rbf" kernel
gbm = GradientBoostingClassifier()

start = time()
(max_score, gbm_best_estimator) = CrossValidationGridSearchNested(X_val, Y_val.ravel(), NUM_TRIALS, 10, gbm, tuned_parameters, 'roc_auc')
gbm_best_parameter = gbm_best_estimator.get_params()

print("CrossValidationGridSearchNested of GradientBoostingClassifier wih NUM_TRIALS = %2.0d took %.2f seconds."%(NUM_TRIALS, (time() - start)))
print(f'\nmax_score = {max_score}\n')
print(f'\nbest_estimator = {gbm_best_estimator}\n')
print(f'\nbest_parameter = {gbm_best_parameter}\n')