In [1]:
import numpy as np
import pandas as pd
import time

# @param dfFile: pandas.DataFrame ('nba_preprocessed.csv')
# @param dateStart, dateEnd: str in the format of 'YYYY-MM-DD'
# @param attriToDrop: list[str]
# @return X, Y: pandas.DataFrame
# featureExtraction() outputs X, Y for model training.
# Game date can be assigned
# Attribute to be dropped can be assigned
def featureExtraction(dfFile, dateStart='1000-01-01', dateEnd='2999-12-31', attriToDrop=None):
    df = pd.read_csv(dfFile)
    
    # Date selection
    df = df.loc[lambda df: (df.Date_A > dateStart) & (df.Date_A < dateEnd), :].reset_index(drop=True)
    
    # Get label Y
    Y = df[['W/L_A']]
    Y = Y.rename(columns={'W/L_A': 'Label'})
    
    # Get attributes X
    colToDrop = ['Team_A', 'Date_A', 'W/L_A', 'Score_A', 'Opponent_A', 'Team_B', 'Date_B', 'W/L_B', 'Home/Away_B', 'Score_B', 'Opponent_B']
    colToDrop += attriToDrop if attriToDrop else []
    X = df.drop(columns = colToDrop)
    
    return X, Y

In [2]:
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from matplotlib import pyplot as plt
from sklearn.svm import SVC
import numpy as np
import operator
%matplotlib inline

def CrossValidationGridSearchNested(X_data, Y_data, num_trials, fold_num, est_classifcation, tuned_param, scoring):
    max_score = -1
    best_estimator = est_classifcation
    
    for i in range(num_trials):
        inner_cv = StratifiedKFold(n_splits=fold_num, random_state=i, shuffle=True)
        outer_cv = StratifiedKFold(n_splits=fold_num, random_state=i+1, shuffle=True)

        # Non_nested parameter search and scoring
        clf = GridSearchCV(estimator=est_classifcation, param_grid=tuned_param, cv=inner_cv, scoring=scoring)
        clf.fit(X_data, Y_data)
        
        # CV with parameter optimization
        param_score = cross_val_score(clf.best_estimator_, X=X_data, y=Y_data, cv=outer_cv, scoring=scoring).mean()
        if(param_score > max_score):
            max_score = param_score
            best_estimator = clf.best_estimator_
            
        progress = (i+1)/num_trials*100
        print(f'> progress = {progress}%')
    
    return (max_score, best_estimator)

In [3]:
dfFile = '../NBA_Ino_part/nba_preprocessed.csv'
dateStart = '2017-10-01'
dateEnd = '2018-04-30'
# X, Y = featureExtraction(dfFile, dateStart, dateEnd)
X, Y = featureExtraction(dfFile, attriToDrop=['PTS_A', 'PTS_B'], dateStart=dateStart, dateEnd=dateEnd)
# X, Y = featureExtraction(dfFile)
X_val = X.values
Y_val = Y.values

In [4]:
from sklearn.ensemble import GradientBoostingClassifier
from time import time

# Set the parameters by cross-validation
tuned_max_features = ['auto', 'sqrt', 'log2']
tuned_parameters = {
                     "loss": ['deviance', 'exponential'],
                     "n_estimators": [600],
                     "learning_rate": [0.25],
                     "max_depth": [3, 9, 11],
                     "subsample": [0.5],
                     "max_features": tuned_max_features
                   }

# Number of random trials
NUM_TRIALS = 1

# We will use a Support Vector Classifier with "rbf" kernel
gbm = GradientBoostingClassifier()

start = time()
(max_score, gbm_best_estimator) = CrossValidationGridSearchNested(X_val, Y_val.ravel(), NUM_TRIALS, 10, gbm, tuned_parameters, 'roc_auc')
gbm_best_parameter = gbm_best_estimator.get_params()

print("CrossValidationGridSearchNested of GradientBoostingClassifier wih NUM_TRIALS = %2.0d took %.2f seconds."%(NUM_TRIALS, (time() - start)))
print(f'\nmax_score = {max_sco`re}\n')
print(f'\nbest_estimator = {gbm_best_estimator}\n')
print(f'\nbest_parameter = {gbm_best_parameter}\n')

> progress = 100.0%
CrossValidationGridSearchNested of GradientBoostingClassifier wih NUM_TRIALS =  1 took 376.25 seconds.

max_score = 0.994460968999934


best_estimator = GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.25, loss='deviance', max_depth=3,
              max_features='auto', max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=600,
              presort='auto', random_state=None, subsample=0.5, verbose=0,
              warm_start=False)


best_parameter = {'criterion': 'friedman_mse', 'init': None, 'learning_rate': 0.25, 'loss': 'deviance', 'max_depth': 3, 'max_features': 'auto', 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 600, 'presort': 'auto', 'rand

In [5]:
from sklearn.ensemble import GradientBoostingClassifier
from time import time

# Set the parameters by cross-validation
tuned_max_features = ['auto', 'sqrt', 'log2']+list(range(2, len(X.columns), 3))
tuned_parameters = {
                     "loss": ['deviance', 'exponential'],
                     "n_estimators": [600],
                     "learning_rate": [0.25],
                     "max_depth": [3, 11, 12, 13],
                     "subsample": [0.5],
                     "max_features": tuned_max_features
                   }

# Number of random trials
NUM_TRIALS = 1

# We will use a Support Vector Classifier with "rbf" kernel
gbm = GradientBoostingClassifier()

start = time()
(max_score, gbm_best_estimator) = CrossValidationGridSearchNested(X_val, Y_val.ravel(), NUM_TRIALS, 10, gbm, tuned_parameters, 'roc_auc')
gbm_best_parameter = gbm_best_estimator.get_params()

print("CrossValidationGridSearchNested of GradientBoostingClassifier wih NUM_TRIALS = %2.0d took %.2f seconds."%(NUM_TRIALS, (time() - start)))
print(f'\nmax_score = {max_score}\n')
print(f'\nbest_estimator = {gbm_best_estimator}\n')
print(f'\nbest_parameter = {gbm_best_parameter}\n')

> progress = 100.0%
CrossValidationGridSearchNested of GradientBoostingClassifier wih NUM_TRIALS =  1 took 3882.37 seconds.

max_score = 0.9946262145548286


best_estimator = GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.25, loss='exponential', max_depth=3,
              max_features=14, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=600,
              presort='auto', random_state=None, subsample=0.5, verbose=0,
              warm_start=False)


best_parameter = {'criterion': 'friedman_mse', 'init': None, 'learning_rate': 0.25, 'loss': 'exponential', 'max_depth': 3, 'max_features': 14, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 600, 'presort': 'auto', 'rand