In [1]:
import numpy as np
import pandas as pd
import time

# @param dfFile: pandas.DataFrame ('nba_preprocessed.csv')
# @param dateStart, dateEnd: str in the format of 'YYYY-MM-DD'
# @param attriToDrop: list[str]
# @return X, Y: pandas.DataFrame
# featureExtraction() outputs X, Y for model training.
# Game date can be assigned
# Attribute to be dropped can be assigned
def featureExtraction(dfFile, dateStart='1000-01-01', dateEnd='2999-12-31', attriToDrop=None):
    df = pd.read_csv(dfFile)
    
    # Date selection
    df = df.loc[(df.Date_A > dateStart) & (df.Date_A < dateEnd), :].reset_index(drop=True)
    
    # Get label Y
    Y = df[['W/L_A']]
    Y = Y.rename(columns={'W/L_A': 'Label'})
    
    # Get attributes X
    attriToDrop = [x + '_A' for x in attriToDrop] + [x + '_B' for x in attriToDrop] if attriToDrop else []
    colToDrop = ['Team_A', 'Date_A', 'W/L_A', 'Score_A', 'Opponent_A', 'Team_B', 'Date_B', 'W/L_B', 'Home/Away_B', 'Score_B', 'Opponent_B']
    colToDrop += attriToDrop if attriToDrop else []
    X = df.drop(columns = colToDrop)
    
    return X, Y

In [2]:
import random
import sys
import numpy as np
import pandas as pd
import warnings
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from datetime import datetime

warnings.filterwarnings(action='ignore', category=DeprecationWarning)

def StackingMethod(origin_df_X, origin_df_Y, kfold=10, is_debug=0, **all_basic_classifiers):
    X_train = origin_df_X.values
    Y_train = origin_df_Y.values.ravel()
    random.seed(datetime.now())
    skf = StratifiedKFold(n_splits=kfold, random_state=random.randint(0, 2**32-1), shuffle=True)
    iteration = 0
    len_y = 0
    new_feature_columns = ['Label_'+x for x in all_basic_classifiers.keys()]
    new_feature_arr     = np.zeros([len(X_train), len(new_feature_columns)])
    model_dict = {}
    
    #Initialize the model list for every classifier
    for k in all_basic_classifiers.keys():
        model_dict[k] = []
    
    for train_index, test_index in skf.split(X_train, Y_train):
        X_cv_train = X_train[train_index]
        Y_cv_train = Y_train[train_index]
        X_cv_test  = X_train[test_index]
        Y_cv_test  = Y_train[test_index]
        column_label_index = 0
        
        if(is_debug):
            print(f"-----iteration {iteration}-----")
            print(f'test_index = {test_index}')
        for k, v in all_basic_classifiers.items():
            classifier_cv = v
            classifier_cv.fit(X_cv_train, Y_cv_train)
            model_dict[k].append(classifier_cv)
            Y_cv_test_result = classifier_cv.predict(X_cv_test)
            count_result_index = 0
            for index in test_index:
                new_feature_arr[index][column_label_index] = Y_cv_test_result[count_result_index]
                count_result_index += 1
                
            column_label_index += 1
            
            if(is_debug):
                len_y += len(Y_cv_test_result)
                print(f'key = {k}, val = {v}')
                print(f'Y_cv_test_result = {Y_cv_test_result}')
                print(f'len(Y_cv_test_result) = {len(Y_cv_test_result)}')
                print(type(Y_cv_test_result))
                print('-------')
        iteration += 1
        
    new_feature_df = pd.DataFrame(data = new_feature_arr, columns = new_feature_columns)
    output_df_all = pd.concat([origin_df_X, new_feature_df], axis=1, ignore_index=False)
    
    if(is_debug):
        print(f'total len_y = {len_y}')
        print(f'new_feature_columns = {new_feature_columns}')
        count_index_fin = 0
        for x in new_feature_arr:
            print(f'index = {count_index_fin}, label = {x}')
            count_index_fin += 1
        
    return (output_df_all, model_dict)

In [3]:
def CrossValidationGridSearchNested(origin_df_X, origin_df_Y, num_trials, fold_num, est_classifcation, tuned_param, scoring):
    X_data = origin_df_X.values
    Y_data = origin_df_Y.values.ravel()
    max_score = -1
    best_estimator = est_classifcation
    is_tuned_param_empty = (tuned_param == []) | (tuned_param == None)
    
    for i in range(num_trials):
        inner_cv = StratifiedKFold(n_splits=fold_num, random_state=i, shuffle=True)
        outer_cv = StratifiedKFold(n_splits=fold_num, random_state=i+1, shuffle=True)
        
        if(is_tuned_param_empty):
            param_score = cross_val_score(est_classifcation, X=X_data, y=Y_data, cv=outer_cv, scoring=scoring).mean()
        else:
            # Non_nested parameter search and scoring
            clf = GridSearchCV(estimator=est_classifcation, param_grid=tuned_param, cv=inner_cv, scoring=scoring)
            clf.fit(X_data, Y_data)
        
            # CV with parameter optimization
            param_score = cross_val_score(clf.best_estimator_, X=X_data, y=Y_data, cv=outer_cv, scoring=scoring).mean()
            
        if(param_score > max_score):
            max_score = param_score
            if(is_tuned_param_empty):
                best_estimator = est_classifcation
            else:
                best_estimator = clf.best_estimator_
            
        progress = (i+1)/num_trials*100
        print(f'> progress = {progress}%')
    
    return (max_score, best_estimator)

In [4]:
import random
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score

#Only support roc_auc scoring now
def CrossValidationGetModelsScores(origin_df_X, origin_df_Y, fold_num, est_classifier):
    X_data = origin_df_X.values
    Y_data = origin_df_Y.values.ravel()
    random.seed(datetime.now())
    skf = StratifiedKFold(n_splits=fold_num, random_state=random.randint(0, 2**32-1), shuffle=True)
    ret_n_estimator = []
    ret_n_score     = []
    
    for train_index, test_index in skf.split(X_data, Y_data):
        X_cv_train = X_data[train_index]
        Y_cv_train = Y_data[train_index]
        X_cv_test  = X_data[test_index]
        Y_cv_test  = Y_data[test_index]
        
        ret_n_estimator.append(est_classifier)
        ret_n_estimator[-1].fit(X_cv_train, Y_cv_train)
        ret_n_score.append(roc_auc_score(Y_cv_test, ret_n_estimator[-1].predict(X_cv_test)))
    
    return(ret_n_estimator, ret_n_score)

In [5]:
#x_1d_test_array is the 1-dimension test array
def PredictFunctionAggregation(x_1d_test_array, n_estimator, voting=1, is_debug=0):
    ret_val = 0
    if(voting == 1):
        y_predict = [x.predict(x_1d_test_array) for x in n_estimator]
        count_0 = y_predict.count(0)
        count_1 = y_predict.count(1)
        
        if(is_debug):
            print(f'y_predict = {y_predict}')
            print(f'count_0   = {count_0}')
            print(f'count_1   = {count_1}')

        if(count_0 > count_1):
            ret_val = 0
        else:
            ret_val = 1
    else:
        y_predict = [x.predict_proba(x_1d_test_array) for x in n_estimator]
        if(is_debug):
            print(f'y_predict = {y_predict}')
        #return the probability that is 1(left wins)     
        total_1_prob = [x[0][1] for x in y_predict]
        ret_val = np.mean(total_1_prob)
    
    return ret_val

In [6]:
dfFile = '../../python_ex/NBA_Ino_Part/nba_preprocessed.csv'
dateStart = '2017-08-01'
dateEnd = '2018-05-01'
# X, Y = featureExtraction(dfFile, dateStart, dateEnd)
X, Y = featureExtraction(dfFile, attriToDrop=['PTS'], dateStart=dateStart, dateEnd=dateEnd)
# X, Y = featureExtraction(dfFile)

In [7]:
X

Unnamed: 0,Home/Away_A,FG%_A,FGM_A,FGA_A,3P%_A,3PM_A,3PA_A,FT%_A,FTM_A,FTA_A,...,FTM_B,FTA_B,REB_B,OREB_B,DREB_B,AST_B,STL_B,BLK_B,TOV_B,PF_B
0,0,0.468,36,77,0.481,13,27,0.917,11,12,...,15,16,36,6,30,28,9,5,15,18
1,1,0.435,37,85,0.333,10,30,0.600,12,20,...,9,13,39,12,27,15,6,3,13,21
2,1,0.488,39,80,0.348,8,23,0.783,18,23,...,26,28,38,10,28,24,13,3,15,24
3,0,0.384,38,99,0.258,8,31,0.600,9,15,...,21,26,49,9,40,18,5,5,14,16
4,0,0.462,42,91,0.200,4,20,0.739,17,23,...,12,12,43,11,32,23,7,4,12,21
5,0,0.505,47,93,0.550,11,20,0.864,19,22,...,21,31,56,17,39,24,5,4,15,18
6,0,0.412,35,85,0.439,18,41,0.710,22,31,...,21,23,49,11,38,27,8,10,22,27
7,1,0.534,47,88,0.500,16,32,0.655,19,29,...,16,25,38,9,29,20,7,4,13,25
8,1,0.450,36,80,0.391,9,23,0.560,14,25,...,7,11,41,8,33,23,9,6,23,24
9,1,0.460,40,87,0.250,6,24,0.815,22,27,...,9,17,41,7,34,38,13,4,13,23


In [8]:
Y

Unnamed: 0,Label
0,0
1,1
2,0
3,0
4,1
5,1
6,0
7,1
8,1
9,0


# Stage1

In [9]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB

level_1_classifier = {'logistic_classifier'             : LogisticRegression(),
                      'svm_classifier'                  : SVC(),
                      'naive_bayse_gaussian_classifier' : GaussianNB()}

level_1_out_X, model_dict = StackingMethod(X, Y, 10, 0, **level_1_classifier)
#level_1_out_X = StackingMethod(X, Y, 10, 0, **level_1_classifier)

In [10]:
level_1_out_X

Unnamed: 0,Home/Away_A,FG%_A,FGM_A,FGA_A,3P%_A,3PM_A,3PA_A,FT%_A,FTM_A,FTA_A,...,OREB_B,DREB_B,AST_B,STL_B,BLK_B,TOV_B,PF_B,Label_logistic_classifier,Label_svm_classifier,Label_naive_bayse_gaussian_classifier
0,0,0.468,36,77,0.481,13,27,0.917,11,12,...,6,30,28,9,5,15,18,0.0,0.0,0.0
1,1,0.435,37,85,0.333,10,30,0.600,12,20,...,12,27,15,6,3,13,21,1.0,1.0,1.0
2,1,0.488,39,80,0.348,8,23,0.783,18,23,...,10,28,24,13,3,15,24,0.0,1.0,1.0
3,0,0.384,38,99,0.258,8,31,0.600,9,15,...,9,40,18,5,5,14,16,0.0,0.0,0.0
4,0,0.462,42,91,0.200,4,20,0.739,17,23,...,11,32,23,7,4,12,21,1.0,1.0,0.0
5,0,0.505,47,93,0.550,11,20,0.864,19,22,...,17,39,24,5,4,15,18,1.0,1.0,1.0
6,0,0.412,35,85,0.439,18,41,0.710,22,31,...,11,38,27,8,10,22,27,0.0,1.0,0.0
7,1,0.534,47,88,0.500,16,32,0.655,19,29,...,9,29,20,7,4,13,25,1.0,1.0,1.0
8,1,0.450,36,80,0.391,9,23,0.560,14,25,...,8,33,23,9,6,23,24,1.0,0.0,0.0
9,1,0.460,40,87,0.250,6,24,0.815,22,27,...,7,34,38,13,4,13,23,0.0,0.0,0.0


In [11]:
level_1_out_X['Label'] = Y['Label']
level_1_out_X

Unnamed: 0,Home/Away_A,FG%_A,FGM_A,FGA_A,3P%_A,3PM_A,3PA_A,FT%_A,FTM_A,FTA_A,...,DREB_B,AST_B,STL_B,BLK_B,TOV_B,PF_B,Label_logistic_classifier,Label_svm_classifier,Label_naive_bayse_gaussian_classifier,Label
0,0,0.468,36,77,0.481,13,27,0.917,11,12,...,30,28,9,5,15,18,0.0,0.0,0.0,0
1,1,0.435,37,85,0.333,10,30,0.600,12,20,...,27,15,6,3,13,21,1.0,1.0,1.0,1
2,1,0.488,39,80,0.348,8,23,0.783,18,23,...,28,24,13,3,15,24,0.0,1.0,1.0,0
3,0,0.384,38,99,0.258,8,31,0.600,9,15,...,40,18,5,5,14,16,0.0,0.0,0.0,0
4,0,0.462,42,91,0.200,4,20,0.739,17,23,...,32,23,7,4,12,21,1.0,1.0,0.0,1
5,0,0.505,47,93,0.550,11,20,0.864,19,22,...,39,24,5,4,15,18,1.0,1.0,1.0,1
6,0,0.412,35,85,0.439,18,41,0.710,22,31,...,38,27,8,10,22,27,0.0,1.0,0.0,0
7,1,0.534,47,88,0.500,16,32,0.655,19,29,...,29,20,7,4,13,25,1.0,1.0,1.0,1
8,1,0.450,36,80,0.391,9,23,0.560,14,25,...,33,23,9,6,23,24,1.0,0.0,0.0,1
9,1,0.460,40,87,0.250,6,24,0.815,22,27,...,34,38,13,4,13,23,0.0,0.0,0.0,0


In [12]:
level_1_out_X = level_1_out_X.drop(['Label'], axis=1)
level_1_out_X

Unnamed: 0,Home/Away_A,FG%_A,FGM_A,FGA_A,3P%_A,3PM_A,3PA_A,FT%_A,FTM_A,FTA_A,...,OREB_B,DREB_B,AST_B,STL_B,BLK_B,TOV_B,PF_B,Label_logistic_classifier,Label_svm_classifier,Label_naive_bayse_gaussian_classifier
0,0,0.468,36,77,0.481,13,27,0.917,11,12,...,6,30,28,9,5,15,18,0.0,0.0,0.0
1,1,0.435,37,85,0.333,10,30,0.600,12,20,...,12,27,15,6,3,13,21,1.0,1.0,1.0
2,1,0.488,39,80,0.348,8,23,0.783,18,23,...,10,28,24,13,3,15,24,0.0,1.0,1.0
3,0,0.384,38,99,0.258,8,31,0.600,9,15,...,9,40,18,5,5,14,16,0.0,0.0,0.0
4,0,0.462,42,91,0.200,4,20,0.739,17,23,...,11,32,23,7,4,12,21,1.0,1.0,0.0
5,0,0.505,47,93,0.550,11,20,0.864,19,22,...,17,39,24,5,4,15,18,1.0,1.0,1.0
6,0,0.412,35,85,0.439,18,41,0.710,22,31,...,11,38,27,8,10,22,27,0.0,1.0,0.0
7,1,0.534,47,88,0.500,16,32,0.655,19,29,...,9,29,20,7,4,13,25,1.0,1.0,1.0
8,1,0.450,36,80,0.391,9,23,0.560,14,25,...,8,33,23,9,6,23,24,1.0,0.0,0.0
9,1,0.460,40,87,0.250,6,24,0.815,22,27,...,7,34,38,13,4,13,23,0.0,0.0,0.0


# Stage2

In [13]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier

level_2_classifier = {'rf_classifer'    : RandomForestClassifier(),
                      'gbdt_classifier' : GradientBoostingClassifier(),
                      'xgb_classifier'  : XGBClassifier()}

level_2_out_X, model_dict = StackingMethod(level_1_out_X, Y, 10, 0, **level_2_classifier)
level_2_out_X

Unnamed: 0,Home/Away_A,FG%_A,FGM_A,FGA_A,3P%_A,3PM_A,3PA_A,FT%_A,FTM_A,FTA_A,...,STL_B,BLK_B,TOV_B,PF_B,Label_logistic_classifier,Label_svm_classifier,Label_naive_bayse_gaussian_classifier,Label_rf_classifer,Label_gbdt_classifier,Label_xgb_classifier
0,0,0.468,36,77,0.481,13,27,0.917,11,12,...,9,5,15,18,0.0,0.0,0.0,0.0,0.0,0.0
1,1,0.435,37,85,0.333,10,30,0.600,12,20,...,6,3,13,21,1.0,1.0,1.0,1.0,1.0,1.0
2,1,0.488,39,80,0.348,8,23,0.783,18,23,...,13,3,15,24,0.0,1.0,1.0,0.0,0.0,0.0
3,0,0.384,38,99,0.258,8,31,0.600,9,15,...,5,5,14,16,0.0,0.0,0.0,0.0,0.0,0.0
4,0,0.462,42,91,0.200,4,20,0.739,17,23,...,7,4,12,21,1.0,1.0,0.0,1.0,1.0,1.0
5,0,0.505,47,93,0.550,11,20,0.864,19,22,...,5,4,15,18,1.0,1.0,1.0,1.0,1.0,1.0
6,0,0.412,35,85,0.439,18,41,0.710,22,31,...,8,10,22,27,0.0,1.0,0.0,0.0,0.0,0.0
7,1,0.534,47,88,0.500,16,32,0.655,19,29,...,7,4,13,25,1.0,1.0,1.0,1.0,1.0,1.0
8,1,0.450,36,80,0.391,9,23,0.560,14,25,...,9,6,23,24,1.0,0.0,0.0,1.0,1.0,1.0
9,1,0.460,40,87,0.250,6,24,0.815,22,27,...,13,4,13,23,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
level_2_out_X['Label'] = Y['Label']
level_2_out_X

Unnamed: 0,Home/Away_A,FG%_A,FGM_A,FGA_A,3P%_A,3PM_A,3PA_A,FT%_A,FTM_A,FTA_A,...,BLK_B,TOV_B,PF_B,Label_logistic_classifier,Label_svm_classifier,Label_naive_bayse_gaussian_classifier,Label_rf_classifer,Label_gbdt_classifier,Label_xgb_classifier,Label
0,0,0.468,36,77,0.481,13,27,0.917,11,12,...,5,15,18,0.0,0.0,0.0,0.0,0.0,0.0,0
1,1,0.435,37,85,0.333,10,30,0.600,12,20,...,3,13,21,1.0,1.0,1.0,1.0,1.0,1.0,1
2,1,0.488,39,80,0.348,8,23,0.783,18,23,...,3,15,24,0.0,1.0,1.0,0.0,0.0,0.0,0
3,0,0.384,38,99,0.258,8,31,0.600,9,15,...,5,14,16,0.0,0.0,0.0,0.0,0.0,0.0,0
4,0,0.462,42,91,0.200,4,20,0.739,17,23,...,4,12,21,1.0,1.0,0.0,1.0,1.0,1.0,1
5,0,0.505,47,93,0.550,11,20,0.864,19,22,...,4,15,18,1.0,1.0,1.0,1.0,1.0,1.0,1
6,0,0.412,35,85,0.439,18,41,0.710,22,31,...,10,22,27,0.0,1.0,0.0,0.0,0.0,0.0,0
7,1,0.534,47,88,0.500,16,32,0.655,19,29,...,4,13,25,1.0,1.0,1.0,1.0,1.0,1.0,1
8,1,0.450,36,80,0.391,9,23,0.560,14,25,...,6,23,24,1.0,0.0,0.0,1.0,1.0,1.0,1
9,1,0.460,40,87,0.250,6,24,0.815,22,27,...,4,13,23,0.0,0.0,0.0,0.0,0.0,0.0,0


In [15]:
level_2_out_X = level_2_out_X.drop(['Label'], axis=1)
level_2_out_X

Unnamed: 0,Home/Away_A,FG%_A,FGM_A,FGA_A,3P%_A,3PM_A,3PA_A,FT%_A,FTM_A,FTA_A,...,STL_B,BLK_B,TOV_B,PF_B,Label_logistic_classifier,Label_svm_classifier,Label_naive_bayse_gaussian_classifier,Label_rf_classifer,Label_gbdt_classifier,Label_xgb_classifier
0,0,0.468,36,77,0.481,13,27,0.917,11,12,...,9,5,15,18,0.0,0.0,0.0,0.0,0.0,0.0
1,1,0.435,37,85,0.333,10,30,0.600,12,20,...,6,3,13,21,1.0,1.0,1.0,1.0,1.0,1.0
2,1,0.488,39,80,0.348,8,23,0.783,18,23,...,13,3,15,24,0.0,1.0,1.0,0.0,0.0,0.0
3,0,0.384,38,99,0.258,8,31,0.600,9,15,...,5,5,14,16,0.0,0.0,0.0,0.0,0.0,0.0
4,0,0.462,42,91,0.200,4,20,0.739,17,23,...,7,4,12,21,1.0,1.0,0.0,1.0,1.0,1.0
5,0,0.505,47,93,0.550,11,20,0.864,19,22,...,5,4,15,18,1.0,1.0,1.0,1.0,1.0,1.0
6,0,0.412,35,85,0.439,18,41,0.710,22,31,...,8,10,22,27,0.0,1.0,0.0,0.0,0.0,0.0
7,1,0.534,47,88,0.500,16,32,0.655,19,29,...,7,4,13,25,1.0,1.0,1.0,1.0,1.0,1.0
8,1,0.450,36,80,0.391,9,23,0.560,14,25,...,9,6,23,24,1.0,0.0,0.0,1.0,1.0,1.0
9,1,0.460,40,87,0.250,6,24,0.815,22,27,...,13,4,13,23,0.0,0.0,0.0,0.0,0.0,0.0


# Stage3

In [16]:
from lightgbm import LGBMClassifier 

# Set the parameters by cross-validation
tuned_parameters = []

# Number of random trials
NUM_TRIALS = 1

clf = LGBMClassifier()

(max_score, best_estimator) = CrossValidationGridSearchNested(level_2_out_X, Y, NUM_TRIALS, 10, clf, tuned_parameters, 'roc_auc')
best_parameter = best_estimator.get_params()

print(f'\nmax_score = {max_score}\n')
print(f'\nbest_estimator = {best_estimator}\n')
print(f'\nbest_parameter = {best_parameter}\n')

> progress = 100.0%

max_score = 1.0


best_estimator = LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
        learning_rate=0.1, max_depth=-1, min_child_samples=20,
        min_child_weight=0.001, min_split_gain=0.0, n_estimators=100,
        n_jobs=-1, num_leaves=31, objective=None, random_state=None,
        reg_alpha=0.0, reg_lambda=0.0, silent=True, subsample=1.0,
        subsample_for_bin=200000, subsample_freq=1)


best_parameter = {'boosting_type': 'gbdt', 'class_weight': None, 'colsample_bytree': 1.0, 'learning_rate': 0.1, 'max_depth': -1, 'min_child_samples': 20, 'min_child_weight': 0.001, 'min_split_gain': 0.0, 'n_estimators': 100, 'n_jobs': -1, 'num_leaves': 31, 'objective': None, 'random_state': None, 'reg_alpha': 0.0, 'reg_lambda': 0.0, 'silent': True, 'subsample': 1.0, 'subsample_for_bin': 200000, 'subsample_freq': 1}



In [17]:
level_2_out_X.head()

Unnamed: 0,Home/Away_A,FG%_A,FGM_A,FGA_A,3P%_A,3PM_A,3PA_A,FT%_A,FTM_A,FTA_A,...,STL_B,BLK_B,TOV_B,PF_B,Label_logistic_classifier,Label_svm_classifier,Label_naive_bayse_gaussian_classifier,Label_rf_classifer,Label_gbdt_classifier,Label_xgb_classifier
0,0,0.468,36,77,0.481,13,27,0.917,11,12,...,9,5,15,18,0.0,0.0,0.0,0.0,0.0,0.0
1,1,0.435,37,85,0.333,10,30,0.6,12,20,...,6,3,13,21,1.0,1.0,1.0,1.0,1.0,1.0
2,1,0.488,39,80,0.348,8,23,0.783,18,23,...,13,3,15,24,0.0,1.0,1.0,0.0,0.0,0.0
3,0,0.384,38,99,0.258,8,31,0.6,9,15,...,5,5,14,16,0.0,0.0,0.0,0.0,0.0,0.0
4,0,0.462,42,91,0.2,4,20,0.739,17,23,...,7,4,12,21,1.0,1.0,0.0,1.0,1.0,1.0


In [18]:
(n_estimator, n_scoring) = CrossValidationGetModelsScores(level_2_out_X, Y, 10, best_estimator)
n_scoring

[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]

In [19]:
n_estimator

[LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
         learning_rate=0.1, max_depth=-1, min_child_samples=20,
         min_child_weight=0.001, min_split_gain=0.0, n_estimators=100,
         n_jobs=-1, num_leaves=31, objective=None, random_state=None,
         reg_alpha=0.0, reg_lambda=0.0, silent=True, subsample=1.0,
         subsample_for_bin=200000, subsample_freq=1),
 LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
         learning_rate=0.1, max_depth=-1, min_child_samples=20,
         min_child_weight=0.001, min_split_gain=0.0, n_estimators=100,
         n_jobs=-1, num_leaves=31, objective=None, random_state=None,
         reg_alpha=0.0, reg_lambda=0.0, silent=True, subsample=1.0,
         subsample_for_bin=200000, subsample_freq=1),
 LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
         learning_rate=0.1, max_depth=-1, min_child_samples=20,
         min_child_weight=0.001, min_split_ga

In [20]:
Y.head(15)

Unnamed: 0,Label
0,0
1,1
2,0
3,0
4,1
5,1
6,0
7,1
8,1
9,0


In [21]:
level_2_out_X.head(10)

Unnamed: 0,Home/Away_A,FG%_A,FGM_A,FGA_A,3P%_A,3PM_A,3PA_A,FT%_A,FTM_A,FTA_A,...,STL_B,BLK_B,TOV_B,PF_B,Label_logistic_classifier,Label_svm_classifier,Label_naive_bayse_gaussian_classifier,Label_rf_classifer,Label_gbdt_classifier,Label_xgb_classifier
0,0,0.468,36,77,0.481,13,27,0.917,11,12,...,9,5,15,18,0.0,0.0,0.0,0.0,0.0,0.0
1,1,0.435,37,85,0.333,10,30,0.6,12,20,...,6,3,13,21,1.0,1.0,1.0,1.0,1.0,1.0
2,1,0.488,39,80,0.348,8,23,0.783,18,23,...,13,3,15,24,0.0,1.0,1.0,0.0,0.0,0.0
3,0,0.384,38,99,0.258,8,31,0.6,9,15,...,5,5,14,16,0.0,0.0,0.0,0.0,0.0,0.0
4,0,0.462,42,91,0.2,4,20,0.739,17,23,...,7,4,12,21,1.0,1.0,0.0,1.0,1.0,1.0
5,0,0.505,47,93,0.55,11,20,0.864,19,22,...,5,4,15,18,1.0,1.0,1.0,1.0,1.0,1.0
6,0,0.412,35,85,0.439,18,41,0.71,22,31,...,8,10,22,27,0.0,1.0,0.0,0.0,0.0,0.0
7,1,0.534,47,88,0.5,16,32,0.655,19,29,...,7,4,13,25,1.0,1.0,1.0,1.0,1.0,1.0
8,1,0.45,36,80,0.391,9,23,0.56,14,25,...,9,6,23,24,1.0,0.0,0.0,1.0,1.0,1.0
9,1,0.46,40,87,0.25,6,24,0.815,22,27,...,13,4,13,23,0.0,0.0,0.0,0.0,0.0,0.0


In [22]:
x_1d_test_array = np.array([list(level_2_out_X.iloc[4]), list(level_2_out_X.iloc[5]), list(level_2_out_X.iloc[6])])
original_df_X = pd.DataFrame(data = x_1d_test_array, columns=level_2_out_X.columns)
original_df_X

Unnamed: 0,Home/Away_A,FG%_A,FGM_A,FGA_A,3P%_A,3PM_A,3PA_A,FT%_A,FTM_A,FTA_A,...,STL_B,BLK_B,TOV_B,PF_B,Label_logistic_classifier,Label_svm_classifier,Label_naive_bayse_gaussian_classifier,Label_rf_classifer,Label_gbdt_classifier,Label_xgb_classifier
0,0.0,0.462,42.0,91.0,0.2,4.0,20.0,0.739,17.0,23.0,...,7.0,4.0,12.0,21.0,1.0,1.0,0.0,1.0,1.0,1.0
1,0.0,0.505,47.0,93.0,0.55,11.0,20.0,0.864,19.0,22.0,...,5.0,4.0,15.0,18.0,1.0,1.0,1.0,1.0,1.0,1.0
2,0.0,0.412,35.0,85.0,0.439,18.0,41.0,0.71,22.0,31.0,...,8.0,10.0,22.0,27.0,0.0,1.0,0.0,0.0,0.0,0.0


In [40]:
x_1d_test_array2 = np.array([list(level_2_out_X.iloc[7]), list(level_2_out_X.iloc[8]), list(level_2_out_X.iloc[9])])
original_df_X2 = pd.DataFrame(data = x_1d_test_array2, columns=level_2_out_X.columns)
original_df_X2

Unnamed: 0,Home/Away_A,FG%_A,FGM_A,FGA_A,3P%_A,3PM_A,3PA_A,FT%_A,FTM_A,FTA_A,...,STL_B,BLK_B,TOV_B,PF_B,Label_logistic_classifier,Label_svm_classifier,Label_naive_bayse_gaussian_classifier,Label_rf_classifer,Label_gbdt_classifier,Label_xgb_classifier
0,1.0,0.534,47.0,88.0,0.5,16.0,32.0,0.655,19.0,29.0,...,7.0,4.0,13.0,25.0,1.0,1.0,1.0,1.0,1.0,1.0
1,1.0,0.45,36.0,80.0,0.391,9.0,23.0,0.56,14.0,25.0,...,9.0,6.0,23.0,24.0,1.0,0.0,0.0,1.0,1.0,1.0
2,1.0,0.46,40.0,87.0,0.25,6.0,24.0,0.815,22.0,27.0,...,13.0,4.0,13.0,23.0,0.0,0.0,0.0,0.0,0.0,0.0


In [76]:
a = {(str(x)+'shit'):2 for x in ['123', 456]}
a

{'123shit': 2, '456shit': 2}

In [99]:
Y.values

array([[0],
       [1],
       [0],
       ...,
       [0],
       [1],
       [0]])

In [134]:
aa = [[x] for x in range(1, 4, 1)]
aa = np.array(aa)
aa.ravel()

array([1, 2, 3])

In [130]:
y_pred = np.array([original_df_X2['Label_logistic_classifier']])
y_pred

array([[0., 1., 0.]])

In [138]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score

y_pred = np.array(original_df_X2['Label_logistic_classifier'])
y_pred[0] = 0
y_pred
y_true = np.array(Y['Label'][7:10])
#print(f'y_pred = \n{y_pred}')
#print(f'y_true = \n{y_true}')
accuracy_score(y_true, y_pred)

0.6666666666666666

In [79]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score

y_pred = original_df_X2['Label_logistic_classifier']
y_pred[0] = 1
y_true = Y['Label'][7:10]
print(f'y_pred = \n{y_pred}')
print(f'y_true = \n{y_true}')
accuracy_score(y_true, y_pred)

y_pred = 
0    1.0
1    1.0
2    0.0
Name: Label_logistic_classifier, dtype: float64
y_true = 
7    1
8    1
9    0
Name: Label, dtype: int64


1.0

In [80]:
roc_auc_score(y_true, y_pred)

1.0

In [24]:
original_df_X.values

array([[ 0.   ,  0.462, 42.   , 91.   ,  0.2  ,  4.   , 20.   ,  0.739,
        17.   , 23.   , 46.   , 11.   , 35.   , 24.   ,  8.   ,  4.   ,
        11.   , 12.   ,  0.416, 37.   , 89.   ,  0.424, 14.   , 33.   ,
         1.   , 12.   , 12.   , 43.   , 11.   , 32.   , 23.   ,  7.   ,
         4.   , 12.   , 21.   ,  1.   ,  1.   ,  0.   ,  1.   ,  1.   ,
         1.   ],
       [ 0.   ,  0.505, 47.   , 93.   ,  0.55 , 11.   , 20.   ,  0.864,
        19.   , 22.   , 46.   ,  4.   , 42.   , 23.   ,  7.   ,  7.   ,
         6.   , 21.   ,  0.406, 39.   , 96.   ,  0.387, 12.   , 31.   ,
         0.677, 21.   , 31.   , 56.   , 17.   , 39.   , 24.   ,  5.   ,
         4.   , 15.   , 18.   ,  1.   ,  1.   ,  1.   ,  1.   ,  1.   ,
         1.   ],
       [ 0.   ,  0.412, 35.   , 85.   ,  0.439, 18.   , 41.   ,  0.71 ,
        22.   , 31.   , 36.   , 13.   , 23.   , 20.   , 13.   ,  5.   ,
        13.   , 22.   ,  0.5  , 41.   , 82.   ,  0.406, 13.   , 32.   ,
         0.913, 21.   , 23.   

In [41]:
original_df_X2.values

array([[ 1.   ,  0.534, 47.   , 88.   ,  0.5  , 16.   , 32.   ,  0.655,
        19.   , 29.   , 43.   ,  9.   , 34.   , 35.   ,  9.   ,  4.   ,
        12.   , 20.   ,  0.494, 42.   , 85.   ,  0.355, 11.   , 31.   ,
         0.64 , 16.   , 25.   , 38.   ,  9.   , 29.   , 20.   ,  7.   ,
         4.   , 13.   , 25.   ,  1.   ,  1.   ,  1.   ,  1.   ,  1.   ,
         1.   ],
       [ 1.   ,  0.45 , 36.   , 80.   ,  0.391,  9.   , 23.   ,  0.56 ,
        14.   , 25.   , 41.   , 13.   , 28.   , 17.   ,  9.   ,  6.   ,
        17.   , 16.   ,  0.468, 36.   , 77.   ,  0.395, 15.   , 38.   ,
         0.636,  7.   , 11.   , 41.   ,  8.   , 33.   , 23.   ,  9.   ,
         6.   , 23.   , 24.   ,  1.   ,  0.   ,  0.   ,  1.   ,  1.   ,
         1.   ],
       [ 1.   ,  0.46 , 40.   , 87.   ,  0.25 ,  6.   , 24.   ,  0.815,
        22.   , 27.   , 46.   ,  9.   , 37.   , 26.   ,  9.   ,  6.   ,
        17.   , 18.   ,  0.549, 50.   , 91.   ,  0.45 , 18.   , 40.   ,
         0.529,  9.   , 17.   

In [140]:
total_list = []
a = [[x] for x in range(10, 31, 10)]
b = [[x] for x in range(40, 61, 10)]

total_list.append(a)
total_list.append(b)
X2_featuer = original_df_X2.values
for list_test in total_list:
    #a_arr = np.array(list_test)
    a_arr = list_test
#    print(a_arr.shape)
    X2_featuer = np.concatenate((X2_featuer, a_arr), axis=1)
X2_featuer

array([[ 1.   ,  0.534, 47.   , 88.   ,  0.5  , 16.   , 32.   ,  0.655,
        19.   , 29.   , 43.   ,  9.   , 34.   , 35.   ,  9.   ,  4.   ,
        12.   , 20.   ,  0.494, 42.   , 85.   ,  0.355, 11.   , 31.   ,
         0.64 , 16.   , 25.   , 38.   ,  9.   , 29.   , 20.   ,  7.   ,
         4.   , 13.   , 25.   ,  0.   ,  1.   ,  1.   ,  1.   ,  1.   ,
         1.   , 10.   , 40.   ],
       [ 1.   ,  0.45 , 36.   , 80.   ,  0.391,  9.   , 23.   ,  0.56 ,
        14.   , 25.   , 41.   , 13.   , 28.   , 17.   ,  9.   ,  6.   ,
        17.   , 16.   ,  0.468, 36.   , 77.   ,  0.395, 15.   , 38.   ,
         0.636,  7.   , 11.   , 41.   ,  8.   , 33.   , 23.   ,  9.   ,
         6.   , 23.   , 24.   ,  1.   ,  0.   ,  0.   ,  1.   ,  1.   ,
         1.   , 20.   , 50.   ],
       [ 1.   ,  0.46 , 40.   , 87.   ,  0.25 ,  6.   , 24.   ,  0.815,
        22.   , 27.   , 46.   ,  9.   , 37.   , 26.   ,  9.   ,  6.   ,
        17.   , 18.   ,  0.549, 50.   , 91.   ,  0.45 , 18.   , 40.   

In [27]:
np.concatenate((original_df_X.values, original_df_X2.values), axis=0)

array([[ 0.   ,  0.462, 42.   , 91.   ,  0.2  ,  4.   , 20.   ,  0.739,
        17.   , 23.   , 46.   , 11.   , 35.   , 24.   ,  8.   ,  4.   ,
        11.   , 12.   ,  0.416, 37.   , 89.   ,  0.424, 14.   , 33.   ,
         1.   , 12.   , 12.   , 43.   , 11.   , 32.   , 23.   ,  7.   ,
         4.   , 12.   , 21.   ,  1.   ,  1.   ,  0.   ,  1.   ,  1.   ,
         1.   ],
       [ 0.   ,  0.505, 47.   , 93.   ,  0.55 , 11.   , 20.   ,  0.864,
        19.   , 22.   , 46.   ,  4.   , 42.   , 23.   ,  7.   ,  7.   ,
         6.   , 21.   ,  0.406, 39.   , 96.   ,  0.387, 12.   , 31.   ,
         0.677, 21.   , 31.   , 56.   , 17.   , 39.   , 24.   ,  5.   ,
         4.   , 15.   , 18.   ,  1.   ,  1.   ,  1.   ,  1.   ,  1.   ,
         1.   ],
       [ 0.   ,  0.412, 35.   , 85.   ,  0.439, 18.   , 41.   ,  0.71 ,
        22.   , 31.   , 36.   , 13.   , 23.   , 20.   , 13.   ,  5.   ,
        13.   , 22.   ,  0.5  , 41.   , 82.   ,  0.406, 13.   , 32.   ,
         0.913, 21.   , 23.   

In [28]:
#a = [PredictFunctionAggregation([x], n_estimator, voting=1, is_debug=0) for x in original_df_X.values]
total = []
a = original_df_X.values[0].tolist()
b = original_df_X.values[1].tolist()
total.append(a)
total.append(b)
total = np.array(total)
total

array([[ 0.   ,  0.462, 42.   , 91.   ,  0.2  ,  4.   , 20.   ,  0.739,
        17.   , 23.   , 46.   , 11.   , 35.   , 24.   ,  8.   ,  4.   ,
        11.   , 12.   ,  0.416, 37.   , 89.   ,  0.424, 14.   , 33.   ,
         1.   , 12.   , 12.   , 43.   , 11.   , 32.   , 23.   ,  7.   ,
         4.   , 12.   , 21.   ,  1.   ,  1.   ,  0.   ,  1.   ,  1.   ,
         1.   ],
       [ 0.   ,  0.505, 47.   , 93.   ,  0.55 , 11.   , 20.   ,  0.864,
        19.   , 22.   , 46.   ,  4.   , 42.   , 23.   ,  7.   ,  7.   ,
         6.   , 21.   ,  0.406, 39.   , 96.   ,  0.387, 12.   , 31.   ,
         0.677, 21.   , 31.   , 56.   , 17.   , 39.   , 24.   ,  5.   ,
         4.   , 15.   , 18.   ,  1.   ,  1.   ,  1.   ,  1.   ,  1.   ,
         1.   ]])

In [56]:
x_1d_test_array = np.array([list(level_2_out_X.iloc[4]), list(level_2_out_X.iloc[5]), list(level_2_out_X.iloc[6])])
cc = [x for x in x_1d_test_array]
cc[0].shape
ccc = np.array([cc[0]])
ccc.shape

(1, 41)

In [29]:
x_1d_test_array = np.array([list(level_2_out_X.iloc[4]), list(level_2_out_X.iloc[5]), list(level_2_out_X.iloc[6])])

a = [PredictFunctionAggregation(np.array([x]), n_estimator, voting=1, is_debug=0) for x in x_1d_test_array]
a

[1, 1, 0]

In [30]:
x_1d_test_array = np.array([list(level_2_out_X.iloc[4])])
final_class = PredictFunctionAggregation(x_1d_test_array, n_estimator, voting=1, is_debug=1)
final_class_1_prob = PredictFunctionAggregation(x_1d_test_array, n_estimator, voting=0, is_debug=1)

y_predict = [array([1]), array([1]), array([1]), array([1]), array([1]), array([1]), array([1]), array([1]), array([1]), array([1])]
count_0   = 0
count_1   = 10
y_predict = [array([[2.19017310e-05, 9.99978098e-01]]), array([[2.19017310e-05, 9.99978098e-01]]), array([[2.19017310e-05, 9.99978098e-01]]), array([[2.19017310e-05, 9.99978098e-01]]), array([[2.19017310e-05, 9.99978098e-01]]), array([[2.19017310e-05, 9.99978098e-01]]), array([[2.19017310e-05, 9.99978098e-01]]), array([[2.19017310e-05, 9.99978098e-01]]), array([[2.19017310e-05, 9.99978098e-01]]), array([[2.19017310e-05, 9.99978098e-01]])]


In [31]:
final_class

1

In [32]:
final_class_1_prob

0.9999780982689798

In [33]:
1-final_class_1_prob

2.1901731020235182e-05

# Feature Importance

In [34]:
import numpy as np
import matplotlib.pyplot as plt

feature_name = list(level_2_out_X.columns)
best_estimator.fit(level_2_out_X.values, Y.values.ravel())
importances = best_estimator.feature_importances_
indices = np.argsort(importances)[::-1]
std = np.array([0.5 for x in importances])
#std = np.std([tree.feature_importances_ for tree in best_estimator.], axis=0)

# Print the feature ranking
print("Feature ranking:")

for f in range(level_2_out_X.shape[1]):
    print("%d. feature %s (%f)" % (f + 1, feature_name[indices[f]], importances[indices[f]]))

# Plot the feature importances of the forest
plt.figure()
plt.title("Feature importances")
plt.bar(range(level_2_out_X.shape[1]), importances[indices],
       color="r", yerr=std[indices], align="center")
plt.xticks(range(level_2_out_X.shape[1]), indices)
plt.xlim([-1, level_2_out_X.shape[1]])
plt.show()

Feature ranking:
1. feature FG%_A (217.000000)
2. feature Label_logistic_classifier (100.000000)
3. feature 3P%_A (33.000000)
4. feature Home/Away_A (22.000000)
5. feature FT%_A (9.000000)
6. feature FG%_B (8.000000)
7. feature FGM_A (4.000000)
8. feature FGA_A (4.000000)
9. feature 3P%_B (3.000000)
10. feature 3PA_A (3.000000)
11. feature AST_A (2.000000)
12. feature FT%_B (2.000000)
13. feature TOV_A (1.000000)
14. feature BLK_A (1.000000)
15. feature OREB_A (1.000000)
16. feature DREB_A (1.000000)
17. feature STL_A (0.000000)
18. feature FTA_A (0.000000)
19. feature REB_A (0.000000)
20. feature FTM_A (0.000000)
21. feature 3PM_A (0.000000)
22. feature PF_A (0.000000)
23. feature Label_xgb_classifier (0.000000)
24. feature FGM_B (0.000000)
25. feature Label_gbdt_classifier (0.000000)
26. feature Label_rf_classifer (0.000000)
27. feature Label_naive_bayse_gaussian_classifier (0.000000)
28. feature Label_svm_classifier (0.000000)
29. feature PF_B (0.000000)
30. feature TOV_B (0.000000)

<Figure size 640x480 with 1 Axes>