In [1]:
import numpy as np
import pandas as pd
import time

# @param dfFile: pandas.DataFrame ('nba_preprocessed.csv')
# @param dateStart, dateEnd: str in the format of 'YYYY-MM-DD'
# @param attriToDrop: list[str]
# @return X, Y: pandas.DataFrame
# featureExtraction() outputs X, Y for model training.
# Game date can be assigned
# Attribute to be dropped can be assigned
def featureExtraction(dfFile, dateStart='1000-01-01', dateEnd='2999-12-31', attriToDrop=None):
    df = pd.read_csv(dfFile)
    
    # Date selection
    df = df.loc[(df.Date_A > dateStart) & (df.Date_A < dateEnd), :].reset_index(drop=True)
    
    # Get label Y
    Y = df[['W/L_A']]
    Y = Y.rename(columns={'W/L_A': 'Label'})
    
    # Get attributes X
    attriToDrop = [x + '_A' for x in attriToDrop] + [x + '_B' for x in attriToDrop] if attriToDrop else []
    colToDrop = ['Team_A', 'Date_A', 'W/L_A', 'Score_A', 'Opponent_A', 'Team_B', 'Date_B', 'W/L_B', 'Home/Away_B', 'Score_B', 'Opponent_B']
    colToDrop += attriToDrop if attriToDrop else []
    X = df.drop(columns = colToDrop)
    
    return X, Y

In [2]:
import random
import sys
import numpy as np
import pandas as pd
import warnings
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from datetime import datetime

warnings.filterwarnings(action='ignore', category=DeprecationWarning)

def StackingMethod(origin_df_X, origin_df_Y, kfold, is_debug, **all_basic_classifiers):
    X_train = origin_df_X.values
    Y_train = origin_df_Y.values.ravel()
    random.seed(datetime.now())
    skf = StratifiedKFold(n_splits=10, random_state=random.randint(0, 2**32-1), shuffle=True)
    iteration = 0
    len_y = 0
    new_feature_columns = ['Label_'+x for x in all_basic_classifiers.keys()]
    new_feature_arr     = np.zeros([len(X_train), len(new_feature_columns)])
    
    for train_index, test_index in skf.split(X_train, Y_train):
        X_cv_train = X_train[train_index]
        Y_cv_train = Y_train[train_index]
        X_cv_test  = X_train[test_index]
        Y_cv_test  = Y_train[test_index]
        column_label_index = 0
        
        if(is_debug):
            print(f"-----iteration {iteration}-----")
            print(f'test_index = {test_index}')
        for k, v in all_basic_classifiers.items():
            classifier_cv = v
            classifier_cv.fit(X_cv_train, Y_cv_train)
            Y_cv_test_result = classifier_cv.predict(X_cv_test)
            count_result_index = 0
            for index in test_index:
                new_feature_arr[index][column_label_index] = Y_cv_test_result[count_result_index]
                count_result_index += 1
                
            column_label_index += 1
            
            if(is_debug):
                len_y += len(Y_cv_test_result)
                print(f'key = {k}, val = {v}')
                print(f'Y_cv_test_result = {Y_cv_test_result}')
                print(f'len(Y_cv_test_result) = {len(Y_cv_test_result)}')
                print(type(Y_cv_test_result))
                print('-------')
        iteration += 1
        
    new_feature_df = pd.DataFrame(data = new_feature_arr, columns = new_feature_columns)
    output_df_all = pd.concat([origin_df_X, new_feature_df], axis=1, ignore_index=False)
    
    if(is_debug):
        print(f'total len_y = {len_y}')
        print(f'new_feature_columns = {new_feature_columns}')
        count_index_fin = 0
        for x in new_feature_arr:
            print(f'index = {count_index_fin}, label = {x}')
            count_index_fin += 1
        
    return output_df_all

In [3]:
def CrossValidationGridSearchNested(origin_df_X, origin_df_Y, num_trials, fold_num, est_classifcation, tuned_param, scoring):
    X_data = origin_df_X.values
    Y_data = origin_df_Y.values.ravel()
    max_score = -1
    best_estimator = est_classifcation
    is_tuned_param_empty = (tuned_param == []) | (tuned_param == None)
    
    for i in range(num_trials):
        inner_cv = StratifiedKFold(n_splits=fold_num, random_state=i, shuffle=True)
        outer_cv = StratifiedKFold(n_splits=fold_num, random_state=i+1, shuffle=True)
        
        if(is_tuned_param_empty):
            param_score = cross_val_score(est_classifcation, X=X_data, y=Y_data, cv=outer_cv, scoring=scoring).mean()
        else:
            # Non_nested parameter search and scoring
            clf = GridSearchCV(estimator=est_classifcation, param_grid=tuned_param, cv=inner_cv, scoring=scoring)
            clf.fit(X_data, Y_data)
        
            # CV with parameter optimization
            param_score = cross_val_score(clf.best_estimator_, X=X_data, y=Y_data, cv=outer_cv, scoring=scoring).mean()
            
        if(param_score > max_score):
            max_score = param_score
            if(is_tuned_param_empty):
                best_estimator = est_classifcation
            else:
                best_estimator = clf.best_estimator_
            
        progress = (i+1)/num_trials*100
        print(f'> progress = {progress}%')
    
    return (max_score, best_estimator)

In [4]:
import random
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score

#Only support roc_auc scoring now
def CrossValidationGetModelsScores(origin_df_X, origin_df_Y, num_trials, fold_num, est_classifier):
    X_data = origin_df_X.values
    Y_data = origin_df_Y.values.ravel()
    random.seed(datetime.now())
    skf = StratifiedKFold(n_splits=fold_num, random_state=random.randint(0, 2**32-1), shuffle=True)
    ret_n_estimator = []
    ret_n_score     = []
    
    for train_index, test_index in skf.split(X_data, Y_data):
        X_cv_train = X_data[train_index]
        Y_cv_train = Y_data[train_index]
        X_cv_test  = X_data[test_index]
        Y_cv_test  = Y_data[test_index]
        
        ret_n_estimator.append(est_classifier)
        ret_n_estimator[-1].fit(X_cv_train, Y_cv_train)
        ret_n_score.append(roc_auc_score(Y_cv_test, ret_n_estimator[-1].predict(X_cv_test)))
    
    return(ret_n_estimator, ret_n_score)

In [5]:
#x_1d_test_array is the 1-dimension test array
def PredictFunctionAggregation(x_1d_test_array, n_estimator, voting=1, is_debug=0):
    ret_val = 0
    if(voting == 1):
        y_predict = [x.predict(x_1d_test_array) for x in n_estimator]
        count_0 = y_predict.count(0)
        count_1 = y_predict.count(1)
        
        if(is_debug):
            print(f'y_predict = {y_predict}')
            print(f'count_0   = {count_0}')
            print(f'count_1   = {count_1}')

        if(count_0 > count_1):
            ret_val = 0
        else:
            ret_val = 1
    else:
        y_predict = [x.predict_proba(x_1d_test_array) for x in n_estimator]
        if(is_debug):
            print(f'y_predict = {y_predict}')
        #return the probability that is 1(left wins)     
        total_1_prob = [x[0][1] for x in y_predict]
        ret_val = np.mean(total_1_prob)
    
    return ret_val

In [6]:
def TotalModel(**all_classifiers_dic):
    level_classifier_dic_list = []
    final_classifier = {}
    for name, classifier_dic in all_classifiers_dic.items():
        level_classifier_dic_list.append(classifier_dic)
    
    final_classifier = level_classifier_dic_list[-1]
    del level_classifier_dic_list[-1]
    
    for classifier_dic in level_classifier_dic_list:
        for name, classifier in classifier_dic.items():
            print(f'--->{name} = {classifier}')
            
    for name, classifier in final_classifier.items():
        print(f'--->{name} = {classifier}')

In [7]:
dfFile = '../../python_ex/NBA_Ino_Part/nba_preprocessed.csv'
dateStart = '2017-08-01'
dateEnd = '2018-05-01'
# X, Y = featureExtraction(dfFile, dateStart, dateEnd)
X, Y = featureExtraction(dfFile, attriToDrop=['PTS'], dateStart=dateStart, dateEnd=dateEnd)
# X, Y = featureExtraction(dfFile)

# Example

In [8]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier 

level_1_classifier  = {'logistic_classifier'             : LogisticRegression(),
                       'svm_classifier'                  : SVC(),
                       'naive_bayse_gaussian_classifier' : GaussianNB()}

level_2_classifier  = {'rf_classifer'    : RandomForestClassifier(),
                       'gbdt_classifier' : GradientBoostingClassifier(),
                       'xgb_classifier'  : XGBClassifier()}

final_classifier    = {'lgb_classifier'  : LGBMClassifier()}

all_classifiers_dic = {'level_1_classifier' : level_1_classifier,
                       'level_2_classifier' : level_2_classifier,
                       'final_classifier'   : final_classifier}

TotalModel(**all_classifiers_dic)


--->logistic_classifier = LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)
--->svm_classifier = SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)
--->naive_bayse_gaussian_classifier = GaussianNB(priors=None)
--->rf_classifer = RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, ve