# Data Mining Final Project - NBA Game Winning Forecasting
## Learning Rate Evaluation

In [1]:
import numpy as np
import pandas as pd
import time
import warnings
warnings.filterwarnings('ignore')
from sklearn.metrics import accuracy_score

## Function - featureEng()

In [2]:
# @param X: pandas.DataFrame
# @param featureSel: int
# @return X: pandas.DataFrame
def featureEng(X, featureSel=None):
    # Feature Engineering
    if not featureSel or featureSel == 0:
        return X
    if featureSel == 1:
        X['PTS_DIFF'] = X['PTS_A'] - X['PTS_B']
    elif featureSel == 2:
        attriToDrop = ['PTS_A', 'PTS_B']
        X = X.drop(columns=attriToDrop)
    elif featureSel == 3:
        X['PTS_DIFF'] = X['PTS_A'] - X['PTS_B']
        attriToDrop = ['PTS_A', 'PTS_B']
        X = X.drop(columns=attriToDrop)
    elif featureSel == 4:
        attriToDrop = [
            'FGM_A', 'FGA_A', '3PM_A', '3PA_A', 'FTM_A', 'FTA_A', 'OREB_A', 'DREB_A', 'PF_A', 
            'FGM_B', 'FGA_B', '3PM_B', '3PA_B', 'FTM_B', 'FTA_B', 'OREB_B', 'DREB_B', 'PF_B'
        ]
        X['PTS_DIFF'] = X['PTS_A'] - X['PTS_B']
        X['STL+BLK_A'] = X['STL_A'] + X['BLK_A']
        X['STL+BLK_B'] = X['STL_B'] + X['BLK_B']
        attriToDrop += ['PTS_A', 'PTS_B', 'STL_A', 'STL_B', 'BLK_A', 'BLK_B']
        X = X.drop(columns=attriToDrop)
    return X

## Function - featureExtraction()

In [3]:
# @param dfFile: pandas.DataFrame ('nba_preprocessed.csv')
# @param dateStart, dateEnd: str in the format of 'YYYY-MM-DD'
# @param period: int
# @param featureSel: int
# @return X, Y: pandas.DataFrame
# featureExtraction() outputs X, Y for model training.
def featureExtraction(dfFile, dateStart='1000-01-01', dateEnd='2999-12-31', period=5, featureSel=None):
    df = pd.read_csv(dfFile)
    
    # Date selection
    df = df.loc[(df.Date_A >= dateStart) & (df.Date_A <= dateEnd), :].reset_index(drop=True)
    
    # Get label Y
    Y = df[['W/L_A']]
    Y = Y.rename(columns={'W/L_A': 'Label'})
    
    # Get averaged attributes X
    for idx, row in df.iterrows():
        df_sel = df.loc[df.Date_A <= row['Date_A'], :].reset_index(drop=True)
        
        # Process of Team_A
        gamePlayed_A = df_sel.loc[df_sel.Team_A == row['Team_A'], :]
        if len(gamePlayed_A) == 1:
            X_A = gamePlayed_A.loc[(gamePlayed_A.Team_A == row['Team_A']), :].sort_values(by=['Date_A'], ascending=False).iloc[0:1, 0:24].reset_index(drop=True)
        elif len(gamePlayed_A) < period:
            X_A = gamePlayed_A.loc[(gamePlayed_A.Team_A == row['Team_A']), :].sort_values(by=['Date_A'], ascending=False).iloc[1:len(gamePlayed_A), 0:24].reset_index(drop=True)
        else:
            X_A = gamePlayed_A.loc[(gamePlayed_A.Team_A == row['Team_A']), :].sort_values(by=['Date_A'], ascending=False).iloc[1:period+1, 0:24].reset_index(drop=True)
        
        # Process of Team_B
        gamePlayed_B = df_sel.loc[df_sel.Team_A == row['Team_B'], :]
        if len(gamePlayed_B) == 1:
            X_B = gamePlayed_B.loc[(gamePlayed_B.Team_A == row['Team_B']), :].sort_values(by=['Date_A'], ascending=False).iloc[0:1, 0:24].reset_index(drop=True)
        elif len(gamePlayed_B) < period:
            X_B = gamePlayed_B.loc[(gamePlayed_B.Team_A == row['Team_B']), :].sort_values(by=['Date_A'], ascending=False).iloc[1:len(gamePlayed_B), 0:24].reset_index(drop=True)
        else:
            X_B = gamePlayed_B.loc[(gamePlayed_B.Team_A == row['Team_B']), :].sort_values(by=['Date_A'], ascending=False).iloc[1:period+1, 0:24].reset_index(drop=True)
        
        # Drop unnecessary attributes
        colToDrop = ['Home/Away_A'] + ['Team_A', 'Date_A', 'W/L_A', 'Score_A', 'Opponent_A']
        X_A = X_A.drop(columns=colToDrop)
        X_B = X_B.drop(columns=colToDrop)
        
        # Rename X_B's columns
        X_B = X_B.rename(columns=lambda x: x[0:-2] + '_B')
        
        # Get X_single = [Home/Away_A + X_A + X_B]
        X_single = pd.DataFrame(data=pd.concat([X_A.mean(), X_B.mean()])).transpose()
        X_single = pd.concat([pd.DataFrame(data={'Home/Away_A': [row['Home/Away_A']]}), X_single], axis=1)
        
        # Concatenation dataFrames by row
        if idx == 0:
            X = X_single
        else:
            X = pd.concat([X, X_single], ignore_index=True)
        
    # Feature Engineering
    X = featureEng(X, featureSel)
        
    return X, Y

## Function - attriGen()

In [4]:
# @param dfFile: pandas.DataFrame (from 'nba_preprocessed.csv')
# @param date: str in the format of 'YYYY-MM-DD'
# @param period: int (Number of previous games to be considered)
# @param Team_A, Team_B: str
# @param homeAway: int (None for played game prediction)
# @param featureSel: int
# @return X: pandas.DataFrame
def attriGen(df, date, period, Team_A, Team_B, homeAway=None, featureSel=None):
    # True Home/Away at the game day
    if homeAway is None:
        df_gameDay = df.loc[(df.Date_A == date) & (df.Team_A == Team_A) & (df.Team_B == Team_B), :].reset_index(drop=True)
        homeAway = int(df_gameDay['Home/Away_A'])
    
    # Date selections
    df = df.loc[df.Date_A < date, :].reset_index(drop=True)
    X_A = df.loc[(df.Team_A == Team_A), :].sort_values(by=['Date_A'], ascending=False).iloc[0:period, 0:24].reset_index(drop=True)
    X_B = df.loc[(df.Team_A == Team_B), :].sort_values(by=['Date_A'], ascending=False).iloc[0:period, 0:24].reset_index(drop=True)
    
    # Drop unnecessary attributes
    colToDrop = ['Home/Away_A'] + ['Team_A', 'Date_A', 'W/L_A', 'Score_A', 'Opponent_A']
    X_A = X_A.drop(columns=colToDrop)
    X_B = X_B.drop(columns=colToDrop)
    
    # Rename X_away's columns
    X_B = X_B.rename(columns=lambda x: x[0:-2] + '_B')
    
    # Get X = [Home/Away_A + X_A + X_B]
    X = pd.DataFrame(data=pd.concat([X_A.mean(), X_B.mean()])).transpose()
    X = pd.concat([pd.DataFrame(data={'Home/Away_A': [homeAway]}), X], axis=1)
    
    # Feature Engineering
    X = featureEng(X, featureSel)
    
    return X

## Function - groundTruthGen()

In [5]:
# @param dfFile: pandas.DataFrame (from 'nba_preprocessed.csv')
# @param date: str in the format of 'YYYY-MM-DD'
# @param Team_A, Team_B: str
# @param featureSel: int
# @return X_groundTruth, Y_groundTruth: pandas.DataFrame
def groundTruthGen(df, date, Team_A, Team_B, featureSel=None):
    # Date selections
    df = df.loc[(df.Date_A == date) & (df.Team_A == Team_A) & (df.Team_B == Team_B), :].reset_index(drop=True)

    # Get label Y
    Y_groundTruth = df[['W/L_A']]
    Y_groundTruth = Y_groundTruth.rename(columns={'W/L_A': 'Label'})
    
    # Drop unnecessary attributes
    colToDrop = [
        'Team_A', 'Date_A', 'W/L_A', 'Score_A', 'Opponent_A', 
        'Team_B', 'Date_B', 'W/L_B', 'Home/Away_B', 'Score_B', 'Opponent_B'
    ]
    X_groundTruth = df.drop(columns=colToDrop)
    
    # Feature Engineering
    X_groundTruth = featureEng(X_groundTruth, featureSel)
    
    return X_groundTruth, Y_groundTruth

## Function - gameAttriGen()

In [6]:
# @param dfFile: pandas.DataFrame ('nba_preprocessed.csv')
# @param dateStart, dateEnd: str in the format of 'YYYY-MM-DD'
# @param period: int
# @param Team_A, Team_B: str (If both are None, predict all games within the date range)
# @param featureSel: int
# @return X, Y: pandas.DataFrame
# gameAttriGen() outputs X_attri, Y_truth for game prediction.
def gameAttriGen(dfFile, dateStart, dateEnd, period=5, Team_A=None, Team_B=None, featureSel=None):
    df = pd.read_csv(dfFile)
    
    # Date selections
    df_sel = df.loc[(df.Date_A >= dateStart) & (df.Date_A <= dateEnd), :].reset_index(drop=True)
    
    # Generate df_sel which includes [date, Team_A, Team_B] columns
    if Team_A and Team_B:
        df_sel = df_sel.loc[(df_sel.Team_A == Team_A) & (df_sel.Opponent_A == Team_B), :].reset_index(drop=True)[['Date_A', 'Team_A', 'Opponent_A']]
    elif Team_A and not Team_B:
        df_sel = df_sel.loc[df_sel.Team_A == Team_A, :].reset_index(drop=True)[['Date_A', 'Team_A', 'Opponent_A']]
    elif not Team_A and Team_B:
        df_sel = df_sel.loc[df_sel.Opponent_A == Team_B, :].reset_index(drop=True)[['Date_A', 'Team_A', 'Opponent_A']]
    elif not Team_A and not Team_B:
        df_sel = df_sel[['Date_A', 'Team_A', 'Opponent_A']]
        # Delete duplicates: (Team_A vs Team_B) is the same as (Team_B vs Team_A). Remove one to avoid double count.
        df_new = pd.DataFrame(columns=['Date_A', 'Team_A', 'Opponent_A'])
        LUT = {}
        for date, x, y in zip(df_sel['Date_A'], df_sel['Team_A'], df_sel['Opponent_A']):
            if (date + x + y) in LUT:
                df_new = pd.concat([df_new, pd.DataFrame(columns=['Date_A', 'Team_A', 'Opponent_A'], data=[[date, x, y]])], ignore_index=True)
            else:
                LUT[date + x + y] = 1
                LUT[date + y + x] = 1
        df_sel = df_new
    
    # W/L prediction
    X_attri = Y_truth = None
    for date, Team_A, Team_B in zip(df_sel['Date_A'], df_sel['Team_A'], df_sel['Opponent_A']):
        X_toBePredicted = attriGen(df, date, period, Team_A, Team_B, None, featureSel)
        X_groundTruth, Y_groundTruth = groundTruthGen(df, date, Team_A, Team_B, featureSel)
        if X_attri is None and Y_truth is None:
            X_attri = X_toBePredicted
            Y_truth = Y_groundTruth
        else:
            X_attri = pd.concat([X_attri, X_toBePredicted], ignore_index=True)
            Y_truth = pd.concat([Y_truth, Y_groundTruth], ignore_index=True)
        
    return X_attri, Y_truth

## Function - gamePrediction()

In [6]:
# @param dfFile: pandas.DataFrame ('nba_preprocessed.csv')
# @param modelsLUT: dict in the format of {'modelName': model}
# @param dateStart, dateEnd: str in the format of 'YYYY-MM-DD'
# @param period: int (Number of previous games to be considered)
# @param Team_A, Team_B: str (If both are None, predict all games within the date range)
# @param featureSel: int
# @return None
# gamePrediction() prints the predicted game W/L results.
def gamePrediction(dfFile, modelsLUT, dateStart, dateEnd, period=5, Team_A=None, Team_B=None, featureSel=None):
    X_attri, Y_truth = gameAttriGen(dfFile, dateStart, dateEnd, period, Team_A, Team_B, featureSel)
    
    resultLUT, accuLUT = {}, {}
    for model in modelsLUT:
        resultLUT[model] = modelsLUT[model].predict(X_attri)
        accuLUT[model] = accuracy_score(Y_truth, modelsLUT[model].predict(X_attri))
    
    print('---------- Prediction Accuracy ----------')
    print('featureSel =', featureSel)
    for x in accuLUT:
        print(x, '=', accuLUT[x]*100, '%')
    print('------------------------------------')

## 1-Year Training Data

In [7]:
# Feature Extraction
dfFile = 'nba_preprocessed.csv'
dateStart = '2017-08-01'
dateEnd = '2018-04-13'
period = 5
featureSel = 3
X, Y = featureExtraction(dfFile, dateStart, dateEnd, period, featureSel)

# Model Training
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import AdaBoostClassifier

logiRegr = LogisticRegression()
logiRegr.fit(X, Y)
logiRegrCVGS = LogisticRegression(C=10, max_iter=300)
logiRegrCVGS.fit(X, Y)

supVecMachine = SVC(kernel='linear', probability=True)
supVecMachine.fit(X, Y)
supVecMachineCVGS = SVC(C=1, kernel='linear', probability=True)
supVecMachineCVGS.fit(X, Y)

xgbc = XGBClassifier()
xgbc.fit(X, Y)
xgbcCVGS = XGBClassifier(max_depth=3, learning_rate=0.1, n_estimators=100, min_child_weight=3, gamma=0.3)
xgbcCVGS.fit(X, Y)

naiveBayes = GaussianNB()
naiveBayes.fit(X, Y)

randomForest = RandomForestClassifier()
randomForest.fit(X, Y)
randomForestCVGS = RandomForestClassifier(n_estimators=1000, criterion='entropy', bootstrap=True, max_depth=10, max_features='sqrt')
randomForestCVGS.fit(X, Y)

gbdt = GradientBoostingClassifier()
gbdt.fit(X,Y)
gbdtCVGS = GradientBoostingClassifier(loss='exponential', n_estimators=800, learning_rate=0.1, max_depth=10, subsample=0.5, max_features='auto')
gbdtCVGS.fit(X,Y)

lgbm = LGBMClassifier()
lgbm.fit(X, Y)
lgbmCVGS = LGBMClassifier(learning_rate=0.1, n_estimators=1000, max_depth=-1, subsample=0.5)
lgbmCVGS.fit(X, Y)

adaBoost = AdaBoostClassifier()
adaBoost.fit(X, Y)
adaBoostCVGS = AdaBoostClassifier(learning_rate=0.2, n_estimators=50)
adaBoostCVGS.fit(X, Y)

# Prediction
dateStart = '2018-04-14'
dateEnd = '2018-06-01'
Team_A = None
Team_B = None
modelsLUT = {
    'logiRegr': logiRegr,
    'logiRegrCVGS': logiRegrCVGS,
    'supVecMachine': supVecMachine,
    'supVecMachineCVGS': supVecMachineCVGS,
    'xgbc': xgbc,
    'xgbcCVGS': xgbcCVGS,
    'naiveBayes': naiveBayes,
    'randomForest': randomForest,
    'randomForestCVGS': randomForestCVGS,
    'gbdt': gbdt,
    'gbdtCVGS': gbdtCVGS,
    'lgbm': lgbm,
    'lgbmCVGS': lgbmCVGS,
    'adaBoost': adaBoost, 
    'adaBoostCVGS': adaBoostCVGS
}

# Training Data Volume
print('---------- Training Data Volume ----------')
print('# of data =', len(X))
print('------------------------------------')

# W/L prediction
gamePrediction(dfFile, modelsLUT, dateStart, dateEnd, period, Team_A, Team_B, featureSel)

---------- Training Data Volume ----------
# of data = 2460
------------------------------------
---------- Prediction Accuracy ----------
featureSel = 3
logiRegr = 69.62025316455697 %
logiRegrCVGS = 69.62025316455697 %
supVecMachine = 70.88607594936708 %
supVecMachineCVGS = 70.88607594936708 %
xgbc = 74.68354430379746 %
xgbcCVGS = 74.68354430379746 %
naiveBayes = 60.75949367088608 %
randomForest = 50.63291139240506 %
randomForestCVGS = 68.35443037974683 %
gbdt = 72.15189873417721 %
gbdtCVGS = 65.82278481012658 %
lgbm = 68.35443037974683 %
lgbmCVGS = 58.22784810126582 %
adaBoost = 68.35443037974683 %
adaBoostCVGS = 73.41772151898735 %
------------------------------------


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


## 2-Year Training Data

In [8]:
# Feature Extraction
dfFile = 'nba_preprocessed.csv'
dateStart = '2016-08-01'
dateEnd = '2018-04-13'
period = 5
featureSel = 3
X, Y = featureExtraction(dfFile, dateStart, dateEnd, period, featureSel)

# Model Training
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import AdaBoostClassifier

logiRegr = LogisticRegression()
logiRegr.fit(X, Y)
logiRegrCVGS = LogisticRegression(C=1000, max_iter=300)
logiRegrCVGS.fit(X, Y)

supVecMachine = SVC(kernel='linear', probability=True)
supVecMachine.fit(X, Y)
supVecMachineCVGS = SVC(C=0.1, kernel='linear', probability=True)
supVecMachineCVGS.fit(X, Y)

xgbc = XGBClassifier()
xgbc.fit(X, Y)
xgbcCVGS = XGBClassifier(max_depth=3, learning_rate=0.1, n_estimators=100, min_child_weight=1, gamma=0.4)
xgbcCVGS.fit(X, Y)

naiveBayes = GaussianNB()
naiveBayes.fit(X, Y)

randomForest = RandomForestClassifier()
randomForest.fit(X, Y)
randomForestCVGS = RandomForestClassifier(n_estimators=1000, criterion='entropy', bootstrap=True, max_depth=None, max_features='auto')
randomForestCVGS.fit(X, Y)

gbdt = GradientBoostingClassifier()
gbdt.fit(X,Y)
gbdtCVGS = GradientBoostingClassifier(loss='exponential', n_estimators=800, learning_rate=0.1, max_depth=10, subsample=0.5, max_features='sqrt')
gbdtCVGS.fit(X,Y)

lgbm = LGBMClassifier()
lgbm.fit(X, Y)
lgbmCVGS = LGBMClassifier(learning_rate=0.1, n_estimators=800, max_depth=10, subsample=0.5)
lgbmCVGS.fit(X, Y)

adaBoost = AdaBoostClassifier()
adaBoost.fit(X, Y)
adaBoostCVGS = AdaBoostClassifier(learning_rate=0.3, n_estimators=50)
adaBoostCVGS.fit(X, Y)

# Prediction
dateStart = '2018-04-14'
dateEnd = '2018-06-01'
Team_A = None
Team_B = None
modelsLUT = {
    'logiRegr': logiRegr,
    'logiRegrCVGS': logiRegrCVGS,
    'supVecMachine': supVecMachine,
    'supVecMachineCVGS': supVecMachineCVGS,
    'xgbc': xgbc,
    'xgbcCVGS': xgbcCVGS,
    'naiveBayes': naiveBayes,
    'randomForest': randomForest,
    'randomForestCVGS': randomForestCVGS,
    'gbdt': gbdt,
    'gbdtCVGS': gbdtCVGS,
    'lgbm': lgbm,
    'lgbmCVGS': lgbmCVGS,
    'adaBoost': adaBoost, 
    'adaBoostCVGS': adaBoostCVGS
}

# Training Data Volume
print('---------- Training Data Volume ----------')
print('# of data =', len(X))
print('------------------------------------')

# W/L prediction
gamePrediction(dfFile, modelsLUT, dateStart, dateEnd, period, Team_A, Team_B, featureSel)

---------- Training Data Volume ----------
# of data = 5078
------------------------------------


  if diff:
  if diff:
  if diff:
  if diff:


---------- Prediction Accuracy ----------
featureSel = 3
logiRegr = 70.88607594936708 %
logiRegrCVGS = 69.62025316455697 %
supVecMachine = 72.15189873417721 %
supVecMachineCVGS = 72.15189873417721 %
xgbc = 70.88607594936708 %
xgbcCVGS = 72.15189873417721 %
naiveBayes = 59.49367088607595 %
randomForest = 55.69620253164557 %
randomForestCVGS = 69.62025316455697 %
gbdt = 69.62025316455697 %
gbdtCVGS = 68.35443037974683 %
lgbm = 74.68354430379746 %
lgbmCVGS = 62.0253164556962 %
adaBoost = 73.41772151898735 %
adaBoostCVGS = 68.35443037974683 %
------------------------------------


  if diff:
  if diff:
  if diff:
  if diff:


## 3-Year Training Data

In [9]:
# Feature Extraction
dfFile = 'nba_preprocessed.csv'
dateStart = '2015-08-01'
dateEnd = '2018-04-13'
period = 5
featureSel = 3
X, Y = featureExtraction(dfFile, dateStart, dateEnd, period, featureSel)

# Model Training
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import AdaBoostClassifier

logiRegr = LogisticRegression()
logiRegr.fit(X, Y)
logiRegrCVGS = LogisticRegression(C=100, max_iter=400)
logiRegrCVGS.fit(X, Y)

supVecMachine = SVC(kernel='linear', probability=True)
supVecMachine.fit(X, Y)
supVecMachineCVGS = SVC(C=10, kernel='linear', probability=True)
supVecMachineCVGS.fit(X, Y)

xgbc = XGBClassifier()
xgbc.fit(X, Y)
xgbcCVGS = XGBClassifier(max_depth=3, learning_rate=0.1, n_estimators=100, min_child_weight=3, gamma=0.2)
xgbcCVGS.fit(X, Y)

naiveBayes = GaussianNB()
naiveBayes.fit(X, Y)

randomForest = RandomForestClassifier()
randomForest.fit(X, Y)
randomForestCVGS = RandomForestClassifier(n_estimators=600, criterion='entropy', bootstrap=True, max_depth=None, max_features='sqrt')
randomForestCVGS.fit(X, Y)

gbdt = GradientBoostingClassifier()
gbdt.fit(X,Y)
gbdtCVGS = GradientBoostingClassifier(loss='exponential', n_estimators=600, learning_rate=0.1, max_depth=3, subsample=0.5, max_features='sqrt')
gbdtCVGS.fit(X,Y)

lgbm = LGBMClassifier()
lgbm.fit(X, Y)
lgbmCVGS = LGBMClassifier(learning_rate=0.1, n_estimators=800, max_depth=-1, subsample=0.5)
lgbmCVGS.fit(X, Y)

adaBoost = AdaBoostClassifier()
adaBoost.fit(X, Y)
adaBoostCVGS = AdaBoostClassifier(learning_rate=0.1, n_estimators=1000)
adaBoostCVGS.fit(X, Y)

# Prediction
dateStart = '2018-04-14'
dateEnd = '2018-06-01'
Team_A = None
Team_B = None
modelsLUT = {
    'logiRegr': logiRegr,
    'logiRegrCVGS': logiRegrCVGS,
    'supVecMachine': supVecMachine,
    'supVecMachineCVGS': supVecMachineCVGS,
    'xgbc': xgbc,
    'xgbcCVGS': xgbcCVGS,
    'naiveBayes': naiveBayes,
    'randomForest': randomForest,
    'randomForestCVGS': randomForestCVGS,
    'gbdt': gbdt,
    'gbdtCVGS': gbdtCVGS,
    'lgbm': lgbm,
    'lgbmCVGS': lgbmCVGS,
    'adaBoost': adaBoost, 
    'adaBoostCVGS': adaBoostCVGS
}

# Training Data Volume
print('---------- Training Data Volume ----------')
print('# of data =', len(X))
print('------------------------------------')

# W/L prediction
gamePrediction(dfFile, modelsLUT, dateStart, dateEnd, period, Team_A, Team_B, featureSel)

---------- Training Data Volume ----------
# of data = 7234
------------------------------------


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


---------- Prediction Accuracy ----------
featureSel = 3
logiRegr = 70.88607594936708 %
logiRegrCVGS = 69.62025316455697 %
supVecMachine = 74.68354430379746 %
supVecMachineCVGS = 72.15189873417721 %
xgbc = 72.15189873417721 %
xgbcCVGS = 74.68354430379746 %
naiveBayes = 60.75949367088608 %
randomForest = 64.55696202531645 %
randomForestCVGS = 70.88607594936708 %
gbdt = 73.41772151898735 %
gbdtCVGS = 64.55696202531645 %
lgbm = 68.35443037974683 %
lgbmCVGS = 67.08860759493672 %
adaBoost = 70.88607594936708 %
adaBoostCVGS = 75.9493670886076 %
------------------------------------


## 4-Year Training Data

In [10]:
# Feature Extraction
dfFile = 'nba_preprocessed.csv'
dateStart = '2014-08-01'
dateEnd = '2018-04-13'
period = 5
featureSel = 3
X, Y = featureExtraction(dfFile, dateStart, dateEnd, period, featureSel)

# Model Training
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import AdaBoostClassifier

logiRegr = LogisticRegression()
logiRegr.fit(X, Y)
logiRegrCVGS = LogisticRegression(C=100, max_iter=400)
logiRegrCVGS.fit(X, Y)

supVecMachine = SVC(kernel='linear', probability=True)
supVecMachine.fit(X, Y)
supVecMachineCVGS = SVC(C=0.01, kernel='linear', probability=True)
supVecMachineCVGS.fit(X, Y)

xgbc = XGBClassifier()
xgbc.fit(X, Y)
xgbcCVGS = XGBClassifier(max_depth=5, learning_rate=0.1, n_estimators=100, min_child_weight=1, gamma=0.4)
xgbcCVGS.fit(X, Y)

naiveBayes = GaussianNB()
naiveBayes.fit(X, Y)

randomForest = RandomForestClassifier()
randomForest.fit(X, Y)
randomForestCVGS = RandomForestClassifier(n_estimators=1000, criterion='entropy', bootstrap=True, max_depth=None, max_features='log2')
randomForestCVGS.fit(X, Y)

gbdt = GradientBoostingClassifier()
gbdt.fit(X,Y)
gbdtCVGS = GradientBoostingClassifier(loss='exponential', n_estimators=600, learning_rate=0.1, max_depth=3, subsample=0.5, max_features='sqrt')
gbdtCVGS.fit(X,Y)

lgbm = LGBMClassifier()
lgbm.fit(X, Y)
lgbmCVGS = LGBMClassifier(learning_rate=0.1, n_estimators=800, max_depth=5, subsample=0.5)
lgbmCVGS.fit(X, Y)

adaBoost = AdaBoostClassifier()
adaBoost.fit(X, Y)
adaBoostCVGS = AdaBoostClassifier(learning_rate=0.1, n_estimators=600)
adaBoostCVGS.fit(X, Y)

# Prediction
dateStart = '2018-04-14'
dateEnd = '2018-06-01'
Team_A = None
Team_B = None
modelsLUT = {
    'logiRegr': logiRegr,
    'logiRegrCVGS': logiRegrCVGS,
    'supVecMachine': supVecMachine,
    'supVecMachineCVGS': supVecMachineCVGS,
    'xgbc': xgbc,
    'xgbcCVGS': xgbcCVGS,
    'naiveBayes': naiveBayes,
    'randomForest': randomForest,
    'randomForestCVGS': randomForestCVGS,
    'gbdt': gbdt,
    'gbdtCVGS': gbdtCVGS,
    'lgbm': lgbm,
    'lgbmCVGS': lgbmCVGS,
    'adaBoost': adaBoost, 
    'adaBoostCVGS': adaBoostCVGS
}

# Training Data Volume
print('---------- Training Data Volume ----------')
print('# of data =', len(X))
print('------------------------------------')

# W/L prediction
gamePrediction(dfFile, modelsLUT, dateStart, dateEnd, period, Team_A, Team_B, featureSel)

---------- Training Data Volume ----------
# of data = 9370
------------------------------------


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


---------- Prediction Accuracy ----------
featureSel = 3
logiRegr = 69.62025316455697 %
logiRegrCVGS = 69.62025316455697 %
supVecMachine = 72.15189873417721 %
supVecMachineCVGS = 70.88607594936708 %
xgbc = 70.88607594936708 %
xgbcCVGS = 72.15189873417721 %
naiveBayes = 59.49367088607595 %
randomForest = 59.49367088607595 %
randomForestCVGS = 72.15189873417721 %
gbdt = 70.88607594936708 %
gbdtCVGS = 69.62025316455697 %
lgbm = 73.41772151898735 %
lgbmCVGS = 63.29113924050633 %
adaBoost = 72.15189873417721 %
adaBoostCVGS = 73.41772151898735 %
------------------------------------


## 5-Year Training Data

In [11]:
# Feature Extraction
dfFile = 'nba_preprocessed.csv'
dateStart = '2013-08-01'
dateEnd = '2018-04-13'
period = 5
featureSel = 3
X, Y = featureExtraction(dfFile, dateStart, dateEnd, period, featureSel)

# Model Training
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import AdaBoostClassifier

logiRegr = LogisticRegression()
logiRegr.fit(X, Y)
logiRegrCVGS = LogisticRegression(C=1000, max_iter=500)
logiRegrCVGS.fit(X, Y)

supVecMachine = SVC(kernel='linear', probability=True)
supVecMachine.fit(X, Y)
supVecMachineCVGS = SVC(C=10, kernel='linear', probability=True)
supVecMachineCVGS.fit(X, Y)

xgbc = XGBClassifier()
xgbc.fit(X, Y)
xgbcCVGS = XGBClassifier(max_depth=3, learning_rate=0.1, n_estimators=200, min_child_weight=1, gamma=0.1)
xgbcCVGS.fit(X, Y)

naiveBayes = GaussianNB()
naiveBayes.fit(X, Y)

randomForest = RandomForestClassifier()
randomForest.fit(X, Y)
randomForestCVGS = RandomForestClassifier(n_estimators=1000, criterion='entropy', bootstrap=True, max_depth=None, max_features='log2')
randomForestCVGS.fit(X, Y)

gbdt = GradientBoostingClassifier()
gbdt.fit(X,Y)
gbdtCVGS = GradientBoostingClassifier(loss='exponential', n_estimators=600, learning_rate=0.1, max_depth=3, subsample=0.5, max_features='sqrt')
gbdtCVGS.fit(X,Y)

lgbm = LGBMClassifier()
lgbm.fit(X, Y)
lgbmCVGS = LGBMClassifier(learning_rate=0.1, n_estimators=600, max_depth=5, subsample=0.5)
lgbmCVGS.fit(X, Y)

adaBoost = AdaBoostClassifier()
adaBoost.fit(X, Y)
adaBoostCVGS = AdaBoostClassifier(learning_rate=0.1, n_estimators=1000)
adaBoostCVGS.fit(X, Y)

# Prediction
dateStart = '2018-04-14'
dateEnd = '2018-06-01'
Team_A = None
Team_B = None
modelsLUT = {
    'logiRegr': logiRegr,
    'logiRegrCVGS': logiRegrCVGS,
    'supVecMachine': supVecMachine,
    'supVecMachineCVGS': supVecMachineCVGS,
    'xgbc': xgbc,
    'xgbcCVGS': xgbcCVGS,
    'naiveBayes': naiveBayes,
    'randomForest': randomForest,
    'randomForestCVGS': randomForestCVGS,
    'gbdt': gbdt,
    'gbdtCVGS': gbdtCVGS,
    'lgbm': lgbm,
    'lgbmCVGS': lgbmCVGS,
    'adaBoost': adaBoost, 
    'adaBoostCVGS': adaBoostCVGS
}

# Training Data Volume
print('---------- Training Data Volume ----------')
print('# of data =', len(X))
print('------------------------------------')

# W/L prediction
gamePrediction(dfFile, modelsLUT, dateStart, dateEnd, period, Team_A, Team_B, featureSel)

---------- Training Data Volume ----------
# of data = 11702
------------------------------------


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


---------- Prediction Accuracy ----------
featureSel = 3
logiRegr = 70.88607594936708 %
logiRegrCVGS = 69.62025316455697 %
supVecMachine = 70.88607594936708 %
supVecMachineCVGS = 70.88607594936708 %
xgbc = 75.9493670886076 %
xgbcCVGS = 75.9493670886076 %
naiveBayes = 59.49367088607595 %
randomForest = 55.69620253164557 %
randomForestCVGS = 73.41772151898735 %
gbdt = 74.68354430379746 %
gbdtCVGS = 70.88607594936708 %
lgbm = 69.62025316455697 %
lgbmCVGS = 63.29113924050633 %
adaBoost = 67.08860759493672 %
adaBoostCVGS = 74.68354430379746 %
------------------------------------
