# Data Mining Final Project - NBA Game Winning Forecasting
## Attributes Generators and Ground Truth Generators

In [1]:
import numpy as np
import pandas as pd
import time

## Function - featureEng()

In [2]:
# @param X: pandas.DataFrame
# @param featureSel: int
# @return X: pandas.DataFrame
def featureEng(X, featureSel=None):
    # Feature Engineering
    if not featureSel or featureSel == 0:
        return X
    if featureSel == 1:
        X['PTS_DIFF'] = X['PTS_A'] - X['PTS_B']
    elif featureSel == 2:
        attriToDrop = ['PTS_A', 'PTS_B']
        X = X.drop(columns=attriToDrop)
    elif featureSel == 3:
        X['PTS_DIFF'] = X['PTS_A'] - X['PTS_B']
        attriToDrop = ['PTS_A', 'PTS_B']
        X = X.drop(columns=attriToDrop)
    elif featureSel == 4:
        attriToDrop = [
            'FGM_A', 'FGA_A', '3PM_A', '3PA_A', 'FTM_A', 'FTA_A', 'OREB_A', 'DREB_A', 'PF_A', 
            'FGM_B', 'FGA_B', '3PM_B', '3PA_B', 'FTM_B', 'FTA_B', 'OREB_B', 'DREB_B', 'PF_B'
        ]
        X['PTS_DIFF'] = X['PTS_A'] - X['PTS_B']
        X['STL+BLK_A'] = X['STL_A'] + X['BLK_A']
        X['STL+BLK_B'] = X['STL_B'] + X['BLK_B']
        attriToDrop += ['PTS_A', 'PTS_B', 'STL_A', 'STL_B', 'BLK_A', 'BLK_B']
        X = X.drop(columns=attriToDrop)
    return X

## Function - attriGen()

In [3]:
# @param dfFile: pandas.DataFrame (from 'nba_preprocessed.csv')
# @param date: str in the format of 'YYYY-MM-DD'
# @param period: int (Number of previous games to be considered)
# @param Team_A, Team_B: str
# @param homeAway: int (None for played game prediction)
# @param featureSel: int
# @return X: pandas.DataFrame
def attriGen(df, date, period, Team_A, Team_B, homeAway=None, featureSel=None):
    # True Home/Away at the game day
    if homeAway is None:
        df_gameDay = df.loc[(df.Date_A == date) & (df.Team_A == Team_A) & (df.Team_B == Team_B), :].reset_index(drop=True)
        homeAway = int(df_gameDay['Home/Away_A'])
    
    # Date selections
    df = df.loc[df.Date_A < date, :].reset_index(drop=True)
    X_A = df.loc[(df.Team_A == Team_A), :].sort_values(by=['Date_A'], ascending=False).iloc[0:period, 0:24].reset_index(drop=True)
    X_B = df.loc[(df.Team_A == Team_B), :].sort_values(by=['Date_A'], ascending=False).iloc[0:period, 0:24].reset_index(drop=True)
    
    # Drop unnecessary attributes
    colToDrop = ['Home/Away_A'] + ['Team_A', 'Date_A', 'W/L_A', 'Score_A', 'Opponent_A']
    X_A = X_A.drop(columns=colToDrop)
    X_B = X_B.drop(columns=colToDrop)
    
    # Rename X_away's columns
    X_B = X_B.rename(columns=lambda x: x[0:-2] + '_B')
    
    # Get X = [Home/Away_A + X_A + X_B]
    X = pd.DataFrame(data=pd.concat([X_A.mean(), X_B.mean()])).transpose()
    X = pd.concat([pd.DataFrame(data={'Home/Away_A': [homeAway]}), X], axis=1)
    
    # Feature Engineering
    X = featureEng(X, featureSel)
    
    return X

## Function - groundTruthGen()

In [4]:
# @param dfFile: pandas.DataFrame (from 'nba_preprocessed.csv')
# @param date: str in the format of 'YYYY-MM-DD'
# @param Team_A, Team_B: str
# @param featureSel: int (0, 1, 2, and 3 corresponds to feature0, 1, 2, and 3, respectively)
# @return X_groundTruth, Y_groundTruth: pandas.DataFrame
def groundTruthGen(df, date, Team_A, Team_B, featureSel=None):
    # Date selections
    df = df.loc[(df.Date_A == date) & (df.Team_A == Team_A) & (df.Team_B == Team_B), :].reset_index(drop=True)

    # Get label Y
    Y_groundTruth = df[['W/L_A']]
    Y_groundTruth = Y_groundTruth.rename(columns={'W/L_A': 'Label'})
    
    # Drop unnecessary attributes
    colToDrop = [
        'Team_A', 'Date_A', 'W/L_A', 'Score_A', 'Opponent_A', 
        'Team_B', 'Date_B', 'W/L_B', 'Home/Away_B', 'Score_B', 'Opponent_B'
    ]
    X_groundTruth = df.drop(columns=colToDrop)
    
    # Feature Engineering
    X_groundTruth = featureEng(X_groundTruth, featureSel)
    
    return X_groundTruth, Y_groundTruth

## Attributes Generator
- attriGen() generates the average attributes of two teams over defined number of previous games.

In [5]:
# Example: (Existing Game)
df = pd.read_csv('nba_preprocessed.csv')
date = '2018-06-03'
period = 5
Team_A, Team_B = 'GSW', 'CLE'
featureSel = 3
X_toBePredicted = attriGen(df, date, period, Team_A, Team_B, None, featureSel)
X_toBePredicted

Unnamed: 0,Home/Away_A,FG%_A,FGM_A,FGA_A,3P%_A,3PM_A,3PA_A,FT%_A,FTM_A,FTA_A,...,FTA_B,REB_B,OREB_B,DREB_B,AST_B,STL_B,BLK_B,TOV_B,PF_B,PTS_DIFF
0,1,0.466,39.0,83.6,0.382,12.8,33.2,0.7866,14.4,18.0,...,23.4,44.8,9.6,35.2,16.2,4.8,6.2,13.0,20.0,4.4


In [6]:
# Example: (Future Game)
df = pd.read_csv('nba_preprocessed.csv')
date = '2018-06-08'
period = 5
Team_A, Team_B = 'GSW', 'CLE'
featureSel = 3
X_toBePredicted = attriGen(df, date, period, Team_A, Team_B, 0, featureSel)
X_toBePredicted

Unnamed: 0,Home/Away_A,FG%_A,FGM_A,FGA_A,3P%_A,3PM_A,3PA_A,FT%_A,FTM_A,FTA_A,...,FTA_B,REB_B,OREB_B,DREB_B,AST_B,STL_B,BLK_B,TOV_B,PF_B,PTS_DIFF
0,0,0.517,43.4,84.0,0.391,13.8,35.0,0.7296,13.8,18.6,...,22.4,45.4,13.4,32.0,18.8,6.0,5.0,11.0,18.2,11.4


In [7]:
# Example: (Future Game)
df = pd.read_csv('nba_preprocessed.csv')
date = '2018-06-06'
period = 5
Team_A, Team_B = 'CLE', 'GSW'
featureSel = 3
X_toBePredicted = attriGen(df, date, period, Team_A, Team_B, 0, featureSel)
X_toBePredicted

Unnamed: 0,Home/Away_A,FG%_A,FGM_A,FGA_A,3P%_A,3PM_A,3PA_A,FT%_A,FTM_A,FTA_A,...,FTA_B,REB_B,OREB_B,DREB_B,AST_B,STL_B,BLK_B,TOV_B,PF_B,PTS_DIFF
0,0,0.44,36.6,83.2,0.3032,9.2,31.2,0.7332,16.8,22.8,...,19.4,41.8,7.8,34.0,25.6,8.0,6.4,12.6,20.6,-12.0


## Ground Truth Generator
- groundTruthGen() generates the ground truth of attributes and label of a specified game (specified by date and teams).
- It can be used for evaluating the model prediction.

In [8]:
df = pd.read_csv('nba_preprocessed.csv')
date = '2018-06-03'
Team_A, Team_B = 'GSW', 'CLE'
featureSel = 3
X_groundTruth, Y_groundTruth = groundTruthGen(df, date, Team_A, Team_B, featureSel)

In [9]:
X_groundTruth

Unnamed: 0,Home/Away_A,FG%_A,FGM_A,FGA_A,3P%_A,3PM_A,3PA_A,FT%_A,FTM_A,FTA_A,...,FTA_B,REB_B,OREB_B,DREB_B,AST_B,STL_B,BLK_B,TOV_B,PF_B,PTS_DIFF
0,1,0.573,47,82,0.417,15,36,0.619,13,21,...,26,42,16,26,25,9,4,10,15,19


In [10]:
Y_groundTruth

Unnamed: 0,Label
0,1
