# Data Mining Final Project - NBA Game Winning Forecasting
## Data Preprocessing

In [1]:
import numpy as np
import pandas as pd
import time

# @param df: pandas.DataFrame
# @return pandas.DataFrame
# NaN cleaner (Numerical)
def cleanDataFrame(df):
    assert isinstance(df, pd.DataFrame), "df needs to be a pd.DataFrame"
    df.dropna(inplace=True)
    indices_to_keep = ~df.isin([np.nan, np.inf, -np.inf]).any(1)
    return df[indices_to_keep].reset_index(drop=True)

# @param df: pandas.DataFrame
# @return team_lut: dict
# Team Dictionary
def teamLUT(df):
    team_lut = {}
    for idx, score in enumerate(df['Score']):
        if score[:3] in team_lut:
            team_lut[score[:3]] += 1
        else:
            team_lut[score[:3]] = 1
    return team_lut

# @param df: pandas.DataFrame
# @return df: pandas.DataFrame
# Add opponent label to a game
def addOpponentCol(df):
    opponent = [None] * len(df['Score'])
    for idx, score in enumerate(df['Score']):
        opponent[idx] = score[:3]
    df['Opponent'] = opponent
    return df

# @param df: pandas.DataFrame
# @return pandas.DataFrame
# Drop objects which are NaN in Score's label (String)
def dropNanScore(df):
    index = []
    for idx, score in enumerate(df['Score']):
        if score[:3] == 'NAN' or score[:3] == 'NaN':
            index.append(idx)
    print("Number of objects dropped =", len(index))
    return df.drop(df.index[index]).reset_index(drop=True)

# @param df: pandas.DataFrame
# @return df_team, df_oppo: pandas.DataFrame
# Pair two teams in a single game by searching 'Date' and 'Opponent' labels.
def pairGamePlayers(df):  
    startTime = time.time()
    invalid_idx = []
    duplicate = 0
    not_found = 0
    # Declare empty dataframe w/ columns from existing dataframe
    df_team = pd.DataFrame(columns = list(df)) # Team attributes
    df_oppo = pd.DataFrame(columns = list(df)) # Opponent attributes
    df_dupl = pd.DataFrame(columns = list(df)) # Duplicated dataframe
    for idx, date, team in zip(df.index.tolist(), df['Date'], df['Team']):
        df_oppo_searched = df.loc[lambda df: df.Date == date, :].loc[lambda df: df.Opponent == team, :]
        if len(df_oppo_searched.index.tolist()) > 1:
            duplicate += 1
            df_dupl = pd.concat([df_dupl, df_oppo_searched], ignore_index=True)
            df_oppo_searched = df_oppo_searched.iloc[0:1, :]
        if not df_oppo_searched.empty:
            df_team = pd.concat([df_team, df.iloc[idx:idx+1, :]], ignore_index=True)
            df_oppo = pd.concat([df_oppo, df_oppo_searched], ignore_index=True)
        else:
            invalid_idx.append(idx)
            not_found += 1
    
    print("Duplicate found =", duplicate)
    print("Opponent not found =", not_found)
    print("Team length = ", len(df_team.index.tolist()))
    print("Oppo length = ", len(df_oppo.index.tolist()))
    print("Execution time =", time.time() - startTime)
    return df_team, df_oppo, invalid_idx

# @param df_team, df_oppo: pandas.DataFrame
# @return df_team, df_oppo: pandas.DataFrame
# Check game validity after pairGamePlayers(df) which pairs two teams in a single game.
def checkGameValidity(df_team, df_oppo):
    startTime = time.time()
    err = 0
    invalid_idx = []
    print("Team length = ", len(df_team.index.tolist()))
    print("Oppo length = ", len(df_oppo.index.tolist()))
    for idx in df_team.index.tolist():
        if df_team.loc[idx]['Date'] != df_oppo.loc[idx]['Date'] or \
        df_team.loc[idx]['Opponent'] != df_oppo.loc[idx]['Team'] or \
        df_team.loc[idx]['W/L'] == df_oppo.loc[idx]['W/L'] or \
        df_team.loc[idx]['Home/Away'] == df_oppo.loc[idx]['Home/Away']:
            err += 1
            invalid_idx.append(idx)
    
    df_team = df_team.drop(df_team.index[invalid_idx]).reset_index(drop=True)
    df_oppo = df_oppo.drop(df_oppo.index[invalid_idx]).reset_index(drop=True)
    
    print("Number of invalid games =", err, "@", [x for x in invalid_idx])
    print("Execution time =", time.time() - startTime)
    return df_team, df_oppo, invalid_idx

# Playoff
## Preprocess

In [2]:
playoff = pd.read_csv("team_playoff_all.csv")
playoff = cleanDataFrame(playoff)   # Clean NaN in 3P% attribute
playoff = addOpponentCol(playoff)
# Binary encode W/L and Home/Away
playoff['W/L'] = playoff['W/L'].map({'W':1, 'L':0})
playoff['Home/Away'] = playoff['Home/Away'].map({'Home':1, 'Away':0})

In [3]:
print(teamLUT(playoff))
print("Number of playoff teams =", len(teamLUT(playoff)))
print("Number of playoff games =", sum(teamLUT(playoff).values()))

{'BOS': 244, 'CLE': 170, 'DET': 230, 'NYK': 167, 'MIL': 95, 'NAN': 275, 'LAL': 327, 'MIA': 207, 'POR': 183, 'ATL': 177, 'PHX': 174, 'NOP': 86, 'ORL': 119, 'UTA': 209, 'BKN': 117, 'IND': 213, 'WAS': 60, 'SAC': 67, 'DEN': 99, 'DAL': 178, 'LAC': 82, 'SAS': 317, 'MIN': 42, 'OKC': 96, 'GSW': 122, 'HOU': 190, 'MEM': 75, 'CHI': 255, 'TOR': 66, 'CHA': 8}
Number of playoff teams = 30
Number of playoff games = 4650


In [4]:
playoff = dropNanScore(playoff)

Number of objects dropped = 275


In [5]:
print(teamLUT(playoff))
print("Number of playoff teams =", len(teamLUT(playoff)))
print("Number of playoff games =", sum(teamLUT(playoff).values()))

{'BOS': 244, 'CLE': 170, 'DET': 230, 'NYK': 167, 'MIL': 95, 'LAL': 327, 'MIA': 207, 'POR': 183, 'ATL': 177, 'PHX': 174, 'NOP': 86, 'ORL': 119, 'UTA': 209, 'BKN': 117, 'IND': 213, 'WAS': 60, 'SAC': 67, 'DEN': 99, 'DAL': 178, 'LAC': 82, 'SAS': 317, 'MIN': 42, 'OKC': 96, 'GSW': 122, 'HOU': 190, 'MEM': 75, 'CHI': 255, 'TOR': 66, 'CHA': 8}
Number of playoff teams = 29
Number of playoff games = 4375


## pairGamePlayers() and checkGameValidity

In [6]:
df_playoff_team, df_playoff_oppo, invalid_idx = pairGamePlayers(playoff)

Duplicate found = 0
Opponent not found = 263
Team length =  4112
Oppo length =  4112
Execution time = 32.781365156173706


In [7]:
df_playoff_team.tail()

Unnamed: 0,Team,Date,W/L,Home/Away,Score,FG%,FGM,FGA,3P%,3PM,...,REB,OREB,DREB,AST,STL,BLK,TOV,PF,PTS,Opponent
4107,GSW,2017-06-01,1,1,CLE91-113GSW,0.425,45,106,0.364,12,...,50,14,36,31,12,3,4,24,113,CLE
4108,GSW,2017-06-04,1,1,CLE113-132GSW,0.517,46,89,0.419,18,...,53,10,43,34,5,7,20,19,132,CLE
4109,GSW,2017-06-07,1,0,CLE113-118GSW,0.482,40,83,0.485,16,...,44,8,36,29,8,4,18,28,118,CLE
4110,GSW,2017-06-09,0,0,CLE137-116GSW,0.448,39,87,0.282,11,...,40,16,24,26,5,7,12,27,116,CLE
4111,GSW,2017-06-12,1,1,CLE120-129GSW,0.511,46,90,0.368,14,...,42,13,29,27,8,2,13,24,129,CLE


In [8]:
df_playoff_oppo.tail()

Unnamed: 0,Team,Date,W/L,Home/Away,Score,FG%,FGM,FGA,3P%,3PM,...,REB,OREB,DREB,AST,STL,BLK,TOV,PF,PTS,Opponent
4107,CLE,2017-06-01,0,0,GSW113-91CLE,0.349,30,86,0.355,11,...,59,15,44,15,0,6,20,23,91,GSW
4108,CLE,2017-06-04,0,0,GSW132-113CLE,0.45,45,100,0.276,8,...,41,10,31,27,15,1,9,18,113,GSW
4109,CLE,2017-06-07,0,1,GSW118-113CLE,0.444,40,90,0.273,12,...,37,10,27,17,9,3,12,25,113,GSW
4110,CLE,2017-06-09,1,1,GSW116-137CLE,0.529,46,87,0.533,24,...,41,11,30,27,6,3,11,24,137,GSW
4111,CLE,2017-06-12,0,0,GSW129-120CLE,0.534,47,88,0.458,11,...,40,12,28,22,6,5,14,22,120,GSW


In [9]:
df_playoff_team_n, df_playoff_oppo_n, invalid_idx = checkGameValidity(df_playoff_team, df_playoff_oppo)

Team length =  4112
Oppo length =  4112
Number of invalid games = 0 @ []
Execution time = 4.601314067840576


In [10]:
df_playoff_team_n['Date'].count()

4112

In [11]:
df_playoff_oppo_n['Date'].count()

4112

# Season
## Preprocess

In [12]:
season = pd.read_csv("team_season_all.csv")
season = cleanDataFrame(season)   # Clean NaN in 3P% attribute
season = addOpponentCol(season)
# Binary encode W/L and Home/Away
season['W/L'] = season['W/L'].map({'W':1, 'L':0})
season['Home/Away'] = season['Home/Away'].map({'Home':1, 'Away':0})

In [13]:
print(teamLUT(season))
print("Number of season teams =", len(teamLUT(season)))
print("Number of season games =", sum(teamLUT(season).values()))

{'CLE': 2370, 'DET': 2373, 'GSW': 2419, 'LAC': 2411, 'NAN': 4980, 'BKN': 2471, 'NYK': 2354, 'MIL': 2373, 'IND': 2363, 'ATL': 2374, 'DEN': 2413, 'UTA': 2419, 'LAL': 2412, 'SAC': 2411, 'PHX': 2412, 'SAS': 2427, 'HOU': 2421, 'BOS': 2355, 'DAL': 2425, 'POR': 2409, 'MIA': 2170, 'NOP': 2170, 'MIN': 2141, 'ORL': 2114, 'TOR': 1703, 'MEM': 1742, 'WAS': 1574, 'CHA': 923, 'OKC': 797, 'CHI': 2368, 'PHI': 164}
Number of season teams = 31
Number of season games = 68458


In [14]:
season = dropNanScore(season)

Number of objects dropped = 4980


In [15]:
print(teamLUT(season))
print("Number of season teams =", len(teamLUT(season)))
print("Number of season games =", sum(teamLUT(season).values()))

{'CLE': 2370, 'DET': 2373, 'GSW': 2419, 'LAC': 2411, 'BKN': 2471, 'NYK': 2354, 'MIL': 2373, 'IND': 2363, 'ATL': 2374, 'DEN': 2413, 'UTA': 2419, 'LAL': 2412, 'SAC': 2411, 'PHX': 2412, 'SAS': 2427, 'HOU': 2421, 'BOS': 2355, 'DAL': 2425, 'POR': 2409, 'MIA': 2170, 'NOP': 2170, 'MIN': 2141, 'ORL': 2114, 'TOR': 1703, 'MEM': 1742, 'WAS': 1574, 'CHA': 923, 'OKC': 797, 'CHI': 2368, 'PHI': 164}
Number of season teams = 30
Number of season games = 63478


## pairGamePlayers() and checkGameValidity

In [16]:
df_season_team, df_season_oppo, invalid_idx = pairGamePlayers(season)

Duplicate found = 0
Opponent not found = 6136
Team length =  57342
Oppo length =  57342
Execution time = 2721.7649500370026


In [17]:
df_season_team.tail()

Unnamed: 0,Team,Date,W/L,Home/Away,Score,FG%,FGM,FGA,3P%,3PM,...,REB,OREB,DREB,AST,STL,BLK,TOV,PF,PTS,Opponent
57337,GSW,2018-04-03,1,0,OKC107-111GSW,0.494,41,83,0.276,8,...,40,6,34,27,9,5,13,25,111,OKC
57338,GSW,2018-04-05,0,0,IND126-106GSW,0.446,37,83,0.31,9,...,39,13,26,29,6,5,16,14,106,IND
57339,GSW,2018-04-07,0,1,NOP126-120GSW,0.544,49,90,0.455,15,...,42,8,34,31,3,0,17,21,120,NOP
57340,GSW,2018-04-08,1,0,PHX100-117GSW,0.522,47,90,0.406,13,...,41,5,36,27,9,7,11,15,117,PHX
57341,GSW,2018-04-10,0,0,UTA119-79GSW,0.349,30,86,0.263,5,...,39,9,30,14,7,3,15,18,79,UTA


In [18]:
df_season_oppo.tail()

Unnamed: 0,Team,Date,W/L,Home/Away,Score,FG%,FGM,FGA,3P%,3PM,...,REB,OREB,DREB,AST,STL,BLK,TOV,PF,PTS,Opponent
57337,OKC,2018-04-03,0,1,GSW111-107OKC,0.376,35,93,0.237,9,...,58,22,36,13,8,3,16,20,107,GSW
57338,IND,2018-04-05,1,1,GSW106-126IND,0.538,50,93,0.517,15,...,47,13,34,32,11,3,12,24,126,GSW
57339,NOP,2018-04-07,1,0,GSW120-126NOP,0.563,49,87,0.407,11,...,34,4,30,39,15,6,8,16,126,GSW
57340,PHX,2018-04-08,0,1,GSW117-100PHX,0.424,39,92,0.286,10,...,44,8,36,24,4,0,13,16,100,GSW
57341,UTA,2018-04-10,1,1,GSW79-119UTA,0.533,48,90,0.371,13,...,54,12,42,25,10,6,15,20,119,GSW


In [19]:
df_season_team_n, df_season_oppo_n, invalid_idx = checkGameValidity(df_season_team, df_season_oppo)

Team length =  57342
Oppo length =  57342
Number of invalid games = 252 @ [1931, 1968, 1976, 1981, 1995, 1998, 2006, 2037, 4119, 4166, 4194, 4227, 6230, 6294, 6301, 6341, 6400, 8457, 8501, 8529, 8585, 9576, 9594, 9671, 9678, 11759, 11765, 11844, 11858, 13912, 13964, 14008, 14010, 16117, 16121, 16147, 16174, 18100, 18240, 18290, 18341, 18357, 18870, 18892, 18950, 18960, 21015, 21020, 21094, 21128, 23200, 23235, 23288, 23292, 25365, 25384, 25449, 25481, 27287, 27314, 27332, 27384, 28736, 28758, 28774, 28778, 28802, 28809, 28850, 30757, 30768, 30790, 30806, 30835, 30840, 30889, 32781, 32798, 32816, 32819, 32863, 32865, 32896, 32928, 33103, 33138, 33147, 33171, 33212, 33224, 33232, 34973, 34982, 35009, 35042, 35046, 35064, 35094, 35109, 36971, 37045, 37055, 37108, 37114, 37120, 37121, 37152, 37172, 39079, 39143, 39145, 39157, 39166, 39185, 39197, 39220, 39247, 40948, 40966, 40970, 40992, 41036, 41051, 41074, 41089, 43025, 43035, 43045, 43095, 43124, 43131, 43152, 45071, 45099, 45109, 45130

In [20]:
df_season_team_n['Date'].count()

57090

In [21]:
df_season_oppo_n['Date'].count()

57090

In [22]:
df_season_team.iloc[invalid_idx, :]

Unnamed: 0,Team,Date,W/L,Home/Away,Score,FG%,FGM,FGA,3P%,3PM,...,REB,OREB,DREB,AST,STL,BLK,TOV,PF,PTS,Opponent
1931,CHI,2014-12-03,1,0,NOP95-102CHI,0.463,37,80,0.429,12,...,49,4,45,27,4,1,11,18,102,NOP
1968,CHI,2015-02-25,0,1,NOP98-86CHI,0.442,34,77,0.176,3,...,43,9,34,24,6,11,12,17,86,NOP
1976,CHI,2015-03-13,0,0,NOP101-91CHI,0.378,31,82,0.469,15,...,36,4,32,18,4,6,10,25,91,NOP
1981,CHI,2015-03-23,1,1,NOP86-98CHI,0.451,37,82,0.188,3,...,46,6,40,21,5,7,6,20,98,NOP
1995,CHI,2015-11-03,0,0,NOP130-105CHI,0.489,44,90,0.467,14,...,33,4,29,26,5,0,12,19,105,NOP
1998,CHI,2015-11-13,1,1,NOP97-102CHI,0.440,37,84,0.250,4,...,57,10,47,27,4,6,11,16,102,NOP
2006,CHI,2015-12-05,0,1,NOP102-96CHI,0.416,37,89,0.360,9,...,52,12,40,29,5,5,14,19,96,NOP
2037,CHI,2016-02-08,0,0,NOP108-91CHI,0.389,35,90,0.400,10,...,41,8,33,25,5,8,7,22,91,NOP
4119,HOU,2014-12-31,1,1,NOP83-102HOU,0.432,38,88,0.395,17,...,41,10,31,22,10,7,14,25,102,NOP
4166,HOU,2015-04-13,1,0,NOP90-100HOU,0.412,35,85,0.257,9,...,49,10,39,19,9,10,10,18,100,NOP


In [23]:
df_season_oppo.iloc[invalid_idx, :]

Unnamed: 0,Team,Date,W/L,Home/Away,Score,FG%,FGM,FGA,3P%,3PM,...,REB,OREB,DREB,AST,STL,BLK,TOV,PF,PTS,Opponent
1931,CHA,2014-12-03,0,1,CHI102-95NOP,0.400,36,90,0.133,2,...,42,7,35,17,5,3,5,21,95,CHI
1968,CHA,2015-02-25,1,0,CHI86-98NOP,0.464,39,84,0.286,4,...,45,12,33,16,8,3,9,18,98,CHI
1976,CHA,2015-03-13,1,1,CHI91-101NOP,0.371,33,89,0.368,7,...,55,17,38,15,4,7,12,17,101,CHI
1981,CHA,2015-03-23,0,0,CHI98-86NOP,0.346,28,81,0.400,8,...,51,11,40,11,2,8,10,25,86,CHI
1995,CHA,2015-11-03,1,1,CHI105-130NOP,0.516,47,91,0.609,14,...,52,12,40,26,6,4,9,11,130,CHI
1998,CHA,2015-11-13,0,0,CHI102-97NOP,0.379,36,95,0.455,15,...,45,9,36,21,9,4,6,21,97,CHI
2006,CHA,2015-12-05,1,0,CHI96-102NOP,0.416,37,89,0.294,10,...,50,13,37,23,10,3,9,18,102,CHI
2037,CHA,2016-02-08,1,1,CHI91-108NOP,0.438,35,80,0.389,14,...,56,10,46,20,6,8,13,19,108,CHI
4119,CHA,2014-12-31,0,0,HOU102-83NOP,0.382,29,76,0.167,3,...,41,7,34,15,6,9,16,17,83,HOU
4166,CHA,2015-04-13,0,1,HOU100-90NOP,0.410,34,83,0.406,13,...,43,7,36,23,8,5,15,17,90,HOU


In [24]:
df_season_team_n.tail(20)

Unnamed: 0,Team,Date,W/L,Home/Away,Score,FG%,FGM,FGA,3P%,3PM,...,REB,OREB,DREB,AST,STL,BLK,TOV,PF,PTS,Opponent
57070,GSW,2018-03-02,1,0,ATL109-114GSW,0.532,42,79,0.355,11,...,35,2,33,33,12,5,18,20,114,ATL
57071,GSW,2018-03-06,1,1,BKN101-114GSW,0.566,47,83,0.478,11,...,41,5,36,30,8,7,17,18,114,BKN
57072,GSW,2018-03-08,1,1,SAS107-110GSW,0.432,41,95,0.267,8,...,49,12,37,30,6,12,10,18,110,SAS
57073,GSW,2018-03-09,0,0,POR125-108GSW,0.487,38,78,0.429,12,...,33,6,27,20,5,3,10,20,108,POR
57074,GSW,2018-03-11,0,0,MIN109-103GSW,0.407,37,91,0.278,10,...,50,12,38,22,6,8,13,17,103,MIN
57075,GSW,2018-03-14,1,1,LAL106-117GSW,0.55,44,80,0.3,6,...,50,11,39,25,8,6,22,24,117,LAL
57076,GSW,2018-03-16,0,1,SAC98-93GSW,0.42,34,81,0.462,12,...,40,9,31,21,10,5,11,21,93,SAC
57077,GSW,2018-03-17,1,0,PHX109-124GSW,0.526,50,95,0.419,13,...,43,9,34,29,8,9,13,21,124,PHX
57078,GSW,2018-03-19,0,0,SAS89-75GSW,0.405,30,74,0.158,3,...,36,2,34,17,7,9,15,24,75,SAS
57079,GSW,2018-03-23,1,1,ATL94-106GSW,0.452,42,93,0.4,12,...,46,10,36,28,8,6,7,17,106,ATL


In [25]:
df_season_oppo_n.tail(20)

Unnamed: 0,Team,Date,W/L,Home/Away,Score,FG%,FGM,FGA,3P%,3PM,...,REB,OREB,DREB,AST,STL,BLK,TOV,PF,PTS,Opponent
57070,ATL,2018-03-02,0,1,GSW114-109ATL,0.442,38,86,0.429,15,...,35,3,32,21,10,2,19,21,109,GSW
57071,BKN,2018-03-06,0,0,GSW114-101BKN,0.424,36,85,0.265,9,...,38,10,28,24,7,1,12,18,101,GSW
57072,SAS,2018-03-08,0,0,GSW110-107SAS,0.453,43,95,0.273,6,...,45,10,35,29,7,3,12,20,107,GSW
57073,POR,2018-03-09,1,1,GSW108-125POR,0.494,44,89,0.41,16,...,46,14,32,21,4,2,11,17,125,GSW
57074,MIN,2018-03-11,1,1,GSW103-109MIN,0.463,44,95,0.3,6,...,50,13,37,27,7,3,9,15,109,GSW
57075,LAL,2018-03-14,0,0,GSW117-106LAL,0.458,38,83,0.324,12,...,33,6,27,25,12,7,14,19,106,GSW
57076,SAC,2018-03-16,1,0,GSW93-98SAC,0.434,36,83,0.452,14,...,47,11,36,25,7,4,13,16,98,GSW
57077,PHX,2018-03-17,0,1,GSW124-109PHX,0.476,39,82,0.458,11,...,42,9,33,22,8,5,19,19,109,GSW
57078,SAS,2018-03-19,1,1,GSW75-89SAS,0.402,33,82,0.348,8,...,48,11,37,21,6,5,14,20,89,GSW
57079,ATL,2018-03-23,0,0,GSW106-94ATL,0.44,37,84,0.344,11,...,44,5,39,24,5,5,11,14,94,GSW


## My season checker

In [27]:
season_sel = season.loc[lambda df: (df.Date > '2010-08-01') & (df.Date < '2011-08-01'), :].reset_index(drop=True)

In [28]:
season_sel

Unnamed: 0,Team,Date,W/L,Home/Away,Score,FG%,FGM,FGA,3P%,3PM,...,REB,OREB,DREB,AST,STL,BLK,TOV,PF,PTS,Opponent
0,CHI,2010-10-27,0,0,OKC106-95CHI,0.430,40,93,0.143,2,...,51,15,36,21,6,6,14,28,95,OKC
1,CHI,2010-10-30,1,1,DET91-101CHI,0.430,37,86,0.278,5,...,51,14,37,17,11,10,15,26,101,DET
2,CHI,2010-11-01,1,1,POR98-110CHI,0.606,43,71,0.385,5,...,36,9,27,27,4,4,15,33,110,POR
3,CHI,2010-11-04,0,1,NYK120-112CHI,0.519,42,81,0.474,9,...,42,12,30,27,8,6,20,21,112,NYK
4,CHI,2010-11-05,0,0,BOS110-105CHI,0.494,42,85,0.400,4,...,43,7,36,25,7,2,19,23,105,BOS
5,CHI,2010-11-08,1,1,DEN92-94CHI,0.415,34,82,0.385,5,...,47,8,39,20,6,12,15,19,94,DEN
6,CHI,2010-11-11,1,1,GSW90-120CHI,0.548,46,84,0.368,7,...,46,11,35,29,14,10,12,7,120,GSW
7,CHI,2010-11-13,1,1,WAS96-103CHI,0.447,34,76,0.391,9,...,40,12,28,24,9,10,14,14,103,WAS
8,CHI,2010-11-16,1,0,HOU92-95CHI,0.486,36,74,0.667,8,...,40,14,26,18,12,4,19,24,95,HOU
9,CHI,2010-11-17,0,0,SAS103-94CHI,0.424,39,92,0.353,6,...,42,12,30,17,8,3,11,21,94,SAS


In [29]:
df_season_sel_team, df_season_sel_oppo, invalid_idx = pairGamePlayers(season_sel)

Duplicate found = 0
Opponent not found = 160
Team length =  2140
Oppo length =  2140
Execution time = 15.10933804512024


In [30]:
# Check the missing opponents
season_sel.iloc[invalid_idx, :]

Unnamed: 0,Team,Date,W/L,Home/Away,Score,FG%,FGM,FGA,3P%,3PM,...,REB,OREB,DREB,AST,STL,BLK,TOV,PF,PTS,Opponent
29,CHI,2010-12-31,1,1,BKN81-90CHI,0.395,30,76,0.273,6,...,43,11,32,21,5,4,11,17,90,BKN
32,CHI,2011-01-05,0,0,BKN96-94CHI,0.442,34,77,0.316,6,...,36,10,26,17,4,0,11,22,94,BKN
64,CHI,2011-03-17,1,0,BKN73-84CHI,0.407,33,81,0.200,4,...,50,17,33,19,6,6,11,18,84,BKN
78,CHI,2011-04-13,1,1,BKN92-97CHI,0.430,34,79,0.333,7,...,45,15,30,23,3,9,13,20,97,BKN
137,HOU,2011-02-26,1,1,BKN108-123HOU,0.512,42,82,0.444,8,...,46,11,35,33,8,2,12,26,123,BKN
151,HOU,2011-03-29,1,0,BKN87-112HOU,0.465,46,99,0.478,11,...,56,16,40,30,7,2,9,8,112,BKN
211,SAS,2011-02-14,1,0,BKN85-102SAS,0.450,36,80,0.438,7,...,50,12,38,21,0,4,13,20,102,BKN
214,SAS,2011-02-25,1,1,BKN96-106SAS,0.463,38,82,0.421,8,...,42,7,35,22,5,3,13,19,106,BKN
259,DAL,2010-12-09,1,1,BKN89-102DAL,0.547,41,75,0.250,4,...,38,5,33,31,11,4,15,17,102,BKN
280,DAL,2011-01-22,1,0,BKN86-87DAL,0.347,26,75,0.381,8,...,39,11,28,18,11,0,14,24,87,BKN


In [31]:
df_season_sel_team_n, df_season_sel_oppo_n, invalid_idx_ = checkGameValidity(df_season_sel_team, df_season_sel_oppo)

Team length =  2140
Oppo length =  2140
Number of invalid games = 0 @ []
Execution time = 2.3190550804138184


## Note: Some Manipulation

In [32]:
playoff.loc[lambda df: df.Date == '1986-04-17', :]

Unnamed: 0,Team,Date,W/L,Home/Away,Score,FG%,FGM,FGA,3P%,3PM,...,REB,OREB,DREB,AST,STL,BLK,TOV,PF,PTS,Opponent
0,CHI,1986-04-17,0,0,BOS123-104CHI,0.488,42,86,0.5,1,...,30,11,19,17,8,2,17,30,104,BOS
245,HOU,1986-04-17,1,1,SAC87-107HOU,0.516,49,95,0.0,0,...,50,21,29,31,8,8,11,24,107,SAC
423,SAS,1986-04-17,0,0,LAL135-88SAS,0.452,38,84,0.0,0,...,18,5,13,27,13,5,24,21,88,LAL
1187,SAC,1986-04-17,0,0,HOU107-87SAC,0.378,34,90,0.5,1,...,51,27,24,13,5,3,15,17,87,HOU
1428,LAL,1986-04-17,1,1,SAS88-135LAL,0.663,57,86,0.25,1,...,47,11,36,38,15,8,23,22,135,SAS
2419,BOS,1986-04-17,1,1,CHI104-123BOS,0.595,44,74,0.6,3,...,41,9,32,29,7,6,16,21,123,CHI
3192,ATL,1986-04-17,1,1,DET122-140ATL,0.56,51,91,0.5,1,...,42,13,29,36,6,6,13,23,140,DET
3911,DET,1986-04-17,0,0,ATL140-122DET,0.48,48,100,0.0,0,...,48,18,30,33,5,1,16,30,122,ATL


In [33]:
playoff.loc[lambda df: df.Date == '1986-04-17', :].loc[lambda df: df.Opponent == 'BOS', :]

Unnamed: 0,Team,Date,W/L,Home/Away,Score,FG%,FGM,FGA,3P%,3PM,...,REB,OREB,DREB,AST,STL,BLK,TOV,PF,PTS,Opponent
0,CHI,1986-04-17,0,0,BOS123-104CHI,0.488,42,86,0.5,1,...,30,11,19,17,8,2,17,30,104,BOS


In [34]:
# Declare empty dataframe w/ columns from existing dataframe
col = list(playoff)
df_temp = pd.DataFrame(columns = col)
df_temp = df_temp.append(playoff.loc[lambda df: df.Date == '1986-04-17', :].loc[lambda df: df.Opponent == 'BOS', :])
df_temp.head()

Unnamed: 0,Team,Date,W/L,Home/Away,Score,FG%,FGM,FGA,3P%,3PM,...,REB,OREB,DREB,AST,STL,BLK,TOV,PF,PTS,Opponent
0,CHI,1986-04-17,0,0,BOS123-104CHI,0.488,42,86,0.5,1,...,30,11,19,17,8,2,17,30,104,BOS


In [35]:
playoff.iloc[:]

Unnamed: 0,Team,Date,W/L,Home/Away,Score,FG%,FGM,FGA,3P%,3PM,...,REB,OREB,DREB,AST,STL,BLK,TOV,PF,PTS,Opponent
0,CHI,1986-04-17,0,0,BOS123-104CHI,0.488,42,86,0.500,1,...,30,11,19,17,8,2,17,30,104,BOS
1,CHI,1986-04-20,0,0,BOS135-131CHI,0.461,47,102,0.000,0,...,44,14,30,15,9,4,11,34,131,BOS
2,CHI,1986-04-22,0,1,BOS122-104CHI,0.438,39,89,0.250,1,...,45,16,29,19,7,5,15,28,104,BOS
3,CHI,1987-04-23,0,0,BOS108-104CHI,0.487,38,78,0.333,1,...,30,8,22,26,5,3,9,22,104,BOS
4,CHI,1987-04-26,0,0,BOS105-96CHI,0.442,34,77,0.571,4,...,38,12,26,14,1,4,15,25,96,BOS
5,CHI,1987-04-28,0,1,BOS105-94CHI,0.386,34,88,0.333,2,...,46,19,27,24,11,6,10,29,94,BOS
6,CHI,1988-04-28,1,1,CLE93-104CHI,0.467,43,92,0.000,0,...,50,18,32,21,9,6,16,24,104,CLE
7,CHI,1988-05-01,1,1,CLE101-106CHI,0.443,47,106,0.000,0,...,57,24,33,30,7,2,10,20,106,CLE
8,CHI,1988-05-03,0,0,CLE110-102CHI,0.464,39,84,0.000,0,...,29,12,17,24,6,4,8,28,102,CLE
9,CHI,1988-05-05,0,0,CLE97-91CHI,0.453,39,86,0.000,0,...,41,13,28,19,9,5,13,26,91,CLE
