In [1]:
import pickle
import pandas as pd

In [2]:
from datetime import datetime

In [3]:
with open('../Data/allplayerFantasyGameLogs.pickle', 'rb') as handle:
  allplayerFantasyGameLogs = pickle.load(handle)

In [4]:
allplayerFantasyGameLogs.set_index('GAME_DATE', inplace=True)

In [5]:
allplayerFantasyGameLogs.columns

Index([   u'SEASON_ID',    u'Player_ID',      u'Game_ID',      u'MATCHUP',
                 u'WL',          u'MIN',          u'FGM',          u'FGA',
             u'FG_PCT',         u'FG3M',         u'FG3A',      u'FG3_PCT',
                u'FTM',          u'FTA',       u'FT_PCT',         u'OREB',
               u'DREB',          u'REB',          u'AST',          u'STL',
                u'BLK',          u'TOV',           u'PF',          u'PTS',
         u'PLUS_MINUS',        u'DouBL',        u'TriBL',       u'FanPTs',
           u'fullName',    u'position1',         u'Team', u'OpponentTeam',
           u'HomeGame'],
      dtype='object')

##Data Integration

We will split the allplayerGameLogs into training and test sets. And we need to use two helpful functions to extract player-level and team-level features to train the prediction model for fantasy points from a player. We also need another helpful function to aggregate corresponding information from the test set for the testing purpose.

In [56]:
def aggr(group):
    test_df = pd.DataFrame()    
    test_df['LastFanPTs'] = group['FanPTs'][-1:]
    test_df['AvgFanPTs'] = group['FanPTs'].mean()
    test_df['AvgPTS'] = group['PTS'].mean()
    test_df['LastPT'] = group['PTS'][-1:]
    test_df['AvgMIN'] = group['MIN'].mean()
    test_df['LastMIN'] = group['MIN'][-1:]
    test_df['AvgFGM'] = group['FGM'].mean()
    test_df['LastFGM'] = group['FGM'][-1:]
    test_df['AvgFGA'] = group['FGA'].mean()
    test_df['LastFGA'] = group['FGA'][-1:]
    test_df['AvgFG3M'] = group['FG3M'].mean()
    test_df['LastFG3M'] = group['FG3M'][-1:]
    test_df['AvgFG3A'] = group['FG3A'].mean()
    test_df['LastFG3A'] = group['FG3A'][-1:]
    test_df['AvgREB'] = group['REB'].mean()
    test_df['LastREB'] = group['REB'][-1:]
    test_df['AvgAST'] = group['AST'].mean()
    test_df['LastAST'] = group['AST'][-1:]
    test_df['AvgSTL'] = group['STL'].mean()
    test_df['AvgTOV'] = group['TOV'].mean() 
    test_df['LastTOV'] = group['TOV'][-1:]
    test_df['AvgPF'] = group['PF'].mean()
    test_df['LastPF'] = group['PF'][-1:]
    test_df['AvgPLUS_MINUS'] = group['PLUS_MINUS'].mean()
    test_df['LastPLUS_MINUS'] = group['PLUS_MINUS'][-1:]
    #group['NumDouBL'] = group['DouBL'].sum()
    #group['NumTriBL'] = group['TriBL'].sum()

    test_df['Last3GameAvgFanPTs'] = group['FanPTs'][-3:].mean()
    test_df['Last3GameAvgMIN'] = group['MIN'][-3:].mean()
    test_df['Last3GameAvgPTS'] = group['PTS'][-3:].mean()
    
    num_team = len(group['Team'].unique())
    if(num_team==1):
        test_df['fullName'] = group['fullName'].unique()
        test_df['Player_ID'] = group['Player_ID'].unique()
        test_df['Team'] = group['Team'].unique()[0]
        test_df['position1'] = group['position1'].unique()[0]
    else:
        test_df['fullName'] = group['fullName'].unique()
        test_df['Player_ID'] = group['Player_ID'].unique()
        test_df['Team'] = group['Team'].unique()[num_team-1]
        test_df['position1'] = group['position1'].unique()       
    
    return(test_df)
    
def aggr_stats(date,allplayerFantasyGameLogs):
    interest_columns = ['fullName','Player_ID','Team','position1','MIN','PTS','FGM','FGA', 'FG3M','FG3A', \
                        'REB','AST','STL','TOV','PF','PLUS_MINUS','DouBL','TriBL','FanPTs']
    tmp = allplayerFantasyGameLogs.ix['2015-10-27':date]
    
    playerID_tmp = tmp.reset_index().copy()
    tmp.grouped = playerID_tmp[interest_columns].groupby('Player_ID')
    Newdf = pd.DataFrame()
    ids = playerID_tmp['Player_ID'].unique()
    
    for id in ids:
        group = tmp.grouped.get_group(id)
        df = aggr(group)
        Newdf = pd.concat([Newdf,df],axis=0)
    
    bins = [-10, 10, 20, 30, 40, 100]
    group_names = ['benchPlayer','belowAvg','average','advanced','top']
    Newdf['Rank']= pd.cut(Newdf['AvgFanPTs'],bins,labels=group_names)
    
    return(Newdf)

In [57]:
def aggr_teamVSteam(group):
        group['TeamStdVSFanPTs'] = group['FanPTs'].std()
        group['TeamAvgVSFanPTs'] = group['FanPTs'].mean()
        group['TeamMaxVSFanPTs'] = group['FanPTs'].max()
        return group

def aggr_team(group):
        group['TeamStdFanPTs'] = group['TeamStdVSFanPTs'].mean()
        group['TeamAvgFanPTs'] = group['TeamAvgVSFanPTs'].mean()
        group['TeamMaxFanPTs'] = group['TeamMaxVSFanPTs'].mean()
        return group    

def generate_team_features(playerGameLogs, playerFeatureTable, date):
    tmp = playerGameLogs['2015-10-27': date]
    tmp = tmp.reset_index()
    bad_players = playerFeatureTable[playerFeatureTable.Rank=='benchPlayer']['Player_ID']
    interest_cols = ['fullName','Player_ID','Team','OpponentTeam','position1','FanPTs','MIN']
    tmp = tmp[interest_cols]
    tmp = tmp[~tmp['Player_ID'].isin(bad_players)]
    
    newdf = tmp.copy()
    newdf_grouped = newdf.groupby(['Team','OpponentTeam'])
        
    Newdf = newdf_grouped.apply(aggr_teamVSteam)
    Newdf.drop(['fullName','Player_ID','MIN','FanPTs','position1'],inplace=True,axis=1)
    Newdf.drop_duplicates(['Team','OpponentTeam'],inplace=True)
    
    Newdf.drop('OpponentTeam',axis=1,inplace=True)
    
    Newdf2 = Newdf.copy()
    Newdf2_grouped = Newdf2.groupby('Team')
    
    Newdf_overall = Newdf2_grouped.apply(aggr_team)
    Newdf_overall.drop(['TeamStdVSFanPTs','TeamAvgVSFanPTs','TeamMaxVSFanPTs'],inplace=True,axis=1)
    Newdf_overall.drop_duplicates('Team',inplace=True)
    
    return(Newdf_overall)

In [58]:
def drop_y(df):
    # list comprehension of the cols that end with '_y'
    to_drop = [x for x in df if x.endswith('_y')]
    df.drop(to_drop, axis=1, inplace=True)

def rename_x(df):
    for col in df:
        if col.endswith('_x'):
            df.rename(columns={col:col.rstrip('_x')}, inplace=True)

In [59]:
def get_train_test(train_date, test_date): #format like'2/10/2016'
    train_date_index = pd.date_range(start='11/10/2015', end=train_date, freq='D')
    train_df = pd.DataFrame()
    
    alldates = allplayerFantasyGameLogs.index
    trydates = pd.date_range(start='10/27/2015', end='2/28/2016', freq='D')
    s = set(alldates)
    nodates = [x for x in trydates if x not in s]
    
    for idx in train_date_index:
        tmp_idx = idx+1
        if tmp_idx not in nodates and idx not in nodates:
            #aggregate the statistics from players -> player-level features
            trainLogs = allplayerFantasyGameLogs.ix['2015-10-27':idx]
            train_player_df = aggr_stats(idx,trainLogs)   
            #next we need to collect the player's next game Fantasy Points.
            next_date = idx + 1
            tmpLogs = allplayerFantasyGameLogs[['fullName', 'Player_ID','Team','OpponentTeam','HomeGame','FanPTs']].ix[next_date]
            tmpLogs.rename(columns={'FanPTs':'NewGameFanPTs'},inplace=True)
            #join the tmpLogs and player festure table by Player_ID, which is based on the players on a new game day
            newgame_df = pd.merge(tmpLogs,train_player_df,how='inner',on='Player_ID')
            drop_y(newgame_df)
            rename_x(newgame_df)

            #get the team features table 
            train_team_df = generate_team_features(allplayerFantasyGameLogs, train_player_df, idx)
            newgame_df = pd.merge(newgame_df,train_team_df,how='left',on='Team')
            train_df = pd.concat([train_df,newgame_df],axis=0)

    test_date_index = pd.date_range(start=train_date, end=test_date, freq='D')[1:]
    start_test_date = pd.date_range(start=train_date, end=test_date, freq='D')[0]
    test_df = pd.DataFrame()
    for idx in test_date_index:
        tmp_idx = idx+1
        if tmp_idx not in nodates and idx not in nodates:
            #aggregate the statistics from players -> player-level features
            testLogs = allplayerFantasyGameLogs.ix[start_test_date:idx]
            test_player_df = aggr_stats(idx,testLogs)   
            #next we need to collect the player's next game Fantasy Points.
            next_date = idx + 1
            tmpLogs = allplayerFantasyGameLogs[['fullName', 'Player_ID','Team','OpponentTeam','HomeGame','FanPTs']].ix[next_date]
            tmpLogs.rename(columns={'FanPTs':'NewGameFanPTs'},inplace=True)
            #join the tmpLogs and player festure table by Player_ID, which is based on the players on a new game day
            newgame_df = pd.merge(tmpLogs,train_player_df,how='inner',on='Player_ID')
            drop_y(newgame_df)
            rename_x(newgame_df)

            #get the team features table 
            test_team_df = generate_team_features(allplayerFantasyGameLogs, test_player_df, idx)
            newgame_df = pd.merge(newgame_df,test_team_df,how='left',on='Team')
            test_df = pd.concat([test_df,newgame_df],axis=0)                
        
    return(train_df, test_df)

In [60]:
train_set, test_set = get_train_test('2/18/2016', '2/28/2016')

In [61]:
train_set.shape

(14503, 39)

In [62]:
test_set.shape

(1511, 39)

In [66]:
train_set.head(5).transpose()

Unnamed: 0,0,1,2,3,4
fullName,Jamal Crawford,DeAndre Jordan,JJ Hickson,Frank Kaminsky,Mike Conley
Player_ID,2037,201599,201581,1.62616e+06,201144
Team,LAC,LAC,DEN,CHA,MEM
OpponentTeam,DAL,DAL,MIL,NYK,GSW
HomeGame,0,0,1,1,1
NewGameFanPTs,12.5,23.75,19.5,12,24
LastFanPTs,15.25,38.5,44.25,-0.5,32.5
AvgFanPTs,15.6071,36.8214,21.35,6.25,28.5938
AvgPTS,10,10.1429,10,3.14286,13.75
LastPT,13,13,19,0,16


In [63]:
with open('../Data/train_set_02_29.pickle', 'wb') as handle:
  pickle.dump(train_set, handle)

In [64]:
with open('../Data/test_set_02_29.pickle', 'wb') as handle:
  pickle.dump(test_set, handle)

##Prepocessing

In [21]:
with open('../Data/train_set_02_29.pickle', 'rb') as handle:
  train_set = pickle.load(handle)

In [22]:
with open('../Data/test_set_02_29.pickle', 'rb') as handle:
  test_set = pickle.load(handle)

In [67]:
test_set.shape

(1511, 39)

In [73]:
#Combine into data:
train_set['source']= 'train'
test_set['source'] = 'test'
data=pd.concat([train_set, test_set],ignore_index=True)
data.shape

(16014, 40)

####Note that in our problem, we don't care about the benchplayers. So we don't have to train on those data, which might bias our prediction.

In [74]:
data = data[data.Rank!='benchPlayer']

Index([          u'fullName',          u'Player_ID',               u'Team',
             u'OpponentTeam',           u'HomeGame',      u'NewGameFanPTs',
               u'LastFanPTs',          u'AvgFanPTs',             u'AvgPTS',
                   u'LastPT',             u'AvgMIN',            u'LastMIN',
                   u'AvgFGM',            u'LastFGM',             u'AvgFGA',
                  u'LastFGA',            u'AvgFG3M',           u'LastFG3M',
                  u'AvgFG3A',           u'LastFG3A',             u'AvgREB',
                  u'LastREB',             u'AvgAST',            u'LastAST',
                   u'AvgSTL',             u'AvgTOV',            u'LastTOV',
                    u'AvgPF',             u'LastPF',      u'AvgPLUS_MINUS',
           u'LastPLUS_MINUS', u'Last3GameAvgFanPTs',    u'Last3GameAvgMIN',
          u'Last3GameAvgPTS',          u'position1',               u'Rank',
            u'TeamStdFanPTs',      u'TeamAvgFanPTs',      u'TeamMaxFanPTs',
            

###Check Missing values

In [69]:
data.apply(lambda x: sum(x.isnull()))

fullName              0
Player_ID             0
Team                  0
OpponentTeam          0
HomeGame              0
NewGameFanPTs         0
LastFanPTs            0
AvgFanPTs             0
AvgPTS                0
LastPT                0
AvgMIN                0
LastMIN               0
AvgFGM                0
LastFGM               0
AvgFGA                0
LastFGA               0
AvgFG3M               0
LastFG3M              0
AvgFG3A               0
LastFG3A              0
AvgREB                0
LastREB               0
AvgAST                0
LastAST               0
AvgSTL                0
AvgTOV                0
LastTOV               0
AvgPF                 0
LastPF                0
AvgPLUS_MINUS         0
LastPLUS_MINUS        0
Last3GameAvgFanPTs    0
Last3GameAvgMIN       0
Last3GameAvgPTS       0
position1             0
Rank                  0
TeamStdFanPTs         0
TeamAvgFanPTs         0
TeamMaxFanPTs         0
source                0
dtype: int64

###Look at categories of all object variables

In [70]:
var = ['Team','OpponentTeam', 'position1','Rank']
for v in var:
    print '\nFrequency count for variable %s'%v
    print data[v].value_counts()


Frequency count for variable Team
SAS    596
DAL    586
BOS    578
NYK    570
BKN    568
POR    566
MIN    559
ATL    556
UTA    550
PHI    549
GSW    543
OKC    543
DEN    538
LAL    538
ORL    535
LAC    532
IND    531
HOU    527
PHX    526
DET    517
CHA    516
MIL    516
WAS    513
TOR    508
CLE    506
NOP    506
MEM    500
CHI    484
SAC    482
MIA    475
Name: Team, dtype: int64

Frequency count for variable OpponentTeam
PHI    574
LAL    566
BKN    565
BOS    563
DAL    556
DET    554
PHX    554
DEN    552
POR    550
NYK    542
MIN    542
UTA    539
SAS    537
MIL    536
MEM    535
HOU    534
ATL    533
IND    528
CHA    526
ORL    523
NOP    520
WAS    519
CLE    518
OKC    517
LAC    514
SAC    512
GSW    509
MIA    508
TOR    508
CHI    480
Name: OpponentTeam, dtype: int64

Frequency count for variable position1
SG    3517
PF    3338
PG    3325
SF    3028
C     2806
Name: position1, dtype: int64

Frequency count for variable Rank
belowAvg       5991
average        4161
benc

###Numerical Coding and One-Hot Coding

In [28]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
var_to_encode = ['Team','OpponentTeam','position1','HomeGame','Rank']
for col in var_to_encode:
    data[col] = le.fit_transform(data[col])

In [71]:
var_to_encode = ['Team','OpponentTeam','position1','HomeGame','Rank']
data = pd.get_dummies(data, columns=var_to_encode)
data.columns

Index([        u'fullName',        u'Player_ID',    u'NewGameFanPTs',
             u'LastFanPTs',        u'AvgFanPTs',           u'AvgPTS',
                 u'LastPT',           u'AvgMIN',          u'LastMIN',
                 u'AvgFGM',
       ...
           u'position1_PG',     u'position1_SF',     u'position1_SG',
             u'HomeGame_0',       u'HomeGame_1',    u'Rank_advanced',
           u'Rank_average',    u'Rank_belowAvg', u'Rank_benchPlayer',
               u'Rank_top'],
      dtype='object', length=107)

In [72]:
data.head(5).transpose()

Unnamed: 0,0,1,2,3,4
fullName,Jamal Crawford,DeAndre Jordan,JJ Hickson,Frank Kaminsky,Mike Conley
Player_ID,2037,201599,201581,1.62616e+06,201144
NewGameFanPTs,12.5,23.75,19.5,12,24
LastFanPTs,15.25,38.5,44.25,-0.5,32.5
AvgFanPTs,15.6071,36.8214,21.35,6.25,28.5938
AvgPTS,10,10.1429,10,3.14286,13.75
LastPT,13,13,19,0,16
AvgMIN,20.8571,32.1429,18,9.14286,31
LastMIN,20,36,30,3,35
AvgFGM,3.28571,4.14286,4.2,1,4.625


###Separate train & test:

In [30]:
train = data.loc[data['source']=='train']
test = data.loc[data['source']=='test']

In [31]:
train.drop('source',axis=1,inplace=True)
test.drop('source',axis=1,inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from IPython.kernel.zmq import kernelapp as app


In [32]:
train.to_csv('../Data/train_modified_0229.csv',index=False)
test.to_csv('../Data/test_modified_0229.csv',index=False)

In [42]:
tt =[1,2,3,4,5]

In [43]:
tt[:3]

[1, 2, 3]

In [44]:
tt[:-3]

[1, 2]

In [49]:
allplayerFantasyGameLogs['FanPTs'][-1:]

GAME_DATE
2016-02-29    24
Name: FanPTs, dtype: float64