In [1]:
import pandas as pd
import os
import json
from datetime import datetime, date
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
from roc_curve import ROC_Curve
import statsmodels.api as sm

%matplotlib inline

In [2]:
# function to extract information from the json files
def json_extract(file):
    with open('data/{}'.format(file)) as f:
        data = json.load(f)
        cols = data['resultSets'][0]['headers']
        vals = data['resultSets'][0]['rowSet']
    return cols, vals


# takes a list of df's and remove all rows with nan's
def drop_na(df_lst):
    map(lambda x: x.dropna(inplace=True), df_lst)
    return None


# function to create dataframe from the json information
# keyword = gamelog, season_stats, or heights_weights
def create_df(keyword, add_year=False):

    fns = os.listdir('data/')

    cols = json_extract('2013_{}.json'.format(keyword))[0]
    if add_year:
        cols += ['YEAR']
    df = pd.DataFrame(columns=cols)

    for fn in fns:
        if keyword in fn:
            temp_cols, temp_vals = json_extract(fn)
            df_temp = pd.DataFrame(temp_vals, columns=temp_cols)
            if add_year:
                df_temp['YEAR'] = int(fn[0:4])
            df = df.append(df_temp)
            del df_temp, temp_cols, temp_vals
    return df


def parse_date(df, date_col, create_sep_cols=True):

    df[date_col] = pd.to_datetime(df[date_col])
    if create_sep_cols:
        date = df[date_col]
        df['YEAR'] = date.apply(lambda x: x.year)
        df['MONTH'] = date.apply(lambda x: x.month)
        df['DAY'] = date.apply(lambda x: x.day)

    return df

# preprocess injury df
def prep_injury(df):

    drop_vars = ['Unnamed: 0', 'Team']
    df.drop(drop_vars, axis=1, inplace=True)

    # converting the Date column to datetime objects
#     df = parse_date(df, 'Date')

    # filter out all events not directly related to basketball
    df = df[(~df['Notes'].str.contains('flu')) &
            (~df['Notes'].str.contains('rest')) &
            (~df['Notes'].str.contains('jail')) &
            (~df['Notes'].str.contains('ill')) &
            (~df['Notes'].str.contains('asthma')) &
            (~df['Notes'].str.contains('virus')) &
            (~df['Notes'].str.contains('return')) &
            (~df['Notes'].str.contains('pneumonia')) &
            (~df['Notes'].str.contains('coach')) &
            (~df['Notes'].str.contains('sister')) &
            (~df['Notes'].str.contains('Fined')) &
            (~df['Notes'].str.contains('flu')) &
            (~df['Notes'].str.contains('GM')) &
            (~df['Notes'].str.contains('flu')) &
            (~df['Notes'].str.contains('team')) &
            (~df['Notes'].str.contains('canal')) &
            (~df['Notes'].str.contains('food')) &
            (~df['Notes'].str.contains('virus')) &
            (~df['Notes'].str.contains('wife')) &
            (~df['Notes'].str.contains('asthma')) &
            (~df['Notes'].str.contains('chin')) &
            (~df['Notes'].str.contains('headache')) &
            (~df['Notes'].str.contains('anemia')) &
            (~df['Notes'].str.contains('dizziness')) &
            (~df['Notes'].str.contains('cold')) &
            (~df['Notes'].str.contains('throat')) &
            (~df['Notes'].str.contains('molar')) &
            (~df['Notes'].str.contains('dizziness')) &
            (~df['Notes'].str.contains('rash')) &
            (~df['Notes'].str.contains('stomach ache')) &
            (~df['Notes'].str.contains('bronchitis')) &
            (~df['Notes'].str.contains('concussion')) &
            (~df['Notes'].str.contains('recover')) &
            (~df['Notes'].str.contains('mump'))]

    # removing characters like (a) and (b)
    df['Player'] = df['Player'].apply(lambda x: ' '.join(x.split()[:2]) if re.match(r'.+\(.+\)', x) else x)

    # stripping blank spaces from player names
    df['Player'] = df['Player'].apply(lambda x: x.strip())

    nba_players = ss_df['PLAYER_NAME'].unique()
    for player in nba_players:
        df['Player'] = df['Player'].apply(lambda x: player if player in x else x)

    df['Player'][21205] = 'Tony Parker'

    return df

# preprocess
def prep_gamelog(df):

    # converting the Date column to datetime objects
    df = parse_date(df, 'GAME_DATE')

    drop_vars = ['SEASON_ID', 'TEAM_ABBREVIATION', 'TEAM_NAME', 'GAME_ID',
                 'MATCHUP', 'WL', 'FG_PCT', 'FG3_PCT', 'FT_PCT',
                 'VIDEO_AVAILABLE']
    df.drop(drop_vars, axis=1, inplace=True)

    return df


# merge the season_stats and heights_weights df
def merge(ss_df, hw_df):
    # drop all nan's rows
    drop_na([ss_df, hw_df])

    # heights/weights data has duplicate players for a specific year for players
    # who were traded.  Dropping duplicates
    hw_df = hw_df.drop_duplicates(['PLAYER_ID', 'YEAR'])

    # merging heights/weights and season stats on the player id and the year
    merged_df = ss_df.merge(hw_df, left_on=['PLAYER_ID', 'YEAR'],
                                   right_on=['PLAYER_ID','YEAR'])

    return merged_df

def preprocess(gl_df, injury_df, span):

    injury_df = prep_injury(injury_df)
    gl_df = prep_gamelog(gl_df)
    
    injury_subset = injury_df[injury_df['YEAR'] == span]
    gl_subset = gl_df[gl_df['YEAR'] == span]

    # remove players
#     df.drop_duplicates('Player', inplace=True)
    
#     df = df.groupby('PLAYER_NAME').mean()
#     df.reset_index(inplace=True)
    
    
    Xy = injury_subset.merge(gl_subset, left_on=['YEAR', 'Player'],
                                        right_on=['YEAR', 'PLAYER_NAME'],
                                        how='outer')

    # drop players who did not player in Jan 2014
    Xy = Xy[Xy['PLAYER_NAME'].notnull() == True]

    Xy['Injured'] = Xy['Notes'].notnull()

    # Xy_drop_cols = ['Date', 'Team', 'Player', 'Notes', 'PLAYER_NAME',
    #                 'PLAYER_ID', 'YEAR', 'MONTH', 'DAY']
    # Xy.drop(Xy_drop_cols, axis=1, inplace=True)

    y = Xy['Injured']
    X  = Xy.drop('Injured', axis=1)

    return X, y

def start_end_season(keyword, add_year=False):

    fns = os.listdir('data/')

    cols = json_extract('2013_{}.json'.format(keyword))[0]
    if add_year:
        cols += ['YEAR']
    df = pd.DataFrame(columns=cols)

    season_range = []
    for fn in fns:
        if keyword in fn:
            cols, vals = json_extract(fn)
            df = pd.DataFrame(vals, columns=cols)
            season_range.append((df['GAME_DATE'].min(),
                                 df['GAME_DATE'].max()))
            
    return season_range

In [3]:
ss_df = create_df('season_stats', add_year=True)
hw_df = create_df('heights_weights', add_year=True)
ss_hw_df = merge(ss_df, hw_df)
injury_df = pd.read_csv('data/injuries.csv')
injury_df = prep_injury(injury_df)
gamelog_df = create_df('gamelog')
gamelog_df = prep_gamelog(gamelog_df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [4]:
# creating dataframe of everyday in a season
season_dt_range = start_end_season('gamelog')

seasons = pd.DataFrame(columns=['Date'])
for range in season_dt_range:
    tmp = pd.DataFrame(pd.date_range(range[0], range[1], freq='D'), columns=['Date'])
    seasons = seasons.append(tmp)

In [5]:
injury_df = parse_date(injury_df, 'Date', False)

In [6]:
reg_season_injuries = seasons.merge(injury_df, on='Date', how='left')

In [7]:
reg_season_injuries = parse_date(reg_season_injuries, 'Date')

In [8]:
reg_season_injuries_2000 = reg_season_injuries[reg_season_injuries['YEAR'] == 2000]

In [9]:
reg_season_injuries_2000.reset_index(inplace=True, drop=True)

In [10]:
reg_season_gamelog = seasons.merge(gamelog_df, left_on='Date', right_on='GAME_DATE', how='left')

In [11]:
reg_season_gamelog_2000 = reg_season_gamelog[reg_season_gamelog['YEAR'] == 2000]

In [12]:
min_mpg = 20.0
reg_players = ss_df[(ss_df['YEAR'] == 2000.0) & (ss_df['MIN'] > min_mpg)]['PLAYER_NAME']

# keeping only players who play for more than 20 mins
reg_season_gamelog_2000 = reg_season_gamelog_2000[reg_season_gamelog_2000['PLAYER_NAME'].isin(reg_players)]

In [14]:
reg_season_injuries_2000['adjusted_date'] = reg_season_injuries_2000['Date'].apply(lambda x: x - pd.DateOffset(days=5))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [15]:
reg_season_gamelog_2000.reset_index(drop=True, inplace=True)

In [276]:
feat_matrix = pd.DataFrame()

# prediction window
pred_window = 14

for i in xrange(500, 2000, 20):
    player = reg_season_gamelog_2000['PLAYER_NAME'].loc[i]
    player_id = reg_season_gamelog_2000['PLAYER_ID'].loc[i]
    start_date = reg_season_gamelog_2000['GAME_DATE'].loc[i]
    end_date = start_date + pd.DateOffset(days=pred_window)
    mask = (reg_season_gamelog_2000['GAME_DATE'] >= start_date) &\
           (reg_season_gamelog_2000['GAME_DATE'] <= end_date) &\
           (reg_season_gamelog_2000['PLAYER_NAME'] == player)
#     grouped_cnt = mask.sum()
    tmp = reg_season_gamelog_2000[mask]
    
    col_func = {}
    col_names = reg_season_gamelog_2000.columns.tolist()
    col_names.remove('PLAYER_ID')
    col_names
    for col in col_names:
        if col not in ['DAY', 'Date', 'GAME_DATE', 'PLAYER_NAME']:
            col_func[col] = ['mean']
        else:
            col_func[col] = ['min']
    
    col_func = lambda x: x.mean() if isinstance(x, float) else x.min()
    feat_matrix = feat_matrix.append(tmp.groupby(['PLAYER_ID']).agg(col_func))

In [279]:
feat_matrix.reset_index(inplace=True)

In [301]:
Xy = feat_matrix.merge(reg_season_injuries_2000, left_on=['GAME_DATE', 'PLAYER_NAME'],
                                                 right_on=['Date', 'Player'], how='outer')

In [311]:
Xy['Injured'] = Xy['Notes'].notnull()
Xy[['GAME_DATE', 'Date_y', 'adjusted_date', 'Player', 'PLAYER_NAME', 'Injured']]
Xy.groupby(['GAME_DATE', 'PLAYER_ID']).mean().reset_index()

Unnamed: 0,GAME_DATE,PLAYER_ID,MIN,FGM,FGA,FG3M,FG3A,FTM,FTA,OREB,...,PTS,PLUS_MINUS,YEAR_x,MONTH_x,DAY_x,index,YEAR_y,MONTH_y,DAY_y,Injured
0,2000-11-04,45.0,19.0,2.0,5.0,0.0,1.0,0.0,0.0,0.0,...,4.0,-15.0,2000.0,11.0,4.0,,,,,False
1,2000-11-04,349.0,29.0,0.0,4.0,0.0,2.0,0.0,0.0,0.0,...,0.0,-5.0,2000.0,11.0,4.0,,,,,False
2,2000-11-04,951.0,30.0,3.0,14.0,0.0,6.0,0.0,0.0,0.0,...,12.0,-15.0,2000.0,11.0,4.0,,,,,False
3,2000-11-04,970.0,20.0,3.0,6.0,0.0,0.0,0.0,0.0,1.0,...,10.0,-12.0,2000.0,11.0,4.0,,,,,False
4,2000-11-04,1005.0,20.0,2.0,6.0,0.0,2.0,0.0,0.0,0.0,...,4.0,-18.0,2000.0,11.0,4.0,,,,,False
5,2000-11-04,1501.0,22.0,3.0,6.0,0.0,1.0,0.0,0.0,0.0,...,10.0,-9.0,2000.0,11.0,4.0,,,,,False
6,2000-11-05,1838.0,17.0,0.0,4.0,0.0,2.0,0.0,0.0,0.0,...,1.0,-23.0,2000.0,11.0,5.0,,,,,False
7,2000-11-06,3.0,13.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,2.0,-9.0,2000.0,11.0,6.0,,,,,False
8,2000-11-06,1722.0,20.0,4.0,8.0,0.0,1.0,0.0,0.0,0.0,...,9.0,-14.0,2000.0,11.0,6.0,,,,,False
9,2000-11-06,1740.0,31.0,3.0,9.0,2.0,5.0,0.0,0.0,1.0,...,12.0,-20.0,2000.0,11.0,6.0,,,,,False


In [18]:
reg_season_injuries_2000.drop(reg_season_injuries_2000.index[drop_rows])

In [19]:
feat_matrix.reset_index(inplace=True)
reg_season_injuries_2000.reset_index(inplace=True)

In [20]:
Xy = pd.concat([feat_matrix, reg_season_injuries_2000], axis=1)