In [1]:
import pandas as pd
import os
import json
from datetime import datetime
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
from sklearn.linear_model import LogisticRegressionCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.cross_validation import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score, auc
from roc_curve import ROC_Curve
from sklearn.svm import SVC
import statsmodels.api as sm

%matplotlib inline

In [None]:
# gl_df['LOCATION'] = gl_2010_df['MATCHUP'].apply(lambda x: re.split(r'@|vs.', x)[1])

In [2]:
# function to extract information from the json files
def json_extract(file):
    with open('data/{}'.format(file)) as f:
        data = json.load(f)
        cols = data['resultSets'][0]['headers']
        vals = data['resultSets'][0]['rowSet']
    return cols, vals

In [3]:
# takes a list of df's and remove all rows with nan's
def drop_na(df_lst):
    map(lambda x: x.dropna(inplace=True), df_lst)

In [4]:
# function to create dataframe from the json information
# keyword = gamelog, season_stats, or heights_weights
def create_df(keyword, add_year=False):

    fns = os.listdir('data/')

    cols = json_extract('2013_{}.json'.format(keyword))[0]
    if add_year:
        cols += ['YEAR']
    df = pd.DataFrame(columns=cols)

    for fn in fns:
        if keyword in fn:
            temp_cols, temp_vals = json_extract(fn)
            df_temp = pd.DataFrame(temp_vals, columns=temp_cols)
            if add_year:
                df_temp['YEAR'] = int(fn[0:4])
            df = df.append(df_temp)
            del df_temp, temp_cols, temp_vals
    return df

In [5]:
# creating dataframes
# gl = gamelogs, ss = season_stats, hw = heights_weights
gl_df = create_df('gamelog')
ss_df = create_df('season_stats', add_year=True)
hw_df = create_df('heights_weights', add_year=True)

gl_df = gl_df.reset_index(drop=True)
ss_df = ss_df.reset_index(drop=True)
hw_df = hw_df.reset_index(drop=True)

# load the injury data
injury_df = pd.read_csv('data/injuries.csv')
injury_df.drop('Unnamed: 0', axis=1, inplace=True)

# converting the Date column to datetime objects and
# extracting the year, month, and day into their own columns
injury_df['Date'] = pd.to_datetime(injury_df['Date'])
date = injury_df['Date']
injury_df['Year'] = date.apply(lambda x: x.year)
injury_df['Month'] = date.apply(lambda x: x.month)
injury_df['Day'] = date.apply(lambda x: x.day)

# filter out all events not directly related to basketball
injury_df = injury_df[(~injury_df['Notes'].str.contains('flu')) &
                      (~injury_df['Notes'].str.contains('rest')) &
                      (~injury_df['Notes'].str.contains('jail')) &
                      (~injury_df['Notes'].str.contains('ill')) &
                      (~injury_df['Notes'].str.contains('asthma')) &
                      (~injury_df['Notes'].str.contains('virus')) &
                      (~injury_df['Notes'].str.contains('return')) &
                      (~injury_df['Notes'].str.contains('pneumonia')) &
                      (~injury_df['Notes'].str.contains('coach')) &
                      (~injury_df['Notes'].str.contains('sister')) &
                      (~injury_df['Notes'].str.contains('Fined')) &
                      (~injury_df['Notes'].str.contains('flu')) &
                      (~injury_df['Notes'].str.contains('GM')) &
                      (~injury_df['Notes'].str.contains('flu')) &
                      (~injury_df['Notes'].str.contains('team')) &
                      (~injury_df['Notes'].str.contains('canal')) &
                      (~injury_df['Notes'].str.contains('food')) &
                      (~injury_df['Notes'].str.contains('virus')) &
                      (~injury_df['Notes'].str.contains('wife')) &
                      (~injury_df['Notes'].str.contains('asthma')) &
                      (~injury_df['Notes'].str.contains('chin')) &
                      (~injury_df['Notes'].str.contains('headache')) &
                      (~injury_df['Notes'].str.contains('anemia')) &
                      (~injury_df['Notes'].str.contains('dizziness')) &
                      (~injury_df['Notes'].str.contains('cold')) &
                      (~injury_df['Notes'].str.contains('throat')) &
                      (~injury_df['Notes'].str.contains('molar')) &
                      (~injury_df['Notes'].str.contains('dizziness')) &
                      (~injury_df['Notes'].str.contains('rash')) &
                      (~injury_df['Notes'].str.contains('stomach ache')) &
                      (~injury_df['Notes'].str.contains('bronchitis')) &
                      (~injury_df['Notes'].str.contains('concussion'))]


# drop all nan's rows
drop_na([ss_df, hw_df])

# heights/weights data has duplicate players for a specific year for players
# who were traded.  Dropping duplicates
hw_df = hw_df.drop_duplicates(['PLAYER_ID', 'YEAR'])

# merging heights/weights and season stats on the player id and the year
ss_hw_df = ss_df.merge(hw_df, left_on=['PLAYER_ID', 'YEAR'],
                              right_on=['PLAYER_ID','YEAR'])

In [6]:
injury_subset = injury_df[(injury_df['Year'] == 2014)]

In [7]:
# removing characters like (a) and (b)
injury_subset['Player'] = injury_subset['Player'].apply(lambda x: ' '.join(x.split()[:2]) if re.match(r'.+\(.+\)', x) else x)

# removing white spaces from player names
injury_subset['Player'] = injury_subset['Player'].apply(lambda x: x.strip())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [10]:
injury_subset.drop_duplicates('Player', inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  return func(*args, **kwargs)


In [11]:
nba_players = ss_df['PLAYER_NAME'].unique()
for player in nba_players:
    injury_subset['Player'] = injury_subset['Player'].apply(lambda x: player if player in x else x)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()


In [12]:
injury_subset['Player'][21205] = 'Tony Parker'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  exec(code_obj, self.user_global_ns, self.user_ns)


In [13]:
gl_df['GAME_DATE'] = pd.to_datetime(gl_df['GAME_DATE'])
gl_date = gl_df['GAME_DATE']
gl_df['YEAR'] = gl_date.apply(lambda x: x.year)
gl_df['MONTH'] = gl_date.apply(lambda x: x.month)
gl_df['DAY'] = gl_date.apply(lambda x: x.day)

In [14]:
gl_subset = gl_df[(gl_df['YEAR'] == 2014)]

In [15]:
drop_gl_vars = ['SEASON_ID', 'TEAM_ABBREVIATION', 'TEAM_NAME', 'GAME_ID',
                'MATCHUP', 'WL', 'FG_PCT', 'FG3_PCT', 'FT_PCT', 'PLUS_MINUS',
                'VIDEO_AVAILABLE']
gl_subset.drop(drop_gl_vars, axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [16]:
gl_subset = gl_subset.groupby('PLAYER_NAME').mean()

In [17]:
gl_subset.reset_index(inplace=True)

In [None]:
Xy = injury_subset.merge(gl_subset, left_on='Player', right_on='PLAYER_NAME', how='outer')

In [None]:
Xy[['Player', 'PLAYER_NAME', 'Date', 'YEAR', 'MONTH']].shape

In [None]:
# drop players who did not player in Jan 2014
Xy = Xy[Xy['PLAYER_NAME'].notnull() == True]

In [None]:
Xy['Injured'] = Xy['Notes'].notnull()

In [None]:
Xy_drop_cols = ['Date', 'Team', 'Player', 'Notes', 'Year', 'Month', 'Day', 'PLAYER_NAME',
                'PLAYER_ID', 'YEAR', 'MONTH', 'DAY']
Xy.drop(Xy_drop_cols, axis=1, inplace=True)

In [None]:
pd.scatter_matrix(Xy, figsize=(20,20));

In [None]:
Xy.corr()

In [None]:
Xy.drop(['FGM', 'FG3M', 'FTM', 'REB', 'PTS'], axis=1, inplace=True)

In [None]:
pd.scatter_matrix(Xy, figsize=(20, 20));

In [None]:
y = Xy['Injured']

In [None]:
X  = Xy.drop('Injured', axis=1)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.25)

In [None]:
rfc = RandomForestClassifier(class_weight='balanced').fit(X_train, y_train)
rfc.score(X_test, y_test)

In [None]:
gbc = GradientBoostingClassifier().fit(X_train, y_train)
gbc.score(X_test, y_test)

In [None]:
logreg_cv = LogisticRegressionCV(class_weight='balanced').fit(X_train, y_train)
logreg_cv.score(X_test, y_test)

In [None]:
svc = SVC(class_weight='balanced', probability=True).fit(X_train, y_train)

In [None]:
svc.score(X_test, y_test)

In [None]:
roc = ROC_Curve(logreg_cv.predict_proba(X_test)[:, 1], y_test)

In [None]:
tpr, fpr, thres = roc.calculate()

In [None]:
roc.plot_curve(size=(10, 10))

In [None]:
roc.calc_auc()

In [21]:
gl_subset.merge(injury_subset, left_on=['PLAYER_NAME', 'YEAR'], right_on=['Player', 'Year'], how='outer')[['PLAYER_NAME', 'Player', 'Year']]

Unnamed: 0,PLAYER_NAME,Player,Year
0,A.J. Price,,
1,Aaron Brooks,Aaron Brooks,2014
2,Aaron Gordon,Aaron Gordon,2014
3,Aaron Gray,,
4,Adonis Thomas,,
5,Adreian Payne,,
6,Al Harrington,,
7,Al Horford,,
8,Al Jefferson,Al Jefferson,2014
9,Al-Farouq Aminu,,
