In [1]:
import json
import numpy as np
import pandas as pd

# Player List

- Positions: QB, RB (and variants), WR (and variants), TE
- Keep unique ids because players can be labelled under multiple variants

In [2]:
with open('../../data/player_links.json', 'r') as f:
    players = []
    for line in f:
        players.append(json.loads(line))
        
players = pd.DataFrame(players)

In [3]:
positions_tracked =  ['QB', 'RB', 'GLB', '3RB', 'FB', 'WR1', 'WR2', 'WR3', 'TE']
players = players.loc[lambda df: df.player_position.isin(positions_tracked), :]
players['player_position'].replace(['GLB', '3RB', 'FB', 'WR1', 'WR2', 'WR3'], ['RB', 'RB', 'RB', 'WR', 'WR', 'WR'], inplace=True)
players['player_id'] = players['player_link'].str.extract('([\d]+)', expand=False)
players.drop_duplicates('player_id', inplace=True)

In [4]:
print(players.shape)
players.head()

(521, 4)


Unnamed: 0,player_link,player_name,player_position,player_id
0,/player/nfl/13045/josh-allen,Josh Allen,QB,13045
1,/player/nfl/3233/derek-anderson,Derek Anderson,QB,3233
2,/player/nfl/12220/nathan-peterman,Nathan Peterman,QB,12220
3,/player/nfl/5168/lesean-mccoy,LeSean McCoy,RB,5168
4,/player/nfl/6168/chris-ivory,Chris Ivory,RB,6168


In [5]:
players.to_csv('../../data/player_list.csv', index=False)

# Player Stats

In [6]:
with open('../../data/player_stats.json', 'r') as f:
    weekly_stats = []
    for line in f:
        weekly_stats.append(json.loads(line))
        
weekly_stats = pd.DataFrame(weekly_stats)

In [7]:
weekly_stats['player_id'] = weekly_stats['url'].str.extract('([\d]+)', expand=False)
weekly_stats['away_game'] = weekly_stats['opp'].str.contains('@')
weekly_stats['opp'] = weekly_stats['opp'].str.replace('@', '')
weekly_stats['date'] = pd.to_datetime(weekly_stats['date'] + ' 2018', format='%b %d %Y')
weekly_stats = weekly_stats[weekly_stats['player_id'].isin(players['player_id'])]

In [8]:
cols_in_order = ['player_id', 'player', 'date', 'week', 'opp', 'away_game', 
                 'pass_attempts', 'pass_completions', 'pass_percent',  'pass_yards', 'pass_ya', 'pass_td', 'pass_int',
                 'rush_attempts', 'rush_yards', 'rush_avg', 'rush_td', 
                 'reception', 'rec_yards', 'rec_avg', 'rec_td', 
                 'fumb_lost', 'ko_ret_td', 'ko_ret_yards', 'punt_ret_td', 'punt_ret_yards']

if all([True if col in weekly_stats.columns else False for col in cols_in_order]):
    weekly_stats = weekly_stats[cols_in_order]

In [9]:
for col in weekly_stats.loc['pass_attempts' : 'punt_ret_yards']:
    weekly_stats[col] = weekly_stats[col].fillna(0)

In [10]:
print(weekly_stats.shape)
weekly_stats.head()

(2590, 26)


Unnamed: 0,player_id,player,date,week,opp,away_game,pass_attempts,pass_completions,pass_percent,pass_yards,...,rush_td,reception,rec_yards,rec_avg,rec_td,fumb_lost,ko_ret_td,ko_ret_yards,punt_ret_td,punt_ret_yards
0,6880,Patrick DiMarco,2018-09-09,1,BAL,True,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,6880,Patrick DiMarco,2018-09-16,2,LAC,False,0.0,0.0,0.0,0.0,...,0.0,1.0,24.0,24.0,0.0,0.0,0.0,0.0,0.0,0.0
2,6880,Patrick DiMarco,2018-09-23,3,MIN,True,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,6880,Patrick DiMarco,2018-09-30,4,GB,True,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,13045,Josh Allen,2018-09-09,1,BAL,True,15.0,6.0,40.0,74.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
weekly_stats.to_csv('../../data/player_stats.csv', index=False)

In [12]:
player_game_count = players.copy()
stats_group = weekly_stats.groupby('player_id').size()
stats_group = stats_group.reset_index()
stats_group.rename(columns={0: 'game_count'}, inplace=True)
player_game_count = player_game_count.merge(stats_group, how='left', on='player_id')
player_game_count['game_count'].fillna(0, inplace=True)

# Histogram of game stats per player
player_game_count['game_count'].value_counts().sort_index()

0.0     46
1.0     26
2.0     25
3.0     39
4.0     35
5.0     45
6.0    118
7.0    172
8.0     15
Name: game_count, dtype: int64

# Player News

- Need to filter news updates down to game updates
- Some players might not have game news updates

In [13]:
with open('../../data/player_news.json', 'r') as f:
    news = []
    for line in f:
        news.append(json.loads(line))
        
news = pd.DataFrame(news)
news.set_index(news.index[::-1], inplace=True)
news.sort_index(inplace=True)

In [14]:
news['player_id'] = news['url'].str.extract('([\d]+)', expand=False)
# Different entries have different time formats. One format is missing year
news['date2'] = pd.to_datetime(news['date'], errors='coerce', format='%a, %b %d, %Y %I:%M:%S %p')
news['date3'] = pd.to_datetime(news['date'], errors='coerce', format='%b %d - %I:%M %p')

news = news[news['player_id'].isin(players['player_id'])]

In [15]:
def fill_dates(group):
    new_dates = []
    prev_year = None
    prev_month = None
    
    for row in group.itertuples():
        if row.date2 is not pd.NaT:
            prev_year = row.date2.year
            prev_month = row.date2.month
            new_dates.append(pd.NaT)
        elif row.date3 is not pd.NaT and prev_year is not None and prev_month is not None:
            if row.date3.month >= prev_month:
                new_dates.append(pd.to_datetime(str(prev_year) + str(row.date3)[4:]))
            else:
                new_dates.append(pd.to_datetime(str(prev_year + 1) + str(row.date3)[4:]))
        else:
            new_dates.append(row.date3)
    
    future_year = None
    future_moth = None
    new_dates = new_dates[::-1]
    for i, row in enumerate(group[::-1].itertuples()):
        if row.date2 is not pd.NaT:
            future_year = row.date2.year
            future_month = row.date2.month
        elif row.date3 is not pd.NaT and future_year is not None and future_month is not None and new_dates[i] is pd.NaT:
            if row.date3.month <= future_month:
                new_dates[i] = pd.to_datetime(str(future_year) + str(row.date3)[4:])
            else:
                new_dates[i] = pd.to_datetime(str(future_year - 1) + str(row.date3)[4:])
                
    return pd.DataFrame(new_dates[::-1], index=group.index, columns=['new_dates'])

In [16]:
# This code assumes chronological order by player. Use this to populate missing year values
grouped_news = news.groupby('player_id')
output = grouped_news.apply(fill_dates)

news['new_dates'] = output['new_dates']
# For players with news entries only in the no year format. Assume year = 2018
news['new_dates'] = np.where(news['new_dates'].notna() & news['new_dates'].dt.year == 1900,
                             pd.to_datetime('2018' + news['new_dates'].dt.strftime('%Y-%m-%d %H:%M:%S').str[4:]), news['new_dates'])

# Combine final date column and drop temp columns
news['date'] = np.where(news['date2'].notna(), news['date2'], news['new_dates'])
news.drop(columns=['date2', 'date3', 'new_dates'], inplace=True)

# Filter news before the 2018-2019 season
news = news[news['date'] >= pd.to_datetime('2018-09-06')]

# Filter news on game day
news['game_date'] = news['date'].dt.date
weekly_stats['game_date'] = weekly_stats['date'].dt.date
news = pd.merge(news, weekly_stats[['player_id', 'game_date']], how='inner', on=['player_id', 'game_date'])
news.drop(columns='game_date', inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [17]:
print(news.shape)
news.head()

(364, 10)


Unnamed: 0,date,impact,player,position,report,source_link,source_text,team,url,player_id
0,2018-10-14 18:03:00,The score was Davis' first of the season. He w...,Vernon Davis,TE,Vernon Davis caught 3-of-3 targets for 48 yard...,,,Redskins,http://www.rotoworld.com/recent/nfl/3638/verno...,3638
1,2018-10-21 21:13:00,Odd. Even without Jamison Crowder and Paul Ric...,Vernon Davis,TE,Vernon Davis failed to record a single target ...,,,,http://www.rotoworld.com/recent/nfl/3638/verno...,3638
2,2018-09-09 19:53:00,"Walker appeared to dislocate his ankle, or at ...",Delanie Walker,TE,Delanie Walker was carted off the field with a...,,,Titans,http://www.rotoworld.com/recent/nfl/3976/delan...,3976
3,2018-09-09 21:47:00,As expected. Walker had his ankle rolled up u...,Delanie Walker,TE,NFL Network's Ian Rapoport reports Delanie Wal...,https://twitter.com/RapSheet/status/1038966003...,Ian Rapoport on Twitter,Titans,http://www.rotoworld.com/recent/nfl/3976/delan...,3976
4,2018-09-16 20:42:00,"Smith was reportedly expected to see ""plenty o...",Jonnu Smith,TE,Jonnu Smith didn't receive a target in the Tit...,,,Titans,http://www.rotoworld.com/recent/nfl/12344/jonn...,12344


In [18]:
news.to_csv('../../data/player_news.csv', index=False)

In [19]:
player_news_count = players.copy()
news_group = news.groupby('player_id').size()
news_group = news_group.reset_index()
news_group.rename(columns={0: 'news_count'}, inplace=True)
player_news_count = player_news_count.merge(news_group, how='left', on='player_id')
player_news_count['news_count'].fillna(0, inplace=True)

# Histogram of game summary news updates per player
player_news_count['news_count'].value_counts().sort_index()

0.0    429
1.0     18
2.0     15
3.0      7
4.0      8
5.0     17
6.0     14
7.0     11
8.0      1
9.0      1
Name: news_count, dtype: int64