In [24]:
%matplotlib inline
import pandas as pd
import urllib2
import bs4
import re
import datetime
import unidecode
import numpy as np
import time
import itertools
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = (12, 16)

### TODO:
- url to player's page is 'http://www.hockey-reference.com/players/%s/%s.html' % (pid[0], pid)
- url to player's gamelogs is 'http://www.hockey-reference.com/players/%s/%s/gamelog/%d/' % (pid[0], pid, year) where year is the 2nd year of the season (2015-2016 => year=2016)

In [2]:
def get_player_list():
    BASEURL = 'http://www.hockey-reference.com'
    URL = '%s/players' % BASEURL
    dic = dict(LEAGUE=[], NAME=[], PID=[], START_YEAR=[], END_YEAR=[], POS=[], HEIGHT=[], WEIGHT=[], BIRTHDAY=[])
    for i in range(97, 97+26):
        letter = chr(i)
        url = '%s/%s/' % (URL, letter)
        try:
            page = urllib2.urlopen(url)
        except:
            print letter, 'page not found'
            continue
        html = page.read()
        soup = bs4.BeautifulSoup(html, "lxml")
        
        rows = soup.findAll('tr')[1:]
        for row in rows:
            lg = row['class'][0]
            items = row.findAll('td')
            a = items[0].find('a')
            name = unidecode.unidecode(a.text)
            href = a['href']
            try:
                pid = re.search('/players/%s/(.+)\.html' % letter, href).group(1)
            except:
                continue
            
            try:
                yr0 = int(items[1].text)
            except ValueError:
                yr0 = None
            
            try:
                yr1 = int(items[2].text)
            except ValueError:
                yr1 = None
            
            pos = items[3].text
            hgt = items[4].text
            
            try:
                wgt = int(items[5].text)
            except ValueError:
                wgt = None
            
            try:
                bd = datetime.datetime.strptime(items[6].text, '%B %d, %Y').date()
            except:
                bd = None
            
            dic['LEAGUE'].append(lg)
            dic['NAME'].append(name)
            dic['PID'].append(pid)
            dic['START_YEAR'].append(yr0)
            dic['END_YEAR'].append(yr1)
            dic['POS'].append(pos)
            dic['HEIGHT'].append(hgt)
            dic['WEIGHT'].append(wgt)
            dic['BIRTHDAY'].append(bd)
    return pd.DataFrame(dic)

# pl = get_player_list()

In [91]:
# pl.to_csv('/Users/andingo/Desktop/NHL/data/playerlist.csv')

In [3]:
pl = pd.read_csv('/Users/andingo/Desktop/NHL/data/playerlist.csv')

In [4]:
def get_gamelog_url(pid, season):
    return 'http://www.hockey-reference.com/players/%s/%s/gamelog/%d/' % (pid[0], pid, season)

def get_player_gamelogs(pid, season):
    url = get_gamelog_url(pid, season)
    page = urllib2.urlopen(url)
    html = page.read()
    soup = bs4.BeautifulSoup(html, 'lxml')
    
    cols = [x.text for x in soup.findAll('thead')[0].findAll('tr')[1].findAll('th')]
    if cols[5] == '':
        cols[5] = 'H/A'
    if cols[7] == '':
        cols[7] = 'W/L'
    FLAG = True
    FLAG2 = True
    for k, col in enumerate(cols):
        if FLAG2:
            if col == 'G':
                cols[k] = 'GM_NUM'
                FLAG2 = False
        if FLAG:
            cols[k] = 'GOAL_%s' % col if col in ('EV','PP','SH','GW') else col
        else:
            cols[k] = 'AST_%s' % col if col in ('EV','PP','SH','GW') else col
        if col == 'GW':
            FLAG = False
    
    output = []
    for row in soup.findAll('tbody')[0].findAll('tr'):
        if row['class'] != ['']:
            continue
        
        items = [x.text for x in row.findAll('td')]
        items = map(lambda x: 0 if x == '' else x, items)
        for i in range(len(items)):
            try:
                if '.' in items[i]:
                    items[i] = float(items[i])
                else:
                    items[i] = int(items[i])
            except:
                try:
                    items[i] = datetime.datetime.strptime(items[i], '%Y-%m-%d').date()
                except:
                    try:
                        m, s = items[i].split(':')
                        items[i] = int(m) + int(s)/60.
                    except:
                        pass
            
        output.append(items)
    
    output = pd.DataFrame(output, columns=cols)
    output['H/A'][output['H/A']==0] = 'v'
    output['PID'] = pid
    output['GID'] = ['%s@%s%s' % (t,o,d) if ha=='@' else '%s@%s%s' % (o,t,d) for i,d,t,ha,o in output[['Date','Tm','H/A','Opp']].itertuples()]
    output['SEASON'] = season
    return output

# gl = get_player_gamelogs(pl.PID.iloc[0], 1999)

In [7]:
start_time = time.time()
global_stats = pd.DataFrame()
goalie_stats = pd.DataFrame()
pids = pl[(pl.END_YEAR>=2015) & (pl.LEAGUE=='nhl')].PID
tmp = pl.set_index('PID')
for k, pid in enumerate(pids):
#     if k < 468:
#         continue
    print pid, '%d / %d' % (k, len(pids))
    if tmp.loc[pid].START_YEAR < 2016:
        try:
            gl = get_player_gamelogs(pid, 2015)
            try:
                global_stats = global_stats.append(gl, ignore_index=True)
            except:
                goalie_stats = goalie_stats.append(gl, ignore_index=True)
        except Exception, e:
            print pid, 'error in 2015', get_gamelog_url(pid, 2015), e.message
    
    if tmp.loc[pid].END_YEAR == 2016:
        try:
            gl = get_player_gamelogs(pid, 2016)
            try:
                global_stats = global_stats.append(gl, ignore_index=True)
            except:
                goalie_stats = goalie_stats.append(gl, ignore_index=True)
        except Exception, e:
            print pid, 'error in 2016', get_gamelog_url(pid, 2016), e.message
print 'Time Elapsed:', time.time() - start_time

abdelju01 0 / 1100
actonwi01 1 / 1100
adamlu01 2 / 1100
adamscr01 3 / 1100
agozzan01 4 / 1100
akesoja01 5 / 1100
allenbr01 6 / 1100
allenco01 7 / 1100
allenja01 8 / 1100
altma01 9 / 1100
alzneka01 10 / 1100
anderfr01 11 / 1100
andercr01 12 / 1100
anderjo05 13 / 1100
anderjo03 14 / 1100
andrean01 15 / 1100
andrisv01 16 / 1100
angelmi01 17 / 1100
anisiar01 18 / 1100
arcobma01 19 / 1100
armiajo01 20 / 1100
arvidvi01 21 / 1100
ashtoca01 22 / 1100
athanan01 23 / 1100
atkinca01 24 / 1100
aulieke01 25 / 1100
bachmri01 26 / 1100
backeda01 27 / 1100
backlmi01 28 / 1100
backsni02 29 / 1100
backsni01 30 / 1100
baertsv01 31 / 1100
baileca01 32 / 1100
bailejo01 33 / 1100
ballake01 34 / 1100
barbema01 35 / 1100
barkoal01 36 / 1100
barrity01 37 / 1100
bartkma01 38 / 1100
bartlvi01 39 / 1100
bassco01 40 / 1100
bassco01

A value is trying to be set on a copy of a slice from a DataFrame

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


 error in 2015 http://www.hockey-reference.com/players/b/bassco01/gamelog/2015/ list index out of range
baunky01 41 / 1100
beaglja01 42 / 1100
beaucfr01 43 / 1100
beaulna01 44 / 1100
beckta01 45 / 1100
belesma01 46 / 1100
bellepi01 47 / 1100
bellebr01 48 / 1100
bennja01 49 / 1100
bennjo01 50 / 1100
bennebe01 51 / 1100
bennesa01 52 / 1100
benoian01 53 / 1100
bergese01 54 / 1100
bergepa01 55 / 1100
berglpa01 56 / 1100
bernijo01 57 / 1100
bernist01 58 / 1100
berrare01 59 / 1100
bertsch02 60 / 1100
berubje02 61 / 1100
bickest01 62 / 1100
bickebr01 63 / 1100
biegaal01 64 / 1100
biegada01 65 / 1100
biekske01 66 / 1100
bigrach01 67 / 1100
binnijo01 68 / 1100
bishobe01 69 / 1100
bitetan01 70 / 1100
bjugsni01 71 / 1100
blackje01 72 / 1100
blandjo01 73 / 1100
blumjo02 74 / 1100
blundmi01 75 / 1100
bobrose01 76 / 1100
bodietr01 77 / 1100
bodnaan01 78 / 1100
bodnaan01 error in 2015 http://www.hockey-reference.com/players/b/bodnaan01/gamelog/2015/ list index out of range
boedkmi01 79 / 1100
bogosza

In [60]:
global_stats.to_csv('/Users/andingo/Desktop/NHL/data/global_stats.csv')
goalie_stats.to_csv('/Users/andingo/Desktop/NHL/data/goalie_stats.csv')

In [11]:
global_stats = pd.read_csv('/Users/andingo/Desktop/NHL/data/global_stats.csv')
goalie_stats = pd.read_csv('/Users/andingo/Desktop/NHL/data/goalie_stats.csv')
global_stats.Date = [datetime.datetime.strptime(x, '%Y-%m-%d').date() for x in global_stats.Date]
goalie_stats.Date = [datetime.datetime.strptime(x, '%Y-%m-%d').date() for x in goalie_stats.Date]

In [7]:
def fantasy_points(GL):
    gl = GL.copy()
    if 'GA' in gl.keys():
        gl['DK'] = 3*(gl['W/L']=='W') - gl.GA + .2*gl.SV + gl.SO
        gl['FD'] = gl.DK
    else:
        gl['DK'] = 3*gl.G + 2*gl.A + .5*gl.S + .5*gl.BLK + gl.GOAL_SH + gl.AST_SH + (1.5*(gl.G>=3) if 'HT' not in gl.keys() else 1.5*gl.HT)
        gl['FD'] = 3*gl.G + 2*gl.A + .5*gl.GOAL_PP + .5*gl.AST_PP + .4*gl.S + gl['+/-'] + .25*gl.PIM
    return gl

In [12]:
global_stats = fantasy_points(global_stats)

## FanDuel

In [30]:
def get_pid(name, pl, **kwargs):
    I = pl.NAME == name
    if not any(I):
        raise Exception('%s not found' % name)
    for col, st in kwargs.iteritems():
        I = I & (pl[col] == st)
    if not any(I):
        raise Exception('%s not found with specifications %s' % (name, kwargs))
    return pl[I].PID.iloc[0]

In [112]:
fdpl = pd.read_csv('/Users/andingo/Desktop/NHL/PlayerLists/pl_fd_nhl_2016-02-11.csv')
fdpl['NAME'] = ['%s %s' % (f,l) for f,l in itertools.izip(fdpl['First Name'], fdpl['Last Name'])]
# fdpl['PID'] = [get_pid(NAME_CORRECTIONS[x], pl) if x in NAME_CORRECTIONS else get_pid(x, pl) for x in fdpl.NAME]

# for x in fdpl.NAME:
#     try:
#         get_pid(NAME_CORRECTIONS[x], pl) if x in NAME_CORRECTIONS else get_pid(x, pl)
#     except Exception, e:
#         print e.message

Anrew Miller not found
Jakub Nakladal not found




In [111]:
NAME_CORRECTIONS = {
    'Alexander Ovechkin' : 'Alex Ovechkin',
    'Johnny Gaudreau' : 'John Gaudreau',
    'Jon Quick' : 'Jonathan Quick',
    "Matthew O'Connor" : "Matt O'Connor",
    'Cameron Talbot' : 'Cam Talbot',
    'Jacob Muzzin' : 'Jake Muzzin',
    'Samuel Bennett' : 'Sam Bennett',
    'Michael Ferland' : 'Micheal Ferland',
    'Pierre Parenteau' : 'P.A. Parenteau',
    'Viatcheslav Voynov' : 'Slava Voynov',
    'Alex Burmistrov' : 'Alexander Burmistrov',
    'Trevor Van Riemsdyk' : 'Trevor van Riemsdyk',
    'Dylan Demelo' : 'Dylan DeMelo',
    'Tobias Enstrom' : 'Toby Enstrom',
    'Chris Vande Velde' : 'Chris VandeVelde',
    'Joe Morrow' : 'Joseph Morrow',
    'Andew Miller' : 'Andrew Miller',
    'Phil Varone' : 'Philip Varone',
    'Richard Clune' : 'Rich Clune',
    'Max Talbot' : 'Maxime Talbot',
    'Yevgeni Medvedev' : 'Evgeny Medvedev',
}

In [110]:
pl[['Medvedev' in x for x in pl.NAME]].NAME
# pl[pl.NAME=='Arturs Kulda']

6102    Andrei Medvedev
6103    Evgeny Medvedev
Name: NAME, dtype: object

In [108]:
pl[pl.PID=='naklaja01']

Unnamed: 0.1,Unnamed: 0,BIRTHDAY,END_YEAR,HEIGHT,LEAGUE,NAME,PID,POS,START_YEAR,WEIGHT
