In [156]:
# imports
import warnings
warnings.filterwarnings('ignore')
from bs4 import BeautifulSoup
import pandas as pd
from scipy import stats
import numpy as np
import spacy
import seaborn as sns
import matplotlib.pyplot as plt
import requests
from urllib.request import urlopen
import json
import dill

In [None]:
# pre-2019 data
pre_2019=pd.read_csv('skater_stats.csv',encoding = "ISO-8859-1")

In [None]:
pre_2019.Tm.unique()

In [None]:
open_space_lst = ['Tm','G','Pos','A','PTS']

In [None]:
def fill_spaces(df,feature):
    dummy_dict={}
    for f in df[feature].unique():
        dummy_dict[f]=f.replace(" ","")
        dummy_dict[f]=f.replace("-","")
    pre_2019[feature]=pre_2019[feature].map(lambda x: dummy_dict[x])
    return df

In [None]:
for f in open_space_lst:    
    pre_2019=fill_spaces(pre_2019,f)

In [None]:
num_lst=['+/-','PIM','EVG','PPG','SHG','GWG','EVA','PPA','SHA','S','S%','TOI']

In [None]:
str(pre_2019['PIM'][193]).split()[0]

In [None]:
for f in num_lst:
     for i in range(len(pre_2019)):
        pre_2019[f][i]=str(pre_2019[f][i]).split()[0]

In [None]:
pre_2019.PIM.unique()[0:20]

In [None]:
# url for scraping 2019 season
url='https://www.hockey-reference.com/leagues/NHL_2019_skaters.html'

In [None]:
# assigning variable to requests getting this url
r=requests.get(url)

In [None]:
# creating soup object
soup = BeautifulSoup(r.content,'html.parser')

In [None]:
#soup.prettify

In [None]:
# gathering information on the players I need data from
player_list=[]
for player in soup.find_all('tbody'):
    player_list.append(player.find_all('tr'))

In [None]:
#player_list

In [None]:
# when I pulled the data it came as a nested list so I need to step inside that list
player_list=player_list[0]

In [None]:
# quick view at what one player looks like
player_list[0]

In [None]:
# total amount of players - sort of - there are repeats that will be discussed further later on
len(player_list)

In [None]:
# creating list of attributes for first player
a=[]
for stat in player_list[0].find_all('td'):
    a.append(stat.text)

In [None]:
# creating list of attributes for every player
b =[]
for i in range(0,1121):
    for stat in player_list[i].find_all('td'):
        b.append(stat.text)

In [None]:
# quick look at beginning of b - what we see is that it captured a little too much and I will need to break this into pieces
b[0:100]

In [None]:
# gauge of how long each player's stats are
b[0:27]

In [None]:
# list comprehension to split up the list of everything into individual players
chunks = [b[x:x+27] for x in range(0, len(b), 27)]

In [None]:
# what does the first player look like - notice how this is a list within a list at index 0
chunks[0]

In [None]:
# what does every number correspond to?
for stat in player_list[0].find_all(class_='right'):
    print(stat['data-stat'],stat.text)

In [None]:
# instantiating data frame to hold data
df=pd.DataFrame(columns=['name','age','team','pos','gp','goals','assists','points','plusmin','plm','ps','goals_ev','goals_pp','goals_sh',
                        'goals_gw','assists_ev','assists_pp','assists_sh','shots','shot_pct','toi','toi_avg',
                        'hits','faceoff_wins','faceoff_losses','faceoff_percent'],
               index=range(0,1076))

In [None]:
# quick look at the empty data frame
df

In [None]:
# features
len(df.columns)

In [None]:
# number of players to populate this list
len(chunks)

In [None]:
# adding name to data frame
for i in range(len(chunks)):
    df.name[i] = chunks[i][0]

In [None]:
# looking for any change in data frame shape
df.shape

In [None]:
# double checking that adding the name worked
df.head()

In [None]:
# this number corresponds to blocks which will be important later
chunks[0][22]

In [None]:
# adding everything to the data frame
for i in range(len(df)):
    df.name[i] = chunks[i][0]
    df.age[i]=chunks[i][1]
    df.team[i]=chunks[i][2]
    df.pos[i]=chunks[i][3]
    df.gp[i]=chunks[i][4]
    df.goals[i]=chunks[i][5]
    df.assists[i]=chunks[i][6]
    df.points[i]=chunks[i][7]
    df.plusmin[i]=chunks[i][8]
    df.plm[i]=chunks[i][9]
    df.ps[i]=chunks[i][10]
    df.goals_ev[i]=chunks[i][11]
    df.goals_pp[i]=chunks[i][12]
    df.goals_sh[i]=chunks[i][13]
    df.goals_gw[i]=chunks[i][14]
    df.assists_ev[i]=chunks[i][15]
    df.assists_pp[i]=chunks[i][16]
    df.assists_sh[i]=chunks[i][17]
    df.shots[i]=chunks[i][18]
    df.shot_pct[i]=chunks[i][19]
    df.toi[i]=chunks[i][20]
    df.toi_avg[i]=chunks[i][21]
    df.hits[i]=chunks[i][23]
    df.faceoff_wins[i]=chunks[i][24]
    df.faceoff_losses[i]=chunks[i][25]
    df.faceoff_percent[i]=chunks[i][26]

In [None]:
# quick look at data - notice I have repeating name values
df.head()

In [None]:
# populating list of blocks by player
block_list = []
for i in range(len(chunks)):
    block_list.append(int(chunks[i][22]))

In [None]:
# quick look at list of blocks by player
block_list[0:5]

In [None]:
# length of list of blocks by player
len(block_list)

In [None]:
# adding block statistic to data frame - for some reason this stat was very hard to add so I need to do it in a roundabout way
df['block']=0
for i in range(len(df)):
    df.block[i]=block_list[i]

In [None]:
# overview of data
df.head()

In [None]:
# tail of data
df.tail()

In [None]:
# data info - notice I will need to change a lot of numbers to be integers or floats
df.info()

In [None]:
# mapping age as an integer
df.age=df.age.astype(int)

In [None]:
# mapping games played as an integer
df.gp=df.gp.astype(int)

In [None]:
# mapping other features as integer or float - some gave me trouble and have been commented out
df.goals=df.goals.astype(int)
df.goals_ev=df.goals_ev.astype(int)
df.goals_gw=df.goals_gw.astype(int)
df.goals_pp=df.goals_pp.astype(int)
df.goals_sh=df.goals_sh.astype(int)
df.assists=df.assists.astype(int)
df.assists_ev=df.assists_ev.astype(int)
df.assists_pp=df.assists_pp.astype(int)
df.assists_sh=df.assists_sh.astype(int)
df.points=df.points.astype(int)
df.plusmin=df.plusmin.astype(int)
df.plm=df.plm.astype(int)
df.ps=df.ps.astype(float)
df.shots=df.shots.astype(float)
#df.shot_pct=df.shot_pct.astype(float)
#df.toi=df.toi.astype(float)
#df.toi_avg=df.toi_avg.astype(float)
df.hits=df.hits.astype(int)
df.faceoff_wins=df.faceoff_wins.astype(int)
df.faceoff_losses=df.faceoff_losses.astype(int)
#df.faceoff_percent=df.faceoff_percent.astype(float)

In [None]:
# review of data
df.head()

In [None]:
# remove faceoff percent
df.drop('faceoff_percent',axis=1,inplace=True)

In [None]:
# add faceoff percent back as a float
df['faceoff_pct']=round(df.faceoff_wins/(df.faceoff_losses+df.faceoff_wins),3)

In [None]:
# adding season column so I can later add more data
df['season']=2019

In [None]:
# review of data
df.head()

In [None]:
#df.shot_pct=df.shot_pct.astype(float)
#df.toi=df.toi.astype(float)
#df.toi_avg=df.toi_avg.astype(float)

In [None]:
# removing shot percent
df.drop('shot_pct',axis=1,inplace=True)

In [None]:
# adding shot percent back as a float
df['shot_pct']=df.goals/df.shots

In [None]:
# time on ice gave me trouble so I am going to have to break this down
df.toi_avg

In [None]:
# first instance of toi
df.toi_avg[0]

In [None]:
# length of toi "type 1"
len(df.toi_avg[0])

In [None]:
# minutes component
df.toi_avg[0].split()[0][0:2]

In [None]:
# seconds component
df.toi_avg[0].split()[0][3:5]

In [None]:
# toi "type 2"

df.toi_avg[1073]

In [None]:
# length of toi "type 2"
len(df.toi_avg[1073])

In [None]:
# minutes
df.toi_avg[1073].split()[0][0:1]

In [None]:
# seconds
df.toi_avg[1073].split()[0][2:4]

In [None]:
# converting minutes and seconds to total seconds
for i in range(len(df)):
    if len(df.toi_avg[i]) == 5:
        df.toi_avg[i] = int(df.toi_avg[i].split()[0][0:2])*60 +int(df.toi_avg[i].split()[0][3:5])
    elif len(df.toi_avg[i]) == 4:
        df.toi_avg[i] = int(df.toi_avg[i].split()[0][0:1])*60 +int(df.toi_avg[i].split()[0][2:4])

In [None]:
# review data having made some modifications
df.head()

In [None]:
# data types for data
df.info()

In [None]:
# toi as a float
df.toi_avg=df.toi_avg.astype(int)

In [None]:
# review data
df.head()

In [None]:
# data types for data
df.info()

In [None]:
# how many unique players are there?
df.groupby('name').sum().shape

In [None]:
# review data
df.head()

In [None]:
# review data I will later merge
pre_2019.head()

In [None]:
# checking names of columns in 2019 season
df.columns

In [None]:
# checking names of columns in 2018 season
pre_2019.columns

In [None]:
# adding stat missing from 2019 season
df['gpg']=df.goals/df.gp

In [None]:
# removing pointless column
pre_2019.drop('Unnamed: 0',axis=1,inplace=True)

In [None]:
# do the data frames align?
len(df.columns)==len(pre_2019.columns)

In [None]:
# checking for difference
pre_2019.columns

In [None]:
# checking for difference
df.columns

In [None]:
# removing the extra column
df.drop('ps',axis=1,inplace=True)

In [None]:
# reorganizing the order of pre-2019 data columns
pre_2019_2 = pre_2019[['Player', 'Age', 'Tm', 'Pos', 'GP', 'G', 'A', 'PTS','+/-', 'PIM', 'EVG', 'PPG', 'SHG', 
                       'GWG', 'EVA', 'PPA', 'SHA', 'S', 'TOI', 'ATOI', 'HIT', 'FOwin', 'FOloss', 
                       'BLK', 'FO%', 'Season', 'S%', 'GPG']]

In [None]:
# assigning uniform name to columns of separate data frames
pre_2019_2.columns=df.columns

In [None]:
# data of pre-2019 season with new feature names
pre_2019_2.head()

In [None]:
for i in range(len(pre_2019_2)):
    pre_2019_2.team[i]=pre_2019_2.team[i].strip()

In [None]:
# figuring out which players changed teams
df[df.name.duplicated()==True].name.unique()

In [None]:
# checking how traded players are represented
df[df.name=='Ryan Strome']

In [None]:
df[df.name.duplicated()==True].name.unique()

In [None]:
# creating data frame with only the final team that player was on - I intend to declare the final team played for as that
# player's team for the season. This way I get a full picture of things like goals and points
final_team_df = df[df.name.duplicated()==True].drop_duplicates(subset='name',keep='last')

In [None]:
# where did "repeat" players end their season
final_team_df.head()

In [None]:
# reset index of this data frame
final_team_df.reset_index(inplace=True)

In [None]:
# drop pointless column
final_team_df.drop('index',axis=1,inplace=True)

In [None]:
# create dictionary to assign values to team name 
final_team_dict = {}
for i in range(len(final_team_df)):
    final_team_dict[final_team_df.name[i]]=final_team_df.team[i]

In [None]:
# quick look at dictionary beginning
list(final_team_dict.items())[0:15]

In [None]:
# quick look at dictionary beginning
list(final_team_dict.keys())[0:5]

In [None]:
# removing repeats of the same player and keeping only total season stats (TOT)
df=df.drop_duplicates(subset='name',keep='first')

In [None]:
# new look of data
df.head()

In [None]:
# insuring that name is a string and not object
df.name = df.name.astype(str)

In [None]:
# cheking that my dictionary will map well
'Pontus Aberg' in list(final_team_dict.keys())

In [None]:
# reset index to deal with players removed
df.reset_index(inplace=True)
df.drop('index',axis=1,inplace=True)

In [None]:
# replace "TOT" with the final team that player played for
for i in range(len(df)):
    if df.name.iloc[i] in list(final_team_dict.keys()):
        df.team.iloc[i] = final_team_dict[df.name.iloc[i]]

In [None]:
# head of data
df.head()

In [None]:
# tail of data
df.tail()

In [None]:
# data frame to combine with
pre_2019_2.head()

In [None]:
# data frame to combine with
pre_2019_2.tail()

In [None]:
open_space_lst

In [None]:
goals_dict={}
for tm in pre_2019_2.goals.unique():
    goals_dict[tm]=tm.replace(" ","")

In [None]:
pre_2019_2.goals=pre_2019_2.goals.map(lambda x: goals_dict[x])

In [None]:
ast_dict={}
for tm in pre_2019_2.assists.unique():
    ast_dict[tm]=tm.replace(" ","")

In [None]:
pre_2019_2.assists=pre_2019_2.assists.map(lambda x: ast_dict[x])

In [None]:
pos_dict={}
for tm in pre_2019_2.pos.unique():
    pos_dict[tm]=tm.replace(" ","")

In [None]:
pre_2019_2.pos=pre_2019_2.pos.map(lambda x: pos_dict[x])

In [None]:
pts_dict={}
for tm in pre_2019_2.points.unique():
    pts_dict[tm]=tm.replace(" ","")

In [None]:
pre_2019_2.points=pre_2019_2.points.map(lambda x: pts_dict[x])

In [None]:
total_df=pd.concat([df,pre_2019_2])

In [None]:
total_df.head()

In [None]:
total_df.tail()

In [None]:
total_df.season.unique()

In [None]:
total_df.columns

# I need to add two or three goalies for each year!

# So that is in a separate notebook, but just for fun I have added a scraper function for skaters

In [157]:
def map_season_final_tm(df,yr):
    df2=df[df.season==yr]
    final_team_df = df2[df2.name.duplicated()==True].drop_duplicates(subset='name',keep='last')
    final_team_df.reset_index(inplace=True)
    final_team_df.drop('index',axis=1,inplace=True)
    final_team_dict = {}
    for i in range(len(df2)):
        final_team_dict[final_team_df.name[i]]=final_team_df.team[i]
    df2=df2.drop_duplicates(subset='name',keep='first')
    df2.name = df2.name.astype(str)
    for i in range(len(df)):
        if df.name.iloc[i] in list(final_team_dict.keys()):
            df.team.iloc[i] = final_team_dict[df.name.iloc[i]]
    return final_team_df

In [158]:
def create_skater_dataframe(year):
    url=f'https://www.hockey-reference.com/leagues/NHL_{year}_skaters.html'
    r=requests.get(url)
    soup = BeautifulSoup(r.content,'html.parser')
    player_list=[]
    for player in soup.find_all('tbody'):
        player_list.append(player.find_all('tr'))
    player_list=player_list[0]
    b =[]
    for i in range(0,len(player_list)):
        for stat in player_list[i].find_all('td'):
            b.append(stat.text)
    chunks = [b[x:x+27] for x in range(0, len(b), 27)]
    df=pd.DataFrame(columns=['name','age','team','pos','gp','goals','assists','points','plusmin','plm','ps','goals_ev','goals_pp','goals_sh',
                        'goals_gw','assists_ev','assists_pp','assists_sh','shots','shot_pct','toi','toi_avg',
                        'hits','faceoff_wins','faceoff_losses','faceoff_percent'],
               index=range(0,len(chunks)))
    for i in range(len(df)):
        df.name[i] = chunks[i][0]
        df.age[i]=chunks[i][1]
        df.team[i]=chunks[i][2]
        df.pos[i]=chunks[i][3]
        df.gp[i]=chunks[i][4]
        df.goals[i]=chunks[i][5]
        df.assists[i]=chunks[i][6]
        df.points[i]=chunks[i][7]
        df.plusmin[i]=chunks[i][8]
        df.plm[i]=chunks[i][9]
        df.ps[i]=chunks[i][10]
        df.goals_ev[i]=chunks[i][11]
        df.goals_pp[i]=chunks[i][12]
        df.goals_sh[i]=chunks[i][13]
        df.goals_gw[i]=chunks[i][14]
        df.assists_ev[i]=chunks[i][15]
        df.assists_pp[i]=chunks[i][16]
        df.assists_sh[i]=chunks[i][17]
        df.shots[i]=chunks[i][18]
        df.shot_pct[i]=chunks[i][19]
        df.toi[i]=chunks[i][20]
        df.toi_avg[i]=chunks[i][21]
        df.hits[i]=chunks[i][23]
        df.faceoff_wins[i]=chunks[i][24]
        df.faceoff_losses[i]=chunks[i][25]
        df.faceoff_percent[i]=chunks[i][26]
    block_list = []
    for i in range(len(chunks)):
        block_list.append((chunks[i][22]))
    df['block']=0
    for i in range(len(df)):
        df.block[i]=block_list[i]
    df.drop('faceoff_percent',axis=1,inplace=True)
    for f in ['faceoff_wins','faceoff_losses','goals','shots','gp','points']:
        df[f].replace("",0,inplace=True)
        df[f]=df[f].astype(int)
    df['faceoff_pct']=round(df.faceoff_wins/(df.faceoff_losses+df.faceoff_wins),3)
    df['season']=int(year)
    df.drop('shot_pct',axis=1,inplace=True)
    df['shot_pct']=df.goals/df.shots
    for i in range(len(df)):
        if len(df.toi_avg[i]) == 5:
            df.toi_avg[i] = int(df.toi_avg[i].split()[0][0:2])*60 +int(df.toi_avg[i].split()[0][3:5])
        elif len(df.toi_avg[i]) == 4:
            df.toi_avg[i] = int(df.toi_avg[i].split()[0][0:1])*60 +int(df.toi_avg[i].split()[0][2:4])
    df.toi_avg=df.toi_avg.astype(int)
    df['gpg']=df.goals/df.gp
    df['ppg']=df.points/df.gp
    df.drop('ps',axis=1,inplace=True)
    final_team_df = df[df.name.duplicated()==True].drop_duplicates(subset=['name'],keep='last')
    final_team_df.reset_index(inplace=True)
    final_team_dict = {}
    for i in range(len(final_team_df)):
        final_team_dict[final_team_df.name[i]]=final_team_df.team[i]
    df=df.drop_duplicates(subset='name',keep='first')
    df.name = df.name.astype(str)
    df.reset_index(inplace=True)
    df.drop('index',axis=1,inplace=True)
    for i in range(len(df)):
        if df.name.iloc[i] in list(final_team_dict.keys()):
            df.team.iloc[i] = final_team_dict[df.name.iloc[i]]
    
    return df

In [159]:
dfdfs=[]
for s in range(2008,2020):
    dfdfs.append(create_skater_dataframe(str(s)))

In [160]:
aaa_df=pd.concat(dfdfs)

In [161]:
for i in range(len(aaa_df)):
    if aaa_df.name.iloc[i]=='Sebastian Aho':
        aaa_df.team.iloc[i]='CAR'

In [180]:
def create_skater_dataframe_early(year):
    url=f'https://www.hockey-reference.com/leagues/NHL_{year}_skaters.html'
    r=requests.get(url)
    soup = BeautifulSoup(r.content,'html.parser')
    player_list=[]
    for player in soup.find_all('tbody'):
        player_list.append(player.find_all('tr'))
    player_list=player_list[0]
    b =[]
    for i in range(0,len(player_list)):
        for stat in player_list[i].find_all('td'):
            b.append(stat.text)
    chunks = [b[x:x+22] for x in range(0, len(b), 22)]
    df=pd.DataFrame(columns=['name','age','team','pos','gp','goals','assists','points','plusmin','plm','ps','goals_ev','goals_pp','goals_sh',
                        'goals_gw','assists_ev','assists_pp','assists_sh','shots','shot_pct','toi','toi_avg',
                        'hits','faceoff_wins','faceoff_losses','faceoff_percent'],
               index=range(0,len(chunks)))
    for i in range(len(df)):
        df.name[i] = chunks[i][0]
        df.age[i]=chunks[i][1]
        df.team[i]=chunks[i][2]
        df.pos[i]=chunks[i][3]
        df.gp[i]=chunks[i][4]
        df.goals[i]=chunks[i][5]
        df.assists[i]=chunks[i][6]
        df.points[i]=chunks[i][7]
        df.plusmin[i]=chunks[i][8]
        df.plm[i]=chunks[i][9]
        df.ps[i]=chunks[i][10]
        df.goals_ev[i]=chunks[i][11]
        df.goals_pp[i]=chunks[i][12]
        df.goals_sh[i]=chunks[i][13]
        df.goals_gw[i]=chunks[i][14]
        df.assists_ev[i]=chunks[i][15]
        df.assists_pp[i]=chunks[i][16]
        df.assists_sh[i]=chunks[i][17]
        df.shots[i]=chunks[i][18]
        df.shot_pct[i]=chunks[i][19]
        df.toi[i]=chunks[i][20]
        df.toi_avg[i]=chunks[i][21]
        df.hits[i]=0
        df.faceoff_wins[i]=0
        df.faceoff_losses[i]=0
        df.faceoff_percent[i]=0
        df['block']=0
    df.drop('faceoff_percent',axis=1,inplace=True)
    for f in ['faceoff_wins','faceoff_losses','goals','shots','gp','points']:
        df[f].replace("",0,inplace=True)
        df[f]=df[f].astype(int)
    df['faceoff_pct']=round(df.faceoff_wins/(df.faceoff_losses+df.faceoff_wins),3)
    df['season']=int(year)
    df.drop('shot_pct',axis=1,inplace=True)
    df['shot_pct']=df.goals/df.shots
    df['gpg']=df.goals/df.gp
    df['ppg']=df.points/df.gp
    df.drop('ps',axis=1,inplace=True)
    
    if year != '2006':
        final_team_df = df[df.name.duplicated()==True].drop_duplicates(subset='name',keep='last')
        final_team_df.reset_index(inplace=True)
        final_team_dict = {}
        for i in range(len(final_team_df)):
            final_team_dict[final_team_df.name[i]]=final_team_df.team[i]
        
        df=df.drop_duplicates(subset='name',keep='first')
        df.name = df.name.astype(str)
        df.reset_index(inplace=True)
        df.drop('index',axis=1,inplace=True)
        for i in range(len(df)):
            if df.name.iloc[i] in list(final_team_dict.keys()):
                df.team.iloc[i] = final_team_dict[df.name.iloc[i]]
    
    else:
        for i in range(len(df)):
            if df.team.iloc[i]=='TOT':
                df.team.iloc[i]=df.team.iloc[i+1]
        df=df.drop_duplicates(subset='name',keep='first')
        df.name = df.name.astype(str)
        df.reset_index(inplace=True)
        df.drop('index',axis=1,inplace=True)
    
    return df

In [181]:
dfdfs2=[]
for s in range(1918,2008):
    try:
        dfdfs2.append(create_skater_dataframe_early(str(s)))
    except:
        print(str(s))

2005


In [182]:
aaa_df2=pd.concat(dfdfs2)

In [183]:
aaa_df2

Unnamed: 0,name,age,team,pos,gp,goals,assists,points,plusmin,plm,...,toi_avg,hits,faceoff_wins,faceoff_losses,block,faceoff_pct,season,shot_pct,gpg,ppg
0,Jack Adams*,23,TRA,C,8,0,0,0,,31,...,,0,0,0,0,,1918,,0.000000,0.000000
1,Billy Bell,26,MTW,F,9,1,0,1,,6,...,,0,0,0,0,,1918,inf,0.111111,0.111111
2,Louis Berlinguette,30,MTL,LW,20,2,1,3,,12,...,,0,0,0,0,,1918,inf,0.100000,0.150000
3,Georges Boucher*,21,OTS,F,21,9,8,17,,46,...,,0,0,0,0,,1918,inf,0.428571,0.809524
4,Morley Bruce,23,OTS,D,7,0,0,0,,0,...,,0,0,0,0,,1918,,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
852,Marek Zidlicky,29,NSH,D,79,4,26,30,8,72,...,19:43,0,0,0,0,,2007,0.035088,0.050633,0.379747
853,Mike Zigomanis,26,PHX,C,75,14,9,23,-8,46,...,14:53,0,0,0,0,,2007,0.098592,0.186667,0.306667
854,Sergei Zubov*,36,DAL,D,78,12,42,54,0,26,...,25:57,0,0,0,0,,2007,0.076923,0.153846,0.692308
855,Dainius Zubrus,28,BUF,RW,79,24,36,60,-19,62,...,19:29,0,0,0,0,,2007,0.151899,0.303797,0.759494


In [184]:
final_team_df_changed =pd.concat([aaa_df,aaa_df2])

In [185]:
final_team_df_changed=final_team_df_changed.sort_values(by=['season','team'])

In [186]:
final_team_df_changed.columns

Index(['name', 'age', 'team', 'pos', 'gp', 'goals', 'assists', 'points',
       'plusmin', 'plm', 'goals_ev', 'goals_pp', 'goals_sh', 'goals_gw',
       'assists_ev', 'assists_pp', 'assists_sh', 'shots', 'toi', 'toi_avg',
       'hits', 'faceoff_wins', 'faceoff_losses', 'block', 'faceoff_pct',
       'season', 'shot_pct', 'gpg', 'ppg'],
      dtype='object')

In [187]:
final_team_df_changed=final_team_df_changed[['season','team','name', 'age', 'pos', 'gp', 'goals', 'assists', 'points',
       'plusmin', 'plm', 'goals_ev', 'goals_pp', 'goals_sh', 'goals_gw',
       'assists_ev', 'assists_pp', 'assists_sh', 'shots', 'toi', 'toi_avg',
       'hits', 'faceoff_wins', 'faceoff_losses', 'block', 'faceoff_pct',
       'shot_pct', 'gpg', 'ppg']]

In [188]:
final_team_df_changed.team.unique()

array(['MTL', 'MTW', 'OTS', 'TRA', 'QBC', 'TRS', 'HAM', 'BOS', 'MTM',
       'NYA', 'PTP', 'CBH', 'DTC', 'NYR', 'TOR', 'DTF', 'PHQ', 'DET',
       'TOT', 'STE', 'BRO', 'LAK', 'MNS', 'OAK', 'PHI', 'PIT', 'STL',
       'BUF', 'CGS', 'VAN', 'ATF', 'NYI', 'KCS', 'WSH', 'CLE', 'CLR',
       'EDM', 'HAR', 'QUE', 'WIN', 'CGY', 'NJD', 'CHI', 'SJS', 'OTT',
       'TBL', 'DAL', 'FLA', 'MDA', 'COL', 'PHX', 'CAR', 'NSH', 'ATL',
       'CBJ', 'MIN', 'ANA', 'WPG', 'ARI', 'VEG'], dtype=object)

In [189]:
final_team_df_changed.dtypes

season              int64
team               object
name               object
age                object
pos                object
gp                  int64
goals               int64
assists            object
points              int64
plusmin            object
plm                object
goals_ev           object
goals_pp           object
goals_sh           object
goals_gw           object
assists_ev         object
assists_pp         object
assists_sh         object
shots               int64
toi                object
toi_avg            object
hits               object
faceoff_wins        int64
faceoff_losses      int64
block              object
faceoff_pct       float64
shot_pct          float64
gpg               float64
ppg               float64
dtype: object

In [190]:
final_team_df_changed.assists=final_team_df_changed.assists.astype(float)

In [191]:
final_team_df_changed.plusmin.unique()

array(['', '-6', '-2', '-7', '-14', '2', '-11', '-4', '-3', '-12', '-10',
       '-18', '-15', '-1', '3', '-24', '6', '-5', '1', '0', '17', '27',
       '-9', '13', '10', '20', '4', '14', '11', '-26', '16', '-13', '9',
       '-19', '28', '8', '37', '24', '32', '19', '5', '7', '21', '18',
       '-31', '-27', '-16', '-8', '-23', '12', '-22', '-28', '-21', '-25',
       '29', '30', '-20', '31', '23', '26', '-46', '-43', '-17', '-36',
       '-42', '-62', '-44', '-41', '-33', '-58', '35', '55', '25', '48',
       '34', '47', '40', '-39', '22', '15', '-30', '-37', '33', '-35',
       '-45', '-34', '-29', '43', '54', '50', '44', '42', '45', '-32',
       '36', '38', '-40', '41', '59', '69', '124', '39', '98', '57',
       '-38', '51', '-57', '-48', '83', '53', '60', '63', '61', '77',
       '52', '70', '-53', '-47', '84', '-49', '80', '46', '-50', '-52',
       '-56', '-54', '72', '79', '-69', '-65', '-82', '-60', '-51', '67',
       '64', '73', '74', '66', '56', '49', '89', '120', '58', '

In [192]:
final_team_df_changed.plusmin=final_team_df_changed.plusmin.replace("",0)

In [193]:
final_team_df_changed.plusmin=final_team_df_changed.plusmin.astype(float)

In [194]:
for col in ['gp', 'goals', 'assists', 'points',
       'plusmin', 'plm', 'goals_ev', 'goals_pp', 'goals_sh', 'goals_gw',
       'assists_ev', 'assists_pp', 'assists_sh', 'shots', 'toi',
       'hits', 'faceoff_wins', 'faceoff_losses', 'block', 'faceoff_pct',
       'shot_pct', 'gpg', 'ppg']:
    final_team_df_changed[col]=final_team_df_changed[col].replace("",0)
    final_team_df_changed[col]=final_team_df_changed[col].astype(float)
    final_team_df_changed[col].fillna(0,inplace=True)

In [195]:
final_team_df_changed.drop('toi_avg',axis=1,inplace=True)

In [196]:
final_team_df_changed['toi_avg']=final_team_df_changed.toi/final_team_df_changed.gp

In [197]:
import numpy as np

In [198]:
final_team_df_changed.replace([np.inf, -np.inf], np.nan,inplace=True)

In [199]:
for col in ['gp', 'goals', 'assists', 'points',
       'plusmin', 'plm', 'goals_ev', 'goals_pp', 'goals_sh', 'goals_gw',
       'assists_ev', 'assists_pp', 'assists_sh', 'shots', 'toi',
       'hits', 'faceoff_wins', 'faceoff_losses', 'block', 'faceoff_pct',
       'shot_pct', 'gpg', 'ppg','toi_avg']:
    final_team_df_changed[col].fillna(0,inplace=True)

In [200]:
final_team_df_changed.to_csv('skater_data.csv')