In [1]:
# Importing standard packages for data exploration and processing.

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

This notebook is going to focus on processing the players' match statistics only. The other three files will be processed in separate notebooks.

In [2]:
# Unlike in the Stage 1 notebooks, we are going to create new variables rather than perform the operations in-place here.
# The reason is that we might need to review the original data during processing.

raw_players_match = pd.read_csv('../raw_data/raw_players_match.csv')

In [3]:
# Does everything seem to be alright with the data?

raw_players_match

Unnamed: 0,URL,Player name,IDSeason,Season,Team,Date,Teams,Score,№,G,...,BLS,FOA,W,L,SOP,GA,Sv,%Sv,GAA,SO
0,https://en.khl.ru/players/16673/,Sergei Abramov,244,Regular season 2013/2014,54,28 Dec 2013,Barys - Amur,8:2,91,0,...,,,,,,,,,,
1,https://en.khl.ru/players/16673/,Sergei Abramov,244,Regular season 2013/2014,54,3 Jan 2014,Amur - Lokomotiv,2:1,91,0,...,,,,,,,,,,
2,https://en.khl.ru/players/16673/,Sergei Abramov,244,Regular season 2013/2014,54,5 Jan 2014,Amur - SKA,1:6,91,0,...,,,,,,,,,,
3,https://en.khl.ru/players/16673/,Sergei Abramov,244,Regular season 2013/2014,54,7 Jan 2014,Amur - Atlant,2:3 Б,91,0,...,,,,,,,,,,
4,https://en.khl.ru/players/16673/,Sergei Abramov,244,Regular season 2013/2014,54,9 Jan 2014,Amur - Severstal,1:3,91,0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
451101,https://en.khl.ru/players/11543/,Alexander Zevakhin,167,Regular season 2009/2010,56,13 Dec 2009,Severstal - CSKA,4:3 Б,15,0,...,,,,,,,,,,
451102,https://en.khl.ru/players/11543/,Alexander Zevakhin,167,Regular season 2009/2010,56,23 Dec 2009,Barys - Severstal,3:4,15,0,...,,,,,,,,,,
451103,https://en.khl.ru/players/11543/,Alexander Zevakhin,167,Regular season 2009/2010,56,25 Dec 2009,Salavat Yulaev - Severstal,2:3,15,0,...,,,,,,,,,,
451104,https://en.khl.ru/players/11543/,Alexander Zevakhin,167,Regular season 2009/2010,56,27 Dec 2009,Avangard - Severstal,3:1,15,0,...,,,,,,,,,,


We can already see that there are some issues with missing data. In addition, the player's team is only indicated by an id rather than its official name.

In [4]:
# What would the summary tell us?

raw_players_match.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 451106 entries, 0 to 451105
Data columns (total 40 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   URL          451106 non-null  object 
 1   Player name  451106 non-null  object 
 2   IDSeason     451106 non-null  int64  
 3   Season       451106 non-null  object 
 4   Team         451106 non-null  int64  
 5   Date         451106 non-null  object 
 6   Teams        451106 non-null  object 
 7   Score        451106 non-null  object 
 8   №            451106 non-null  int64  
 9   G            451106 non-null  int64  
 10  Assists      451106 non-null  int64  
 11  PTS          409063 non-null  float64
 12  +/-          409063 non-null  float64
 13  +            409063 non-null  float64
 14  -            409063 non-null  float64
 15  PIM          451106 non-null  int64  
 16  ESG          409063 non-null  float64
 17  PPG          409063 non-null  float64
 18  SHG          409063 non-

We can see that in many columns there is no missing data at all. Some columns are stored as floats while they should in fact be integers. At the same time, for other columns there is a clear separation into skaters (forwards and defencemen) and goalies.

For example, we can see that season statistics appears to have 409063 rows of data for skaters and 42020 rows for goalies, with a total of 451083 rows. However, there are 451106 rows in the dataframe so 23 rows seem to be unaccounted in either.

Let us find out who is messing up our data. We can see that icetime has exactly 451083 non-null values which is in line with our calculations, so we are probably interested in the cases when icetime is null.

In [5]:
# We need the rows for which icetime are null.

raw_players_match[raw_players_match['TOI'].isnull()]

Unnamed: 0,URL,Player name,IDSeason,Season,Team,Date,Teams,Score,№,G,...,BLS,FOA,W,L,SOP,GA,Sv,%Sv,GAA,SO
35442,https://en.khl.ru/players/33314/,Casey Bailey,671,Regular season 2018/2019,246,20 Feb 2019,Jokerit - Slovan,7:1,25,0,...,,,,,,,,,,
46321,https://en.khl.ru/players/29144/,David Boldizar,468,Regular season 2017/2018,246,20 Sep 2017,Slovan - Ak Bars,3:6,23,0,...,,,,,,,,,,
46322,https://en.khl.ru/players/29144/,David Boldizar,468,Regular season 2017/2018,246,23 Sep 2017,Vityaz - Slovan,4:0,23,0,...,,,,,,,,,,
46323,https://en.khl.ru/players/29144/,David Boldizar,468,Regular season 2017/2018,246,25 Sep 2017,CSKA - Slovan,3:2,23,0,...,,,,,,,,,,
46324,https://en.khl.ru/players/29144/,David Boldizar,468,Regular season 2017/2018,246,27 Sep 2017,Slovan - Vityaz,4:3,23,0,...,,,,,,,,,,
46325,https://en.khl.ru/players/29144/,David Boldizar,468,Regular season 2017/2018,246,3 Oct 2017,Slovan - Severstal,5:4 Б,23,0,...,,,,,,,,,,
46326,https://en.khl.ru/players/29144/,David Boldizar,468,Regular season 2017/2018,246,5 Oct 2017,Slovan - Torpedo,0:1,23,0,...,,,,,,,,,,
46327,https://en.khl.ru/players/29144/,David Boldizar,671,Regular season 2018/2019,246,22 Jan 2019,Dinamo R - Slovan,3:2,61,0,...,,,,,,,,,,
46328,https://en.khl.ru/players/29144/,David Boldizar,671,Regular season 2018/2019,246,24 Jan 2019,Lokomotiv - Slovan,7:0,61,0,...,,,,,,,,,,
46329,https://en.khl.ru/players/29144/,David Boldizar,671,Regular season 2018/2019,246,26 Jan 2019,Slovan - Dinamo Mn,2:4,61,0,...,,,,,,,,,,


We have multiple culprits here. Something must have went wrong with the way their data was stored.

In [6]:
raw_players_match[raw_players_match['TOI'].isnull()].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 23 entries, 35442 to 354778
Data columns (total 40 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   URL          23 non-null     object 
 1   Player name  23 non-null     object 
 2   IDSeason     23 non-null     int64  
 3   Season       23 non-null     object 
 4   Team         23 non-null     int64  
 5   Date         23 non-null     object 
 6   Teams        23 non-null     object 
 7   Score        23 non-null     object 
 8   №            23 non-null     int64  
 9   G            23 non-null     int64  
 10  Assists      23 non-null     int64  
 11  PTS          0 non-null      float64
 12  +/-          0 non-null      float64
 13  +            0 non-null      float64
 14  -            0 non-null      float64
 15  PIM          23 non-null     int64  
 16  ESG          0 non-null      float64
 17  PPG          0 non-null      float64
 18  SHG          0 non-null      float64
 19  OT

Most of the data is missing, and not because it is supposed to be a zero. After all, icetime cannot be zero if a player has participated in a match. And the only values present are integers, so definitely something weird with the formatting.

We do not know whether the player has zeroes in all those columns or if it is just a data storage issue. Since those are only a few broken rows, let us just drop them altogether. And, while we are at it, drop all rows where a player got no icetime.

In [7]:
raw_players_match.groupby('TOI').size()

TOI
-        9362
0       18734
0.0         1
0:01       16
0:02        8
        ...  
9:55      212
9:56      199
9:57      215
9:58      193
9:59      239
Length: 3694, dtype: int64

In [8]:
# Zero icetime can be recorded as either NaN, '-', '0' or '0.0' and we want neither of them.

zero_list = [np.NaN, '-', '0', '0.0']

players_match = raw_players_match[~raw_players_match['TOI'].isin(zero_list)]

players_match = players_match.reset_index(drop=True)

Now we can create a new column indicating whether a player is a skater or a goalie. Let us use the shifts for separation.

In [9]:
# Number of shifts on ice is only tracked for skaters, so goalies are supposed to have it as null.

players_match['Role'] = np.where(players_match['SFT'].isnull(), 'Goalie', 'Skater')

In [10]:
players_match

Unnamed: 0,URL,Player name,IDSeason,Season,Team,Date,Teams,Score,№,G,...,FOA,W,L,SOP,GA,Sv,%Sv,GAA,SO,Role
0,https://en.khl.ru/players/16673/,Sergei Abramov,244,Regular season 2013/2014,54,28 Dec 2013,Barys - Amur,8:2,91,0,...,,,,,,,,,,Skater
1,https://en.khl.ru/players/16673/,Sergei Abramov,244,Regular season 2013/2014,54,3 Jan 2014,Amur - Lokomotiv,2:1,91,0,...,,,,,,,,,,Skater
2,https://en.khl.ru/players/16673/,Sergei Abramov,244,Regular season 2013/2014,54,5 Jan 2014,Amur - SKA,1:6,91,0,...,,,,,,,,,,Skater
3,https://en.khl.ru/players/16673/,Sergei Abramov,244,Regular season 2013/2014,54,7 Jan 2014,Amur - Atlant,2:3 Б,91,0,...,,,,,,,,,,Skater
4,https://en.khl.ru/players/16673/,Sergei Abramov,244,Regular season 2013/2014,54,9 Jan 2014,Amur - Severstal,1:3,91,0,...,,,,,,,,,,Skater
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
422981,https://en.khl.ru/players/11543/,Alexander Zevakhin,167,Regular season 2009/2010,56,11 Dec 2009,Severstal - Vityaz,5:1,15,0,...,,,,,,,,,,Skater
422982,https://en.khl.ru/players/11543/,Alexander Zevakhin,167,Regular season 2009/2010,56,13 Dec 2009,Severstal - CSKA,4:3 Б,15,0,...,,,,,,,,,,Skater
422983,https://en.khl.ru/players/11543/,Alexander Zevakhin,167,Regular season 2009/2010,56,23 Dec 2009,Barys - Severstal,3:4,15,0,...,,,,,,,,,,Skater
422984,https://en.khl.ru/players/11543/,Alexander Zevakhin,167,Regular season 2009/2010,56,25 Dec 2009,Salavat Yulaev - Severstal,2:3,15,0,...,,,,,,,,,,Skater


We have quite a bit of work ahead of us. Many columns contain data that we would like to see in other columns, such as years, home/visit team and whether the game was finished in the main time or in overtime/by shootouts.

UPDATE: as it turns out, there is a number of rows for which the score is not actually a valid score. Let us look at it.

In [11]:
players_match[players_match['Score'] == '(-:-)']

Unnamed: 0,URL,Player name,IDSeason,Season,Team,Date,Teams,Score,№,G,...,FOA,W,L,SOP,GA,Sv,%Sv,GAA,SO,Role
3049,https://en.khl.ru/players/14597/,Yegor Averin,167,Regular season 2009/2010,34,9 Jan 2010,Vityaz - Avangard,(-:-),29,0,...,,,,,,,,,,Skater
21757,https://en.khl.ru/players/14315/,Sergei Belokon,167,Regular season 2009/2010,19,9 Jan 2010,Vityaz - Avangard,(-:-),61,0,...,,,,,,,,,,Skater
23288,https://en.khl.ru/players/3431/,Viktor Bobrov,167,Regular season 2009/2010,19,9 Jan 2010,Vityaz - Avangard,(-:-),38,0,...,,,,,,,,,,Skater
27560,https://en.khl.ru/players/14653/,Vadim Berdnikov,167,Regular season 2009/2010,19,9 Jan 2010,Vityaz - Avangard,(-:-),36,0,...,,,,,,,,,,Skater
35837,https://en.khl.ru/players/15606/,Georgy Belousov,167,Regular season 2009/2010,19,9 Jan 2010,Vityaz - Avangard,(-:-),25,0,...,,,,,,,,,,Skater
40534,https://en.khl.ru/players/3950/,Anton Babchuk,167,Regular season 2009/2010,34,9 Jan 2010,Vityaz - Avangard,(-:-),78,0,...,,,,,,,,,,Skater
41588,https://en.khl.ru/players/10176/,Anton Belov,167,Regular season 2009/2010,34,9 Jan 2010,Vityaz - Avangard,(-:-),77,0,...,,,,,,,,,,Skater
42360,https://en.khl.ru/players/6428/,Alexei Bondarev,167,Regular season 2009/2010,34,9 Jan 2010,Vityaz - Avangard,(-:-),58,0,...,,,,,,,,,,Skater
48932,https://en.khl.ru/players/13252/,Rafael Batyrshin,167,Regular season 2009/2010,19,9 Jan 2010,Vityaz - Avangard,(-:-),23,0,...,,,,,,,,,,Skater
68239,https://en.khl.ru/players/15295/,Sergei Denisov,167,Regular season 2009/2010,19,9 Jan 2010,Vityaz - Avangard,(-:-),20,0,...,,0.0,0.0,0.0,0.0,0.0,-,0.0,0.0,Goalie


Can you see how all of those refer to a single match, Vityaz - Avangard on January 9th 2010?

That is a sort of unique match for KHL. As a result of two mass fights early into the game, both teams have racked up an enormous amount of penalties and did not have enough players to even continue the match. As such, it was cancelled and the score was recorded the way you see above, breaking the data processing workflow. Let us just drop those observations altogether.

In [12]:
players_match = players_match[players_match['Score'] != '(-:-)']

# We do not really need the 'IDSeason' column, as the current 'Season' column is indicative enough.

players_match = players_match.drop('IDSeason', axis=1)

# Separating the 'Season' column into the type of season and the years would allow us to more easily sort it.

players_match['Year'] = players_match['Season'].apply(lambda x: x[:-10])
players_match['Season'] = players_match['Season'].apply(lambda x: x[-9:])

# We need to separate the teams into two columns.
# It is important to remove the trailling spaces from the results.

players_match['Home_team'] = players_match['Teams'].apply(lambda x: x.split('-')[0].strip())
players_match['Away_team'] = players_match['Teams'].apply(lambda x: x.split('-')[1].strip())

# Now separating the match score into each team's corresponding score.
# In addition, we will create a 'Length' column that will indicate in which period the game has ended.
# The split on a space separates the scores from an overtime indicator, and the split on a colon separates teams' scores.

players_match['Home_score'] = players_match['Score'].apply(lambda x: x.split(' ')[0].split(':')[0])
players_match['Away_score'] = players_match['Score'].apply(lambda x: x.split(' ')[0].split(':')[1])

# We cannot just take the second element after the split since the list will only contain 1 element if there is no overtime.
# But we can artificially create an extra element of a list by padding the string with an extra space at the end.
# This trick allows us to take the overtime indicator if it is present or a blank string if it is not.

players_match['Length'] = players_match['Score'].apply(lambda x: (x + ' ').split(' ')[1])

In [13]:
# What values do we have here?

players_match['Length'].unique()

array(['', 'Б', 'ОТ'], dtype=object)

We could previously see a Russian letter 'Б' in the 'Score' column. It indicates shootouts and was not properly changed to English it seems. Therefore, we need to change it and, while we are at it, might as well change all values to the more obvious ones.

In [14]:
length_dict = {'': 'Standard', 'ОТ': 'Overtime', 'Б': 'Shootouts'}

players_match['Length'] = players_match['Length'].map(length_dict)

We can now rearrange the columns.

In [15]:
# Dropping the columns we are no longer interested in.

players_match.drop(['Teams', 'Score'], axis=1, inplace=True)

# We will have to move the columns around quite a bit.

columns = players_match.columns

players_match = players_match[[col for col in columns[:2]] + ['Role', 'Year'] + [col for col in columns[2:5]] +
                              ['Home_team', 'Away_team', 'Home_score', 'Away_score','Length'] + [col for col in columns[5:-7]]]

# The current column names are not very informative, are they?

header = ['Profile', 'Player', 'Role', 'Year', 'Season', 'Team_id', 'Date', 'Home_team', 'Away_team', 'Home_score',
          'Away_score', 'Length', 'Number', 'Goals', 'Assists', 'Points', 'Plus_minus', 'Plus', 'Minus',
          'Penalties', 'Goals_even', 'Goals_powerplay', 'Goals_shorthanded', 'Goals_overtime', 'Game_winning_goals',
          'Game_winning_shootouts', 'Shots', 'Shots_percentage', 'Faceoffs', 'Faceoffs_won', 'Faceoffs_percentage',
          'Icetime', 'Shifts', 'Hits', 'Shots_blocked', 'Penalties_against', 'Wins', 'Losses', 'Shootouts', 'Goals_against',
          'Saves', 'Saves_percentage', 'Goals_against_average', 'Shutouts']

players_match.columns = header

Can we start changing the data types? Not really.

Most of our columns still has many NaN values because different statistics are tracked for skaters and goalies. And integers do not like having NaN values in them. It could be worked around but such an approach would not necessarily be the best one.

We could, of course, leave it as it is or replace missing values with zeros. However, analysing skaters and goalies together in the future sounds like a bad analysis design since the two groups are very distinct. Therefore, let us separate the data into two distinct dataframes and store skater statistics and goalie statistics separately. That way, we can also change floats into integers within each dataframe separately.

In [16]:
# Thankfully, we have a convenient column to separate on.

skaters_match = players_match[players_match['Role'] == 'Skater'].reset_index(drop=True)
goalies_match = players_match[players_match['Role'] == 'Goalie'].reset_index(drop=True)

In [17]:
# Most columns are either null or non-null in every row except for 'Hits', 'Shots_blocked' and 'Penalties_against'.

skaters_match.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 399663 entries, 0 to 399662
Data columns (total 44 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   Profile                 399663 non-null  object 
 1   Player                  399663 non-null  object 
 2   Role                    399663 non-null  object 
 3   Year                    399663 non-null  object 
 4   Season                  399663 non-null  object 
 5   Team_id                 399663 non-null  int64  
 6   Date                    399663 non-null  object 
 7   Home_team               399663 non-null  object 
 8   Away_team               399663 non-null  object 
 9   Home_score              399663 non-null  object 
 10  Away_score              399663 non-null  object 
 11  Length                  399663 non-null  object 
 12  Number                  399663 non-null  int64  
 13  Goals                   399663 non-null  int64  
 14  Assists             

In [18]:
# And all is perfect here!

goalies_match.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23283 entries, 0 to 23282
Data columns (total 44 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Profile                 23283 non-null  object 
 1   Player                  23283 non-null  object 
 2   Role                    23283 non-null  object 
 3   Year                    23283 non-null  object 
 4   Season                  23283 non-null  object 
 5   Team_id                 23283 non-null  int64  
 6   Date                    23283 non-null  object 
 7   Home_team               23283 non-null  object 
 8   Away_team               23283 non-null  object 
 9   Home_score              23283 non-null  object 
 10  Away_score              23283 non-null  object 
 11  Length                  23283 non-null  object 
 12  Number                  23283 non-null  int64  
 13  Goals                   23283 non-null  int64  
 14  Assists                 23283 non-null

What is going on here? It feels as if for some reason those three columns are only recorded part of the time. Is it happening all the time?

In [19]:
# Distribution of NaN values for 'Hits' by season.

skaters_match[skaters_match['Hits'].isnull()].groupby(['Season', 'Year']).size()

Season     Year          
2008/2009  Playoffs           2241
           Regular season    24804
2009/2010  Playoffs           2659
           Regular season    25299
2010/2011  Playoffs           3174
           Regular season    23044
2011/2012  Playoffs           3103
           Regular season    23167
2012/2013  Playoffs           3161
           Regular season    25681
2013/2014  Playoffs           3176
           Regular season    28986
dtype: int64

Would you look at that, the missing values are only the case for seasons 2008/2009 through 2013/2014! Clearly, the indicators have just not been tracked in though years. Mystery solved.

However, what should we do with it? Replacing it with zeros would not be very fair and can mess up our analysis. At the same time, we cannot change the column to integers without replacing the NaN values. Oh well, we might leave it as is for now and keep in mind that any further analysis needs to take into account that change from season 2014/2015 onwards. Now let us move onto the rest of the columns.

We are going to remove the null columns and change the non-null ones to integers. Actually, in a few cases we would need to change the columns to floats for things such as '%SOG' (percentage of shots on goal that scored) which seem to be stored as objects right now.

At the same time, Some columns that we want to store as floats have '-' for their value, which cannot be converted into a float value. This is because they are obtained by dividing one statistics by another and one of the two may not be suitable for such operation. We are going to replace those values with NaN.

Moreover, the 'TOI' column is stored in the format 'minutes:seconds' and is thus not convertible to floats. A new column will be added for tit, calculated as an integer value in seconds.

In [20]:
# We can copy paste parts of the previously created list of column names instead of typing them up manually.

print(header)

['Profile', 'Player', 'Role', 'Year', 'Season', 'Team_id', 'Date', 'Home_team', 'Away_team', 'Home_score', 'Away_score', 'Length', 'Number', 'Goals', 'Assists', 'Points', 'Plus_minus', 'Plus', 'Minus', 'Penalties', 'Goals_even', 'Goals_powerplay', 'Goals_shorthanded', 'Goals_overtime', 'Game_winning_goals', 'Game_winning_shootouts', 'Shots', 'Shots_percentage', 'Faceoffs', 'Faceoffs_won', 'Faceoffs_percentage', 'Icetime', 'Shifts', 'Hits', 'Shots_blocked', 'Penalties_against', 'Wins', 'Losses', 'Shootouts', 'Goals_against', 'Saves', 'Saves_percentage', 'Goals_against_average', 'Shutouts']


In [21]:
# Starting with the more numerous skaters.

skaters_match.drop(['Wins', 'Losses', 'Shootouts', 'Goals_against', 'Saves', 'Saves_percentage',
                     'Goals_against_average', 'Shutouts'], axis=1, inplace=True)

# A list of columns to be changed into integers.

skaters_int = ['Home_score', 'Away_score', 'Goals', 'Assists', 'Points', 'Plus_minus', 'Plus',
               'Minus', 'Penalties', 'Goals_even', 'Goals_powerplay', 'Goals_shorthanded', 'Goals_overtime',
               'Game_winning_goals', 'Game_winning_shootouts', 'Shots', 'Faceoffs', 'Faceoffs_won']

skaters_match[skaters_int] = skaters_match[skaters_int].astype('int')

# We cannot directly convert shifts which are in float format but stored as objects.

skaters_match['Shifts'] = skaters_match['Shifts'].astype('float').astype('int')

# A list of columns to be changed into floats.
# Remember, we need to fix the '-' symbol and cannot change the 'Icetime' column.

skaters_float = ['Shots_percentage', 'Faceoffs_percentage']

skaters_match[skaters_float] = skaters_match[skaters_float].replace('-', np.NaN).astype('float')

# The player's number and team id are currently stored as float, let us change them into object.

skaters_object = ['Team_id', 'Number']

skaters_match[skaters_object] = skaters_match[skaters_object].astype('object')

# What do you think we are going to do to the 'Date' column?

skaters_match['Date'] = pd.to_datetime(skaters_match['Date'])

# Finally, let us add the icetime in seconds. For icetime, we are okay with having zero values instead of NaN.

skaters_match['Icetime_seconds'] = skaters_match['Icetime'].apply(lambda x: int(x[:-3]) * 60 + int(x[-2:]))

# Moving the new column to be right after our existing icetime.

header_skaters = skaters_match.columns

skaters_match = skaters_match[[col for col in header_skaters[:32]] + ['Icetime_seconds'] +
                                [col for col in header_skaters[32:-1]]]

Some of the goalies have an issue where the icetime is stored in seconds to begin with. That breaks the algorithm in multiple ways, and requires writing a separate function to apply to the column itself. Ideally, we want not only to get the correct icetime in seconds but also fix the original 'Icetime' column so that it always displays in 'minutes:seconds' format.

UPDATE: no idea why the commented out function does not work. It returns a Series of tuples when applied to a 'Icetime' column, and after two hours the solution as to how to unpack it into two separate columns is still unclear. Therefore, just a quick patch for now.

In [22]:
# def icetime(icetime):
    
#     # We can have no less than 4 symbols for icetime stored in a 'minutes:seconds' format.
    
#     time_list = icetime[0].split(':')
    
#     if len(time_list) == 2:
        
#         minutes = int(time_list[0])
#         seconds = int(time_list[1])
        
#         return icetime
    
#     else:
           
#         minutes = 0
#         seconds = int(time_list[0])
        
#     while seconds >= 60:
            
#         minutes += 1
#         seconds -= 60
        
#     return f'{minutes}:{seconds}', seconds

In [23]:
def icetime(icetime):
    
    # We can have no less than 4 symbols for icetime stored in a 'minutes:seconds' format.
    
    time_list = icetime.split(':')
    
    if len(time_list) == 2:
        
        return icetime
    
    else:
           
        minutes = 0
        seconds = int(time_list[0])
        
        while seconds >= 60:

            minutes += 1
            seconds -= 60
        
        return f'{minutes}:{seconds}'
        
def icetime_seconds(icetime):
    
    # To be used after turning all icetime values into a 'minutes:seconds' format.
    
    time_list = icetime.split(':')
        
    minutes = int(time_list[0])
    seconds = int(time_list[1])
        
    return minutes * 60 + seconds

In [24]:
# Now for the goalies.

goalies_match.drop(['Points', 'Plus_minus', 'Plus', 'Minus', 'Goals_even', 'Goals_powerplay', 'Goals_shorthanded',
                     'Goals_overtime', 'Game_winning_goals', 'Game_winning_shootouts', 'Shots_percentage',
                     'Faceoffs', 'Faceoffs_won', 'Faceoffs_percentage', 'Shifts', 'Hits', 'Shots_blocked', 'Penalties_against'],
                    axis=1, inplace=True)

# A list of columns to be changed into integers.

goalies_int = ['Goals', 'Assists', 'Penalties', 'Shots', 'Wins', 'Losses', 'Shootouts', 'Goals_against',
               'Saves', 'Shutouts']

goalies_match[goalies_int] = goalies_match[goalies_int].astype('int')

# A list of columns to be changed into floats
# We still need to fix the '-' symbol and cannot change the 'icetime' column.

goalies_float = ['Saves_percentage', 'Goals_against_average']

goalies_match[goalies_float] = goalies_match[goalies_float].replace('-', np.NaN).astype('float')

# The player's number and team id are currently stored as float, let us change them into object.

goalies_object = ['Team_id', 'Number']

goalies_match[goalies_object] = goalies_match[goalies_object].astype('object')

# What do you think we are going to do to the 'Date' column?

goalies_match['Date'] = pd.to_datetime(goalies_match['Date'])


header_skaters = skaters_match.columns

# Finally, let us fix the icetime and add the icetime in seconds.

# goalies_match['Icetime_seconds'] = goalies_match['Icetime'].apply(lambda x: int(x[:-3]) * 60 + int(x[-2:])) - to be fixed
goalies_match['Icetime'] = goalies_match['Icetime'].apply(icetime)
goalies_match['Icetime_seconds'] = goalies_match['Icetime'].apply(icetime_seconds)

# Moving the new column to be right after our existing icetime.

header_goalies = goalies_match.columns

goalies_match = goalies_match[[col for col in header_goalies[:18]] + ['Icetime_seconds'] +
                                [col for col in header_goalies[18:-1]]]

Everything seems to be in order, good job us! Now, for the best part.

In [25]:
skaters_match.to_csv('../data/skaters_match.csv', encoding='utf8', index=False)
goalies_match.to_csv('../data/goalies_match.csv', encoding='utf8', index=False)