In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import sqlite3
import datetime
from typing import Optional, List
from pprint import pprint

%matplotlib inline

## Gather Data

In [2]:
def get_dataframe_from_sql(query, db_path = 'football.sqlite'):
    """
    Returns a pandas dataframe containing the db data returned
    by the provided SQL query.
    """
    # establish a connection to the database
    conn = sqlite3.connect(db_path)

    # load the query results into a pandas dataframe
    df = pd.read_sql_query(query, conn)

    # close the connection to the database
    conn.close()

    # return the dataframe
    return df

In [3]:
get_dataframe_from_sql("SELECT name FROM sqlite_master WHERE type='table';")

Unnamed: 0,name
0,sqlite_sequence
1,Player_Attributes
2,Player
3,Match
4,League
5,Country
6,Team
7,Team_Attributes


In [4]:
get_dataframe_from_sql("PRAGMA table_info(Player_Attributes);").head(30)

Unnamed: 0,cid,name,type,notnull,dflt_value,pk
0,0,id,INTEGER,0,,1
1,1,player_fifa_api_id,INTEGER,0,,0
2,2,player_api_id,INTEGER,0,,0
3,3,date,TEXT,0,,0
4,4,overall_rating,INTEGER,0,,0
5,5,potential,INTEGER,0,,0
6,6,preferred_foot,TEXT,0,,0
7,7,attacking_work_rate,TEXT,0,,0
8,8,defensive_work_rate,TEXT,0,,0
9,9,crossing,INTEGER,0,,0


#### get the available countries

In [5]:
countries_df = get_dataframe_from_sql("SELECT * FROM Country;")
countries_df

Unnamed: 0,id,name
0,1,Belgium
1,1729,England
2,4769,France
3,7809,Germany
4,10257,Italy
5,13274,Netherlands
6,15722,Poland
7,17642,Portugal
8,19694,Scotland
9,21518,Spain


#### get the top 5 leagues

In [6]:
top_5_league_country_names = np.array(['England', 'France', 'Germany', 'Italy', 'Spain'])
top_5_league_country_names

array(['England', 'France', 'Germany', 'Italy', 'Spain'], dtype='<U7')

In [7]:
countries_df[countries_df.name.isin(top_5_league_country_names)].index

Int64Index([1, 2, 3, 4, 9], dtype='int64')

In [8]:
leagues_df = get_dataframe_from_sql("SELECT * FROM League;")
leagues_df

Unnamed: 0,id,country_id,name
0,1,1,Belgium Jupiler League
1,1729,1729,England Premier League
2,4769,4769,France Ligue 1
3,7809,7809,Germany 1. Bundesliga
4,10257,10257,Italy Serie A
5,13274,13274,Netherlands Eredivisie
6,15722,15722,Poland Ekstraklasa
7,17642,17642,Portugal Liga ZON Sagres
8,19694,19694,Scotland Premier League
9,21518,21518,Spain LIGA BBVA


In [9]:
leagues_df = leagues_df[leagues_df.index.isin(countries_df[countries_df.name.isin(top_5_league_country_names)].index)]
leagues_df

Unnamed: 0,id,country_id,name
1,1729,1729,England Premier League
2,4769,4769,France Ligue 1
3,7809,7809,Germany 1. Bundesliga
4,10257,10257,Italy Serie A
9,21518,21518,Spain LIGA BBVA


#### filter the matches that are in each league

In [10]:
top_5_leagues_matches_df = get_dataframe_from_sql('SELECT * FROM Match').query('league_id in @leagues_df.id')
top_5_leagues_matches_df = top_5_leagues_matches_df.sort_values("date").reset_index(drop=True)
top_5_leagues_matches_df.sample(10)

Unnamed: 0,id,country_id,league_id,season,stage,date,match_api_id,home_team_api_id,away_team_api_id,home_team_goal,...,SJA,VCH,VCD,VCA,GBH,GBD,GBA,BSH,BSD,BSA
1560,8000,7809,7809,2008/2009,29,2009-04-25 00:00:00,499571,9789,9790,2,...,2.75,2.2,3.3,3.0,2.25,3.3,3.0,2.38,3.3,2.88
12548,24077,21518,21518,2014/2015,34,2015-04-29 00:00:00,1778373,9910,9864,1,...,,1.73,3.9,5.25,,,,,,
9490,12502,10257,10257,2013/2014,8,2013-10-18 00:00:00,1536597,8686,9875,2,...,3.5,2.2,3.5,3.5,,,,,,
9293,12355,10257,10257,2013/2014,3,2013-09-15 00:00:00,1536540,8535,8529,1,...,7.5,1.45,4.75,7.5,,,,,,
1612,8020,7809,7809,2008/2009,30,2009-05-02 00:00:00,499582,9810,9789,0,...,2.25,3.25,3.4,2.0,3.0,3.3,2.2,3.0,3.4,2.25
2667,2218,1729,1729,2009/2010,19,2009-12-27 00:00:00,658925,9825,10252,3,...,5.5,1.73,3.5,5.5,1.73,3.5,4.75,1.73,3.5,5.0
7571,9305,7809,7809,2012/2013,6,2012-09-30 00:00:00,1239514,8721,9905,0,...,3.13,2.25,3.5,3.3,2.2,3.5,3.3,2.25,3.4,3.0
9507,6686,4769,4769,2013/2014,10,2013-10-19 00:00:00,1468256,9851,9873,2,...,4.33,1.92,3.5,4.4,,,,,,
13434,24218,21518,21518,2015/2016,13,2015-11-29 00:00:00,2030203,8305,10205,2,...,,3.25,3.3,2.4,,,,,,
9073,3563,1729,1729,2012/2013,38,2013-05-19 00:00:00,1228312,8466,10194,1,...,5.0,1.8,3.6,5.0,1.75,3.5,4.75,1.73,3.5,5.0


#### get the teams

In [11]:
teams_df = get_dataframe_from_sql("SELECT * FROM Team;")
teams_df.head()

Unnamed: 0,id,team_api_id,team_fifa_api_id,team_long_name,team_short_name
0,1,9987,673.0,KRC Genk,GEN
1,2,9993,675.0,Beerschot AC,BAC
2,3,10000,15005.0,SV Zulte-Waregem,ZUL
3,4,9994,2007.0,Sporting Lokeren,LOK
4,5,9984,1750.0,KSV Cercle Brugge,CEB


In [12]:
# Get the top 5 leagues' teams
# use the api id of both home and away teams in case some teams have missing home/away match info
top_5_leagues_team_ids = np.unique(np.concatenate((top_5_leagues_matches_df.home_team_api_id.unique(), top_5_leagues_matches_df.away_team_api_id.unique())))
teams_df = teams_df[teams_df.team_api_id.isin(top_5_leagues_team_ids)].reset_index(drop=True)
teams_df.head()

Unnamed: 0,id,team_api_id,team_fifa_api_id,team_long_name,team_short_name
0,3457,10260,11.0,Manchester United,MUN
1,3458,10261,13.0,Newcastle United,NEW
2,3459,9825,1.0,Arsenal,ARS
3,3460,8659,109.0,West Bromwich Albion,WBA
4,3461,8472,106.0,Sunderland,SUN


### get attributes of teams in the top 5 leagues

In [13]:
team_attributes_df = get_dataframe_from_sql("SELECT * FROM Team_Attributes;")
team_attributes_df.head()

Unnamed: 0,id,team_fifa_api_id,team_api_id,date,buildUpPlaySpeed,buildUpPlaySpeedClass,buildUpPlayDribbling,buildUpPlayDribblingClass,buildUpPlayPassing,buildUpPlayPassingClass,...,chanceCreationShooting,chanceCreationShootingClass,chanceCreationPositioningClass,defencePressure,defencePressureClass,defenceAggression,defenceAggressionClass,defenceTeamWidth,defenceTeamWidthClass,defenceDefenderLineClass
0,1,434,9930,2010-02-22 00:00:00,60,Balanced,,Little,50,Mixed,...,55,Normal,Organised,50,Medium,55,Press,45,Normal,Cover
1,2,434,9930,2014-09-19 00:00:00,52,Balanced,48.0,Normal,56,Mixed,...,64,Normal,Organised,47,Medium,44,Press,54,Normal,Cover
2,3,434,9930,2015-09-10 00:00:00,47,Balanced,41.0,Normal,54,Mixed,...,64,Normal,Organised,47,Medium,44,Press,54,Normal,Cover
3,4,77,8485,2010-02-22 00:00:00,70,Fast,,Little,70,Long,...,70,Lots,Organised,60,Medium,70,Double,70,Wide,Cover
4,5,77,8485,2011-02-22 00:00:00,47,Balanced,,Little,52,Mixed,...,52,Normal,Organised,47,Medium,47,Press,52,Normal,Cover


In [14]:
# Get the attributes of the top 5 leagues' teams
team_attributes_df = team_attributes_df[team_attributes_df.team_api_id.isin(teams_df.team_api_id.unique())]
team_attributes_df.head()

Unnamed: 0,id,team_fifa_api_id,team_api_id,date,buildUpPlaySpeed,buildUpPlaySpeedClass,buildUpPlayDribbling,buildUpPlayDribblingClass,buildUpPlayPassing,buildUpPlayPassingClass,...,chanceCreationShooting,chanceCreationShootingClass,chanceCreationPositioningClass,defencePressure,defencePressureClass,defenceAggression,defenceAggressionClass,defenceTeamWidth,defenceTeamWidthClass,defenceDefenderLineClass
9,10,614,8576,2010-02-22 00:00:00,60,Balanced,,Little,40,Mixed,...,55,Normal,Organised,30,Deep,70,Double,30,Narrow,Offside Trap
10,11,614,8576,2011-02-22 00:00:00,65,Balanced,,Little,45,Mixed,...,50,Normal,Organised,45,Medium,45,Press,50,Normal,Cover
11,12,614,8576,2012-02-22 00:00:00,59,Balanced,,Little,52,Mixed,...,52,Normal,Organised,38,Medium,47,Press,53,Normal,Cover
12,13,614,8576,2013-09-20 00:00:00,59,Balanced,,Little,52,Mixed,...,52,Normal,Organised,38,Medium,47,Press,53,Normal,Cover
13,14,614,8576,2014-09-19 00:00:00,59,Balanced,57.0,Normal,52,Mixed,...,52,Normal,Organised,38,Medium,47,Press,53,Normal,Cover


### get the players

In [15]:
players_df = get_dataframe_from_sql('SELECT * FROM Player;')
players_df.head()

Unnamed: 0,id,player_api_id,player_name,player_fifa_api_id,birthday,height,weight
0,1,505942,Aaron Appindangoye,218353,1992-02-29 00:00:00,182.88,187
1,2,155782,Aaron Cresswell,189615,1989-12-15 00:00:00,170.18,146
2,3,162549,Aaron Doran,186170,1991-05-13 00:00:00,170.18,163
3,4,30572,Aaron Galindo,140161,1982-05-08 00:00:00,182.88,198
4,5,23780,Aaron Hughes,17725,1979-11-08 00:00:00,182.88,154


### get the player attributes

In [16]:
player_attributes_df = get_dataframe_from_sql('SELECT * FROM Player_Attributes;')
player_attributes_df.head()

Unnamed: 0,id,player_fifa_api_id,player_api_id,date,overall_rating,potential,preferred_foot,attacking_work_rate,defensive_work_rate,crossing,...,vision,penalties,marking,standing_tackle,sliding_tackle,gk_diving,gk_handling,gk_kicking,gk_positioning,gk_reflexes
0,1,218353,505942,2016-02-18 00:00:00,67.0,71.0,right,medium,medium,49.0,...,54.0,48.0,65.0,69.0,69.0,6.0,11.0,10.0,8.0,8.0
1,2,218353,505942,2015-11-19 00:00:00,67.0,71.0,right,medium,medium,49.0,...,54.0,48.0,65.0,69.0,69.0,6.0,11.0,10.0,8.0,8.0
2,3,218353,505942,2015-09-21 00:00:00,62.0,66.0,right,medium,medium,49.0,...,54.0,48.0,65.0,66.0,69.0,6.0,11.0,10.0,8.0,8.0
3,4,218353,505942,2015-03-20 00:00:00,61.0,65.0,right,medium,medium,48.0,...,53.0,47.0,62.0,63.0,66.0,5.0,10.0,9.0,7.0,7.0
4,5,218353,505942,2007-02-22 00:00:00,61.0,65.0,right,medium,medium,48.0,...,53.0,47.0,62.0,63.0,66.0,5.0,10.0,9.0,7.0,7.0


### get the champions league history data

In [17]:
champs_league_hist_df = pd.read_csv("./champs_league_hist.csv", header=None, names=["year", "team_name", "progress", "team_country"])
champs_league_hist_df.head()

Unnamed: 0,year,team_name,progress,team_country
0,1994,A.C. Milan,1. Winner,Italy
1,1994,Barcelona,2. Runner Up,Spain
2,1994,A.C. Milan,3. Semifinalist,Italy
3,1994,Barcelona,3. Semifinalist,Spain
4,1994,Monaco,3. Semifinalist,France


## Assess Data - Quality

### Assessing match data

In [18]:
predictions_start_idx = -30
match_events_start_idx = -38
away_players_start_idx = -49
home_players_start_idx = -60
away_players_y_pos_start_idx = -71
home_players_y_pos_start_idx = -82
away_players_x_pos_start_idx = -93
home_players_x_pos_start_idx = -104

In [19]:
top_5_leagues_matches_df.iloc[:, :home_players_x_pos_start_idx].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14585 entries, 0 to 14584
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   id                14585 non-null  int64 
 1   country_id        14585 non-null  int64 
 2   league_id         14585 non-null  int64 
 3   season            14585 non-null  object
 4   stage             14585 non-null  int64 
 5   date              14585 non-null  object
 6   match_api_id      14585 non-null  int64 
 7   home_team_api_id  14585 non-null  int64 
 8   away_team_api_id  14585 non-null  int64 
 9   home_team_goal    14585 non-null  int64 
 10  away_team_goal    14585 non-null  int64 
dtypes: int64(9), object(2)
memory usage: 1.2+ MB


In [20]:
top_5_leagues_matches_df.iloc[:, :home_players_x_pos_start_idx].describe()

Unnamed: 0,id,country_id,league_id,stage,match_api_id,home_team_api_id,away_team_api_id,home_team_goal,away_team_goal
count,14585.0,14585.0,14585.0,14585.0,14585.0,14585.0,14585.0,14585.0,14585.0
mean,10739.324306,9271.884882,9271.884882,19.152143,1198544.0,9513.471718,9513.674734,1.539184,1.140075
std,6984.373837,6921.447469,6921.447469,10.810203,494169.8,8097.777199,8097.767842,1.289485,1.129409
min,1729.0,1729.0,1729.0,1.0,483129.0,4087.0,4087.0,0.0,0.0
25%,5375.0,4769.0,4769.0,10.0,705602.0,8535.0,8535.0,1.0,0.0
50%,9021.0,7809.0,7809.0,19.0,1216821.0,8686.0,8686.0,1.0,1.0
75%,12667.0,10257.0,10257.0,28.0,1709701.0,9869.0,9869.0,2.0,2.0
max,24557.0,21518.0,21518.0,38.0,2118418.0,208931.0,208931.0,10.0,9.0


In [21]:
# investigate the results with max home and away goals to confirm that they're not shit
def get_max_home_goals_league_match_details():
    league_name = leagues_df[leagues_df.id == top_5_leagues_matches_df[top_5_leagues_matches_df.home_team_goal == 10].league_id.values[0]].name
    home_team_name = teams_df[teams_df.team_api_id == top_5_leagues_matches_df[top_5_leagues_matches_df.home_team_goal == 10].home_team_api_id.values[0]].team_long_name.values[0]
    away_team_name = teams_df[teams_df.team_api_id == top_5_leagues_matches_df[top_5_leagues_matches_df.home_team_goal == 10].away_team_api_id.values[0]].team_long_name.values[0]
    match_info = top_5_leagues_matches_df[top_5_leagues_matches_df.home_team_goal == 10][["home_team_goal", "away_team_goal", "season", "date"]].squeeze()
    home_team_goal = match_info.home_team_goal
    away_team_goal = match_info.away_team_goal
    season = match_info.season
    date = match_info.date

    return f"{home_team_name} {home_team_goal} - {away_team_goal} {away_team_name} played on {date} in season {season}"

get_max_home_goals_league_match_details()

'Real Madrid CF 10 - 2 Rayo Vallecano played on 2015-12-20 00:00:00 in season 2015/2016'

In [22]:
def get_max_away_goals_league_match_details():
    mask = top_5_leagues_matches_df.away_team_goal == top_5_leagues_matches_df.away_team_goal.max()
    max_away_goals_match = top_5_leagues_matches_df[mask]
    league_name = leagues_df[leagues_df.id == max_away_goals_match.league_id.values[0]].name
    home_team_name = teams_df[teams_df.team_api_id == max_away_goals_match.home_team_api_id.values[0]].team_long_name.values[0]
    away_team_name = teams_df[teams_df.team_api_id == max_away_goals_match.away_team_api_id.values[0]].team_long_name.values[0]
    match_info = max_away_goals_match[["home_team_goal", "away_team_goal", "season", "date"]].squeeze()
    home_team_goal = match_info.home_team_goal
    away_team_goal = match_info.away_team_goal
    season = match_info.season
    date = match_info.date

    return f"{home_team_name} {home_team_goal} - {away_team_goal} {away_team_name} played on {date} in season {season}"

get_max_away_goals_league_match_details()

'ES Troyes AC 0 - 9 Paris Saint-Germain played on 2016-03-13 00:00:00 in season 2015/2016'

In [23]:
# check the max and min dates of the matches
top_5_leagues_matches_df.date.min(), top_5_leagues_matches_df.date.max()

('2008-08-09 00:00:00', '2016-05-17 00:00:00')

In [24]:
# check the range of the seasons
top_5_leagues_matches_df.season.unique()

array(['2008/2009', '2009/2010', '2010/2011', '2011/2012', '2012/2013',
       '2013/2014', '2014/2015', '2015/2016'], dtype=object)

In [25]:
# check the range of values for stage
top_5_leagues_matches_df.stage.unique()

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34,
       35, 36, 37, 38])

### assessing match players

In [26]:
# home players
top_5_leagues_matches_df.iloc[:, home_players_start_idx:away_players_start_idx].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14585 entries, 0 to 14584
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   home_player_1   14547 non-null  float64
 1   home_player_2   14538 non-null  float64
 2   home_player_3   14545 non-null  float64
 3   home_player_4   14541 non-null  float64
 4   home_player_5   14547 non-null  float64
 5   home_player_6   14551 non-null  float64
 6   home_player_7   14533 non-null  float64
 7   home_player_8   14541 non-null  float64
 8   home_player_9   14551 non-null  float64
 9   home_player_10  14512 non-null  float64
 10  home_player_11  14523 non-null  float64
dtypes: float64(11)
memory usage: 1.2 MB


In [27]:
# home players
top_5_leagues_matches_df.iloc[:, home_players_start_idx:away_players_start_idx].describe()

Unnamed: 0,home_player_1,home_player_2,home_player_3,home_player_4,home_player_5,home_player_6,home_player_7,home_player_8,home_player_9,home_player_10,home_player_11
count,14547.0,14538.0,14545.0,14541.0,14547.0,14551.0,14533.0,14541.0,14551.0,14512.0,14523.0
mean,67540.999244,90120.123401,75328.502991,78945.332233,94028.161477,86215.306989,82200.12076,92028.973798,94381.943509,88395.371348,82525.980307
std,79081.728488,97652.409271,88316.786481,86523.706029,103101.134223,97119.99795,94373.382459,102354.036812,101888.416488,96795.703335,92267.511513
min,2984.0,2802.0,2752.0,2752.0,2752.0,2802.0,2802.0,2802.0,2770.0,2802.0,2802.0
25%,30380.0,30861.0,27492.0,27684.0,30983.0,30721.0,30530.0,30930.0,31235.0,30881.0,30853.0
50%,36479.0,39731.5,37482.0,38432.0,40985.0,39376.0,39198.0,40731.0,40601.0,39638.0,38848.0
75%,56829.0,141113.0,93457.0,101070.0,130155.0,112035.0,107930.0,121044.0,144993.0,121633.0,104045.0
max,698273.0,748432.0,696443.0,696443.0,720738.0,722766.0,692984.0,693171.0,722766.0,742405.0,696365.0


In [28]:
# confirm that the max number actually exists in the players df
top_5_leagues_matches_df.iloc[:, home_players_start_idx:away_players_start_idx].describe().loc["max", :].unique().size == players_df[players_df.player_api_id.isin(top_5_leagues_matches_df.iloc[:, home_players_start_idx:away_players_start_idx].describe().loc["max", :].unique())].shape[0]

True

In [29]:
# away players
top_5_leagues_matches_df.iloc[:, away_players_start_idx:match_events_start_idx].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14585 entries, 0 to 14584
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   away_player_1   14556 non-null  float64
 1   away_player_2   14536 non-null  float64
 2   away_player_3   14539 non-null  float64
 3   away_player_4   14539 non-null  float64
 4   away_player_5   14541 non-null  float64
 5   away_player_6   14540 non-null  float64
 6   away_player_7   14542 non-null  float64
 7   away_player_8   14530 non-null  float64
 8   away_player_9   14538 non-null  float64
 9   away_player_10  14523 non-null  float64
 10  away_player_11  14513 non-null  float64
dtypes: float64(11)
memory usage: 1.2 MB


In [30]:
# away players
top_5_leagues_matches_df.iloc[:, away_players_start_idx:match_events_start_idx].describe()

Unnamed: 0,away_player_1,away_player_2,away_player_3,away_player_4,away_player_5,away_player_6,away_player_7,away_player_8,away_player_9,away_player_10,away_player_11
count,14556.0,14536.0,14539.0,14539.0,14541.0,14540.0,14542.0,14530.0,14538.0,14523.0,14513.0
mean,67814.32928,91015.047193,74546.03329,80039.389917,93992.660684,86831.252063,83094.74192,94315.749966,95323.630898,90212.77484,84306.015641
std,79174.061489,98841.807943,86654.868263,89174.013853,102718.305201,97069.715662,95644.730922,105128.693217,103836.645073,99666.106104,93356.739627
min,2796.0,2790.0,2752.0,2752.0,2790.0,2802.0,2802.0,2802.0,2802.0,2770.0,2802.0
25%,30380.0,30894.0,27476.0,27679.0,30977.0,30731.0,30598.0,30920.0,31304.0,30893.0,30853.0
50%,36479.0,39841.0,37451.0,38432.0,40985.0,39487.0,39267.0,41098.0,40636.0,39793.0,39225.0
75%,56829.0,144999.0,89475.0,103089.0,130155.0,113465.0,109330.0,128827.0,144993.0,127982.5,108809.0
max,698273.0,748432.0,696443.0,696443.0,720738.0,722766.0,750435.0,710807.0,722766.0,722766.0,717270.0


In [31]:
# confirm that the max number actually exists in the players df
top_5_leagues_matches_df.iloc[:, away_players_start_idx:match_events_start_idx].describe().loc["max", :].unique().size == players_df[players_df.player_api_id.isin(top_5_leagues_matches_df.iloc[:, away_players_start_idx:match_events_start_idx].describe().loc["max", :].unique())].shape[0]

True

### assessing match events

In [32]:
top_5_leagues_matches_df.iloc[:, match_events_start_idx:predictions_start_idx].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14585 entries, 0 to 14584
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   goal        13567 non-null  object
 1   shoton      13567 non-null  object
 2   shotoff     13567 non-null  object
 3   foulcommit  13567 non-null  object
 4   card        13567 non-null  object
 5   cross       13567 non-null  object
 6   corner      13567 non-null  object
 7   possession  13567 non-null  object
dtypes: object(8)
memory usage: 911.7+ KB


In [33]:
top_5_leagues_matches_df.iloc[:, match_events_start_idx:predictions_start_idx].describe()

Unnamed: 0,goal,shoton,shotoff,foulcommit,card,cross,corner,possession
count,13567,13567,13567,13567,13567,13567,13567,13567
unique,12606,8121,8121,8123,13165,8123,8122,8081
top,<goal />,<shoton />,<shotoff />,<foulcommit />,<card />,<cross />,<corner />,<possession />
freq,962,5447,5447,5445,403,5445,5446,5487


In [34]:
top_5_leagues_matches_df.iloc[:, match_events_start_idx:predictions_start_idx].describe().loc["top", "goal"]

'<goal />'

In [35]:
top_5_leagues_matches_df.iloc[:, match_events_start_idx:predictions_start_idx].iloc[-1, 0]

'<goal><value><comment>n</comment><stats><goals>1</goals><shoton>1</shoton></stats><event_incident_typefk>393</event_incident_typefk><coordinates><value>22</value><value>6</value></coordinates><elapsed>43</elapsed><player2>413557</player2><subtype>shot</subtype><player1>30829</player1><sortorder>2</sortorder><team>10260</team><id>5623457</id><n>169</n><type>goal</type><goal_type>n</goal_type></value><value><comment>n</comment><stats><goals>1</goals><shoton>1</shoton></stats><event_incident_typefk>393</event_incident_typefk><coordinates><value>28</value><value>8</value></coordinates><elapsed>75</elapsed><player2>35327</player2><subtype>shot</subtype><player1>696365</player1><sortorder>0</sortorder><team>10260</team><id>5623542</id><n>329</n><type>goal</type><goal_type>n</goal_type></value><value><comment>n</comment><stats><goals>1</goals><shoton>1</shoton></stats><event_incident_typefk>393</event_incident_typefk><coordinates><value>25</value><value>6</value></coordinates><elapsed>87</el

Seems those numbers are not really reflective of the full data in the columns, so will not pay attention to them for now.

### Assessing match predictions

In [36]:
top_5_leagues_matches_df.loc[:, list(top_5_leagues_matches_df.columns[-30:-15])].describe()

Unnamed: 0,B365H,B365D,B365A,BWH,BWD,BWA,IWH,IWD,IWA,LBH,LBD,LBA,PSH,PSD,PSA
count,14573.0,14573.0,14573.0,14558.0,14558.0,14558.0,14540.0,14540.0,14540.0,14568.0,14568.0,14568.0,7292.0,7292.0,7292.0
mean,2.595606,3.809271,4.70451,2.541709,3.738281,4.455709,2.454475,3.625189,4.193165,2.512505,3.706167,4.429821,2.787401,4.140014,5.128762
std,1.756999,1.171708,3.794316,1.625401,1.090557,3.359353,1.434216,0.838685,2.881999,1.606546,1.069737,3.44706,2.191845,1.68015,4.701423
min,1.04,1.4,1.08,1.03,1.65,1.1,1.05,1.5,1.1,1.04,1.4,1.1,1.04,2.2,1.09
25%,1.67,3.25,2.6,1.7,3.2,2.6,1.7,3.2,2.55,1.67,3.2,2.5,1.71,3.38,2.6175
50%,2.1,3.4,3.6,2.1,3.4,3.45,2.1,3.3,3.3,2.1,3.4,3.4,2.18,3.6,3.73
75%,2.75,3.8,5.25,2.7,3.8,5.0,2.6,3.7,4.65,2.7,3.75,5.0,2.94,4.18,5.55
max,26.0,17.0,51.0,34.0,19.5,51.0,20.0,11.0,25.0,26.0,19.0,51.0,36.0,29.0,47.5


In [37]:
top_5_leagues_matches_df.loc[:, list(top_5_leagues_matches_df.columns[-15:])].describe()

Unnamed: 0,WHH,WHD,WHA,SJH,SJD,SJA,VCH,VCD,VCA,GBH,GBD,GBA,BSH,BSD,BSA
count,14568.0,14568.0,14568.0,11074.0,11074.0,11074.0,14555.0,14555.0,14555.0,9081.0,9081.0,9081.0,9085.0,9085.0,9085.0
mean,2.569285,3.62959,4.570721,2.533537,3.736561,4.680812,2.64294,3.875681,4.909706,2.472426,3.622296,4.386528,2.465726,3.62515,4.418149
std,1.688519,0.987731,3.745467,1.627928,1.036067,3.716216,1.916398,1.325661,4.465039,1.454708,0.883999,2.998704,1.460544,0.871147,3.158631
min,1.02,1.02,1.08,1.04,1.4,1.1,1.03,1.62,1.08,1.05,1.45,1.12,1.04,1.33,1.12
25%,1.7,3.2,2.62,1.67,3.25,2.63,1.7,3.25,2.62,1.7,3.2,2.6,1.67,3.2,2.62
50%,2.15,3.3,3.4,2.1,3.4,3.6,2.15,3.5,3.6,2.1,3.3,3.5,2.1,3.3,3.4
75%,2.7,3.75,5.0,2.7,3.75,5.25,2.8,4.0,5.4,2.63,3.7,5.0,2.6,3.75,5.0
max,26.0,17.0,51.0,23.0,15.0,41.0,36.0,26.0,67.0,21.0,11.0,34.0,17.0,13.0,34.0


In [38]:
top_5_leagues_matches_df.loc[:, list(top_5_leagues_matches_df.columns[-30:-15])].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14585 entries, 0 to 14584
Data columns (total 15 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   B365H   14573 non-null  float64
 1   B365D   14573 non-null  float64
 2   B365A   14573 non-null  float64
 3   BWH     14558 non-null  float64
 4   BWD     14558 non-null  float64
 5   BWA     14558 non-null  float64
 6   IWH     14540 non-null  float64
 7   IWD     14540 non-null  float64
 8   IWA     14540 non-null  float64
 9   LBH     14568 non-null  float64
 10  LBD     14568 non-null  float64
 11  LBA     14568 non-null  float64
 12  PSH     7292 non-null   float64
 13  PSD     7292 non-null   float64
 14  PSA     7292 non-null   float64
dtypes: float64(15)
memory usage: 1.7 MB


In [39]:
top_5_leagues_matches_df.loc[:, list(top_5_leagues_matches_df.columns[-15:])].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14585 entries, 0 to 14584
Data columns (total 15 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   WHH     14568 non-null  float64
 1   WHD     14568 non-null  float64
 2   WHA     14568 non-null  float64
 3   SJH     11074 non-null  float64
 4   SJD     11074 non-null  float64
 5   SJA     11074 non-null  float64
 6   VCH     14555 non-null  float64
 7   VCD     14555 non-null  float64
 8   VCA     14555 non-null  float64
 9   GBH     9081 non-null   float64
 10  GBD     9081 non-null   float64
 11  GBA     9081 non-null   float64
 12  BSH     9085 non-null   float64
 13  BSD     9085 non-null   float64
 14  BSA     9085 non-null   float64
dtypes: float64(15)
memory usage: 1.7 MB


### assessing player attributes

In [40]:
player_attributes_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 183978 entries, 0 to 183977
Data columns (total 42 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   id                   183978 non-null  int64  
 1   player_fifa_api_id   183978 non-null  int64  
 2   player_api_id        183978 non-null  int64  
 3   date                 183978 non-null  object 
 4   overall_rating       183142 non-null  float64
 5   potential            183142 non-null  float64
 6   preferred_foot       183142 non-null  object 
 7   attacking_work_rate  180748 non-null  object 
 8   defensive_work_rate  183142 non-null  object 
 9   crossing             183142 non-null  float64
 10  finishing            183142 non-null  float64
 11  heading_accuracy     183142 non-null  float64
 12  short_passing        183142 non-null  float64
 13  volleys              181265 non-null  float64
 14  dribbling            183142 non-null  float64
 15  curve            

In [41]:
player_attributes_df.isna().sum().unique()

array([   0,  836, 3230, 2713])

In [42]:
attributes_start_idx = 4

In [43]:
player_attributes_df.iloc[:, :attributes_start_idx].describe()

Unnamed: 0,id,player_fifa_api_id,player_api_id
count,183978.0,183978.0,183978.0
mean,91989.5,165671.524291,135900.617324
std,53110.01825,53851.094769,136927.84051
min,1.0,2.0,2625.0
25%,45995.25,155798.0,34763.0
50%,91989.5,183488.0,77741.0
75%,137983.75,199848.0,191080.0
max,183978.0,234141.0,750584.0


In [44]:
# confirm that the max value make sense
players_df[players_df.player_api_id == player_attributes_df.iloc[:, :attributes_start_idx].describe().loc["max", :].player_api_id]

Unnamed: 0,id,player_api_id,player_name,player_fifa_api_id,birthday,height,weight
470,473,750584,Alexandre Azevedo,234141,1997-01-28 00:00:00,175.26,150


In [45]:
player_attributes_df.iloc[:, attributes_start_idx:].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 183978 entries, 0 to 183977
Data columns (total 38 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   overall_rating       183142 non-null  float64
 1   potential            183142 non-null  float64
 2   preferred_foot       183142 non-null  object 
 3   attacking_work_rate  180748 non-null  object 
 4   defensive_work_rate  183142 non-null  object 
 5   crossing             183142 non-null  float64
 6   finishing            183142 non-null  float64
 7   heading_accuracy     183142 non-null  float64
 8   short_passing        183142 non-null  float64
 9   volleys              181265 non-null  float64
 10  dribbling            183142 non-null  float64
 11  curve                181265 non-null  float64
 12  free_kick_accuracy   183142 non-null  float64
 13  long_passing         183142 non-null  float64
 14  ball_control         183142 non-null  float64
 15  acceleration     

In [46]:
player_attributes_df.preferred_foot.unique()

array(['right', 'left', None], dtype=object)

In [47]:
player_attributes_df.attacking_work_rate.unique()

array(['medium', 'high', None, 'low', 'None', 'le', 'norm', 'stoc', 'y'],
      dtype=object)

In [48]:
player_attributes_df.defensive_work_rate.unique()

array(['medium', 'high', 'low', '_0', None, '5', 'ean', 'o', '1', 'ormal',
       '7', '2', '8', '4', 'tocky', '0', '3', '6', '9', 'es'],
      dtype=object)

In [49]:
player_attributes_df.iloc[:, attributes_start_idx:attributes_start_idx+15].describe()

Unnamed: 0,overall_rating,potential,crossing,finishing,heading_accuracy,short_passing,volleys,dribbling,curve,free_kick_accuracy,long_passing,ball_control
count,183142.0,183142.0,183142.0,183142.0,183142.0,183142.0,181265.0,183142.0,181265.0,183142.0,183142.0,183142.0
mean,68.600015,73.460353,55.086883,49.921078,57.266023,62.429672,49.468436,59.175154,52.965675,49.38095,57.06988,63.388879
std,7.041139,6.592271,17.242135,19.038705,16.488905,14.194068,18.256618,17.744688,18.255788,17.831746,14.394464,15.196671
min,33.0,39.0,1.0,1.0,1.0,3.0,1.0,1.0,2.0,1.0,3.0,5.0
25%,64.0,69.0,45.0,34.0,49.0,57.0,35.0,52.0,41.0,36.0,49.0,58.0
50%,69.0,74.0,59.0,53.0,60.0,65.0,52.0,64.0,56.0,50.0,59.0,67.0
75%,73.0,78.0,68.0,65.0,68.0,72.0,64.0,72.0,67.0,63.0,67.0,73.0
max,94.0,97.0,95.0,97.0,98.0,97.0,93.0,97.0,94.0,97.0,97.0,97.0


In [50]:
player_attributes_df.iloc[:, attributes_start_idx+15:attributes_start_idx+30].describe()

Unnamed: 0,acceleration,sprint_speed,agility,reactions,balance,shot_power,jumping,stamina,strength,long_shots,aggression,interceptions,positioning,vision,penalties
count,183142.0,183142.0,181265.0,183142.0,181265.0,183142.0,181265.0,183142.0,183142.0,183142.0,183142.0,183142.0,183142.0,181265.0,183142.0
mean,67.659357,68.051244,65.97091,66.103706,65.189496,61.808427,66.969045,67.038544,67.424529,53.339431,60.948046,52.009271,55.786504,57.87355,55.003986
std,12.983326,12.569721,12.954585,9.155408,13.063188,16.135143,11.006734,13.165262,12.07228,18.367025,16.089521,19.450133,18.448292,15.144086,15.546519
min,10.0,12.0,11.0,17.0,12.0,2.0,14.0,10.0,10.0,1.0,6.0,1.0,2.0,1.0,2.0
25%,61.0,62.0,58.0,61.0,58.0,54.0,60.0,61.0,60.0,41.0,51.0,34.0,45.0,49.0,45.0
50%,69.0,69.0,68.0,67.0,67.0,65.0,68.0,69.0,69.0,58.0,64.0,57.0,60.0,60.0,57.0
75%,77.0,77.0,75.0,72.0,74.0,73.0,74.0,76.0,76.0,67.0,73.0,68.0,69.0,69.0,67.0
max,97.0,97.0,96.0,96.0,96.0,97.0,96.0,96.0,96.0,96.0,97.0,96.0,96.0,97.0,96.0


In [51]:
player_attributes_df.iloc[:, attributes_start_idx+30:].describe()

Unnamed: 0,marking,standing_tackle,sliding_tackle,gk_diving,gk_handling,gk_kicking,gk_positioning,gk_reflexes
count,183142.0,183142.0,181265.0,183142.0,183142.0,183142.0,183142.0,183142.0
mean,46.772242,50.351257,48.001462,14.704393,16.063612,20.998362,16.132154,16.441439
std,21.227667,21.483706,21.598778,16.865467,15.867382,21.45298,16.099175,17.198155
min,1.0,1.0,2.0,1.0,1.0,1.0,1.0,1.0
25%,25.0,29.0,25.0,7.0,8.0,8.0,8.0,8.0
50%,50.0,56.0,53.0,10.0,11.0,12.0,11.0,11.0
75%,66.0,69.0,67.0,13.0,15.0,15.0,15.0,15.0
max,96.0,95.0,95.0,94.0,93.0,97.0,96.0,96.0


### Assessing teams

In [52]:
teams_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 164 entries, 0 to 163
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   id                164 non-null    int64  
 1   team_api_id       164 non-null    int64  
 2   team_fifa_api_id  164 non-null    float64
 3   team_long_name    164 non-null    object 
 4   team_short_name   164 non-null    object 
dtypes: float64(1), int64(2), object(2)
memory usage: 6.5+ KB


In [53]:
teams_df.describe()

Unnamed: 0,id,team_api_id,team_fifa_api_id
count,164.0,164.0,164.0
mean,19329.432927,10836.079268,15386.189024
std,13772.187036,17435.893334,37424.238876
min,3457.0,4087.0,1.0
25%,9543.75,8470.5,64.75
50%,15629.5,8664.0,362.5
75%,22042.5,9865.5,1826.0
max,48358.0,208931.0,112409.0


In [54]:
team_attributes_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 924 entries, 9 to 1449
Data columns (total 25 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   id                              924 non-null    int64  
 1   team_fifa_api_id                924 non-null    int64  
 2   team_api_id                     924 non-null    int64  
 3   date                            924 non-null    object 
 4   buildUpPlaySpeed                924 non-null    int64  
 5   buildUpPlaySpeedClass           924 non-null    object 
 6   buildUpPlayDribbling            308 non-null    float64
 7   buildUpPlayDribblingClass       924 non-null    object 
 8   buildUpPlayPassing              924 non-null    int64  
 9   buildUpPlayPassingClass         924 non-null    object 
 10  buildUpPlayPositioningClass     924 non-null    object 
 11  chanceCreationPassing           924 non-null    int64  
 12  chanceCreationPassingClass      924

In [55]:
for attr_name in [col for col in team_attributes_df.columns if col.endswith("Class")]:
    print(f"{attr_name} unique values: {team_attributes_df[attr_name].unique()}")

buildUpPlaySpeedClass unique values: ['Balanced' 'Slow' 'Fast']
buildUpPlayDribblingClass unique values: ['Little' 'Normal' 'Lots']
buildUpPlayPassingClass unique values: ['Mixed' 'Short' 'Long']
buildUpPlayPositioningClass unique values: ['Organised' 'Free Form']
chanceCreationPassingClass unique values: ['Normal' 'Safe' 'Risky']
chanceCreationCrossingClass unique values: ['Normal' 'Little' 'Lots']
chanceCreationShootingClass unique values: ['Normal' 'Lots' 'Little']
chanceCreationPositioningClass unique values: ['Organised' 'Free Form']
defencePressureClass unique values: ['Deep' 'Medium' 'High']
defenceAggressionClass unique values: ['Double' 'Press' 'Contain']
defenceTeamWidthClass unique values: ['Narrow' 'Normal' 'Wide']
defenceDefenderLineClass unique values: ['Offside Trap' 'Cover']


In [56]:
for attr_name in [col for col in team_attributes_df.iloc[:, attributes_start_idx:].columns if not col.endswith("Class")]:
    print(f"number of unique values for {attr_name}: {team_attributes_df[attr_name].unique().size}")

number of unique values for buildUpPlaySpeed: 56
number of unique values for buildUpPlayDribbling: 48
number of unique values for buildUpPlayPassing: 58
number of unique values for chanceCreationPassing: 49
number of unique values for chanceCreationCrossing: 55
number of unique values for chanceCreationShooting: 54
number of unique values for defencePressure: 48
number of unique values for defenceAggression: 46
number of unique values for defenceTeamWidth: 42


In [57]:
for col_name in ["buildUpPlayDribbling", "buildUpPlaySpeed"]:
    print(team_attributes_df[col_name].unique())

[nan 57. 70. 41. 61. 48. 46. 49. 51. 32. 37. 45. 52. 50. 38. 55. 35. 40.
 30. 29. 34. 24. 39. 31. 60. 44. 36. 54. 53. 33. 56. 59. 43. 47. 69. 62.
 58. 42. 65. 77. 28. 68. 66. 71. 26. 27. 74. 67.]
[60 65 59 45 48 55 42 46 50 23 41 39 56 40 62 66 75 25 30 70 67 63 35 64
 57 47 68 43 24 36 52 58 69 73 37 51 44 38 49 71 74 76 31 54 32 80 53 61
 34 72 29 78 26 28 20 77]


In [58]:
team_attributes_df.iloc[:, attributes_start_idx:].describe()

Unnamed: 0,buildUpPlaySpeed,buildUpPlayDribbling,buildUpPlayPassing,chanceCreationPassing,chanceCreationCrossing,chanceCreationShooting,defencePressure,defenceAggression,defenceTeamWidth
count,924.0,308.0,924.0,924.0,924.0,924.0,924.0,924.0,924.0
mean,53.541126,48.373377,48.666667,52.498918,53.971861,54.248918,45.968615,49.742424,52.270563
std,11.318181,10.66335,11.285103,10.859738,11.388694,10.548179,10.294372,9.438097,9.345908
min,20.0,24.0,20.0,21.0,20.0,22.0,23.0,27.0,29.0
25%,46.0,41.0,40.0,46.0,47.0,49.0,38.0,44.0,48.0
50%,54.0,49.0,50.0,52.0,54.0,54.0,45.0,49.0,52.0
75%,63.0,55.0,55.0,60.0,63.0,63.0,51.0,55.0,58.0
max,80.0,77.0,80.0,80.0,80.0,80.0,72.0,72.0,73.0


### Assessing champions league data

In [59]:
champs_league_hist_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1666 entries, 0 to 1665
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   year          1666 non-null   int64 
 1   team_name     1666 non-null   object
 2   progress      1666 non-null   object
 3   team_country  1666 non-null   object
dtypes: int64(1), object(3)
memory usage: 52.2+ KB


In [60]:
champs_league_hist_df.describe()

Unnamed: 0,year
count,1666.0
mean,2007.840336
std,8.001033
min,1994.0
25%,2001.0
50%,2008.0
75%,2015.0
max,2021.0


In [61]:
for col_name in champs_league_hist_df.columns:
    pprint(f"{col_name} unique values: {champs_league_hist_df[col_name].unique()}")

('year unique values: [1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 '
 '2005 2006 2007\n'
 ' 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021]')
("team_name unique values: ['A.C. Milan' 'Barcelona' 'Monaco' 'Porto' 'Spartak "
 "Moscow' 'Galatasaray'\n"
 " 'Werder Bremen' 'Anderlecht' 'Feyenoord' 'Steaua Bucuresti'\n"
 " 'Levski Sofia' 'Copenhagen' 'Sparta Prague' 'Manchester United'\n"
 " 'Lech Poznan' 'Austria Wien' 'Floriana' 'Akranes S.C.' 'AEK Athens'\n"
 " 'Croatia Zagreb' 'Rangers (IL)' 'Dinamo Minsk' 'Linfield' 'FC Aarau'\n"
 " 'AIK' 'HJK Helsinki' 'Kispest Honved' 'Cork City' 'Beitar Jerusalem'\n"
 " 'Skonto' 'Dynamo Kyiv' 'Rosenborg BK' 'Ajax' 'Bayern Munich'\n"
 " 'Paris Saint-Germain' 'IFK Goteborg' 'Hajduk Split' 'Benfica'\n"
 " 'FC Red Bull Salzburg' 'Avenir Beggen' 'Silkeborg' 'Vac FC-Samsung'\n"
 " 'Legia Warszawa' 'Servette' 'Maccabi Haifa' 'Juventus' 'Panathinaikos'\n"
 " 'Nantes' 'Borussia Dortmund' 'Real Madrid' 'Aalborg BK'\n"
 " 'Blac

In [62]:
champs_league_hist_df[champs_league_hist_df.progress == "1. Winner"]

Unnamed: 0,year,team_name,progress,team_country
0,1994,A.C. Milan,1. Winner,Italy
62,1995,Ajax,1. Winner,Netherlands
124,1996,Juventus,1. Winner,Italy
178,1997,Borussia Dortmund,1. Winner,Germany
231,1998,Real Madrid,1. Winner,Spain
269,1999,Manchester United,1. Winner,England
307,2000,Real Madrid,1. Winner,Spain
369,2001,Bayern Munich,1. Winner,Germany
431,2002,Real Madrid,1. Winner,Spain
493,2003,A.C. Milan,1. Winner,Italy


Cross referencing the winners with Wikipedia shows that they're consistent, so I'll consider the data to be valid and accurate.

## Assess data - Tidiness

### Assessing league data

In [63]:
leagues_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5 entries, 1 to 9
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   id          5 non-null      int64 
 1   country_id  5 non-null      int64 
 2   name        5 non-null      object
dtypes: int64(2), object(1)
memory usage: 160.0+ bytes


In [64]:
leagues_df

Unnamed: 0,id,country_id,name
1,1729,1729,England Premier League
2,4769,4769,France Ligue 1
3,7809,7809,Germany 1. Bundesliga
4,10257,10257,Italy Serie A
9,21518,21518,Spain LIGA BBVA


### assessing team data

In [65]:
teams_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 164 entries, 0 to 163
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   id                164 non-null    int64  
 1   team_api_id       164 non-null    int64  
 2   team_fifa_api_id  164 non-null    float64
 3   team_long_name    164 non-null    object 
 4   team_short_name   164 non-null    object 
dtypes: float64(1), int64(2), object(2)
memory usage: 6.5+ KB


In [66]:
teams_df.describe()

Unnamed: 0,id,team_api_id,team_fifa_api_id
count,164.0,164.0,164.0
mean,19329.432927,10836.079268,15386.189024
std,13772.187036,17435.893334,37424.238876
min,3457.0,4087.0,1.0
25%,9543.75,8470.5,64.75
50%,15629.5,8664.0,362.5
75%,22042.5,9865.5,1826.0
max,48358.0,208931.0,112409.0


In [67]:
teams_df.team_api_id.unique().size == teams_df.shape[0]

True

In [68]:
team_attributes_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 924 entries, 9 to 1449
Data columns (total 25 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   id                              924 non-null    int64  
 1   team_fifa_api_id                924 non-null    int64  
 2   team_api_id                     924 non-null    int64  
 3   date                            924 non-null    object 
 4   buildUpPlaySpeed                924 non-null    int64  
 5   buildUpPlaySpeedClass           924 non-null    object 
 6   buildUpPlayDribbling            308 non-null    float64
 7   buildUpPlayDribblingClass       924 non-null    object 
 8   buildUpPlayPassing              924 non-null    int64  
 9   buildUpPlayPassingClass         924 non-null    object 
 10  buildUpPlayPositioningClass     924 non-null    object 
 11  chanceCreationPassing           924 non-null    int64  
 12  chanceCreationPassingClass      924

In [69]:
for col_name in ["team_api_id", "team_fifa_api_id", "date"]:
    pprint(team_attributes_df[col_name].unique().size == team_attributes_df.shape[0])

False
False
False


That means team api ids are not unique rows and neither is date. Perhaps combining the two will give a unique row.

In [70]:
team_attributes_df.apply(lambda row: row.date + "_" + str(row.team_api_id), axis=1).unique().size == team_attributes_df.shape[0]

True

Indeed it does! So that means each row represents data collected for a unique team on a unique date for that team.

### assessing match data

In [71]:
pprint([*top_5_leagues_matches_df.columns])

['id',
 'country_id',
 'league_id',
 'season',
 'stage',
 'date',
 'match_api_id',
 'home_team_api_id',
 'away_team_api_id',
 'home_team_goal',
 'away_team_goal',
 'home_player_X1',
 'home_player_X2',
 'home_player_X3',
 'home_player_X4',
 'home_player_X5',
 'home_player_X6',
 'home_player_X7',
 'home_player_X8',
 'home_player_X9',
 'home_player_X10',
 'home_player_X11',
 'away_player_X1',
 'away_player_X2',
 'away_player_X3',
 'away_player_X4',
 'away_player_X5',
 'away_player_X6',
 'away_player_X7',
 'away_player_X8',
 'away_player_X9',
 'away_player_X10',
 'away_player_X11',
 'home_player_Y1',
 'home_player_Y2',
 'home_player_Y3',
 'home_player_Y4',
 'home_player_Y5',
 'home_player_Y6',
 'home_player_Y7',
 'home_player_Y8',
 'home_player_Y9',
 'home_player_Y10',
 'home_player_Y11',
 'away_player_Y1',
 'away_player_Y2',
 'away_player_Y3',
 'away_player_Y4',
 'away_player_Y5',
 'away_player_Y6',
 'away_player_Y7',
 'away_player_Y8',
 'away_player_Y9',
 'away_player_Y10',
 'away_player

In [72]:
for col_name in ["match_api_id"]:
    pprint(top_5_leagues_matches_df[col_name].unique().size == top_5_leagues_matches_df.shape[0])

True


In [73]:
top_5_leagues_matches_df.match_api_id.duplicated().sum()

0

### assessing player data

In [74]:
players_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11060 entries, 0 to 11059
Data columns (total 7 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   id                  11060 non-null  int64  
 1   player_api_id       11060 non-null  int64  
 2   player_name         11060 non-null  object 
 3   player_fifa_api_id  11060 non-null  int64  
 4   birthday            11060 non-null  object 
 5   height              11060 non-null  float64
 6   weight              11060 non-null  int64  
dtypes: float64(1), int64(4), object(2)
memory usage: 605.0+ KB


In [75]:
players_df.player_api_id.unique().size == players_df.shape[0]

True

In [76]:
players_df.player_api_id.duplicated().sum() == 0

True

In [77]:
player_attributes_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 183978 entries, 0 to 183977
Data columns (total 42 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   id                   183978 non-null  int64  
 1   player_fifa_api_id   183978 non-null  int64  
 2   player_api_id        183978 non-null  int64  
 3   date                 183978 non-null  object 
 4   overall_rating       183142 non-null  float64
 5   potential            183142 non-null  float64
 6   preferred_foot       183142 non-null  object 
 7   attacking_work_rate  180748 non-null  object 
 8   defensive_work_rate  183142 non-null  object 
 9   crossing             183142 non-null  float64
 10  finishing            183142 non-null  float64
 11  heading_accuracy     183142 non-null  float64
 12  short_passing        183142 non-null  float64
 13  volleys              181265 non-null  float64
 14  dribbling            183142 non-null  float64
 15  curve            

In [78]:
for col_name in ["player_api_id", "player_fifa_api_id", "date"]:
    pprint(player_attributes_df[col_name].unique().size == player_attributes_df.shape[0])
    pprint(player_attributes_df[col_name].duplicated().sum() == 0)

False
False
False
False
False
False


In [79]:
# try using a combination of api id and date
pprint(player_attributes_df.apply(lambda row: row.date + "_" + str(row.player_api_id), axis=1).unique().size == player_attributes_df.shape[0])
pprint(player_attributes_df.apply(lambda row: row.date + "_" + str(row.player_api_id), axis=1).unique().size)
pprint(player_attributes_df.shape[0])

False
183142
183978


does one of those contain null values?

In [80]:
pprint(player_attributes_df.date.isna().sum())
pprint(player_attributes_df.player_api_id.isna().sum())

0
0


does the combination yield duplicated values?

In [81]:
pd.Series(player_attributes_df.apply(lambda row: row.date + "_" + str(row.player_api_id), axis=1)).duplicated().sum()

836

### Assessing champs league data

In [82]:
champs_league_hist_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1666 entries, 0 to 1665
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   year          1666 non-null   int64 
 1   team_name     1666 non-null   object
 2   progress      1666 non-null   object
 3   team_country  1666 non-null   object
dtypes: int64(1), object(3)
memory usage: 52.2+ KB


In [83]:
champs_league_hist_df[champs_league_hist_df.duplicated()]

Unnamed: 0,year,team_name,progress,team_country
109,1995,Galatasaray,7. Last 32,Turkey
111,1995,IFK Goteborg,7. Last 32,Sweden
113,1995,Dynamo Kyiv,7. Last 32,Ukraine
114,1995,Paris Saint-Germain,7. Last 32,France
117,1995,Hajduk Split,7. Last 32,Croatia
118,1995,Steaua Bucuresti,7. Last 32,Romania
120,1995,AEK Athens,7. Last 32,Greece
123,1995,FC Red Bull Salzburg,7. Last 32,Austria


In [84]:
champs_league_hist_df[(champs_league_hist_df.team_name == "Galatasaray") & (champs_league_hist_df.year == 1995) & (champs_league_hist_df.progress == "7. Last 32")]

Unnamed: 0,year,team_name,progress,team_country
101,1995,Galatasaray,7. Last 32,Turkey
109,1995,Galatasaray,7. Last 32,Turkey


## Clean data

### Create copies of the dataframes

In [85]:
# create a copy of the dataframes to clean
countries_df_clean = countries_df.copy()
leagues_df_clean = leagues_df.copy()
teams_df_clean = teams_df.copy()
team_attributes_df_clean = team_attributes_df.copy()
players_df_clean = players_df.copy()
player_attributes_df_clean = player_attributes_df.copy()
top_5_leagues_matches_df_clean = top_5_leagues_matches_df.copy()
champs_league_hist_df_clean = champs_league_hist_df.copy()

### match table - quality

#### `date` column should be datetime

##### Define
convert the date column to datetime using pandas to_datetime function

##### Code

In [86]:
top_5_leagues_matches_df_clean["date"] = pd.to_datetime(top_5_leagues_matches_df_clean.date)
top_5_leagues_matches_df_clean.date.head()

0   2008-08-09
1   2008-08-09
2   2008-08-09
3   2008-08-09
4   2008-08-09
Name: date, dtype: datetime64[ns]

##### Test

In [87]:
top_5_leagues_matches_df_clean.date.dtype

dtype('<M8[ns]')

#### `season` should be categorical

##### Define
- convert season to categorical by getting the unique values and assigning them to a list
- the list should be ordered with the oldest season first and the latest season last

##### Code

In [88]:
season_categories = pd.CategoricalDtype(top_5_leagues_matches_df_clean.season.unique(), ordered=True)
season_categories

CategoricalDtype(categories=['2008/2009', '2009/2010', '2010/2011', '2011/2012',
                  '2012/2013', '2013/2014', '2014/2015', '2015/2016'],
, ordered=True)

In [89]:
# convert the season column for matches to this categorical variable
top_5_leagues_matches_df_clean["season"] = top_5_leagues_matches_df_clean.season.astype(season_categories)

##### Test

In [90]:
top_5_leagues_matches_df_clean.season.dtypes

CategoricalDtype(categories=['2008/2009', '2009/2010', '2010/2011', '2011/2012',
                  '2012/2013', '2013/2014', '2014/2015', '2015/2016'],
, ordered=True)

#### rename to `stage` to `matchday` and should be categorical

##### Define

- rename the column
- make it categorical with the categories being the unique values in the column
- should be ordered

##### Code

In [91]:
top_5_leagues_matches_df_clean.rename(columns={"stage": "matchday"}, inplace=True)

In [92]:
matchday_categories = pd.CategoricalDtype(top_5_leagues_matches_df_clean.matchday.unique(), ordered=True)

In [93]:
top_5_leagues_matches_df_clean["matchday"] = top_5_leagues_matches_df_clean.matchday.astype(matchday_categories)

##### Test

In [94]:
top_5_leagues_matches_df_clean.matchday

0         1
1         1
2         1
3         1
4         1
         ..
14580    38
14581    38
14582    38
14583    38
14584    38
Name: matchday, Length: 14585, dtype: category
Categories (38, int64): [1 < 2 < 3 < 4 ... 35 < 36 < 37 < 38]

#### player IDs (`home_player_1` ... `away_player_11`) are null for some matches

##### Define

- drop all values that are null for these columns

##### Code

In [95]:
top_5_leagues_matches_df_clean.drop(top_5_leagues_matches_df_clean[top_5_leagues_matches_df_clean.loc[:, "home_player_1":"away_player_11"].isna().sum(axis=1) > 0].index, inplace=True)

##### Test

In [96]:
assert top_5_leagues_matches_df_clean[top_5_leagues_matches_df_clean.loc[:, "home_player_1":"away_player_11"].isna().sum(axis=1) > 0].size == 0

#### match events (`goal` ... `possession`) are null for some matches

##### Define

- confirm that matches with a 0-0 scoreline don't have a null `goal` column
- drop all values that are null for these columns

##### Code

In [97]:
# get the values of the goals columns for 0-0 matches
top_5_leagues_matches_df_clean[(top_5_leagues_matches_df_clean.home_team_goal == 0) & (top_5_leagues_matches_df_clean.away_team_goal == 0)].goal.value_counts()

<goal />                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           909
<goal><value><comment>dg</comment><event_incident_typefk>304</event_incident_typefk><elapsed>87</elapsed><subtype>loose_ball</subtype><player1>23934</player1><sortorder>2</sortorder><team>8559</team><id>389536</id><n>172</n><type>goal</type><goal_type>dg</goal_type></value><value><comment>dg</comment><event_incident_typefk>298</event_incident_typefk><elapsed>90</elapsed><subtype>shot</subtype><player1>23934</player1><sortorder>2</sortorder><team

That above confirms that 0-0 matches don't have null values for the `goal` column

In [98]:
top_5_leagues_matches_df_clean[top_5_leagues_matches_df_clean.loc[:, "goal":"possession"].isna().sum(axis=1) > 0].index

Int64Index([   0,    1,    3,    4,    6,    7,    9,   13,   14,   15,
            ...
            5873, 5874, 5876, 5877, 5879, 5890, 5894, 5897, 5898, 7494],
           dtype='int64', length=933)

In [99]:
top_5_leagues_matches_df_clean.drop(top_5_leagues_matches_df_clean[top_5_leagues_matches_df_clean.loc[:, "goal":"possession"].isna().sum(axis=1) > 0].index, inplace=True)

##### Test

In [100]:
assert top_5_leagues_matches_df_clean[top_5_leagues_matches_df_clean.loc[:, "goal":"possession"].isna().sum(axis=1) > 0].size == 0

#### predictions are null for some matches

##### Define

- drop all rows that have null values

##### Code

In [101]:
top_5_leagues_matches_df_clean.iloc[:, predictions_start_idx:].shape[0]

12723

In [102]:
(top_5_leagues_matches_df_clean.iloc[:, predictions_start_idx:].isna().sum(axis=1) > 0).sum()

10960

Most of the rows have missing predictions! I can't drop the null rows.

In [103]:
top_5_leagues_matches_df_clean.iloc[:, predictions_start_idx:].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 12723 entries, 10 to 14584
Data columns (total 30 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   B365H   12714 non-null  float64
 1   B365D   12714 non-null  float64
 2   B365A   12714 non-null  float64
 3   BWH     12699 non-null  float64
 4   BWD     12699 non-null  float64
 5   BWA     12699 non-null  float64
 6   IWH     12686 non-null  float64
 7   IWD     12686 non-null  float64
 8   IWA     12686 non-null  float64
 9   LBH     12709 non-null  float64
 10  LBD     12709 non-null  float64
 11  LBA     12709 non-null  float64
 12  PSH     7023 non-null   float64
 13  PSD     7023 non-null   float64
 14  PSA     7023 non-null   float64
 15  WHH     12708 non-null  float64
 16  WHD     12708 non-null  float64
 17  WHA     12708 non-null  float64
 18  SJH     9391 non-null   float64
 19  SJD     9391 non-null   float64
 20  SJA     9391 non-null   float64
 21  VCH     12696 non-null  float64
 2

##### new strategy
- drop columns that have less than 75% non-null predictions (below that feels too incomplete to me)
- drop all match rows that have no predictions at all
- fill in the remaining null predictions using the average of those values for the row (i.e. null home predictions will be average of home predictions, likewise for draw and away)

**drop columns that have less than 75% non-null predictions (below that feels too incomplete to me)**

In [104]:
num_missing_predictions = top_5_leagues_matches_df_clean.isna().sum().iloc[-30:]
num_missing_predictions

B365H       9
B365D       9
B365A       9
BWH        24
BWD        24
BWA        24
IWH        37
IWD        37
IWA        37
LBH        14
LBD        14
LBA        14
PSH      5700
PSD      5700
PSA      5700
WHH        15
WHD        15
WHA        15
SJH      3332
SJD      3332
SJA      3332
VCH        27
VCD        27
VCA        27
GBH      5272
GBD      5272
GBA      5272
BSH      5268
BSD      5268
BSA      5268
dtype: int64

In [105]:
# drop the columns for the betting providers with more than 1/4 of the data missing
top_5_leagues_matches_df_clean.drop(num_missing_predictions[num_missing_predictions > round(top_5_leagues_matches_df_clean.shape[0] * 1 / 4)].index, axis=1, inplace=True)
top_5_leagues_matches_df_clean.iloc[:, predictions_start_idx:].columns

Index(['away_player_8', 'away_player_9', 'away_player_10', 'away_player_11',
       'goal', 'shoton', 'shotoff', 'foulcommit', 'card', 'cross', 'corner',
       'possession', 'B365H', 'B365D', 'B365A', 'BWH', 'BWD', 'BWA', 'IWH',
       'IWD', 'IWA', 'LBH', 'LBD', 'LBA', 'WHH', 'WHD', 'WHA', 'VCH', 'VCD',
       'VCA'],
      dtype='object')

In [106]:
predictions_start_idx = -18

In [107]:
match_predictions_df = top_5_leagues_matches_df_clean.loc[:, list(top_5_leagues_matches_df_clean.columns[predictions_start_idx:])]
match_predictions_df.columns

Index(['B365H', 'B365D', 'B365A', 'BWH', 'BWD', 'BWA', 'IWH', 'IWD', 'IWA',
       'LBH', 'LBD', 'LBA', 'WHH', 'WHD', 'WHA', 'VCH', 'VCD', 'VCA'],
      dtype='object')

In [108]:
match_predictions_df.describe()

Unnamed: 0,B365H,B365D,B365A,BWH,BWD,BWA,IWH,IWD,IWA,LBH,LBD,LBA,WHH,WHD,WHA,VCH,VCD,VCA
count,12714.0,12714.0,12714.0,12699.0,12699.0,12699.0,12686.0,12686.0,12686.0,12709.0,12709.0,12709.0,12708.0,12708.0,12708.0,12696.0,12696.0,12696.0
mean,2.625455,3.851312,4.724529,2.570322,3.779627,4.47566,2.478364,3.656283,4.206142,2.547071,3.754477,4.471715,2.604511,3.662175,4.61116,2.682743,3.931064,4.962055
std,1.821225,1.188397,3.867266,1.681922,1.109067,3.427111,1.483711,0.853358,2.938152,1.668051,1.087585,3.533333,1.753303,0.999789,3.834338,1.992439,1.344774,4.567066
min,1.04,1.4,1.08,1.03,1.65,1.1,1.05,1.5,1.1,1.04,1.4,1.1,1.02,1.02,1.08,1.03,1.62,1.08
25%,1.67,3.3,2.6,1.67,3.25,2.55,1.7,3.2,2.55,1.67,3.25,2.5,1.7,3.2,2.6,1.7,3.3,2.6
50%,2.1,3.4,3.5,2.1,3.4,3.45,2.1,3.3,3.3,2.1,3.4,3.4,2.15,3.3,3.4,2.15,3.5,3.6
75%,2.8,4.0,5.25,2.75,3.8,5.0,2.6,3.7,4.7,2.75,3.75,5.0,2.75,3.75,5.0,2.88,4.0,5.5
max,26.0,17.0,51.0,34.0,19.5,51.0,20.0,11.0,25.0,26.0,19.0,51.0,26.0,17.0,51.0,36.0,26.0,67.0


**drop all match rows that have no predictions at all**

In [109]:
top_5_leagues_matches_df_clean.shape

(12723, 103)

In [110]:
top_5_leagues_matches_df_clean.drop(top_5_leagues_matches_df_clean[match_predictions_df.iloc[:, predictions_start_idx:].isna().sum(axis=1) == abs(predictions_start_idx)].index, inplace=True)
top_5_leagues_matches_df_clean.shape

(12716, 103)

In [111]:
match_predictions_df = top_5_leagues_matches_df_clean.loc[:, list(top_5_leagues_matches_df_clean.columns[predictions_start_idx:])]

**fill in the remaining null predictions using the average of those values for the row (i.e. null home predictions will be average of home predictions, likewise for draw and away)**

In [112]:
def series_to_dataframe(series: pd.Series, column_names: List[str]):
    # Create an empty DataFrame
    df = pd.DataFrame()

    # Repeat the series for each column name and assign it to the DataFrame
    for column_name in column_names:
        df[column_name] = series

    return df


In [113]:
# strategy: get the average home odds for the row, and fill in with that value
# fillna allows me to specify a dataframe with values for each column to fill in with
# so I can specify a dataframe with the average home odds for each row, in the home column only
# do the same for the away and draw columns

def fill_missing_predictions(suffix: str, df: pd.DataFrame):
    """
    Fills the missing predictions for the columns ending with the provided suffix in the provided dataframe.
    The suffix indicates if the prediction is for the home team (H), away team (A) or a draw (D).

    The missing values are filled with the mean of the other predictions for the same match for the same outcome,
    meaning that missing home win predictions are filled with the mean of other home win predictions for that match.
    """
    predictions_cols = list(filter(lambda x: x.endswith(suffix), df.columns))
    df[predictions_cols] = df[predictions_cols].fillna(series_to_dataframe(df[predictions_cols].mean(axis=1), predictions_cols))
    return df

In [114]:
for suffix in ["H", "A", "D"]:
    top_5_leagues_matches_df_clean = fill_missing_predictions(suffix, top_5_leagues_matches_df_clean)

##### Test

In [115]:
match_predictions_df = top_5_leagues_matches_df_clean.loc[:, list(top_5_leagues_matches_df_clean.columns[predictions_start_idx:])]
match_predictions_df.isna().sum()

B365H    0
B365D    0
B365A    0
BWH      0
BWD      0
BWA      0
IWH      0
IWD      0
IWA      0
LBH      0
LBD      0
LBA      0
WHH      0
WHD      0
WHA      0
VCH      0
VCD      0
VCA      0
dtype: int64

In [116]:
top_5_leagues_matches_df_clean[top_5_leagues_matches_df_clean.iloc[:, predictions_start_idx:].isna().sum(axis=1) > 0]

Unnamed: 0,id,country_id,league_id,season,matchday,date,match_api_id,home_team_api_id,away_team_api_id,home_team_goal,...,IWA,LBH,LBD,LBA,WHH,WHD,WHA,VCH,VCD,VCA


### player attributes table - quality

#### `date` column should be datetime

##### Define
- convert using `pd.to_datetime`

##### Code

In [117]:
# convert the date strings to date time objects
player_attributes_df_clean["date"] = pd.to_datetime(player_attributes_df_clean.date)
player_attributes_df_clean.date.head()

0   2016-02-18
1   2015-11-19
2   2015-09-21
3   2015-03-20
4   2007-02-22
Name: date, dtype: datetime64[ns]

##### Test

In [118]:
player_attributes_df_clean.date.dtype

dtype('<M8[ns]')

#### missing a `season` column

##### Define

- create from date column

##### Code

In [119]:
season_categories

CategoricalDtype(categories=['2008/2009', '2009/2010', '2010/2011', '2011/2012',
                  '2012/2013', '2013/2014', '2014/2015', '2015/2016'],
, ordered=True)

In [120]:
player_attributes_df.date.min(), player_attributes_df.date.max()

('2007-02-22 00:00:00', '2016-07-07 00:00:00')

**Rule**: from january up to and including july is the season that started the previous summer, august to december is the season that started the current year

In [121]:
# add a season column to the player attributes df based on the date column
def find_season(items, condition):
    for item in items:
        if condition(item):
            return item
    return None  # Return None if no match is found


def get_season_from_date(date: datetime.datetime) -> Optional[pd.CategoricalDtype]:
    """
    Returns the season category for the supplied date, starting with the 2008/2009 season
    """
    july = 7
    if date.year < 2008 or (date.year == 2008 and date.month <= july):
        return None

    return find_season(season_categories.categories, lambda season: (f"{date.year}/" in season and date.month > july) or (f"/{date.year}" in season and date.month <= july))

In [122]:
player_attributes_df_clean["season"] = player_attributes_df_clean.date.apply(get_season_from_date).astype(season_categories)
player_attributes_df_clean.season.head()

0    2015/2016
1    2015/2016
2    2015/2016
3    2014/2015
4          NaN
Name: season, dtype: category
Categories (8, object): ['2008/2009' < '2009/2010' < '2010/2011' < '2011/2012' < '2012/2013' < '2013/2014' < '2014/2015' < '2015/2016']

##### Test

In [123]:
assert "season" in player_attributes_df_clean.columns
player_attributes_df_clean.season.dtype

CategoricalDtype(categories=['2008/2009', '2009/2010', '2010/2011', '2011/2012',
                  '2012/2013', '2013/2014', '2014/2015', '2015/2016'],
, ordered=True)

#### missing a `league_id` column

##### Define

- create from the league of the first team the player played for that season

##### Code

In [124]:
player_attributes_df_clean.columns

Index(['id', 'player_fifa_api_id', 'player_api_id', 'date', 'overall_rating',
       'potential', 'preferred_foot', 'attacking_work_rate',
       'defensive_work_rate', 'crossing', 'finishing', 'heading_accuracy',
       'short_passing', 'volleys', 'dribbling', 'curve', 'free_kick_accuracy',
       'long_passing', 'ball_control', 'acceleration', 'sprint_speed',
       'agility', 'reactions', 'balance', 'shot_power', 'jumping', 'stamina',
       'strength', 'long_shots', 'aggression', 'interceptions', 'positioning',
       'vision', 'penalties', 'marking', 'standing_tackle', 'sliding_tackle',
       'gk_diving', 'gk_handling', 'gk_kicking', 'gk_positioning',
       'gk_reflexes', 'season'],
      dtype='object')

In [125]:
top_5_leagues_matches_df_clean.home_player_1

10        27284.0
18        23686.0
19        32562.0
20        36374.0
22        31465.0
           ...   
14580    532942.0
14581     31047.0
14582    192064.0
14583     24224.0
14584    182917.0
Name: home_player_1, Length: 12716, dtype: float64

In [126]:
def add_player_attributes_league_id_col(pa_df_clean):
    # Reshape the matches dataframe to have a single column for player IDs
    home_player_cols = [f'home_player_{i}' for i in range(1, 12)]
    away_player_cols = [f'away_player_{i}' for i in range(1, 12)]
    player_api_ids = pd.concat([top_5_leagues_matches_df_clean[home_player_cols], top_5_leagues_matches_df_clean[away_player_cols]], axis=1).stack().reset_index(drop=True)
    player_api_ids = player_api_ids.rename('player_api_id')

    # remove the players that do not feature in any match in the top 5 leagues over the seasons
    pa_df_clean = pa_df_clean[pa_df_clean.player_api_id.isin(player_api_ids)]

    # get individual dataframes containing the match id, league_id, and season for each player of each match
    player_matches_df = pd.melt(top_5_leagues_matches_df_clean, id_vars=["id"], value_vars=home_player_cols+away_player_cols, value_name="player_api_id").drop("variable", axis=1)
    league_matches_df = pd.melt(top_5_leagues_matches_df_clean, id_vars=["league_id"], value_vars=home_player_cols+away_player_cols, value_name="player_api_id").drop("variable", axis=1)
    season_matches_df = pd.melt(top_5_leagues_matches_df_clean, id_vars=["season"], value_vars=home_player_cols+away_player_cols, value_name="player_api_id").drop("variable", axis=1)
    # merge these dataframes together to get a dataframe with the player_api_id, league_id and season
    player_matches_df = player_matches_df.merge(league_matches_df, left_index=True, right_index=True).merge(season_matches_df, left_index=True, right_index=True).drop(labels=["player_api_id_x", "player_api_id_y"], axis=1)

    # preserve only the first occurrence of a player in a season
    player_matches_df = player_matches_df[~(player_matches_df[['player_api_id', 'season']].duplicated())]

    # merge these dataframes to add the league id column to the player attributes dataframe
    pa_df_clean = pd.merge(pa_df_clean, player_matches_df, left_on=['player_api_id', 'season'], right_on=['player_api_id', 'season'], how='left')
    # remove the rows that don't have a season (i.e. the season is outside our range of seasons from 2008-2016)
    pa_df_clean = pa_df_clean[~(pa_df_clean.season.isna())]
    # separate the players who don't have a league id into another dataframe
    pa_league_isna_df_clean = pa_df_clean[pa_df_clean.league_id.isna()]
    pa_df_clean = pa_df_clean[~(pa_df_clean.league_id.isna())]
    pa_df_clean = pa_df_clean.rename({"id_x": "id"}, axis=1).drop("id_y", axis=1)
    pa_league_isna_df_clean = pa_league_isna_df_clean.rename({"id_x": "id"}, axis=1).drop("id_y", axis=1)

    return pa_df_clean, pa_league_isna_df_clean

player_attributes_df_clean, no_league_player_attributes_df_clean = add_player_attributes_league_id_col(player_attributes_df_clean.copy())
player_attributes_df_clean.shape, no_league_player_attributes_df_clean.shape

((59984, 44), (49712, 44))

- There's still a small issue: there's a big drop off in the number of rows of player attributes after removing rows that don't have a league ID. Need to review the logic to be sure that's the correct behavior.
- UPDATE: started checking and it seems that the missing players actually did not have any matches for the given seasons. However, I checked and the first player is Aaron Cresswell, who I know and who has played in the premier league for a while. So I need to confirm if it's true that he actually didn't play any premier league match in the 2013/2014 season.
- UPDATE 2: Cresswell actually wasn't in the premier league in the 2013/2014 season. He was in the championship at Ipswich. So it appears that the logic is correct.

##### Test

In [127]:
player_attributes_df_clean.league_id.isna().sum()

0

In [128]:
assert no_league_player_attributes_df_clean.shape[0] == no_league_player_attributes_df_clean.league_id.isna().sum()

In [129]:
def test_null_league_id(df):
    """
    Receives a dataframe of player attributes with null league_id values, finds a match that the user featured in
    that season, and gets the league id from that match
    """
    home_player_cols = [f'home_player_{i}' for i in range(1, 12)]
    away_player_cols = [f'away_player_{i}' for i in range(1, 12)]
    player_api_id = df.player_api_id
    season = df.season
    for player_col in home_player_cols + away_player_cols:
        match = top_5_leagues_matches_df_clean[(top_5_leagues_matches_df_clean[player_col] == player_api_id) & (top_5_leagues_matches_df_clean.season == season)]
        if not match.empty:
            assert False, "Found a match for this player in this season"

# commented out for now because it takes a long time to run and I don't want to waste that much time while this project is still in progress
# TODO: uncomment when the project is complete
# no_league_player_attributes_df_clean.apply(test_null_league_id, axis=1)



#### preferred foot should be categorical

##### Define

- convert to categorical (should be unordered)

##### Code

In [130]:
player_attributes_df_clean.preferred_foot.value_counts()

right    45140
left     14802
Name: preferred_foot, dtype: int64

In [131]:
preferred_foot_categories = pd.CategoricalDtype(categories=set(filter(None, player_attributes_df_clean.preferred_foot.unique())), ordered=False)
preferred_foot_categories

CategoricalDtype(categories=['right', 'left'], ordered=False)

In [132]:
player_attributes_df_clean["preferred_foot"] = player_attributes_df_clean.preferred_foot.astype(preferred_foot_categories)

##### Test

In [133]:
player_attributes_df_clean.preferred_foot.dtype

CategoricalDtype(categories=['right', 'left'], ordered=False)

#### all attributes missing for some rows

##### Define

- drop all rows that have null values

##### Code

In [134]:
# establish the number of columns
num_attributes = player_attributes_df_clean.iloc[:, attributes_start_idx:].columns.size
num_attributes

40

In [135]:
# find the number of rows that have all values missing
(player_attributes_df_clean.iloc[:, attributes_start_idx:].isna().sum(axis=1) == num_attributes).sum()

0

In [136]:
# delete them
player_attributes_df_clean.drop(player_attributes_df_clean[player_attributes_df_clean.iloc[:, attributes_start_idx:].isna().sum(axis=1) == num_attributes].index, inplace=True)

##### Test

In [137]:
assert (player_attributes_df_clean.iloc[:, attributes_start_idx:].isna().sum(axis=1) == num_attributes).sum() == 0

#### attacking work rate null for some players

##### Define

- drop rows where this value is null

##### Code

In [138]:
player_attributes_df_clean.drop(player_attributes_df_clean[player_attributes_df_clean.attacking_work_rate.isna()].index, inplace=True)

##### Test

In [139]:
player_attributes_df_clean.attacking_work_rate.isna().sum()

0

#### attacking work rate has strange values

##### Define

- find common ground between the values that are mixed but sensible
- normalize those values to make them uniform
- convert nonsensical values to null
- drop rows where this value is null

##### Code

In [140]:
player_attributes_df_clean.attacking_work_rate.value_counts()

medium    39081
high      16722
low        3042
None        672
y            45
norm         40
stoc         21
le           19
Name: attacking_work_rate, dtype: int64

In [141]:
players_attacking_work_rate_mapping = {
    "medium": "medium",
    "high": "high",
    "low": "low",
    "norm": "medium",
    # all these other values should be None
    "stoc": None,
    "y": None,
    "le": None,
    "None": None,
}

In [142]:
player_attributes_df_clean["attacking_work_rate"] = player_attributes_df_clean.attacking_work_rate.apply(lambda x: players_attacking_work_rate_mapping[x])

In [143]:
player_attributes_df_clean.drop(player_attributes_df_clean[player_attributes_df_clean.attacking_work_rate.isna()].index, inplace=True)

##### Test

In [144]:
player_attributes_df_clean.attacking_work_rate.value_counts()

medium    39121
high      16722
low        3042
Name: attacking_work_rate, dtype: int64

In [145]:
player_attributes_df_clean.attacking_work_rate.isna().sum()

0

#### defensive work rate has strange values

##### Define

- find common ground between the values that are mixed but sensible
- normalize those values to make them uniform
- convert nonsensical values to null
- drop rows where this value is null

##### Code

In [146]:
player_attributes_df_clean.defensive_work_rate.value_counts()

medium    41990
high      11189
low        5627
ormal        40
2            10
0             9
4             7
1             5
5             4
3             4
Name: defensive_work_rate, dtype: int64

In [147]:
players_defensive_work_rate_mapping = {
    "medium": "medium",
    "high": "high",
    "low": "low",
    "ormal": "medium",
    "2": None,
    "1": None,
    "6": None,
    "5": None,
    "4": None,
    "3": None,
    "0": None,
    "9": None,
    "7": None,
    "8": None,
}

In [148]:
player_attributes_df_clean["defensive_work_rate"] = player_attributes_df_clean.defensive_work_rate.apply(lambda x: players_defensive_work_rate_mapping[x])

In [149]:
player_attributes_df_clean.drop(player_attributes_df_clean[player_attributes_df_clean.defensive_work_rate.isna()].index, inplace=True)

##### Test

In [150]:
player_attributes_df_clean.defensive_work_rate.value_counts()

medium    42030
high      11189
low        5627
Name: defensive_work_rate, dtype: int64

In [151]:
player_attributes_df_clean.defensive_work_rate.isna().sum()

0

#### attacking work rate should be categorical

##### Define

- convert the values to categorical

##### Code

In [152]:
work_rate_categories = pd.CategoricalDtype(categories=["low", "medium", "high"], ordered=True)
player_attributes_df_clean["attacking_work_rate"] = player_attributes_df_clean.attacking_work_rate.astype(work_rate_categories)

##### Test

In [153]:
player_attributes_df_clean.attacking_work_rate.dtype

CategoricalDtype(categories=['low', 'medium', 'high'], ordered=True)

#### defensive work rate should be categorical

##### Define

- convert to categorical

##### Code

In [154]:
player_attributes_df_clean["defensive_work_rate"] = player_attributes_df_clean.defensive_work_rate.astype(work_rate_categories)

##### Test

In [155]:
player_attributes_df_clean.defensive_work_rate.dtype

CategoricalDtype(categories=['low', 'medium', 'high'], ordered=True)

### team attributes table - quality

#### `date` column should be datetime

##### Define

- convert to datetime using `pd.to_datetime`

##### Code

In [156]:
team_attributes_df_clean["date"] = pd.to_datetime(team_attributes_df_clean.date)

##### Test

In [157]:
team_attributes_df_clean.date.dtype

dtype('<M8[ns]')

#### missing a `season` column

##### Define

- create from date column
- make it categorical

##### Code

In [158]:
team_attributes_df_clean["season"] = team_attributes_df_clean.date.apply(get_season_from_date).astype(season_categories)

##### Test

In [159]:
team_attributes_df_clean.season.dtype

CategoricalDtype(categories=['2008/2009', '2009/2010', '2010/2011', '2011/2012',
                  '2012/2013', '2013/2014', '2014/2015', '2015/2016'],
, ordered=True)

#### all the columns that end with `Class` should be categorical

##### Define

- for each of them get their unique values
- create a categorical variable (ordered if needed)
- convert them to that variable

##### Code

In [160]:
for col_name in filter(lambda x: x.endswith("Class"), team_attributes_df_clean.columns):
    print(f'"{col_name}": ({team_attributes_df_clean[col_name].unique()}, True),')

"buildUpPlaySpeedClass": (['Balanced' 'Slow' 'Fast'], True),
"buildUpPlayDribblingClass": (['Little' 'Normal' 'Lots'], True),
"buildUpPlayPassingClass": (['Mixed' 'Short' 'Long'], True),
"buildUpPlayPositioningClass": (['Organised' 'Free Form'], True),
"chanceCreationPassingClass": (['Normal' 'Safe' 'Risky'], True),
"chanceCreationCrossingClass": (['Normal' 'Little' 'Lots'], True),
"chanceCreationShootingClass": (['Normal' 'Lots' 'Little'], True),
"chanceCreationPositioningClass": (['Organised' 'Free Form'], True),
"defencePressureClass": (['Deep' 'Medium' 'High'], True),
"defenceAggressionClass": (['Double' 'Press' 'Contain'], True),
"defenceTeamWidthClass": (['Narrow' 'Normal' 'Wide'], True),
"defenceDefenderLineClass": (['Offside Trap' 'Cover'], True),


In [161]:
team_attrs_class_columns_mapping = {
    "buildUpPlaySpeedClass": (['2. Balanced', '1. Slow', '3. Fast'], True),
    "buildUpPlayDribblingClass": (['1. Little', '2. Normal', '3. Lots'], True),
    "buildUpPlayPassingClass": (['2. Mixed', '1. Short', '3. Long'], True),
    "buildUpPlayPositioningClass": (['1. Organised', '1. Free Form'], False),
    "chanceCreationPassingClass": (['2. Normal', '1. Safe', '3. Risky'], True),
    "chanceCreationCrossingClass": (['2. Normal', '1. Little', '3. Lots'], True),
    "chanceCreationShootingClass": (['2. Normal', '3. Lots', '1. Little'], True),
    "chanceCreationPositioningClass": (['1. Organised', '1. Free Form'], False),
    "defencePressureClass": (['1. Deep', '2. Medium', '3. High'], True),
    "defenceAggressionClass": (['3. Double', '2. Press', '1. Contain'], True),
    "defenceTeamWidthClass": (['1. Narrow', '2. Normal', '3. Wide'], True),
    "defenceDefenderLineClass": (['2. Offside Trap', '1. Cover'], True),
}

In [162]:
for key, value in team_attrs_class_columns_mapping.items():
    sorted_list = sorted(value[0])
    ordered = value[1]
    cat_variable = pd.CategoricalDtype([item[3:] for item in sorted_list], ordered=ordered)
    team_attributes_df_clean[key] = team_attributes_df_clean[key].astype(cat_variable)

##### Test

In [163]:
pprint(team_attributes_df_clean.defenceAggressionClass.dtype)

CategoricalDtype(categories=['Contain', 'Press', 'Double'], ordered=True)


In [164]:
for key in team_attrs_class_columns_mapping.keys():
    pprint(team_attributes_df_clean[key].dtype)

CategoricalDtype(categories=['Slow', 'Balanced', 'Fast'], ordered=True)
CategoricalDtype(categories=['Little', 'Normal', 'Lots'], ordered=True)
CategoricalDtype(categories=['Short', 'Mixed', 'Long'], ordered=True)
CategoricalDtype(categories=['Free Form', 'Organised'], ordered=False)
CategoricalDtype(categories=['Safe', 'Normal', 'Risky'], ordered=True)
CategoricalDtype(categories=['Little', 'Normal', 'Lots'], ordered=True)
CategoricalDtype(categories=['Little', 'Normal', 'Lots'], ordered=True)
CategoricalDtype(categories=['Free Form', 'Organised'], ordered=False)
CategoricalDtype(categories=['Deep', 'Medium', 'High'], ordered=True)
CategoricalDtype(categories=['Contain', 'Press', 'Double'], ordered=True)
CategoricalDtype(categories=['Narrow', 'Normal', 'Wide'], ordered=True)
CategoricalDtype(categories=['Cover', 'Offside Trap'], ordered=True)


#### `buildUpPlayDribbling` has (a lot of) null values

##### Define

- drop the column

##### Code

In [165]:
team_attributes_df_clean.drop(["buildUpPlayDribbling"], axis=1, inplace=True)

##### Test

In [166]:
assert "buildUpPlayDribbling" not in team_attributes_df_clean.columns

### champs league table - quality

#### contains data for years that are not part of our period of interest

##### Define

- those years should be removed

##### Code

In [167]:
champs_league_hist_df_clean = champs_league_hist_df_clean[champs_league_hist_df_clean.year.isin(top_5_leagues_matches_df_clean.date.dt.year.unique())]

##### Test

In [168]:
assert champs_league_hist_df_clean.year.unique().size == top_5_leagues_matches_df_clean.date.dt.year.unique().size

#### order assigned to `progress` values is not consistent eg `5. Last 16` and `6. Last 16`

##### Define

- create a dictionary with the correct order
- use that dictionary to replace the values in the column
- remove the numbers from the values

##### Code

In [169]:
champs_league_hist_df_clean.progress.unique()

array(['1. Winner', '2. Runner Up', '3. Semifinalist',
       '4. Quarterfinalist', '5. Last 16', '7. Group Stage'], dtype=object)

Turns out all the inconsistent values got removed once the rows for other years were removed.

#### `progress` should be categorical

##### Define

- convert to categorical

##### Code

In [170]:
champs_lg_stages_categories = pd.CategoricalDtype(categories=[stage[3:] for stage in sorted(champs_league_hist_df_clean.progress.unique())[::-1]], ordered=True)
champs_lg_stages_categories

CategoricalDtype(categories=['Group Stage', 'Last 16', 'Quarterfinalist', 'Semifinalist',
                  'Runner Up', 'Winner'],
, ordered=True)

In [171]:
champs_league_hist_df_clean["progress"] = champs_league_hist_df_clean.progress.str.slice(3).astype(champs_lg_stages_categories)

##### Test

In [172]:
champs_league_hist_df_clean.progress.dtype

CategoricalDtype(categories=['Group Stage', 'Last 16', 'Quarterfinalist', 'Semifinalist',
                  'Runner Up', 'Winner'],
, ordered=True)

#### missing team api id column

##### Define

- find all the teams in the teams table that have a matching name to the teams in the champs league table
- set the team api id for those teams in the champs league table, set others to null
- for the teams without an API ID, manually create a mapping for their names to whatever their API ID is
- use the mapping to set their API IDs in the champs league table

##### Code

In [173]:
teams_df_clean.columns

Index(['id', 'team_api_id', 'team_fifa_api_id', 'team_long_name',
       'team_short_name'],
      dtype='object')

In [174]:
teams_df_clean.team_long_name.isin(champs_league_hist_df_clean.team_name.unique()).sum()

15

In [175]:
teams_df_clean[teams_df_clean.team_long_name.isin(champs_league_hist_df_clean.team_name.unique())][["team_api_id", "team_long_name"]]

Unnamed: 0,team_api_id,team_long_name
0,10260,Manchester United
2,9825,Arsenal
5,8650,Liverpool
9,8456,Manchester City
13,8586,Tottenham Hotspur
18,8455,Chelsea
42,9748,Olympique Lyonnais
45,9847,Paris Saint-Germain
72,9789,Borussia Dortmund
102,8543,Lazio


In [176]:
team_name_api_id_mapping = teams_df_clean[teams_df_clean.team_long_name.isin(champs_league_hist_df_clean.team_name.unique())][["team_api_id", "team_long_name"]].set_index("team_long_name")["team_api_id"].to_dict()
team_name_api_id_mapping

{'Manchester United': 10260,
 'Arsenal': 9825,
 'Liverpool': 8650,
 'Manchester City': 8456,
 'Tottenham Hotspur': 8586,
 'Chelsea': 8455,
 'Olympique Lyonnais': 9748,
 'Paris Saint-Germain': 9847,
 'Borussia Dortmund': 9789,
 'Lazio': 8543,
 'Fiorentina': 8535,
 'Juventus': 9885,
 'Roma': 8686,
 'Napoli': 9875,
 'Real Sociedad': 8560}

In [177]:
# set these values as team api id in the champs league df
champs_league_hist_df_clean["team_api_id"] = champs_league_hist_df_clean.team_name.apply(lambda x: team_name_api_id_mapping.get(x, None))
champs_league_hist_df_clean.team_api_id

803     10260.0
804      8455.0
805     10260.0
806      8455.0
807      8650.0
         ...   
1356        NaN
1357        NaN
1358        NaN
1359     9748.0
1360        NaN
Name: team_api_id, Length: 558, dtype: float64

In [178]:
champs_league_hist_df_clean.team_api_id.value_counts(dropna=False)

NaN        393
8455.0      28
10260.0     25
9825.0      22
9885.0      14
9748.0      13
9847.0      12
9789.0      11
8686.0      10
8456.0      10
8650.0       9
8535.0       3
8586.0       3
9875.0       3
8543.0       1
8560.0       1
Name: team_api_id, dtype: int64

So there are 393 teams whose names still need to be updated. Possible strategy:
- do a fuzzy match on the team names to find the closest match
    - first get the unique team names from the champs league table that lack an api id, and from the teams df
    - for each unique champs league team name, find the loop over the unique teams df names and find the closest match, and returns its ID
    - this leaves a list of IDs, which can be used to update the champs league table
- to start, do fuzzy match using a 2-way substring match i.e. if string 1 is a substring of string 2 or vice versa, then they are a match

In [179]:
champs_league_teams_no_api_id = champs_league_hist_df_clean[champs_league_hist_df_clean.team_api_id.isna()].team_name.unique()
champs_league_teams_no_api_id

array(['Barcelona', 'Schalke 04', 'Fenerbahce', 'Porto', 'Celtic',
       'Real Madrid', 'A.C. Milan', 'Internazionale', 'Sevilla',
       'Olympiacos', 'Marseille', 'Besiktas', 'Rosenborg BK', 'Valencia',
       'Werder Bremen', 'Benfica', 'Shakhtar Donetsk', 'Rangers (IL)',
       'Stuttgart', 'Sporting CP', 'Dynamo Kyiv', 'PSV Eindhoven',
       'CSKA Moscow', 'Slavia Prague', 'Steaua Bucuresti',
       'Bayern Munich', 'Villarreal', 'Athletico Madrid', 'Panathinaikos',
       'Bordeaux', 'CFR Cluj', 'Anorthosis', 'Basel', 'Aalborg BK',
       'Zenit Saint Petersburg', 'BATE Borisov', 'Maccabi Haifa',
       'Wolfsburg', 'Zurich', 'APOEL', 'Debrecen', 'Rubin Kazan',
       'Unirea Urziceni', 'Standard Liege', 'AZ Alkmaar', 'Copenhagen',
       'Twente', 'Hapoel Tel Aviv', 'Bursaspor', 'Spartak Moscow',
       'Zilina', 'Ajax', 'Auxerre', 'Braga', 'FK Partizan',
       'Bayer Leverkusen', 'Trabzonspor', 'Lille OSC', 'Otelul Galati',
       'Dinamo Zagreb', 'Genk', 'Viktoria Plzen', '

In [180]:
available_team_names = teams_df_clean.team_long_name.unique()
available_team_names

array(['Manchester United', 'Newcastle United', 'Arsenal',
       'West Bromwich Albion', 'Sunderland', 'Liverpool',
       'West Ham United', 'Wigan Athletic', 'Aston Villa',
       'Manchester City', 'Everton', 'Blackburn Rovers', 'Middlesbrough',
       'Tottenham Hotspur', 'Bolton Wanderers', 'Stoke City', 'Hull City',
       'Fulham', 'Chelsea', 'Portsmouth', 'Birmingham City',
       'Wolverhampton Wanderers', 'Burnley', 'Blackpool', 'Swansea City',
       'Queens Park Rangers', 'Norwich City', 'Southampton', 'Reading',
       'Crystal Palace', 'Cardiff City', 'Leicester City', 'Bournemouth',
       'Watford', 'AJ Auxerre', 'FC Nantes', 'Girondins de Bordeaux',
       'SM Caen', 'Le Havre AC', 'OGC Nice', 'Le Mans FC', 'FC Lorient',
       'Olympique Lyonnais', 'Toulouse FC', 'AS Monaco',
       'Paris Saint-Germain', 'AS Nancy-Lorraine', 'LOSC Lille',
       'Stade Rennais FC', 'Olympique de Marseille',
       'FC Sochaux-Montbéliard', 'Grenoble Foot 38', 'Valenciennes FC',
    

In [181]:
champs_lg_name_team_name_mapping = {}
for champs_lg_team_name in champs_league_teams_no_api_id:
    for _team_name in available_team_names:
        if _team_name.lower() in champs_lg_team_name.lower() or champs_lg_team_name.lower() in _team_name.lower():
            champs_lg_name_team_name_mapping[champs_lg_team_name] = _team_name

champs_lg_name_team_name_mapping

{'Barcelona': 'FC Barcelona',
 'Schalke 04': 'FC Schalke 04',
 'Real Madrid': 'Real Madrid CF',
 'A.C. Milan': 'Milan',
 'Internazionale': 'Inter',
 'Sevilla': 'Sevilla FC',
 'Marseille': 'Olympique de Marseille',
 'Valencia': 'Valencia CF',
 'Werder Bremen': 'SV Werder Bremen',
 'Stuttgart': 'VfB Stuttgart',
 'Bayern Munich': 'FC Bayern Munich',
 'Villarreal': 'Villarreal CF',
 'Bordeaux': 'Girondins de Bordeaux',
 'Wolfsburg': 'VfL Wolfsburg',
 'Auxerre': 'AJ Auxerre',
 'Montpellier': 'Montpellier Hérault SC',
 'Monaco': 'AS Monaco'}

That mapping is correct, so we need to add the team api ids to the champs league table.

In [182]:
def update_champs_league_team_api_id(team_row) -> int:
    team_name = team_row["team_name"]
    team_api_id = team_row["team_api_id"]
    if team_api_id and not np.isnan(team_api_id):
        # this team already has an API ID
        return team_row["team_api_id"]
    if team_name not in champs_lg_name_team_name_mapping:
        return None
    team_api_id = teams_df_clean.loc[teams_df_clean["team_long_name"] == champs_lg_name_team_name_mapping[team_name], "team_api_id"]
    if not team_api_id.empty:
        return team_api_id[team_api_id.index[0]]
    else:
        return None

champs_league_hist_df_clean["team_api_id"] = champs_league_hist_df_clean.apply(update_champs_league_team_api_id, axis=1)
champs_league_hist_df_clean.team_api_id.value_counts(dropna=False)

NaN        208
8634.0      37
9823.0      32
8633.0      32
8455.0      28
10260.0     25
9825.0      22
9885.0      14
8636.0      14
10189.0     13
8564.0      13
9748.0      13
9847.0      12
9789.0      11
8686.0      10
8456.0      10
8592.0       9
8650.0       9
10267.0      7
8302.0       5
10205.0      4
9827.0       4
8721.0       4
9875.0       3
9829.0       3
8586.0       3
8535.0       3
10269.0      3
8697.0       3
8583.0       1
10249.0      1
8560.0       1
8543.0       1
Name: team_api_id, dtype: int64

At this point I need a better fuzzy matching solution.

In [183]:
from thefuzz import fuzz, process as fuzz_process

In [184]:
champs_league_teams_no_api_id = champs_league_hist_df_clean[champs_league_hist_df_clean.team_api_id.isna()].team_name.unique()
champs_league_teams_no_api_id

array(['Fenerbahce', 'Porto', 'Celtic', 'Olympiacos', 'Besiktas',
       'Rosenborg BK', 'Benfica', 'Shakhtar Donetsk', 'Rangers (IL)',
       'Sporting CP', 'Dynamo Kyiv', 'PSV Eindhoven', 'CSKA Moscow',
       'Slavia Prague', 'Steaua Bucuresti', 'Athletico Madrid',
       'Panathinaikos', 'CFR Cluj', 'Anorthosis', 'Basel', 'Aalborg BK',
       'Zenit Saint Petersburg', 'BATE Borisov', 'Maccabi Haifa',
       'Zurich', 'APOEL', 'Debrecen', 'Rubin Kazan', 'Unirea Urziceni',
       'Standard Liege', 'AZ Alkmaar', 'Copenhagen', 'Twente',
       'Hapoel Tel Aviv', 'Bursaspor', 'Spartak Moscow', 'Zilina', 'Ajax',
       'Braga', 'FK Partizan', 'Bayer Leverkusen', 'Trabzonspor',
       'Lille OSC', 'Otelul Galati', 'Dinamo Zagreb', 'Genk',
       'Viktoria Plzen', 'Galatasaray', 'Malaga', 'Anderlecht',
       'Nordsjaelland', 'Austria Wien', 'Malmo FF',
       'PFC Ludogorets Razgrad', 'Maribor', 'Athletic Bilbao', 'Gent',
       'Astana', 'Borussia Monchengladbach', 'Maccabi Tel Aviv'],
 

In [185]:
champs_lg_name_team_name_mapping = {}
for champs_lg_team_name in champs_league_teams_no_api_id:
    matching_available_teams = fuzz_process.extract(champs_lg_team_name, available_team_names)
    if matching_available_teams:
        champs_lg_name_team_name_mapping[champs_lg_team_name] = matching_available_teams

champs_lg_name_team_name_mapping

{'Fenerbahce': [('FC Energie Cottbus', 54),
  ('Inter', 54),
  ('CD Tenerife', 48),
  ('RC Lens', 45),
  ('Eintracht Frankfurt', 45)],
 'Porto': [('Portsmouth', 72),
  ('RC Deportivo de La Coruña', 72),
  ('Real Sporting de Gijón', 72),
  ('Xerez Club Deportivo', 72),
  ('Blackpool', 60)],
 'Celtic': [('Wigan Athletic', 60),
  ('Athletic Club de Bilbao', 60),
  ('Atlético Madrid', 60),
  ('RC Celta de Vigo', 60),
  ('Elche CF', 54)],
 'Olympiacos': [('Olympique Lyonnais', 57),
  ('Olympique de Marseille', 54),
  ('Milan', 54),
  ('Roma', 45),
  ('Empoli', 45)],
 'Besiktas': [('Brescia', 67),
  ('Stade Brestois 29', 56),
  ('SC Bastia', 56),
  ('Siena', 54),
  ('SD Eibar', 48)],
 'Rosenborg BK': [('Siena', 54),
  ('Genoa', 54),
  ('Arsenal', 51),
  ('Blackburn Rovers', 48),
  ('1. FC Nürnberg', 48)],
 'Benfica': [('Brescia', 57),
  ('Valencia CF', 53),
  ('Siena', 50),
  ('Genoa', 50),
  ('FC Bayern Munich', 49)],
 'Shakhtar Donetsk': [('Udinese', 51),
  ('Stade de Reims', 47),
  ('Roma

The matching didn't go very well. The majority of these teams are not in the top 5 leagues, so I'll just manually add the ones that are.

In [186]:
champs_league_teams_no_api_id

array(['Fenerbahce', 'Porto', 'Celtic', 'Olympiacos', 'Besiktas',
       'Rosenborg BK', 'Benfica', 'Shakhtar Donetsk', 'Rangers (IL)',
       'Sporting CP', 'Dynamo Kyiv', 'PSV Eindhoven', 'CSKA Moscow',
       'Slavia Prague', 'Steaua Bucuresti', 'Athletico Madrid',
       'Panathinaikos', 'CFR Cluj', 'Anorthosis', 'Basel', 'Aalborg BK',
       'Zenit Saint Petersburg', 'BATE Borisov', 'Maccabi Haifa',
       'Zurich', 'APOEL', 'Debrecen', 'Rubin Kazan', 'Unirea Urziceni',
       'Standard Liege', 'AZ Alkmaar', 'Copenhagen', 'Twente',
       'Hapoel Tel Aviv', 'Bursaspor', 'Spartak Moscow', 'Zilina', 'Ajax',
       'Braga', 'FK Partizan', 'Bayer Leverkusen', 'Trabzonspor',
       'Lille OSC', 'Otelul Galati', 'Dinamo Zagreb', 'Genk',
       'Viktoria Plzen', 'Galatasaray', 'Malaga', 'Anderlecht',
       'Nordsjaelland', 'Austria Wien', 'Malmo FF',
       'PFC Ludogorets Razgrad', 'Maribor', 'Athletic Bilbao', 'Gent',
       'Astana', 'Borussia Monchengladbach', 'Maccabi Tel Aviv'],
 

In [187]:
champs_lg_name_team_name_mapping = {
    "Athletico Madrid": "Atlético Madrid",
    "Bayer Leverkusen": "Bayer 04 Leverkusen",
    "Lille OSC": "LOSC Lille",
    "Malaga": "Málaga CF",
    "Athletic Bilbao": "Athletic Club de Bilbao",
    "Borussia Monchengladbach": "Borussia Mönchengladbach",
}

In [188]:
champs_league_hist_df_clean["team_api_id"] = champs_league_hist_df_clean.apply(update_champs_league_team_api_id, axis=1)
champs_league_hist_df_clean.team_api_id.value_counts(dropna=False)

NaN        178
8634.0      37
9823.0      32
8633.0      32
8455.0      28
10260.0     25
9825.0      22
9906.0      16
9885.0      14
8636.0      14
8564.0      13
9748.0      13
10189.0     13
9847.0      12
9789.0      11
8456.0      10
8686.0      10
8592.0       9
8650.0       9
10267.0      7
8178.0       7
8302.0       5
8721.0       4
10205.0      4
9827.0       4
8535.0       3
8586.0       3
9875.0       3
10269.0      3
9864.0       3
8697.0       3
9829.0       3
8639.0       2
8583.0       1
8543.0       1
10249.0      1
8560.0       1
8315.0       1
9788.0       1
Name: team_api_id, dtype: int64

Can drop all the remaining rows that don't have API IDs because we won't need them.

In [189]:
champs_league_hist_df_clean[champs_league_hist_df_clean.team_api_id.isna()].team_name.unique()

array(['Fenerbahce', 'Porto', 'Celtic', 'Olympiacos', 'Besiktas',
       'Rosenborg BK', 'Benfica', 'Shakhtar Donetsk', 'Rangers (IL)',
       'Sporting CP', 'Dynamo Kyiv', 'PSV Eindhoven', 'CSKA Moscow',
       'Slavia Prague', 'Steaua Bucuresti', 'Panathinaikos', 'CFR Cluj',
       'Anorthosis', 'Basel', 'Aalborg BK', 'Zenit Saint Petersburg',
       'BATE Borisov', 'Maccabi Haifa', 'Zurich', 'APOEL', 'Debrecen',
       'Rubin Kazan', 'Unirea Urziceni', 'Standard Liege', 'AZ Alkmaar',
       'Copenhagen', 'Twente', 'Hapoel Tel Aviv', 'Bursaspor',
       'Spartak Moscow', 'Zilina', 'Ajax', 'Braga', 'FK Partizan',
       'Trabzonspor', 'Otelul Galati', 'Dinamo Zagreb', 'Genk',
       'Viktoria Plzen', 'Galatasaray', 'Anderlecht', 'Nordsjaelland',
       'Austria Wien', 'Malmo FF', 'PFC Ludogorets Razgrad', 'Maribor',
       'Gent', 'Astana', 'Maccabi Tel Aviv'], dtype=object)

In [190]:
champs_league_hist_df_clean.drop(champs_league_hist_df_clean[champs_league_hist_df_clean.team_api_id.isna()].index, inplace=True)

##### Test

In [191]:
assert champs_league_hist_df_clean.team_api_id.isna().sum() == 0

#### missing a `season` column

##### Define

- create from year column
- drop rows for years that are not in our timeframe

##### Code

In [192]:
champs_league_hist_df_clean["season"] = champs_league_hist_df_clean.year.apply(lambda x: f"{x - 1}/{x}").astype(season_categories)

In [193]:
champs_league_hist_df_clean.drop(champs_league_hist_df_clean[champs_league_hist_df_clean.season.isna()].index, inplace=True)

##### Test

In [194]:
champs_league_hist_df_clean.season.dtype

CategoricalDtype(categories=['2008/2009', '2009/2010', '2010/2011', '2011/2012',
                  '2012/2013', '2013/2014', '2014/2015', '2015/2016'],
, ordered=True)

In [195]:
assert champs_league_hist_df_clean.season.isna().sum() == 0

#### missing a league id column

##### Define

- create from country column by matching the country name to the country table and getting the league id (which is the same as the country id)

##### Code

In [196]:
country_name_league_id_mapping = {country_name: countries_df_clean.query(f'name == "{country_name}"').id.iloc[0] for country_name in champs_league_hist_df_clean.team_country.unique()}
country_name_league_id_mapping

{'Spain': 21518,
 'England': 1729,
 'Germany': 7809,
 'France': 4769,
 'Italy': 10257}

In [197]:
champs_league_hist_df_clean["league_id"] = champs_league_hist_df_clean.team_country.apply(lambda x: country_name_league_id_mapping.get(x, None))

##### Test

In [198]:
champs_league_hist_df_clean.league_id.value_counts(dropna=False)

21518    97
1729     80
7809     69
10257    50
4769     42
Name: league_id, dtype: int64

### match table - tidiness

#### the xml columns (`goal` ... `possession`) contain multiple details that can and should be in a separate table

##### Define

- get a function that can parse the xml and return a dictionary containing the details
- check all the keys and nested keys in the dictionary to see what the possible values are and understand the structure
- decide on the columns that should be in the new table
- create a new dataframe(s) that transfers the data from the xml columns to the new table(s)
- drop those columns from the match table

##### Code

**GOAL**

In [199]:
import xmltodict

In [200]:
xmltodict.parse(top_5_leagues_matches_df_clean.goal.values[0])

OrderedDict([('goal',
              OrderedDict([('value',
                            [OrderedDict([('comment', 'n'),
                                          ('stats',
                                           OrderedDict([('goals', '1'),
                                                        ('shoton', '1')])),
                                          ('event_incident_typefk', '71'),
                                          ('elapsed', '12'),
                                          ('player1', '30872'),
                                          ('sortorder', '0'),
                                          ('team', '9823'),
                                          ('id', '375301'),
                                          ('n', '21'),
                                          ('type', 'goal'),
                                          ('goal_type', 'n')]),
                             OrderedDict([('comment', 'p'),
                                          ('stats',
        

I need something more finetuned than that.

In [201]:
top_5_leagues_matches_df_clean.goal.values[0]

'<goal><value><comment>n</comment><stats><goals>1</goals><shoton>1</shoton></stats><event_incident_typefk>71</event_incident_typefk><elapsed>12</elapsed><player1>30872</player1><sortorder>0</sortorder><team>9823</team><id>375301</id><n>21</n><type>goal</type><goal_type>n</goal_type></value><value><comment>p</comment><stats><penalties>1</penalties></stats><event_incident_typefk>20</event_incident_typefk><elapsed>16</elapsed><player1>32118</player1><sortorder>0</sortorder><team>9823</team><id>375302</id><n>22</n><type>goal</type><goal_type>p</goal_type></value><value><comment>n</comment><stats><goals>1</goals><shoton>1</shoton></stats><event_incident_typefk>71</event_incident_typefk><elapsed>25</elapsed><player1>33101</player1><sortorder>0</sortorder><team>9790</team><id>375303</id><n>20</n><type>goal</type><goal_type>n</goal_type></value><value><comment>p</comment><stats><penalties>1</penalties></stats><event_incident_typefk>20</event_incident_typefk><elapsed>56</elapsed><player1>39106<

In [202]:
top_5_leagues_matches_df_clean.goal.unique()[:1000]

array(['<goal><value><comment>n</comment><stats><goals>1</goals><shoton>1</shoton></stats><event_incident_typefk>71</event_incident_typefk><elapsed>12</elapsed><player1>30872</player1><sortorder>0</sortorder><team>9823</team><id>375301</id><n>21</n><type>goal</type><goal_type>n</goal_type></value><value><comment>p</comment><stats><penalties>1</penalties></stats><event_incident_typefk>20</event_incident_typefk><elapsed>16</elapsed><player1>32118</player1><sortorder>0</sortorder><team>9823</team><id>375302</id><n>22</n><type>goal</type><goal_type>p</goal_type></value><value><comment>n</comment><stats><goals>1</goals><shoton>1</shoton></stats><event_incident_typefk>71</event_incident_typefk><elapsed>25</elapsed><player1>33101</player1><sortorder>0</sortorder><team>9790</team><id>375303</id><n>20</n><type>goal</type><goal_type>n</goal_type></value><value><comment>p</comment><stats><penalties>1</penalties></stats><event_incident_typefk>20</event_incident_typefk><elapsed>56</elapsed><player1

In [203]:
import xml.etree.ElementTree as ET
import pdb

def extract_unique_tags(xml_list):
    """
    Extracts unique tag names from a list of XML strings by recursively traversing the XML tree.
    
    Args:
        xml_list (list): List of XML strings.
    
    Returns:
        list: List of unique tag names.
    """
    tag_names = set()

    def traverse_xml(element):
        tag_names.add(element.tag)
        for child in element:
            traverse_xml(child)

    for xml_string in xml_list:
        root = ET.fromstring(xml_string)
        traverse_xml(root)

    return list(tag_names)

extract_unique_tags(top_5_leagues_matches_df_clean.goal)

['coordinates',
 'subtype',
 'player2',
 'elapsed',
 'owngoals',
 'value',
 'goal',
 'type',
 'shotoff',
 'goals',
 'id',
 'del',
 'team',
 'shoton',
 'event_incident_typefk',
 'sortorder',
 'goal_type',
 'player1',
 'elapsed_plus',
 'n',
 'penalties',
 'comment',
 'stats']

In [204]:
def parse_xml_to_dict(xml_string):
    root = ET.fromstring(xml_string)
    return _traverse_xml(root)

def _traverse_xml(element):
    result = {}
    if element.text:
        result[element.tag] = element.text.strip()
    for child in element:
        child_data = _traverse_xml(child)
        if child.tag in result:
            if isinstance(result[child.tag], list):
                result[child.tag].append(child_data)
            else:
                result[child.tag] = [result[child.tag], child_data]
        else:
            result[child.tag] = child_data
    return result

parse_xml_to_dict(top_5_leagues_matches_df_clean.goal.unique()[0])

{'value': [{'comment': {'comment': 'n'},
   'stats': {'goals': {'goals': '1'}, 'shoton': {'shoton': '1'}},
   'event_incident_typefk': {'event_incident_typefk': '71'},
   'elapsed': {'elapsed': '12'},
   'player1': {'player1': '30872'},
   'sortorder': {'sortorder': '0'},
   'team': {'team': '9823'},
   'id': {'id': '375301'},
   'n': {'n': '21'},
   'type': {'type': 'goal'},
   'goal_type': {'goal_type': 'n'}},
  {'comment': {'comment': 'p'},
   'stats': {'penalties': {'penalties': '1'}},
   'event_incident_typefk': {'event_incident_typefk': '20'},
   'elapsed': {'elapsed': '16'},
   'player1': {'player1': '32118'},
   'sortorder': {'sortorder': '0'},
   'team': {'team': '9823'},
   'id': {'id': '375302'},
   'n': {'n': '22'},
   'type': {'type': 'goal'},
   'goal_type': {'goal_type': 'p'}},
  {'comment': {'comment': 'n'},
   'stats': {'goals': {'goals': '1'}, 'shoton': {'shoton': '1'}},
   'event_incident_typefk': {'event_incident_typefk': '71'},
   'elapsed': {'elapsed': '25'},
   '

In [205]:
parse_xml_to_dict(top_5_leagues_matches_df_clean.goal.unique()[1])

{'value': {'comment': {'comment': 'n'},
  'stats': {'goals': {'goals': '1'}, 'shoton': {'shoton': '1'}},
  'event_incident_typefk': {'event_incident_typefk': '393'},
  'elapsed': {'elapsed': '4'},
  'player2': {'player2': '39297'},
  'subtype': {'subtype': 'shot'},
  'player1': {'player1': '26181'},
  'sortorder': {'sortorder': '2'},
  'team': {'team': '9825'},
  'id': {'id': '375546'},
  'n': {'n': '231'},
  'type': {'type': 'goal'},
  'goal_type': {'goal_type': 'n'}}}

In [206]:
parse_xml_to_dict(top_5_leagues_matches_df_clean.goal.unique()[2])

{'value': {'comment': {'comment': 'n'},
  'stats': {'goals': {'goals': '1'}, 'shoton': {'shoton': '1'}},
  'event_incident_typefk': {'event_incident_typefk': '407'},
  'elapsed': {'elapsed': '83'},
  'player2': {'player2': '30889'},
  'subtype': {'subtype': 'distance'},
  'player1': {'player1': '30853'},
  'sortorder': {'sortorder': '0'},
  'team': {'team': '8650'},
  'id': {'id': '378041'},
  'n': {'n': '344'},
  'type': {'type': 'goal'},
  'goal_type': {'goal_type': 'n'}}}

In [207]:
parse_xml_to_dict(top_5_leagues_matches_df_clean.goal.unique()[3])

{'value': [{'comment': {'comment': 'n'},
   'stats': {'goals': {'goals': '1'}, 'shoton': {'shoton': '1'}},
   'event_incident_typefk': {'event_incident_typefk': '393'},
   'elapsed': {'elapsed': '4'},
   'player2': {'player2': '36394'},
   'subtype': {'subtype': 'shot'},
   'player1': {'player1': '23139'},
   'sortorder': {'sortorder': '2'},
   'team': {'team': '8654'},
   'id': {'id': '376060'},
   'n': {'n': '244'},
   'type': {'type': 'goal'},
   'goal_type': {'goal_type': 'n'}},
  {'comment': {'comment': 'n'},
   'stats': {'goals': {'goals': '1'}, 'shoton': {'shoton': '1'}},
   'event_incident_typefk': {'event_incident_typefk': '393'},
   'elapsed': {'elapsed': '10'},
   'player2': {'player2': '37277'},
   'subtype': {'subtype': 'shot'},
   'player1': {'player1': '23139'},
   'sortorder': {'sortorder': '1'},
   'team': {'team': '8654'},
   'id': {'id': '376165'},
   'n': {'n': '251'},
   'type': {'type': 'goal'},
   'goal_type': {'goal_type': 'n'}},
  {'comment': {'comment': 'n'},


In [208]:
teams_df_clean[teams_df_clean.team_long_name == "Manchester United"]

Unnamed: 0,id,team_api_id,team_fifa_api_id,team_long_name,team_short_name
0,3457,10260,11.0,Manchester United,MUN


In [209]:
teams_df_clean[teams_df_clean.team_long_name == "Leicester City"]

Unnamed: 0,id,team_api_id,team_fifa_api_id,team_long_name,team_short_name
31,8021,8197,95.0,Leicester City,LEI


In [210]:
top_5_leagues_matches_df_clean[(top_5_leagues_matches_df_clean.home_team_goal == 5) & (top_5_leagues_matches_df_clean.away_team_goal == 3) & (top_5_leagues_matches_df_clean.away_team_api_id == 10260) & (top_5_leagues_matches_df_clean.home_team_api_id == 8197)]

Unnamed: 0,id,country_id,league_id,season,matchday,date,match_api_id,home_team_api_id,away_team_api_id,home_team_goal,...,IWA,LBH,LBD,LBA,WHH,WHD,WHA,VCH,VCD,VCA
11145,4342,1729,1729,2014/2015,5,2014-09-21,1724025,8197,10260,5,...,1.55,5.0,4.0,1.66,5.5,3.5,1.67,5.75,4.0,1.67


In [211]:
# check out the goal column of the 5-3 match between Leicester City and Manchester United in the 2014/2015 season
top_5_leagues_matches_df_clean[(top_5_leagues_matches_df_clean.home_team_goal == 5) & (top_5_leagues_matches_df_clean.away_team_goal == 3) & (top_5_leagues_matches_df_clean.away_team_api_id == 10260) & (top_5_leagues_matches_df_clean.home_team_api_id == 8197)].goal.values[0]

'<goal><value><comment>n</comment><stats><goals>1</goals><shoton>1</shoton></stats><event_incident_typefk>406</event_incident_typefk><coordinates><value>18</value><value>68</value></coordinates><elapsed>13</elapsed><player2>22543</player2><subtype>header</subtype><player1>30843</player1><sortorder>1</sortorder><team>10260</team><id>3752800</id><n>262</n><type>goal</type><goal_type>n</goal_type></value><value><comment>n</comment><stats><goals>1</goals><shoton>1</shoton></stats><event_incident_typefk>393</event_incident_typefk><coordinates><value>29</value><value>60</value></coordinates><elapsed>16</elapsed><player2>30829</player2><subtype>shot</subtype><player1>46509</player1><sortorder>2</sortorder><team>10260</team><id>3752854</id><n>265</n><type>goal</type><goal_type>n</goal_type></value><value><comment>n</comment><stats><goals>1</goals><shoton>1</shoton></stats><event_incident_typefk>406</event_incident_typefk><coordinates><value>24</value><value>5</value></coordinates><elapsed>17</

In [212]:
# check out the goal column of the 5-3 match between Leicester City and Manchester United in the 2014/2015 season
parse_xml_to_dict(top_5_leagues_matches_df_clean[(top_5_leagues_matches_df_clean.home_team_goal == 5) & (top_5_leagues_matches_df_clean.away_team_goal == 3) & (top_5_leagues_matches_df_clean.away_team_api_id == 10260) & (top_5_leagues_matches_df_clean.home_team_api_id == 8197)].goal.values[0])

{'value': [{'comment': {'comment': 'n'},
   'stats': {'goals': {'goals': '1'}, 'shoton': {'shoton': '1'}},
   'event_incident_typefk': {'event_incident_typefk': '406'},
   'coordinates': {'value': [{'value': '18'}, {'value': '68'}]},
   'elapsed': {'elapsed': '13'},
   'player2': {'player2': '22543'},
   'subtype': {'subtype': 'header'},
   'player1': {'player1': '30843'},
   'sortorder': {'sortorder': '1'},
   'team': {'team': '10260'},
   'id': {'id': '3752800'},
   'n': {'n': '262'},
   'type': {'type': 'goal'},
   'goal_type': {'goal_type': 'n'}},
  {'comment': {'comment': 'n'},
   'stats': {'goals': {'goals': '1'}, 'shoton': {'shoton': '1'}},
   'event_incident_typefk': {'event_incident_typefk': '393'},
   'coordinates': {'value': [{'value': '29'}, {'value': '60'}]},
   'elapsed': {'elapsed': '16'},
   'player2': {'player2': '30829'},
   'subtype': {'subtype': 'shot'},
   'player1': {'player1': '46509'},
   'sortorder': {'sortorder': '2'},
   'team': {'team': '10260'},
   'id': {'

Basic structure so far is:
- each row is a list of goal events, with key-value pairs describing each event
- `stats` key marks a goal as a shot on target if it's not a penalty
- `event_incident_typefk` must refer to a categorization of the event type, though it's not clear where it's from
- `coordinates` appear to be `y` then `x` where `y` is the distance between the touchlines and `x` is the distance between the goal lines
- `elapsed` represents the minute that the goal was scored
- `subtype` is the type of goal, e.g. `header`, `shot`, and appears to be missing for penalties
- `player1` and `player2` likely correspond to the scorer and assister
- `team` is the team that scored the goal
- `goal_type` is a one-letter representation of the type of goal (I see `n` and `p` so far, which seems to mean normal and penalty)

In [213]:
# now check direct free kick goal in Chelsea 1 - 3 Southampton from 2015/2016 season
teams_df_clean[teams_df_clean.team_short_name == "CHE"]

Unnamed: 0,id,team_api_id,team_fifa_api_id,team_long_name,team_short_name
18,3475,8455,5.0,Chelsea,CHE


In [214]:
teams_df_clean[teams_df_clean.team_short_name == "SOU"]

Unnamed: 0,id,team_api_id,team_fifa_api_id,team_long_name,team_short_name
27,6504,8466,17.0,Southampton,SOU


In [215]:
top_5_leagues_matches_df_clean[(top_5_leagues_matches_df_clean.home_team_goal == 1) & (top_5_leagues_matches_df_clean.away_team_goal == 3) & (top_5_leagues_matches_df_clean.away_team_api_id == 8466) & (top_5_leagues_matches_df_clean.home_team_api_id == 8455)]

Unnamed: 0,id,country_id,league_id,season,matchday,date,match_api_id,home_team_api_id,away_team_api_id,home_team_goal,...,IWA,LBH,LBD,LBA,WHH,WHD,WHA,VCH,VCD,VCA
13100,4752,1729,1729,2015/2016,8,2015-10-03,1988783,8455,8466,1,...,4.7,1.8,3.75,4.5,1.83,3.2,4.8,1.83,3.75,4.75


In [216]:
top_5_leagues_matches_df_clean[(top_5_leagues_matches_df_clean.home_team_goal == 1) & (top_5_leagues_matches_df_clean.away_team_goal == 3) & (top_5_leagues_matches_df_clean.away_team_api_id == 8466) & (top_5_leagues_matches_df_clean.home_team_api_id == 8455)].goal.values[0]

'<goal><value><comment>n</comment><stats><goals>1</goals><shoton>1</shoton></stats><event_incident_typefk>130</event_incident_typefk><coordinates><value>7</value><value>16</value></coordinates><elapsed>10</elapsed><subtype>direct_freekick</subtype><player1>94086</player1><sortorder>0</sortorder><team>8455</team><id>4868674</id><n>76</n><type>goal</type><goal_type>n</goal_type></value><value><comment>n</comment><stats><goals>1</goals><shoton>1</shoton></stats><event_incident_typefk>411</event_incident_typefk><coordinates><value>26</value><value>57</value></coordinates><elapsed>44</elapsed><player2>41280</player2><subtype>volley</subtype><player1>23792</player1><sortorder>1</sortorder><team>8466</team><id>4869077</id><n>244</n><type>goal</type><goal_type>n</goal_type></value><value><comment>n</comment><stats><goals>1</goals><shoton>1</shoton></stats><event_incident_typefk>393</event_incident_typefk><coordinates><value>24</value><value>61</value></coordinates><elapsed>60</elapsed><player2

In [217]:
parse_xml_to_dict(top_5_leagues_matches_df_clean[(top_5_leagues_matches_df_clean.home_team_goal == 1) & (top_5_leagues_matches_df_clean.away_team_goal == 3) & (top_5_leagues_matches_df_clean.away_team_api_id == 8466) & (top_5_leagues_matches_df_clean.home_team_api_id == 8455)].goal.values[0])

{'value': [{'comment': {'comment': 'n'},
   'stats': {'goals': {'goals': '1'}, 'shoton': {'shoton': '1'}},
   'event_incident_typefk': {'event_incident_typefk': '130'},
   'coordinates': {'value': [{'value': '7'}, {'value': '16'}]},
   'elapsed': {'elapsed': '10'},
   'subtype': {'subtype': 'direct_freekick'},
   'player1': {'player1': '94086'},
   'sortorder': {'sortorder': '0'},
   'team': {'team': '8455'},
   'id': {'id': '4868674'},
   'n': {'n': '76'},
   'type': {'type': 'goal'},
   'goal_type': {'goal_type': 'n'}},
  {'comment': {'comment': 'n'},
   'stats': {'goals': {'goals': '1'}, 'shoton': {'shoton': '1'}},
   'event_incident_typefk': {'event_incident_typefk': '411'},
   'coordinates': {'value': [{'value': '26'}, {'value': '57'}]},
   'elapsed': {'elapsed': '44'},
   'player2': {'player2': '41280'},
   'subtype': {'subtype': 'volley'},
   'player1': {'player1': '23792'},
   'sortorder': {'sortorder': '1'},
   'team': {'team': '8466'},
   'id': {'id': '4869077'},
   'n': {'n'

More learnings:
- direct free kick is not treated like penalty i.e. it is a shot on target
- subtype is `direct_freekick`
- no assist is recorded for direct free kicks (i.e. only `player1` is provided)

In [218]:
# now check own goal scored by Santiago Vergini in the 14/15 season in Southampton v Sunderland
teams_df_clean[teams_df_clean.team_short_name == "SOU"]

Unnamed: 0,id,team_api_id,team_fifa_api_id,team_long_name,team_short_name
27,6504,8466,17.0,Southampton,SOU


In [219]:
teams_df_clean[teams_df_clean.team_short_name == "SUN"]

Unnamed: 0,id,team_api_id,team_fifa_api_id,team_long_name,team_short_name
4,3461,8472,106.0,Sunderland,SUN


In [220]:
top_5_leagues_matches_df_clean[(top_5_leagues_matches_df_clean.home_team_api_id == 8466) & (top_5_leagues_matches_df_clean.away_team_api_id == 8472) & (top_5_leagues_matches_df_clean.home_team_goal == 8) & (top_5_leagues_matches_df_clean.away_team_goal == 0) & (top_5_leagues_matches_df_clean.home_team_goal == 8)]

Unnamed: 0,id,country_id,league_id,season,matchday,date,match_api_id,home_team_api_id,away_team_api_id,home_team_goal,...,IWA,LBH,LBD,LBA,WHH,WHD,WHA,VCH,VCD,VCA
11310,4376,1729,1729,2014/2015,8,2014-10-18,1724059,8466,8472,8,...,5.1,1.61,3.8,6.0,1.62,3.4,6.5,1.62,4.0,6.5


In [221]:
top_5_leagues_matches_df_clean[(top_5_leagues_matches_df_clean.home_team_api_id == 8466) & (top_5_leagues_matches_df_clean.away_team_api_id == 8472) & (top_5_leagues_matches_df_clean.home_team_goal == 8) & (top_5_leagues_matches_df_clean.away_team_goal == 0) & (top_5_leagues_matches_df_clean.home_team_goal == 8)].goal.values[0]

'<goal><value><comment>o</comment><stats><owngoals>1</owngoals></stats><event_incident_typefk>115</event_incident_typefk><coordinates><value>17</value><value>6</value></coordinates><elapsed>12</elapsed><player1>188540</player1><sortorder>3</sortorder><team>8472</team><id>3843125</id><n>436</n><type>goal</type><goal_type>o</goal_type></value><value><comment>n</comment><stats><goals>1</goals><shoton>1</shoton></stats><event_incident_typefk>721</event_incident_typefk><coordinates><value>27</value><value>4</value></coordinates><elapsed>18</elapsed><player2>23792</player2><subtype>tap_in</subtype><player1>41280</player1><sortorder>1</sortorder><team>8466</team><id>3843257</id><n>360</n><type>goal</type><goal_type>n</goal_type></value><value><comment>n</comment><stats><goals>1</goals><shoton>1</shoton></stats><event_incident_typefk>721</event_incident_typefk><coordinates><value>27</value><value>2</value></coordinates><elapsed>37</elapsed><player2>94267</player2><subtype>tap_in</subtype><play

In [222]:
parse_xml_to_dict(top_5_leagues_matches_df_clean[(top_5_leagues_matches_df_clean.home_team_api_id == 8466) & (top_5_leagues_matches_df_clean.away_team_api_id == 8472) & (top_5_leagues_matches_df_clean.home_team_goal == 8) & (top_5_leagues_matches_df_clean.away_team_goal == 0) & (top_5_leagues_matches_df_clean.home_team_goal == 8)].goal.values[0])

{'value': [{'comment': {'comment': 'o'},
   'stats': {'owngoals': {'owngoals': '1'}},
   'event_incident_typefk': {'event_incident_typefk': '115'},
   'coordinates': {'value': [{'value': '17'}, {'value': '6'}]},
   'elapsed': {'elapsed': '12'},
   'player1': {'player1': '188540'},
   'sortorder': {'sortorder': '3'},
   'team': {'team': '8472'},
   'id': {'id': '3843125'},
   'n': {'n': '436'},
   'type': {'type': 'goal'},
   'goal_type': {'goal_type': 'o'}},
  {'comment': {'comment': 'n'},
   'stats': {'goals': {'goals': '1'}, 'shoton': {'shoton': '1'}},
   'event_incident_typefk': {'event_incident_typefk': '721'},
   'coordinates': {'value': [{'value': '27'}, {'value': '4'}]},
   'elapsed': {'elapsed': '18'},
   'player2': {'player2': '23792'},
   'subtype': {'subtype': 'tap_in'},
   'player1': {'player1': '41280'},
   'sortorder': {'sortorder': '1'},
   'team': {'team': '8466'},
   'id': {'id': '3843257'},
   'n': {'n': '360'},
   'type': {'type': 'goal'},
   'goal_type': {'goal_type

More learnings:
- own goal is not recorded as a shot on target, similar to penalties
- has no subtype
- goal_type is `o`
- no assist is recorded for own goals (i.e. only `player1` is provided)
- there's a subtype called "tap_in", interesting

**Strategy Outline**
- NOTE: all keys are always objects (or lists of objects) where the key is repeated as the nested key of the object eg `'type': {'type': 'goal'}`
- columns:
    - `stats` is a comma-separated list of all the nested keys it contains except the `goals` key
        - after seeing the values in it, they'll be converted to individual columns if needed
    - preserve `event_incident_typefk`
    - save coordinates as `y` and `x` coords of the shot that led to the goal i.e. value of first array member is `y` and second is `x`
    - `elapsed` is the minute the goal was scored
    - `player1` is the scorer
    - `player2` if present is the assister
    - preserve `subtype`
    - `team` is the team API ID
    - preserve `goal_type`

Excellent! So what other columns do we need?
- match api id
... I think that's it

So the steps are:
- initiate an outer variable containing all the columns we want
- for every row, parse the xml and use the values to update the columns in the outer variable
- create the dataframe based on the outer variable

In [223]:
def create_goal_events_dataframe(goal_events_data: pd.DataFrame):
    goal_events_xml: str = goal_events_data.goal
    match_api_id: int = goal_events_data.match_api_id
    df_data_rows = []

    def get_goal_events(xml_str: str):
        goal_events = parse_xml_to_dict(goal_events_xml).get("value")
        if not goal_events:
            return

        if not isinstance(goal_events, list):
            goal_events = [goal_events]

        for goal_event in goal_events:
            df_data = {}
            event_api_id = goal_event.get("id", {}).get("id")
            stats_keys = ",".join([key for key in goal_event.get("stats", {}) if key != "goals"])
            coordinates = goal_event.get("coordinates", {}).get("value", [])
            y = coordinates[0]["value"] if coordinates else None
            x = coordinates[1]["value"] if len(coordinates) > 1 else None
            elapsed = goal_event.get("elapsed", {}).get("elapsed")
            elapsed_plus = goal_event.get("elapsed_plus", {}).get("elapsed_plus")
            player1 = goal_event.get("player1", {}).get("player1")
            player2 = goal_event.get("player2", {}).get("player2")
            subtype = goal_event.get("subtype", {}).get("subtype")
            type = goal_event.get("type", {}).get("type")
            _del = goal_event.get("del", {}).get("del")
            team = goal_event.get("team", {}).get("team")
            goal_type = goal_event.get("goal_type", {}).get("goal_type")
            event_incident_typefk = goal_event.get("event_incident_typefk", {}).get("event_incident_typefk")

            df_data["match_api_id"] = match_api_id
            df_data["event_api_id"] = event_api_id
            df_data["event_incident_typefk"] = event_incident_typefk
            df_data["stats"] = stats_keys
            df_data["y"] = y
            df_data["x"] = x
            df_data["elapsed"] = elapsed
            df_data["elapsed_plus"] = elapsed_plus
            df_data["player1"] = player1
            df_data["player2"] = player2
            df_data["subtype"] = subtype
            df_data["type"] = type
            df_data["_del"] = _del
            df_data["team"] = team
            df_data["goal_type"] = goal_type

            df_data_rows.append(df_data)

    get_goal_events(goal_events_xml)

    return df_data_rows

goal_events_list = []
top_5_leagues_matches_df_clean.loc[:, ["match_api_id", "goal"]].apply(create_goal_events_dataframe, axis=1).apply(lambda x: goal_events_list.extend(x))
goal_events_df = pd.DataFrame(goal_events_list)
goal_events_df.head()

Unnamed: 0,match_api_id,event_api_id,event_incident_typefk,stats,y,x,elapsed,elapsed_plus,player1,player2,subtype,type,_del,team,goal_type
0,499317,375301,71,shoton,,,12,,30872,,,goal,,9823,n
1,499317,375302,20,penalties,,,16,,32118,,,goal,,9823,p
2,499317,375303,71,shoton,,,25,,33101,,,goal,,9790,n
3,499317,375308,20,penalties,,,56,,39106,,,goal,,9790,p
4,489043,375546,393,shoton,,,4,,26181,39297.0,shot,goal,,9825,n


In [224]:
events_xml_columns = top_5_leagues_matches_df_clean.loc[:, "goal":"possession"].columns
events_xml_columns

Index(['goal', 'shoton', 'shotoff', 'foulcommit', 'card', 'cross', 'corner',
       'possession'],
      dtype='object')

**SHOTON**

In [225]:
# some rows have just "<shoton />" because there were no shots on goal, so we have to filter those out
parse_xml_to_dict(top_5_leagues_matches_df_clean.sample(10)[top_5_leagues_matches_df_clean.shoton.str.len() > 10].shoton.values[0])

  parse_xml_to_dict(top_5_leagues_matches_df_clean.sample(10)[top_5_leagues_matches_df_clean.shoton.str.len() > 10].shoton.values[0])


{'value': [{'stats': {'shoton': {'shoton': '1'}},
   'event_incident_typefk': {'event_incident_typefk': '136'},
   'elapsed': {'elapsed': '15'},
   'subtype': {'subtype': 'header'},
   'player1': {'player1': '33028'},
   'sortorder': {'sortorder': '1'},
   'team': {'team': '10267'},
   'n': {'n': '250'},
   'type': {'type': 'shoton'},
   'id': {'id': '1933269'}},
  {'stats': {'shoton': {'shoton': '1'}},
   'event_incident_typefk': {'event_incident_typefk': '147'},
   'elapsed': {'elapsed': '16'},
   'subtype': {'subtype': 'shot'},
   'player1': {'player1': '38886'},
   'sortorder': {'sortorder': '0'},
   'team': {'team': '9869'},
   'n': {'n': '251'},
   'type': {'type': 'shoton'},
   'id': {'id': '1933275'}},
  {'stats': {'blocked': {'blocked': '1'}},
   'event_incident_typefk': {'event_incident_typefk': '61'},
   'elapsed': {'elapsed': '32'},
   'subtype': {'subtype': 'blocked_shot'},
   'player1': {'player1': '150739'},
   'sortorder': {'sortorder': '2'},
   'team': {'team': '10267'

Learnings:
- `stats` key includes a `shoton` nested key that should be ignored (similar to `goals` key previously)
- `event_incident_typefk`, `elapsed`, `subtype`, `player1`, `team` and `id` are still present
    - of course `player1` is the shooter
- there's a subtype of shots on target named `big chance shot`, which can be a way to assess the excitement of a match (how many of these came up in a match)

In [226]:
def create_shoton_events_dataframe(events_data: pd.DataFrame):
    events_xml: str = events_data.shoton
    match_api_id: int = events_data.match_api_id
    df_data_rows = []

    def get_events(xml_str: str):
        events = parse_xml_to_dict(events_xml).get("value")
        if not events:
            return

        if not isinstance(events, list):
            events = [events]

        for event in events:
            df_data = {}
            event_api_id = event.get("id", {}).get("id")
            stats_keys = ",".join([key for key in event.get("stats", {}) if key != "shoton"])
            elapsed = event.get("elapsed", {}).get("elapsed")
            elapsed_plus = event.get("elapsed_plus", {}).get("elapsed_plus")
            player1 = event.get("player1", {}).get("player1")
            subtype = event.get("subtype", {}).get("subtype")
            type = event.get("type", {}).get("type")
            _del = event.get("del", {}).get("del")
            card_type = event.get("card_type", {}).get("card_type")
            team = event.get("team", {}).get("team")
            event_incident_typefk = event.get("event_incident_typefk", {}).get("event_incident_typefk")

            df_data["match_api_id"] = match_api_id
            df_data["event_api_id"] = event_api_id
            df_data["event_incident_typefk"] = event_incident_typefk
            df_data["stats"] = stats_keys
            df_data["elapsed"] = elapsed
            df_data["elapsed_plus"] = elapsed_plus
            df_data["player1"] = player1
            df_data["subtype"] = subtype
            df_data["type"] = type
            df_data["_del"] = _del
            df_data["card_type"] = card_type
            df_data["team"] = team

            df_data_rows.append(df_data)

    get_events(events_xml)

    return df_data_rows

shoton_events_list = []
top_5_leagues_matches_df_clean.loc[:, ["match_api_id", "shoton"]].apply(create_shoton_events_dataframe, axis=1).apply(lambda x: shoton_events_list.extend(x))
shoton_events_df = pd.DataFrame(shoton_events_list)
shoton_events_df.head()

Unnamed: 0,match_api_id,event_api_id,event_incident_typefk,stats,elapsed,elapsed_plus,player1,subtype,type,_del,card_type,team
0,489043,375556,61,blocked,7,,31013,blocked_shot,shoton,,,9825
1,489043,375557,137,,7,,30960,distance,shoton,,,9825
2,489043,375563,61,blocked,9,,26111,blocked_shot,shoton,,,9825
3,489043,375596,135,,23,,36410,shot,shoton,,,9825
4,489043,375628,136,,32,,26111,header,shoton,,,9825


In [227]:
shoton_events_df.card_type.value_counts(dropna=False)

None    85100
y           1
Name: card_type, dtype: int64

In [228]:
shoton_events_df[shoton_events_df.card_type == 'y']

Unnamed: 0,match_api_id,event_api_id,event_incident_typefk,stats,elapsed,elapsed_plus,player1,subtype,type,_del,card_type,team
35040,1083354,1952281,494,blocked,84,,73845,blocked_shot,shoton,,y,9857


In [229]:
shoton_events_df._del.value_counts(dropna=False)

None    85037
1          64
Name: _del, dtype: int64

In [230]:
shoton_events_df[shoton_events_df._del == '1']

Unnamed: 0,match_api_id,event_api_id,event_incident_typefk,stats,elapsed,elapsed_plus,player1,subtype,type,_del,card_type,team
31986,1051808,1828530,311,,53,,,deflected,shoton,1,,
32271,1030061,1834656,62,blocked,75,,,blocked_shot,shoton,1,,
32549,1051811,1844974,62,blocked,55,,,blocked_shot,shoton,1,,
33628,1025282,1889204,61,blocked,45,3,,blocked_shot,shoton,1,,
33888,1051841,1906554,153,,16,,,shot,shoton,1,,
...,...,...,...,...,...,...,...,...,...,...,...,...
80497,2060461,5391728,135,,63,,,shot,shoton,1,,
81046,2002332,5419017,61,blocked,89,,,blocked_shot,shoton,1,,
82307,2002419,5494136,61,blocked,32,,,blocked_shot,shoton,1,,
83044,2030504,5528381,137,,72,,,distance,shoton,1,,


**SHOTOFF**

In [231]:
(top_5_leagues_matches_df_clean.shotoff.str.len() <= 11).sum()

5050

In [232]:
# some rows have just "<shotoff />" because there were no shots on goal, so we have to filter those out
parse_xml_to_dict(top_5_leagues_matches_df_clean.sample(10)[top_5_leagues_matches_df_clean.shotoff.str.len() > 11].shotoff.values[0])

  parse_xml_to_dict(top_5_leagues_matches_df_clean.sample(10)[top_5_leagues_matches_df_clean.shotoff.str.len() > 11].shotoff.values[0])


{'value': [{'stats': {'shotoff': {'shotoff': '1'}},
   'event_incident_typefk': {'event_incident_typefk': '317'},
   'elapsed': {'elapsed': '12'},
   'subtype': {'subtype': 'deflected'},
   'player1': {'player1': '30655'},
   'sortorder': {'sortorder': '3'},
   'team': {'team': '10189'},
   'n': {'n': '159'},
   'type': {'type': 'shotoff'},
   'id': {'id': '1257485'}},
  {'stats': {'shotoff': {'shotoff': '1'}},
   'event_incident_typefk': {'event_incident_typefk': '46'},
   'elapsed': {'elapsed': '23'},
   'subtype': {'subtype': 'shot'},
   'player1': {'player1': '30655'},
   'sortorder': {'sortorder': '0'},
   'team': {'team': '10189'},
   'n': {'n': '166'},
   'type': {'type': 'shotoff'},
   'id': {'id': '1257504'}},
  {'stats': {'shotoff': {'shotoff': '1'}},
   'event_incident_typefk': {'event_incident_typefk': '47'},
   'elapsed': {'elapsed': '25'},
   'subtype': {'subtype': 'header'},
   'player1': {'player1': '164706'},
   'sortorder': {'sortorder': '0'},
   'team': {'team': '815

Learnings:
- seems like the same structure as shots on target
- there's a subtype of shots off target named `crossbar`, which can be a way to get more info on goal mouth action for shots off target
    - there's a similar subtype called `big chance header`, interesting for the same reason that `big chance shot` is interesting
- there's an `elapsed_plus` key which tells us how many minutes into added time the event happened
    - can be quite powerful for measuring last minute action, as a measure of excitability
    - however I need to add this column to all the other tables that have `elapsed` so that I can have that info there too
- there's a `bicycle_kick` subtype, which is could be a measure of excitement

In [233]:
def create_shotoff_events_dataframe(events_data: pd.DataFrame):
    events_xml: str = events_data.shotoff
    match_api_id: int = events_data.match_api_id
    df_data_rows = []

    def get_events(xml_str: str):
        events = parse_xml_to_dict(events_xml).get("value")
        if not events:
            return

        if not isinstance(events, list):
            events = [events]

        for event in events:
            df_data = {}
            event_api_id = event.get("id", {}).get("id")
            stats_keys = ",".join([key for key in event.get("stats", {}) if key != "shotoff"])
            elapsed = event.get("elapsed", {}).get("elapsed")
            elapsed_plus = event.get("elapsed_plus", {}).get("elapsed_plus")
            player1 = event.get("player1", {}).get("player1")
            subtype = event.get("subtype", {}).get("subtype")
            type = event.get("type", {}).get("type")
            _del = event.get("del", {}).get("del")
            card_type = event.get("card_type", {}).get("card_type")
            team = event.get("team", {}).get("team")
            event_incident_typefk = event.get("event_incident_typefk", {}).get("event_incident_typefk")

            df_data["match_api_id"] = match_api_id
            df_data["event_api_id"] = event_api_id
            df_data["event_incident_typefk"] = event_incident_typefk
            df_data["stats"] = stats_keys
            df_data["elapsed"] = elapsed
            df_data["elapsed_plus"] = elapsed_plus
            df_data["player1"] = player1
            df_data["subtype"] = subtype
            df_data["type"] = type
            df_data["_del"] = _del
            df_data["card_type"] = card_type
            df_data["team"] = team

            df_data_rows.append(df_data)

    get_events(events_xml)

    return df_data_rows

shotoff_events_list = []
top_5_leagues_matches_df_clean.loc[:, ["match_api_id", "shotoff"]].apply(create_shotoff_events_dataframe, axis=1).apply(lambda x: shotoff_events_list.extend(x))
shotoff_events_df = pd.DataFrame(shotoff_events_list)
shotoff_events_df.head()

Unnamed: 0,match_api_id,event_api_id,event_incident_typefk,stats,elapsed,elapsed_plus,player1,subtype,type,_del,card_type,team
0,489043,375553,81,,6,,23257,direct_freekick,shotoff,,,8659
1,489043,375561,317,,9,,26181,deflected,shotoff,,,9825
2,489043,375572,9,,11,,38835,distance,shotoff,,,9825
3,489043,375585,317,,19,,26111,deflected,shotoff,,,9825
4,489043,375603,47,,25,,36410,header,shotoff,,,9825


Try creating a function that will look through every event in every row and extract all the unique keys, so that I can know right out of the gate what columns I need to create
- should be more doable now that I have the function for parsing the xml to get the dictionary of events

In [234]:
def get_all_event_keys(events_xml: str):
    events_keys = set()

    def get_events(xml_str: str):
        events = parse_xml_to_dict(events_xml).get("value")
        if not events:
            return

        if not isinstance(events, list):
            events = [events]

        for event in events:
            df_data = {}
            keys_set = set(event.keys())
            for unrecorded_key in (events_keys | keys_set) - events_keys:
                events_keys.add(unrecorded_key)

    get_events(events_xml)

    return events_keys

def combine_sets(series: pd.Series):
    combined_set = set()
    for s in series:
        combined_set.update(s)
    return combined_set

# start checking with goal events
combine_sets(top_5_leagues_matches_df_clean.goal.apply(get_all_event_keys))

{'comment',
 'coordinates',
 'del',
 'elapsed',
 'elapsed_plus',
 'event_incident_typefk',
 'goal_type',
 'id',
 'n',
 'player1',
 'player2',
 'sortorder',
 'stats',
 'subtype',
 'team',
 'type',
 'value'}

`del` is the only one I hadn't already seen, so I've updated the goal events code above with it.

In [235]:
# next is shoton
combine_sets(top_5_leagues_matches_df_clean.shoton.apply(get_all_event_keys))

{'card_type',
 'coordinates',
 'del',
 'elapsed',
 'elapsed_plus',
 'event_incident_typefk',
 'goal_type',
 'id',
 'n',
 'player1',
 'sortorder',
 'stats',
 'subtype',
 'team',
 'type',
 'value'}

`del` and `card_type` are the new ones, I've updated the code above to include them in the df.

In [236]:
# next is shotoff
combine_sets(top_5_leagues_matches_df_clean.shotoff.apply(get_all_event_keys))

{'card_type',
 'coordinates',
 'del',
 'elapsed',
 'elapsed_plus',
 'event_incident_typefk',
 'id',
 'n',
 'player1',
 'sortorder',
 'stats',
 'subtype',
 'team',
 'type',
 'value'}

Same as for `shoton`, not surprising.

**FOULCOMMIT**

In [237]:
combine_sets(top_5_leagues_matches_df_clean.foulcommit.apply(get_all_event_keys))

{'card_type',
 'coordinates',
 'del',
 'elapsed',
 'elapsed_plus',
 'event_incident_typefk',
 'id',
 'injury_time',
 'n',
 'player1',
 'player2',
 'sortorder',
 'stats',
 'subtype',
 'team',
 'type',
 'value',
 'venue'}

In [238]:
foulcommit_event_keys = combine_sets(top_5_leagues_matches_df_clean.foulcommit.apply(get_all_event_keys))
foulcommit_event_keys

{'card_type',
 'coordinates',
 'del',
 'elapsed',
 'elapsed_plus',
 'event_incident_typefk',
 'id',
 'injury_time',
 'n',
 'player1',
 'player2',
 'sortorder',
 'stats',
 'subtype',
 'team',
 'type',
 'value',
 'venue'}

In [239]:
def create_events_df_with_columns(cols_to_save, df_column_name):
    """
    Returns a function that creates a dataframe with the given columns from the given column of the top_5_leagues_matches_df_clean dataframe
    """
    def create_events_dataframe(events_data: pd.DataFrame):
        events_xml: str = events_data[df_column_name]
        match_api_id: int = events_data.match_api_id
        df_data_rows = []

        def get_events(xml_str: str):
            events = parse_xml_to_dict(events_xml).get("value")
            if not events:
                return

            if not isinstance(events, list):
                events = [events]

            for event in events:
                df_data = {}
                for col in cols_to_save:
                    if col == "id":
                        col_name = "event_api_id"
                    elif col == "del":
                        col_name = "_del"
                    else:
                        col_name = col

                    df_data["match_api_id"] = match_api_id

                    if col == "stats":
                        df_data[col] = ",".join([key for key in event.get(col, {}) if key != df_column_name])
                    else:
                        df_data[col_name] = event.get(col, {}).get(col, None)

                df_data_rows.append(df_data)

        get_events(events_xml)

        return df_data_rows

    return create_events_dataframe

def create_df_from_events(events_series: pd.Series):
    events_list = []
    events_series.apply(lambda x: events_list.extend(x))
    return pd.DataFrame(events_list)

foulcommit_events_df = create_df_from_events(
    top_5_leagues_matches_df_clean[["match_api_id", "foulcommit"]].apply(create_events_df_with_columns(
        foulcommit_event_keys.difference({"sortorder", "n", "coordinates", "value"}),
        "foulcommit"
    ), axis=1)
)
foulcommit_events_df.head()

Unnamed: 0,match_api_id,card_type,type,player1,elapsed_plus,subtype,venue,team,player2,elapsed,event_api_id,_del,event_incident_typefk,stats,injury_time
0,489043,,foulcommit,36832,,,,8659,26181,2,375531,,37,foulscommitted,
1,489043,,foulcommit,31291,,,,9825,23257,3,375541,,37,foulscommitted,
2,489043,,foulcommit,30935,,pushing,,9825,31088,5,375551,,320,foulscommitted,
3,489043,,foulcommit,30960,,shirt_pull,,9825,24171,10,375564,,18,foulscommitted,
4,489043,,foulcommit,30960,,shirt_pull,,9825,23115,17,375581,,18,foulscommitted,


In [240]:
parse_xml_to_dict(top_5_leagues_matches_df_clean.sample(1).foulcommit.values[0])

{}

In [241]:
top_5_leagues_matches_df_clean[top_5_leagues_matches_df_clean.foulcommit.str.len() < 20].foulcommit.value_counts(dropna=False)

<foulcommit />    5048
Name: foulcommit, dtype: int64

The `<foulcommit />` value indicates that there are no foul records for the match. That is truly strange and almost impossible, so this more than likely means the records are just not available within the table. Worth remembering because the analysis has to factor this into the conclusions.

**CARD**

In [242]:
card_event_keys = combine_sets(top_5_leagues_matches_df_clean.card.apply(get_all_event_keys))
card_event_keys

{'card_type',
 'comment',
 'del',
 'elapsed',
 'elapsed_plus',
 'event_incident_typefk',
 'goal_type',
 'id',
 'n',
 'player1',
 'sortorder',
 'stats',
 'subtype',
 'team',
 'type',
 'value'}

In [243]:
card_events_df = create_df_from_events(
    top_5_leagues_matches_df_clean[["match_api_id", "card"]].apply(create_events_df_with_columns(
        card_event_keys.difference({"sortorder", "n", "value"}),
        "card"
    ), axis=1)
)
card_events_df.head()

Unnamed: 0,match_api_id,subtype,elapsed,event_incident_typefk,goal_type,card_type,type,player1,elapsed_plus,event_api_id,_del,comment,stats,team
0,499317,,19,70,,y,card,30749,,375310,,y,ycards,9790
1,499317,,64,70,,y,card,38244,,375311,,y,ycards,9823
2,499317,,44,70,,y,card,33101,,375312,,y,ycards,9790
3,499317,,44,70,,y,card,30598,,375313,,y,ycards,9790
4,499317,,85,70,,y,card,37156,,375317,,y,ycards,9790


In [244]:
parse_xml_to_dict(top_5_leagues_matches_df_clean.sample(1).card.values[0])

{'value': [{'comment': {'comment': 'y'},
   'stats': {'ycards': {'ycards': '1'}},
   'event_incident_typefk': {'event_incident_typefk': '70'},
   'elapsed': {'elapsed': '37'},
   'card_type': {'card_type': 'y'},
   'player1': {'player1': '42434'},
   'sortorder': {'sortorder': '0'},
   'team': {'team': '8540'},
   'n': {'n': '232'},
   'type': {'type': 'card'},
   'id': {'id': '4371284'}},
  {'comment': {'comment': 'y'},
   'stats': {'ycards': {'ycards': '1'}},
   'event_incident_typefk': {'event_incident_typefk': '70'},
   'elapsed': {'elapsed': '55'},
   'card_type': {'card_type': 'y'},
   'player1': {'player1': '108401'},
   'sortorder': {'sortorder': '1'},
   'team': {'team': '8540'},
   'n': {'n': '330'},
   'type': {'type': 'card'},
   'id': {'id': '4371849'}},
  {'comment': {'comment': 'y'},
   'stats': {'ycards': {'ycards': '1'}},
   'event_incident_typefk': {'event_incident_typefk': '70'},
   'elapsed': {'elapsed': '66'},
   'card_type': {'card_type': 'y'},
   'player1': {'pla

**CROSS**

In [245]:
cross_event_keys = combine_sets(top_5_leagues_matches_df_clean.cross.apply(get_all_event_keys))
cross_event_keys

{'coordinates',
 'del',
 'elapsed',
 'elapsed_plus',
 'event_incident_typefk',
 'goal_type',
 'id',
 'n',
 'player1',
 'sortorder',
 'spectators',
 'stats',
 'subtype',
 'team',
 'type',
 'value'}

In [246]:
cross_events_df = create_df_from_events(
    top_5_leagues_matches_df_clean[["match_api_id", "cross"]].apply(create_events_df_with_columns(
        cross_event_keys.difference({"sortorder", "n", "value", "coordinates"}),
        "cross"
    ), axis=1)
)
cross_events_df.head()

Unnamed: 0,match_api_id,spectators,type,player1,elapsed_plus,subtype,elapsed,event_api_id,_del,event_incident_typefk,goal_type,stats,team
0,489043,,cross,39297,,cross,3,375536,,7,,crosses,9825
1,489043,,corner,39297,,cross,3,375538,,329,,corners,9825
2,489043,,cross,31291,,cross,10,375566,,7,,crosses,9825
3,489043,,cross,26111,,cross,11,375568,,7,,crosses,9825
4,489043,,cross,31291,,cross,11,375570,,7,,crosses,9825


In [247]:
parse_xml_to_dict(top_5_leagues_matches_df_clean.sample(1).cross.values[0])

{'value': [{'stats': {'corners': {'corners': '1'}},
   'event_incident_typefk': {'event_incident_typefk': '329'},
   'elapsed': {'elapsed': '3'},
   'subtype': {'subtype': 'cross'},
   'player1': {'player1': '23916'},
   'sortorder': {'sortorder': '3'},
   'team': {'team': '8655'},
   'n': {'n': '211'},
   'type': {'type': 'corner'},
   'id': {'id': '2004456'}},
  {'stats': {'corners': {'corners': '1'}},
   'event_incident_typefk': {'event_incident_typefk': '329'},
   'elapsed': {'elapsed': '4'},
   'subtype': {'subtype': 'cross'},
   'player1': {'player1': '23916'},
   'sortorder': {'sortorder': '1'},
   'team': {'team': '8655'},
   'n': {'n': '212'},
   'type': {'type': 'corner'},
   'id': {'id': '2004463'}},
  {'stats': {'crosses': {'crosses': '1'}},
   'event_incident_typefk': {'event_incident_typefk': '7'},
   'elapsed': {'elapsed': '7'},
   'subtype': {'subtype': 'cross'},
   'player1': {'player1': '175947'},
   'sortorder': {'sortorder': '1'},
   'team': {'team': '8472'},
   'n'

**CORNER**

In [248]:
corner_event_keys = combine_sets(top_5_leagues_matches_df_clean.corner.apply(get_all_event_keys))
corner_event_keys

{'coordinates',
 'del',
 'elapsed',
 'elapsed_plus',
 'event_incident_typefk',
 'id',
 'n',
 'player1',
 'sortorder',
 'spectators',
 'stats',
 'subtype',
 'team',
 'type',
 'value'}

In [249]:
corner_events_df = create_df_from_events(
    top_5_leagues_matches_df_clean[["match_api_id", "corner"]].apply(create_events_df_with_columns(
        corner_event_keys.difference({"sortorder", "n", "value", "coordinates"}),
        "corner"
    ), axis=1)
)
corner_events_df.head()

Unnamed: 0,match_api_id,spectators,type,player1,elapsed_plus,subtype,elapsed,event_api_id,_del,event_incident_typefk,stats,team
0,489043,,corner,39297,,short,3,375535,,330,corners,9825
1,489043,,corner,39297,,cross,3,375538,,329,corners,9825
2,489043,,corner,30960,,short,9,375562,,330,corners,9825
3,489043,,corner,39297,,cross,19,375586,,329,corners,9825
4,489043,,corner,39297,,cross,25,375602,,329,corners,9825


In [250]:
parse_xml_to_dict(top_5_leagues_matches_df_clean.sample(1).corner.values[0])

{'value': [{'stats': {'corners': {'corners': '1'}},
   'event_incident_typefk': {'event_incident_typefk': '871'},
   'coordinates': {'value': [{'value': '1'}, {'value': '1'}]},
   'elapsed': {'elapsed': '28'},
   'subtype': {'subtype': 'short_left'},
   'player1': {'player1': '176300'},
   'sortorder': {'sortorder': '1'},
   'team': {'team': '10194'},
   'n': {'n': '202'},
   'type': {'type': 'corner'},
   'id': {'id': '4792141'}},
  {'stats': {'corners': {'corners': '1'}},
   'event_incident_typefk': {'event_incident_typefk': '867'},
   'coordinates': {'value': [{'value': '1'}, {'value': '1'}]},
   'elapsed': {'elapsed': '28'},
   'subtype': {'subtype': 'cross_left'},
   'player1': {'player1': '176300'},
   'sortorder': {'sortorder': '3'},
   'team': {'team': '10194'},
   'n': {'n': '214'},
   'type': {'type': 'corner'},
   'id': {'id': '4792149'}},
  {'stats': {'corners': {'corners': '1'}},
   'event_incident_typefk': {'event_incident_typefk': '867'},
   'coordinates': {'value': [{'v

**POSSESSION**

In [251]:
possession_event_keys = combine_sets(top_5_leagues_matches_df_clean.possession.apply(get_all_event_keys))
possession_event_keys

{'awaypos',
 'card_type',
 'comment',
 'del',
 'elapsed',
 'elapsed_plus',
 'event_incident_typefk',
 'homepos',
 'id',
 'injury_time',
 'n',
 'sortorder',
 'stats',
 'subtype',
 'type',
 'value'}

In [252]:
possession_events_df = create_df_from_events(
    top_5_leagues_matches_df_clean[["match_api_id", "possession"]].apply(create_events_df_with_columns(
        possession_event_keys.difference({"sortorder", "n", "value", "type"}),
        "possession"
    ), axis=1)
)
possession_events_df.head()

Unnamed: 0,match_api_id,card_type,homepos,elapsed_plus,subtype,awaypos,elapsed,event_api_id,_del,event_incident_typefk,comment,stats,injury_time
0,489043,,65,,possession,35,27,375608,,352,65,,
1,489043,,61,2.0,possession,39,45,375663,,352,61,,
2,489043,,65,,possession,35,74,375787,,352,65,,
3,489043,,66,3.0,possession,34,90,375909,,352,66,,
4,489044,,45,,possession,55,25,377876,,352,45,,


In [253]:
parse_xml_to_dict(top_5_leagues_matches_df_clean.sample(1).possession.values[0])

{}

In [254]:
top_5_leagues_matches_df_clean.drop(events_xml_columns, axis=1, inplace=True)

##### Test

In [255]:
# assert that each member of the events xml columns variable is not a column in the top 5 league matches clean df
for xml_col in events_xml_columns:
    assert xml_col not in top_5_leagues_matches_df_clean.columns

#### player positions (`home_player_X1` ... `away_player_Y11`) should be in a separate table

##### Define

- move those values into a dataframe where each row is unique on the match ID and player
- the dataframe also needs columns for the x and y position of each player
- so effectively the columns should be: match api id, player id, x pos, y pos

##### Code

In [256]:
top_5_leagues_matches_df_clean.iloc[:, -100:-30].columns

Index(['id', 'country_id', 'league_id', 'season', 'matchday', 'date',
       'match_api_id', 'home_team_api_id', 'away_team_api_id',
       'home_team_goal', 'away_team_goal', 'home_player_X1', 'home_player_X2',
       'home_player_X3', 'home_player_X4', 'home_player_X5', 'home_player_X6',
       'home_player_X7', 'home_player_X8', 'home_player_X9', 'home_player_X10',
       'home_player_X11', 'away_player_X1', 'away_player_X2', 'away_player_X3',
       'away_player_X4', 'away_player_X5', 'away_player_X6', 'away_player_X7',
       'away_player_X8', 'away_player_X9', 'away_player_X10',
       'away_player_X11', 'home_player_Y1', 'home_player_Y2', 'home_player_Y3',
       'home_player_Y4', 'home_player_Y5', 'home_player_Y6', 'home_player_Y7',
       'home_player_Y8', 'home_player_Y9', 'home_player_Y10',
       'home_player_Y11', 'away_player_Y1', 'away_player_Y2', 'away_player_Y3',
       'away_player_Y4', 'away_player_Y5', 'away_player_Y6', 'away_player_Y7',
       'away_player_Y8', 'aw

In [257]:
top_5_leagues_matches_df_clean.columns[top_5_leagues_matches_df_clean.columns.str.contains('player_')]

Index(['home_player_X1', 'home_player_X2', 'home_player_X3', 'home_player_X4',
       'home_player_X5', 'home_player_X6', 'home_player_X7', 'home_player_X8',
       'home_player_X9', 'home_player_X10', 'home_player_X11',
       'away_player_X1', 'away_player_X2', 'away_player_X3', 'away_player_X4',
       'away_player_X5', 'away_player_X6', 'away_player_X7', 'away_player_X8',
       'away_player_X9', 'away_player_X10', 'away_player_X11',
       'home_player_Y1', 'home_player_Y2', 'home_player_Y3', 'home_player_Y4',
       'home_player_Y5', 'home_player_Y6', 'home_player_Y7', 'home_player_Y8',
       'home_player_Y9', 'home_player_Y10', 'home_player_Y11',
       'away_player_Y1', 'away_player_Y2', 'away_player_Y3', 'away_player_Y4',
       'away_player_Y5', 'away_player_Y6', 'away_player_Y7', 'away_player_Y8',
       'away_player_Y9', 'away_player_Y10', 'away_player_Y11', 'home_player_1',
       'home_player_2', 'home_player_3', 'home_player_4', 'home_player_5',
       'home_player_6', 

In [258]:
top_5_leagues_matches_df_clean[top_5_leagues_matches_df_clean.match_api_id == 499317].home_player_X1

10    1.0
Name: home_player_X1, dtype: float64

In [259]:
top_5_leagues_matches_df_clean.iloc[:, -84:-29].columns

Index(['home_player_X1', 'home_player_X2', 'home_player_X3', 'home_player_X4',
       'home_player_X5', 'home_player_X6', 'home_player_X7', 'home_player_X8',
       'home_player_X9', 'home_player_X10', 'home_player_X11',
       'away_player_X1', 'away_player_X2', 'away_player_X3', 'away_player_X4',
       'away_player_X5', 'away_player_X6', 'away_player_X7', 'away_player_X8',
       'away_player_X9', 'away_player_X10', 'away_player_X11',
       'home_player_Y1', 'home_player_Y2', 'home_player_Y3', 'home_player_Y4',
       'home_player_Y5', 'home_player_Y6', 'home_player_Y7', 'home_player_Y8',
       'home_player_Y9', 'home_player_Y10', 'home_player_Y11',
       'away_player_Y1', 'away_player_Y2', 'away_player_Y3', 'away_player_Y4',
       'away_player_Y5', 'away_player_Y6', 'away_player_Y7', 'away_player_Y8',
       'away_player_Y9', 'away_player_Y10', 'away_player_Y11', 'home_player_1',
       'home_player_2', 'home_player_3', 'home_player_4', 'home_player_5',
       'home_player_6', 

In [260]:
def transform_player_positions(df):
    # Get the relevant columns related to player positions
    player_columns = df.columns[df.columns.str.startswith('home_player_')]

    # Split the player_columns into player IDs and coordinate types
    player_nums = player_columns.str.extract(r'home_player_(\d+)', expand=True).dropna()[0].tolist()

    # Create a list to store the transformed rows
    rows = []

    def populate_players_pos(team_designation, match_api_id, player_nums):
        # Iterate over each player ID and coordinate type
        for player_num in player_nums:
            player_id = row[f'{team_designation}_player_{player_num}']
            x_pos = row[f'{team_designation}_player_X{player_num}']
            y_pos = row[f'{team_designation}_player_Y{player_num}']

            # Create a new row with the transformed values
            transformed_row = {
                'match_api_id': match_api_id,
                'player_id': player_id,
                'player_num': player_num,
                'team_designation': team_designation,
                'x_pos': x_pos,
                'y_pos': y_pos
            }

            # Append the transformed row to the list
            rows.append(transformed_row)

    # Iterate over each row in the original dataframe
    for _, row in df.iterrows():
        match_api_id = row['match_api_id']

        populate_players_pos('home', match_api_id, player_nums)
        populate_players_pos('away', match_api_id, player_nums)

    # Create a new dataframe from the list of transformed rows
    transformed_df = pd.DataFrame(rows)

    return transformed_df

player_match_positions_df = transform_player_positions(top_5_leagues_matches_df_clean)
player_match_positions_df.head()

Unnamed: 0,match_api_id,player_id,player_num,team_designation,x_pos,y_pos
0,499317,27284.0,1,home,1.0,1.0
1,499317,35988.0,2,home,2.0,3.0
2,499317,39774.0,3,home,4.0,3.0
3,499317,33085.0,4,home,6.0,3.0
4,499317,30894.0,5,home,8.0,3.0


##### Test

In [261]:
player_match_positions_df.describe()

Unnamed: 0,match_api_id,player_id,x_pos,y_pos
count,279752.0,279752.0,279748.0,279748.0
mean,1254489.0,88541.518305,4.635622,5.55115
std,478133.4,98344.664662,2.144022,2.986857
min,489042.0,2752.0,1.0,1.0
25%,857043.8,30740.0,3.0,3.0
50%,1229438.0,39534.0,5.0,6.0
75%,1709996.0,115670.0,6.0,8.0
max,2060645.0,750435.0,9.0,11.0


In [262]:
top_5_leagues_matches_df_clean.iloc[:, 11:33].describe()

Unnamed: 0,home_player_X1,home_player_X2,home_player_X3,home_player_X4,home_player_X5,home_player_X6,home_player_X7,home_player_X8,home_player_X9,home_player_X10,...,away_player_X2,away_player_X3,away_player_X4,away_player_X5,away_player_X6,away_player_X7,away_player_X8,away_player_X9,away_player_X10,away_player_X11
count,12716.0,12716.0,12716.0,12716.0,12716.0,12716.0,12716.0,12716.0,12716.0,12716.0,...,12716.0,12716.0,12716.0,12716.0,12716.0,12716.0,12716.0,12716.0,12716.0,12712.0
mean,1.000079,2.086427,4.068103,6.064092,7.444086,3.421044,4.945659,5.014391,5.763684,5.53822,...,2.085876,4.067395,6.06598,7.409641,3.455961,4.897767,5.008729,5.747641,5.631802,5.623977
std,0.008868,0.362846,0.367048,0.431985,1.795859,1.159268,1.125176,1.822142,1.830077,1.454147,...,0.389422,0.392742,0.454595,1.833562,1.248147,1.170656,1.814392,1.818487,1.501398,0.71963
min,1.0,1.0,1.0,2.0,1.0,1.0,1.0,1.0,1.0,2.0,...,1.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,3.0
25%,1.0,2.0,4.0,6.0,8.0,3.0,4.0,3.0,5.0,4.0,...,2.0,4.0,6.0,8.0,3.0,4.0,3.0,5.0,4.0,5.0
50%,1.0,2.0,4.0,6.0,8.0,4.0,5.0,6.0,5.0,5.0,...,2.0,4.0,6.0,8.0,4.0,5.0,6.0,5.0,5.0,5.0
75%,1.0,2.0,4.0,6.0,8.0,4.0,6.0,7.0,8.0,7.0,...,2.0,4.0,6.0,8.0,4.0,6.0,7.0,8.0,7.0,6.0
max,2.0,8.0,8.0,8.0,9.0,9.0,9.0,9.0,9.0,9.0,...,8.0,8.0,8.0,9.0,9.0,8.0,9.0,9.0,9.0,7.0


Looks like the values are in the same range between 1 and 11.

In [263]:
def confirm_player_pos():
    """
    Gets a random row in the match table, picks a random player from the match, and confirms that the player's
    coordinates match the coordinates in the player match positions table
    """
    random_row = top_5_leagues_matches_df_clean.sample(1)
    random_home_player = random_row.iloc[0]['home_player_5']
    random_away_player = random_row.iloc[0]['away_player_5']
    random_match = random_row.iloc[0]['match_api_id']
    home_player_match_pos = player_match_positions_df[
        (player_match_positions_df.player_id == random_home_player) & (player_match_positions_df.match_api_id == random_match)
    ]
    away_player_match_pos = player_match_positions_df[
        (player_match_positions_df.player_id == random_away_player) & (player_match_positions_df.match_api_id == random_match)
    ]
    assert not home_player_match_pos.empty, "No matching position for home player"
    assert not away_player_match_pos.empty, "No matching position for away player"
    assert home_player_match_pos.iloc[0]['x_pos'] == random_row.iloc[0]['home_player_X5']
    assert home_player_match_pos.iloc[0]['y_pos'] == random_row.iloc[0]['home_player_Y5']
    assert away_player_match_pos.iloc[0]['x_pos'] == random_row.iloc[0]['away_player_X5']
    assert away_player_match_pos.iloc[0]['y_pos'] == random_row.iloc[0]['away_player_Y5']

confirm_player_pos()

#### match players (`home_player_1` ... `away_player_11`) should be in a separate table

##### Define

- no need to actually do anything here because the player positions table already contains rows that are unique on player ID and match ID

### player attributes table - tidiness

#### duplicated rows for the same information (same player and date)

##### Define

- investigate the affected rows to see what info if any is different
- decide on if the duplicated rows should be preserved as-is, merged using a strategy, or dropped
- implement the decision

##### Code

In [264]:
player_attributes_df_clean[pd.Series(player_attributes_df_clean.apply(lambda row: str(row.date) + "_" + str(row.player_api_id), axis=1)).duplicated()]

Unnamed: 0,id,player_fifa_api_id,player_api_id,date,overall_rating,potential,preferred_foot,attacking_work_rate,defensive_work_rate,crossing,...,marking,standing_tackle,sliding_tackle,gk_diving,gk_handling,gk_kicking,gk_positioning,gk_reflexes,season,league_id


These have already been unintentionally removed by the other data cleaning steps.

##### Test

In [265]:
assert pd.Series(player_attributes_df_clean.apply(lambda row: str(row.date) + "_" + str(row.player_api_id), axis=1)).duplicated().sum() == 0

### champs league table - tidiness

#### some rows are duplicates

##### Define

- investigate the affected rows to see what info if any is different
- decide on if the duplicated rows should be preserved as-is, merged using a strategy, or dropped
- implement the decision

##### Code

In [266]:
champs_league_hist_df_clean.columns

Index(['year', 'team_name', 'progress', 'team_country', 'team_api_id',
       'season', 'league_id'],
      dtype='object')

In [267]:
champs_league_hist_df_clean.apply(lambda x: str(x.year) + "_" + str(x.team_api_id) + "_" + x.progress, axis=1).duplicated().sum()

0

Once again it seems that there are actually no duplicates. Lovely!

##### Test

In [268]:
assert champs_league_hist_df_clean.apply(lambda x: str(x.year) + "_" + str(x.team_api_id) + "_" + x.progress, axis=1).duplicated().sum() == 0

## Iterating - Assess Data - Quality

### assessing goal events data

#### quality

In [269]:
goal_events_df.columns

Index(['match_api_id', 'event_api_id', 'event_incident_typefk', 'stats', 'y',
       'x', 'elapsed', 'elapsed_plus', 'player1', 'player2', 'subtype', 'type',
       '_del', 'team', 'goal_type'],
      dtype='object')

In [270]:
goal_events_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35579 entries, 0 to 35578
Data columns (total 15 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   match_api_id           35579 non-null  int64 
 1   event_api_id           35579 non-null  object
 2   event_incident_typefk  35579 non-null  object
 3   stats                  35579 non-null  object
 4   y                      9609 non-null   object
 5   x                      9609 non-null   object
 6   elapsed                35579 non-null  object
 7   elapsed_plus           1528 non-null   object
 8   player1                35474 non-null  object
 9   player2                15076 non-null  object
 10  subtype                19403 non-null  object
 11  type                   35579 non-null  object
 12  _del                   27 non-null     object
 13  team                   35552 non-null  object
 14  goal_type              35552 non-null  object
dtypes: int64(1), object

In [271]:
goal_events_df.type.value_counts()

goal    35579
Name: type, dtype: int64

In [272]:
goal_events_df.goal_type.value_counts()

n      30690
p       2647
o       1006
npm      666
dg       525
rp        17
psm        1
Name: goal_type, dtype: int64

In [273]:
goal_events_df.subtype.value_counts()

shot                    10457
header                   3710
distance                 1253
volley                    982
tap_in                    776
direct_freekick           596
loose_ball                546
deflected                 351
saved_back_into_play      240
saved                     117
lob                       113
bicycle_kick              106
missed                     72
post                       28
backheel                   25
indirect freekick          19
crossbar                   12
Name: subtype, dtype: int64

In [274]:
goal_events_df.stats.value_counts()

shoton          30706
penalties        2630
                 1234
owngoals         1006
stats,shoton        2
shotoff             1
Name: stats, dtype: int64

In [280]:
goal_events_df.elapsed_plus.unique()

array([None, '2', '5', '3', '1', '4', '14', '6', '7', '8', '12', '11',
       '9'], dtype=object)

In [283]:
def investigate_goal_added_time():
    goal_event = goal_events_df.query("elapsed_plus == '11'")
    match_api_id = goal_event.match_api_id.values[0]
    match = top_5_leagues_matches_df_clean[top_5_leagues_matches_df_clean.match_api_id == match_api_id]
    home_team = teams_df_clean[teams_df_clean.team_api_id == match.home_team_api_id.values[0]].team_long_name.values[0]
    away_team = teams_df_clean[teams_df_clean.team_api_id == match.away_team_api_id.values[0]].team_long_name.values[0]
    return home_team, match.home_team_goal.values[0], match.away_team_goal.values[0], away_team, match.date.values[0], goal_event.elapsed.values[0], goal_event.player1.values[0], goal_event.team.values[0]

investigate_goal_added_time()

('Chelsea',
 3,
 1,
 'Norwich City',
 numpy.datetime64('2011-08-27T00:00:00.000000000'),
 '90',
 '41468',
 '8455')

Findings:
- the matches with longest added time were all valid

In [294]:
goal_events_df.team.unique()

array(['9823', '9790', '9825', '8650', '8654', '8528', '8655', '8668',
       '8549', '8559', '10194', '9879', '8667', '8697', '9912', '8722',
       '8721', '9789', '8178', '8177', '10261', '10260', '8455', '8659',
       '8472', '8586', '10252', '10189', '9911', '9810', '8600', '8540',
       '8462', '8636', '9882', '10267', '9788', '8226', '8524', '8690',
       '8533', '9885', '8535', '9857', '8564', '8686', '9875', '9804',
       '8529', '8543', '8479', '8302', '8696', '9783', '8633', '10205',
       '8371', '8456', '8530', '10281', '9906', '8634', '10233', '8551',
       '9888', '9865', '8305', '8558', '8661', '8398', '8295', '10269',
       '8315', '9869', '8603', '9864', '9904', '8388', '9905', '8165',
       '8358', '8191', '8602', '8658', '10167', '9976', '8394', '8537',
       '9867', '9868', '8483', '8152', '8350', '8560', '9880', '10278',
       '8581', '9858', '9853', '9873', '8588', '8583', '9748', '10249',
       '9941', '9874', '9851', '9831', '9827', '9847', '8521', '

#### tidiness

In [275]:
goal_events_df.event_api_id.duplicated().sum()

0

### assessing shoton events data

#### quality

In [276]:
shoton_events_df.columns

Index(['match_api_id', 'event_api_id', 'event_incident_typefk', 'stats',
       'elapsed', 'elapsed_plus', 'player1', 'subtype', 'type', '_del',
       'card_type', 'team'],
      dtype='object')

In [277]:
shoton_events_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 85101 entries, 0 to 85100
Data columns (total 12 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   match_api_id           85101 non-null  int64 
 1   event_api_id           85101 non-null  object
 2   event_incident_typefk  85101 non-null  object
 3   stats                  85101 non-null  object
 4   elapsed                85101 non-null  object
 5   elapsed_plus           5397 non-null   object
 6   player1                84196 non-null  object
 7   subtype                85084 non-null  object
 8   type                   85101 non-null  object
 9   _del                   64 non-null     object
 10  card_type              1 non-null      object
 11  team                   85036 non-null  object
dtypes: int64(1), object(11)
memory usage: 7.8+ MB


In [278]:
shoton_events_df._del.value_counts()

1    64
Name: _del, dtype: int64

In [279]:
shoton_events_df.type.value_counts()

shoton    85101
Name: type, dtype: int64

In [280]:
shoton_events_df.subtype.value_counts()

blocked_shot          28925
shot                  20814
distance              11648
header                 7312
big chance shot        4779
deflected              3628
direct_freekick        3079
volley                 1451
big chance header      1275
blocked_header          824
big chance blocked      508
big chance volley       280
bicycle_kick            271
indirect freekick       150
lob                     132
big chance bicycle        8
Name: subtype, dtype: int64

In [281]:
shoton_events_df.stats.value_counts()

                 55648
blocked          29439
stats               12
stats,blocked        2
Name: stats, dtype: int64

In [331]:
shoton_events_df.elapsed_plus.unique()

array([None, '2', '3', '1', '4', '5', '6', '7', '12', '8', '11', '10',
       '-10', '9'], dtype=object)

In [335]:
def investigate_shoton_added_time():
    shoton_event = shoton_events_df.query("elapsed_plus == '-10'")
    match_api_id = shoton_event.match_api_id.values[0]
    match = top_5_leagues_matches_df_clean[top_5_leagues_matches_df_clean.match_api_id == match_api_id]
    home_team = teams_df_clean[teams_df_clean.team_api_id == match.home_team_api_id.values[0]].team_long_name.values[0]
    away_team = teams_df_clean[teams_df_clean.team_api_id == match.away_team_api_id.values[0]].team_long_name.values[0]
    return home_team, match.home_team_goal.values[0], match.away_team_goal.values[0], away_team, match.date.values[0], shoton_event.elapsed.values[0], shoton_event.player1.values[0], shoton_event.team.values[0]

investigate_shoton_added_time()

('Cesena',
 1,
 3,
 'Milan',
 numpy.datetime64('2012-02-19T00:00:00.000000000'),
 '29',
 '80562',
 '8564')

Findings:
- the highest values of `elapsed_plus` are correct
- the negative value (`-10`) is correct, so some investigation is needed here to see if there are any other negative values and if they're correct

In [343]:
shoton_events_df.player1.sample(200).unique()

array(['25597', '26005', '25075', '25537', '34479', '193530', '27352',
       '30709', '46820', '46010', '39225', '24450', '28907', '33418',
       '177874', '26166', '30638', '185336', '30626', '41199', '42664',
       '38699', '134217', '39106', '113836', '38215', '119525', '214426',
       '246177', '33881', '27326', '46344', '192899', '27430', '30549',
       '39027', '31906', '161328', '72623', '30981', '31435', '187017',
       '128827', '52004', '215299', '30853', '107417', '33615', '30935',
       '30618', None, '2802', '30878', '24213', '35831', '24456', '41280',
       '23291', '103714', '36615', '184536', '37273', '177720', '40055',
       '23022', '181276', '26330', '35480', '198566', '25543', '41605',
       '26111', '150330', '25594', '30893', '562697', '184138', '95082',
       '183548', '26291', '39618', '24225', '38609', '70409', '127945',
       '37506', '41445', '24160', '178812', '15403', '34267', '40128',
       '171698', '157723', '32748', '30843', '30706', '47747

#### tidiness

In [282]:
shoton_events_df.event_api_id.duplicated().sum()

0

NOTE: not showing it again but `stats` contains info that's a subset of what's in `subtype`

### assessing shotoff events data

#### quality

In [283]:
shotoff_events_df.columns

Index(['match_api_id', 'event_api_id', 'event_incident_typefk', 'stats',
       'elapsed', 'elapsed_plus', 'player1', 'subtype', 'type', '_del',
       'card_type', 'team'],
      dtype='object')

In [284]:
shotoff_events_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 86246 entries, 0 to 86245
Data columns (total 12 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   match_api_id           86246 non-null  int64 
 1   event_api_id           86246 non-null  object
 2   event_incident_typefk  86246 non-null  object
 3   stats                  86246 non-null  object
 4   elapsed                86246 non-null  object
 5   elapsed_plus           5429 non-null   object
 6   player1                85649 non-null  object
 7   subtype                86067 non-null  object
 8   type                   86246 non-null  object
 9   _del                   52 non-null     object
 10  card_type              2 non-null      object
 11  team                   86194 non-null  object
dtypes: int64(1), object(11)
memory usage: 7.9+ MB


In [285]:
shotoff_events_df.type.value_counts()

shotoff    86246
Name: type, dtype: int64

In [286]:
shotoff_events_df.subtype.value_counts()

distance               20731
shot                   20146
header                 14311
deflected               7286
bad shot                3874
big chance shot         3863
direct_freekick         3808
volley                  3778
post                    1751
crossbar                1718
big chance header       1352
miss_kick               1066
bicycle_kick             594
big chance volley        531
big chance post          513
big chance crossbar      352
lob                      234
indirect freekick        137
big chance bicycle        22
Name: subtype, dtype: int64

In [287]:
shotoff_events_df._del.value_counts()

1    52
Name: _del, dtype: int64

In [288]:
shotoff_events_df.card_type.value_counts()

y    1
r    1
Name: card_type, dtype: int64

In [289]:
shotoff_events_df.stats.value_counts()

         86238
stats        8
Name: stats, dtype: int64

In [315]:
shotoff_events_df.elapsed_plus.unique()

array([None, '1', '3', '5', '4', '2', '10', '6', '7', '12', '8', '9',
       '13', '11'], dtype=object)

In [322]:
def investigate_shotoff_added_time():
    shotoff_event = shotoff_events_df.query("elapsed_plus == '12'")
    match_api_id = shotoff_event.match_api_id.values[0]
    match = top_5_leagues_matches_df_clean[top_5_leagues_matches_df_clean.match_api_id == match_api_id]
    home_team = teams_df_clean[teams_df_clean.team_api_id == match.home_team_api_id.values[0]].team_long_name.values[0]
    away_team = teams_df_clean[teams_df_clean.team_api_id == match.away_team_api_id.values[0]].team_long_name.values[0]
    return home_team, match.home_team_goal.values[0], match.away_team_goal.values[0], away_team, match.date.values[0], shotoff_event.elapsed.values[0]

investigate_shotoff_added_time()

('Stoke City',
 2,
 1,
 'Tottenham Hotspur',
 numpy.datetime64('2008-10-19T00:00:00.000000000'),
 '90')

Findings
- both the matches with 13 and 12 minutes of added time are real, so will not check the others with less

In [328]:
shotoff_events_df.player1.sample(200).unique()

array(['23605', '94550', '37920', '26928', '38460', '46349', '37550',
       '31306', '161421', '212643', '15732', '26437', '163670', '32148',
       '35411', '8922', '464500', '24171', '27734', '129391', '109621',
       '30496', '202341', '23916', '210404', '38899', '26494', '23760',
       '164385', '37474', '37412', '150389', '21414', '49939', '73448',
       '51545', '469852', '30960', '39535', '30827', '75310', '34248',
       '75489', '23283', '42431', '37411', '24791', '335436', '30818',
       '24213', '30646', '39701', '35532', '30829', '26151', '23949',
       '30822', '38601', '96540', '33991', '128864', '16300', '144996',
       '40636', '38403', '184138', '308843', '213729', '177503', '213812',
       '41468', '362212', '30955', '30956', '31953', '23991', '75307',
       '24435', '24531', '26209', '195998', '129817', '72518', '156008',
       '30663', '30937', '25594', '239287', '30929', '34261', '521421',
       '160713', '103714', '32616', '200965', '23354', '38433', '2

#### tidiness

In [290]:
shotoff_events_df.event_api_id.duplicated().sum()

0

### assessing foulcommit events data

#### quality

In [291]:
foulcommit_events_df.columns

Index(['match_api_id', 'elapsed', 'injury_time', 'subtype',
       'event_incident_typefk', 'player1', 'venue', '_del', 'card_type',
       'team', 'event_api_id', 'elapsed_plus', 'stats', 'player2', 'type'],
      dtype='object')

In [292]:
foulcommit_events_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 197877 entries, 0 to 197876
Data columns (total 15 columns):
 #   Column                 Non-Null Count   Dtype 
---  ------                 --------------   ----- 
 0   match_api_id           197877 non-null  int64 
 1   elapsed                197877 non-null  object
 2   injury_time            1 non-null       object
 3   subtype                104637 non-null  object
 4   event_incident_typefk  197877 non-null  object
 5   player1                190708 non-null  object
 6   venue                  2 non-null       object
 7   _del                   365 non-null     object
 8   card_type              13 non-null      object
 9   team                   197511 non-null  object
 10  event_api_id           197877 non-null  object
 11  elapsed_plus           11965 non-null   object
 12  stats                  197877 non-null  object
 13  player2                172256 non-null  object
 14  type                   197877 non-null  object
dtype

In [293]:
foulcommit_events_df.card_type.value_counts()

y    12
r     1
Name: card_type, dtype: int64

In [294]:
foulcommit_events_df.subtype.value_counts()

trip                26685
pushing             25818
from_behind         11984
serious_foul        10423
hands                7376
pull                 7290
dangerous_play       6437
shirt_pull           2798
obstruction          2583
penalty              2016
advantage             847
diving                361
goalkeeper_hands       19
Name: subtype, dtype: int64

In [295]:
foulcommit_events_df.venue.value_counts()

Borussia-Park            1
Stadio Luigi Ferraris    1
Name: venue, dtype: int64

In [296]:
foulcommit_events_df._del.value_counts()

1    365
Name: _del, dtype: int64

In [297]:
foulcommit_events_df.stats.value_counts()

foulscommitted          196655
                          1206
stats,foulscommitted        16
Name: stats, dtype: int64

In [298]:
foulcommit_events_df.type.value_counts()

foulcommit    197877
Name: type, dtype: int64

In [299]:
foulcommit_events_df.elapsed_plus.unique()

array([None, '1', '4', '3', '2', '5', '6', '7', '9', '13', '8', '10',
       '11', '12', '17'], dtype=object)

In [310]:
def investigate_17_mins_added_time():
    foul_event = foulcommit_events_df.query("elapsed_plus == '17'")
    match_api_id = foul_event.match_api_id.values[0]
    match = top_5_leagues_matches_df_clean[top_5_leagues_matches_df_clean.match_api_id == match_api_id]
    home_team = teams_df_clean[teams_df_clean.team_api_id == match.home_team_api_id.values[0]].team_long_name
    away_team = teams_df_clean[teams_df_clean.team_api_id == match.away_team_api_id.values[0]].team_long_name
    return foul_event.player1, foul_event.player2, foul_event.team

investigate_17_mins_added_time()

(76381    None
 Name: player1, dtype: object,
 76381    None
 Name: player2, dtype: object,
 76381    None
 Name: team, dtype: object)

Findings:
- the 17th minute incident so far appears to be non-existent
    - from [this article](http://news.bbc.co.uk/sport1/hi/football/eng_prem/16331299.stm)
- the next 2 latest occurring events, at 13 minutes and 12, were existent
- probably need to at least drop this outlandish row
    - CONFIRMED: this row actually has no player or team ID, so it's useless and should be dropped

In [309]:
foulcommit_events_df.team.unique()

array(['8659', '9825', '8650', '8472', '8654', '8528', '8668', '8655',
       '8549', '8586', '10194', '8559', '8667', '9879', '8697', '9912',
       '10261', '10260', '8455', '8462', '10252', '9823', '9789', '9911',
       '8721', '8600', '8540', '9882', '8636', '10267', '8661', '10189',
       '9885', '8535', '8633', '9783', '8456', '8634', '8696', '8686',
       '10233', '8564', '9875', '9865', '8690', '8558', '8305', '8371',
       '9790', '9804', '8543', '9869', '8530', '9888', '8603', '8722',
       '10269', '8315', '8302', '9906', '8524', '9810', '8295', '10205',
       '9857', '8551', '8529', '8178', '8226', '9864', '10281', '9904',
       '8533', '8177', '8479', '9788', '8388', '8398', '8191', '8602',
       '8658', '9905', '9976', '9867', '8394', '9868', '10167', '8537',
       '8165', '8358', None, '8483', '8350', '9880', '10278', '8152',
       '8560', '9858', '8581', '10172', '9850', '8406', '10003', '7878',
       '6269', '8370', '4087', '8639', '9853', '9941', '9873', '8

#### tidiness

In [299]:
foulcommit_events_df.event_api_id.duplicated().sum()

0

### assessing card events data

#### quality

In [300]:
card_events_df.columns

Index(['match_api_id', '_del', 'card_type', 'stats', 'elapsed',
       'event_incident_typefk', 'elapsed_plus', 'player1', 'type', 'subtype',
       'goal_type', 'team', 'comment', 'event_api_id'],
      dtype='object')

In [301]:
card_events_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55846 entries, 0 to 55845
Data columns (total 14 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   match_api_id           55846 non-null  int64 
 1   _del                   311 non-null    object
 2   card_type              55270 non-null  object
 3   stats                  55846 non-null  object
 4   elapsed                55846 non-null  object
 5   event_incident_typefk  55846 non-null  object
 6   elapsed_plus           3200 non-null   object
 7   player1                55537 non-null  object
 8   type                   55846 non-null  object
 9   subtype                17165 non-null  object
 10  goal_type              1 non-null      object
 11  team                   55537 non-null  object
 12  comment                55846 non-null  object
 13  event_api_id           55846 non-null  object
dtypes: int64(1), object(13)
memory usage: 6.0+ MB


In [302]:
card_events_df.card_type.value_counts()

y     52177
r      1561
y2     1532
Name: card_type, dtype: int64

In [303]:
card_events_df.subtype.value_counts()

serious_fouls           6894
Unsportsmanlike Cond    3415
verbal_abuse            1796
emergency_brake         1208
shirt_pull               670
stall_time               585
pushing                  494
hands                    490
advantage                440
diving                   403
violence                 349
kicked_ball_away         247
Removing Shirt           174
Name: subtype, dtype: int64

In [304]:
card_events_df.goal_type.value_counts()

p    1
Name: goal_type, dtype: int64

In [305]:
card_events_df._del.value_counts()

1    311
Name: _del, dtype: int64

In [306]:
card_events_df.stats.value_counts()

ycards          52176
rcards           3093
                  574
stats,ycards        3
Name: stats, dtype: int64

In [307]:
card_events_df.type.value_counts()

card    55846
Name: type, dtype: int64

In [308]:
card_events_df.comment.value_counts(dropna=False)

y     52710
r      1587
y2     1549
Name: comment, dtype: int64

In [336]:
card_events_df.player1.sample(200).unique()

array(['164029', '95078', '42388', '30536', '303165', '27719', '26392',
       '32765', '23271', '145536', '112476', '45440', '72518', '40022',
       '33028', '46448', '24150', '25150', '26255', '37442', '27313',
       '27690', '41884', '24452', '24658', '150649', '33847', '49860',
       '339992', '37450', '35443', '30991', '37656', '425426', '38364',
       '256727', '38373', '114337', '24160', '213486', '49866', '40672',
       '111237', '213878', '280638', '39487', '2802', '562995', '96540',
       '26146', '304856', '15403', '42651', '42479', '161035', '35852',
       '49940', '564856', '41061', '19327', '26199', '59595', '17703',
       '26214', '210704', '31921', '35327', '201888', '37541', '30352',
       '103266', '191315', '39509', '31097', '25338', '39201', '46836',
       '26660', '37461', '101201', '21450', '212892', '72417', '161660',
       '39428', '40707', '33692', '75405', '74771', '32769', '40958',
       '30509', '38433', '27693', '396912', '196205', '166595', '25

#### tidiness

In [309]:
card_events_df.event_api_id.duplicated().sum()

0

### assessing cross events data

#### quality

In [310]:
cross_events_df.columns

Index(['match_api_id', '_del', 'event_api_id', 'goal_type', 'team', 'stats',
       'elapsed', 'spectators', 'event_incident_typefk', 'elapsed_plus',
       'player1', 'type', 'subtype'],
      dtype='object')

In [311]:
cross_events_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 259225 entries, 0 to 259224
Data columns (total 13 columns):
 #   Column                 Non-Null Count   Dtype 
---  ------                 --------------   ----- 
 0   match_api_id           259225 non-null  int64 
 1   _del                   162 non-null     object
 2   event_api_id           259225 non-null  object
 3   goal_type              1 non-null       object
 4   team                   259060 non-null  object
 5   stats                  259225 non-null  object
 6   elapsed                259225 non-null  object
 7   spectators             1 non-null       object
 8   event_incident_typefk  259225 non-null  object
 9   elapsed_plus           14794 non-null   object
 10  player1                247361 non-null  object
 11  type                   259225 non-null  object
 12  subtype                259225 non-null  object
dtypes: int64(1), object(12)
memory usage: 25.7+ MB


In [312]:
cross_events_df.goal_type.value_counts(dropna=False)

None    259224
n            1
Name: goal_type, dtype: int64

In [313]:
cross_events_df.type.value_counts(dropna=False)

cross      212897
corner      39399
throwin      6929
Name: type, dtype: int64

In [314]:
cross_events_df.stats.value_counts(dropna=False)

crosses          212862
corners           39393
                   6906
stats,crosses        35
throwin              23
stats,corners         6
Name: stats, dtype: int64

In [315]:
cross_events_df.spectators.value_counts(dropna=False)

None     259224
27131         1
Name: spectators, dtype: int64

In [316]:
cross_events_df.subtype.value_counts(dropna=False)

cross    259225
Name: subtype, dtype: int64

In [327]:
cross_events_df.player1.sample(200).unique()

array(['41468', '200630', '102356', '191873', '18925', '24157', '150480',
       '294003', '33049', '36121', '183500', '23225', '30474', '173484',
       '24116', '23538', None, '144999', '46757', '297579', '75489',
       '30348', '36378', '32304', '27427', '188467', '282775', '198566',
       '128020', '40537', '38820', '24852', '213805', '24846', '39232',
       '293160', '15732', '426892', '38994', '26238', '575780', '427438',
       '41191', '24154', '210065', '38807', '16351', '96456', '467689',
       '40193', '33991', '41475', '131409', '38836', '409161', '51545',
       '26256', '37475', '33632', '37545', '39334', '38362', '30895',
       '72532', '46820', '244558', '52563', '96643', '46297', '24020',
       '40018', '30905', '23806', '26181', '181297', '31014', '30893',
       '24516', '24156', '45605', '466357', '52243', '207236', '214972',
       '37950', '30250', '24773', '467318', '34601', '8922', '49825',
       '30530', '36084', '26154', '43374', '39555', '128852', '235

#### tidiness

In [328]:
cross_events_df.event_api_id.duplicated().sum()

0