In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import sqlite3
import datetime
from typing import Optional, List
from pprint import pprint

%matplotlib inline

## Gather Data

In [2]:
def get_dataframe_from_sql(query, db_path = 'football.sqlite'):
    """
    Returns a pandas dataframe containing the db data returned
    by the provided SQL query.
    """
    # establish a connection to the database
    conn = sqlite3.connect(db_path)

    # load the query results into a pandas dataframe
    df = pd.read_sql_query(query, conn)

    # close the connection to the database
    conn.close()

    # return the dataframe
    return df

In [3]:
get_dataframe_from_sql("SELECT name FROM sqlite_master WHERE type='table';")

Unnamed: 0,name
0,sqlite_sequence
1,Player_Attributes
2,Player
3,Match
4,League
5,Country
6,Team
7,Team_Attributes


In [4]:
get_dataframe_from_sql("PRAGMA table_info(Player_Attributes);").head(30)

Unnamed: 0,cid,name,type,notnull,dflt_value,pk
0,0,id,INTEGER,0,,1
1,1,player_fifa_api_id,INTEGER,0,,0
2,2,player_api_id,INTEGER,0,,0
3,3,date,TEXT,0,,0
4,4,overall_rating,INTEGER,0,,0
5,5,potential,INTEGER,0,,0
6,6,preferred_foot,TEXT,0,,0
7,7,attacking_work_rate,TEXT,0,,0
8,8,defensive_work_rate,TEXT,0,,0
9,9,crossing,INTEGER,0,,0


#### get the available countries

In [212]:
countries_df = get_dataframe_from_sql("SELECT * FROM Country;")
countries_df

Unnamed: 0,id,name
0,1,Belgium
1,1729,England
2,4769,France
3,7809,Germany
4,10257,Italy
5,13274,Netherlands
6,15722,Poland
7,17642,Portugal
8,19694,Scotland
9,21518,Spain


#### get the top 5 leagues

In [215]:
top_5_league_country_names = np.array(['England', 'France', 'Germany', 'Italy', 'Spain'])
top_5_league_country_names

array(['England', 'France', 'Germany', 'Italy', 'Spain'], dtype='<U7')

In [216]:
countries_df[countries_df.name.isin(top_5_league_country_names)].index

Int64Index([1, 2, 3, 4, 9], dtype='int64')

In [218]:
leagues_df = get_dataframe_from_sql("SELECT * FROM League;")
leagues_df

Unnamed: 0,id,country_id,name
0,1,1,Belgium Jupiler League
1,1729,1729,England Premier League
2,4769,4769,France Ligue 1
3,7809,7809,Germany 1. Bundesliga
4,10257,10257,Italy Serie A
5,13274,13274,Netherlands Eredivisie
6,15722,15722,Poland Ekstraklasa
7,17642,17642,Portugal Liga ZON Sagres
8,19694,19694,Scotland Premier League
9,21518,21518,Spain LIGA BBVA


In [220]:
leagues_df = leagues_df[leagues_df.index.isin(countries_df[countries_df.name.isin(top_5_league_country_names)].index)]
leagues_df

Unnamed: 0,id,country_id,name
1,1729,1729,England Premier League
2,4769,4769,France Ligue 1
3,7809,7809,Germany 1. Bundesliga
4,10257,10257,Italy Serie A
9,21518,21518,Spain LIGA BBVA


#### filter the matches that are in each league

In [6]:
top_5_leagues_matches_df = get_dataframe_from_sql('SELECT * FROM Match').query('league_id in @leagues_df.id')
top_5_leagues_matches_df = top_5_leagues_matches_df.sort_values("date").reset_index(drop=True)
top_5_leagues_matches_df.sample(10)

Unnamed: 0,id,country_id,league_id,season,stage,date,match_api_id,home_team_api_id,away_team_api_id,home_team_goal,...,SJA,VCH,VCD,VCA,GBH,GBD,GBA,BSH,BSD,BSA
6685,3062,1729,1729,2011/2012,27,2012-03-03 00:00:00,1025637,8456,8559,2,...,21.0,1.13,10.0,23.0,1.14,7.0,17.0,1.14,7.0,23.0
12531,4274,1729,1729,2014/2015,33,2015-04-28 00:00:00,1724307,8667,8650,1,...,,4.1,3.4,2.05,,,,,,
12342,12726,10257,10257,2014/2015,29,2015-04-04 00:00:00,1786292,7943,8533,1,...,,2.25,3.1,3.9,,,,,,
40,4998,4769,4769,2008/2009,3,2008-08-23 00:00:00,483158,9873,8689,3,...,3.4,2.2,3.0,3.1,2.2,2.9,3.4,2.1,2.88,3.5
11827,7172,4769,4769,2014/2015,20,2015-01-10 00:00:00,1709890,8639,7819,1,...,,1.85,3.4,5.25,,,,,,
14052,10096,7809,7809,2015/2016,24,2016-03-02 00:00:00,2002301,10189,9790,3,...,,1.87,3.75,4.2,,,,,,
389,21880,21518,21518,2008/2009,8,2008-10-25 00:00:00,530215,8371,8603,0,...,3.4,2.25,3.2,3.1,2.3,3.2,3.0,2.25,3.2,3.0
5945,22681,21518,21518,2011/2012,11,2011-10-29 00:00:00,1051752,8560,8633,0,...,1.18,17.0,7.5,1.2,15.0,6.5,1.17,11.0,6.5,1.2
10072,6803,4769,4769,2013/2014,21,2014-01-19 00:00:00,1468401,9837,9748,0,...,2.25,3.3,3.2,2.4,,,,,,
10092,9423,7809,7809,2013/2014,18,2014-01-25 00:00:00,1479148,8721,9904,1,...,5.75,1.6,4.2,6.0,,,,,,


#### get the teams

In [7]:
teams_df = get_dataframe_from_sql("SELECT * FROM Team;")
teams_df.head()

Unnamed: 0,id,team_api_id,team_fifa_api_id,team_long_name,team_short_name
0,1,9987,673.0,KRC Genk,GEN
1,2,9993,675.0,Beerschot AC,BAC
2,3,10000,15005.0,SV Zulte-Waregem,ZUL
3,4,9994,2007.0,Sporting Lokeren,LOK
4,5,9984,1750.0,KSV Cercle Brugge,CEB


In [8]:
# Get the top 5 leagues' teams
# use the api id of both home and away teams in case some teams have missing home/away match info
top_5_leagues_team_ids = np.unique(np.concatenate((top_5_leagues_matches_df.home_team_api_id.unique(), top_5_leagues_matches_df.away_team_api_id.unique())))
teams_df = teams_df[teams_df.team_api_id.isin(top_5_leagues_team_ids)].reset_index(drop=True)
teams_df.head()

Unnamed: 0,id,team_api_id,team_fifa_api_id,team_long_name,team_short_name
0,3457,10260,11.0,Manchester United,MUN
1,3458,10261,13.0,Newcastle United,NEW
2,3459,9825,1.0,Arsenal,ARS
3,3460,8659,109.0,West Bromwich Albion,WBA
4,3461,8472,106.0,Sunderland,SUN


### get attributes of teams in the top 5 leagues

In [9]:
team_attributes_df = get_dataframe_from_sql("SELECT * FROM Team_Attributes;")
team_attributes_df.head()

Unnamed: 0,id,team_fifa_api_id,team_api_id,date,buildUpPlaySpeed,buildUpPlaySpeedClass,buildUpPlayDribbling,buildUpPlayDribblingClass,buildUpPlayPassing,buildUpPlayPassingClass,...,chanceCreationShooting,chanceCreationShootingClass,chanceCreationPositioningClass,defencePressure,defencePressureClass,defenceAggression,defenceAggressionClass,defenceTeamWidth,defenceTeamWidthClass,defenceDefenderLineClass
0,1,434,9930,2010-02-22 00:00:00,60,Balanced,,Little,50,Mixed,...,55,Normal,Organised,50,Medium,55,Press,45,Normal,Cover
1,2,434,9930,2014-09-19 00:00:00,52,Balanced,48.0,Normal,56,Mixed,...,64,Normal,Organised,47,Medium,44,Press,54,Normal,Cover
2,3,434,9930,2015-09-10 00:00:00,47,Balanced,41.0,Normal,54,Mixed,...,64,Normal,Organised,47,Medium,44,Press,54,Normal,Cover
3,4,77,8485,2010-02-22 00:00:00,70,Fast,,Little,70,Long,...,70,Lots,Organised,60,Medium,70,Double,70,Wide,Cover
4,5,77,8485,2011-02-22 00:00:00,47,Balanced,,Little,52,Mixed,...,52,Normal,Organised,47,Medium,47,Press,52,Normal,Cover


In [10]:
# Get the attributes of the top 5 leagues' teams
team_attributes_df = team_attributes_df[team_attributes_df.team_api_id.isin(teams_df.team_api_id.unique())]
team_attributes_df.head()

Unnamed: 0,id,team_fifa_api_id,team_api_id,date,buildUpPlaySpeed,buildUpPlaySpeedClass,buildUpPlayDribbling,buildUpPlayDribblingClass,buildUpPlayPassing,buildUpPlayPassingClass,...,chanceCreationShooting,chanceCreationShootingClass,chanceCreationPositioningClass,defencePressure,defencePressureClass,defenceAggression,defenceAggressionClass,defenceTeamWidth,defenceTeamWidthClass,defenceDefenderLineClass
9,10,614,8576,2010-02-22 00:00:00,60,Balanced,,Little,40,Mixed,...,55,Normal,Organised,30,Deep,70,Double,30,Narrow,Offside Trap
10,11,614,8576,2011-02-22 00:00:00,65,Balanced,,Little,45,Mixed,...,50,Normal,Organised,45,Medium,45,Press,50,Normal,Cover
11,12,614,8576,2012-02-22 00:00:00,59,Balanced,,Little,52,Mixed,...,52,Normal,Organised,38,Medium,47,Press,53,Normal,Cover
12,13,614,8576,2013-09-20 00:00:00,59,Balanced,,Little,52,Mixed,...,52,Normal,Organised,38,Medium,47,Press,53,Normal,Cover
13,14,614,8576,2014-09-19 00:00:00,59,Balanced,57.0,Normal,52,Mixed,...,52,Normal,Organised,38,Medium,47,Press,53,Normal,Cover


### get the players

In [11]:
players_df = get_dataframe_from_sql('SELECT * FROM Player;')
players_df.head()

Unnamed: 0,id,player_api_id,player_name,player_fifa_api_id,birthday,height,weight
0,1,505942,Aaron Appindangoye,218353,1992-02-29 00:00:00,182.88,187
1,2,155782,Aaron Cresswell,189615,1989-12-15 00:00:00,170.18,146
2,3,162549,Aaron Doran,186170,1991-05-13 00:00:00,170.18,163
3,4,30572,Aaron Galindo,140161,1982-05-08 00:00:00,182.88,198
4,5,23780,Aaron Hughes,17725,1979-11-08 00:00:00,182.88,154


### get the player attributes

In [12]:
player_attributes_df = get_dataframe_from_sql('SELECT * FROM Player_Attributes;')
player_attributes_df.head()

Unnamed: 0,id,player_fifa_api_id,player_api_id,date,overall_rating,potential,preferred_foot,attacking_work_rate,defensive_work_rate,crossing,...,vision,penalties,marking,standing_tackle,sliding_tackle,gk_diving,gk_handling,gk_kicking,gk_positioning,gk_reflexes
0,1,218353,505942,2016-02-18 00:00:00,67.0,71.0,right,medium,medium,49.0,...,54.0,48.0,65.0,69.0,69.0,6.0,11.0,10.0,8.0,8.0
1,2,218353,505942,2015-11-19 00:00:00,67.0,71.0,right,medium,medium,49.0,...,54.0,48.0,65.0,69.0,69.0,6.0,11.0,10.0,8.0,8.0
2,3,218353,505942,2015-09-21 00:00:00,62.0,66.0,right,medium,medium,49.0,...,54.0,48.0,65.0,66.0,69.0,6.0,11.0,10.0,8.0,8.0
3,4,218353,505942,2015-03-20 00:00:00,61.0,65.0,right,medium,medium,48.0,...,53.0,47.0,62.0,63.0,66.0,5.0,10.0,9.0,7.0,7.0
4,5,218353,505942,2007-02-22 00:00:00,61.0,65.0,right,medium,medium,48.0,...,53.0,47.0,62.0,63.0,66.0,5.0,10.0,9.0,7.0,7.0


### get the champions league history data

In [13]:
champs_league_hist_df = pd.read_csv("./champs_league_hist.csv", header=None, names=["year", "team_name", "progress", "team_country"])
champs_league_hist_df.head()

Unnamed: 0,year,team_name,progress,team_country
0,1994,A.C. Milan,1. Winner,Italy
1,1994,Barcelona,2. Runner Up,Spain
2,1994,A.C. Milan,3. Semifinalist,Italy
3,1994,Barcelona,3. Semifinalist,Spain
4,1994,Monaco,3. Semifinalist,France


## Assess Data - Quality

### Assessing match data

In [14]:
predictions_start_idx = -30
match_events_start_idx = -38
away_players_start_idx = -49
home_players_start_idx = -60
away_players_y_pos_start_idx = -71
home_players_y_pos_start_idx = -82
away_players_x_pos_start_idx = -93
home_players_x_pos_start_idx = -104

In [15]:
top_5_leagues_matches_df.iloc[:, :home_players_x_pos_start_idx].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14585 entries, 0 to 14584
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   id                14585 non-null  int64 
 1   country_id        14585 non-null  int64 
 2   league_id         14585 non-null  int64 
 3   season            14585 non-null  object
 4   stage             14585 non-null  int64 
 5   date              14585 non-null  object
 6   match_api_id      14585 non-null  int64 
 7   home_team_api_id  14585 non-null  int64 
 8   away_team_api_id  14585 non-null  int64 
 9   home_team_goal    14585 non-null  int64 
 10  away_team_goal    14585 non-null  int64 
dtypes: int64(9), object(2)
memory usage: 1.2+ MB


In [16]:
top_5_leagues_matches_df.iloc[:, :home_players_x_pos_start_idx].describe()

Unnamed: 0,id,country_id,league_id,stage,match_api_id,home_team_api_id,away_team_api_id,home_team_goal,away_team_goal
count,14585.0,14585.0,14585.0,14585.0,14585.0,14585.0,14585.0,14585.0,14585.0
mean,10739.324306,9271.884882,9271.884882,19.152143,1198544.0,9513.471718,9513.674734,1.539184,1.140075
std,6984.373837,6921.447469,6921.447469,10.810203,494169.8,8097.777199,8097.767842,1.289485,1.129409
min,1729.0,1729.0,1729.0,1.0,483129.0,4087.0,4087.0,0.0,0.0
25%,5375.0,4769.0,4769.0,10.0,705602.0,8535.0,8535.0,1.0,0.0
50%,9021.0,7809.0,7809.0,19.0,1216821.0,8686.0,8686.0,1.0,1.0
75%,12667.0,10257.0,10257.0,28.0,1709701.0,9869.0,9869.0,2.0,2.0
max,24557.0,21518.0,21518.0,38.0,2118418.0,208931.0,208931.0,10.0,9.0


In [17]:
# investigate the results with max home and away goals to confirm that they're not shit
def get_max_home_goals_league_match_details():
    league_name = leagues_df[leagues_df.id == top_5_leagues_matches_df[top_5_leagues_matches_df.home_team_goal == 10].league_id.values[0]].name
    home_team_name = teams_df[teams_df.team_api_id == top_5_leagues_matches_df[top_5_leagues_matches_df.home_team_goal == 10].home_team_api_id.values[0]].team_long_name.values[0]
    away_team_name = teams_df[teams_df.team_api_id == top_5_leagues_matches_df[top_5_leagues_matches_df.home_team_goal == 10].away_team_api_id.values[0]].team_long_name.values[0]
    match_info = top_5_leagues_matches_df[top_5_leagues_matches_df.home_team_goal == 10][["home_team_goal", "away_team_goal", "season", "date"]].squeeze()
    home_team_goal = match_info.home_team_goal
    away_team_goal = match_info.away_team_goal
    season = match_info.season
    date = match_info.date

    return f"{home_team_name} {home_team_goal} - {away_team_goal} {away_team_name} played on {date} in season {season}"

get_max_home_goals_league_match_details()

'Real Madrid CF 10 - 2 Rayo Vallecano played on 2015-12-20 00:00:00 in season 2015/2016'

In [18]:
def get_max_away_goals_league_match_details():
    mask = top_5_leagues_matches_df.away_team_goal == top_5_leagues_matches_df.away_team_goal.max()
    max_away_goals_match = top_5_leagues_matches_df[mask]
    league_name = leagues_df[leagues_df.id == max_away_goals_match.league_id.values[0]].name
    home_team_name = teams_df[teams_df.team_api_id == max_away_goals_match.home_team_api_id.values[0]].team_long_name.values[0]
    away_team_name = teams_df[teams_df.team_api_id == max_away_goals_match.away_team_api_id.values[0]].team_long_name.values[0]
    match_info = max_away_goals_match[["home_team_goal", "away_team_goal", "season", "date"]].squeeze()
    home_team_goal = match_info.home_team_goal
    away_team_goal = match_info.away_team_goal
    season = match_info.season
    date = match_info.date

    return f"{home_team_name} {home_team_goal} - {away_team_goal} {away_team_name} played on {date} in season {season}"

get_max_away_goals_league_match_details()

'ES Troyes AC 0 - 9 Paris Saint-Germain played on 2016-03-13 00:00:00 in season 2015/2016'

In [19]:
# check the max and min dates of the matches
top_5_leagues_matches_df.date.min(), top_5_leagues_matches_df.date.max()

('2008-08-09 00:00:00', '2016-05-17 00:00:00')

In [20]:
# check the range of the seasons
top_5_leagues_matches_df.season.unique()

array(['2008/2009', '2009/2010', '2010/2011', '2011/2012', '2012/2013',
       '2013/2014', '2014/2015', '2015/2016'], dtype=object)

In [21]:
# check the range of values for stage
top_5_leagues_matches_df.stage.unique()

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34,
       35, 36, 37, 38])

### assessing match players

In [22]:
# home players
top_5_leagues_matches_df.iloc[:, home_players_start_idx:away_players_start_idx].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14585 entries, 0 to 14584
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   home_player_1   14547 non-null  float64
 1   home_player_2   14538 non-null  float64
 2   home_player_3   14545 non-null  float64
 3   home_player_4   14541 non-null  float64
 4   home_player_5   14547 non-null  float64
 5   home_player_6   14551 non-null  float64
 6   home_player_7   14533 non-null  float64
 7   home_player_8   14541 non-null  float64
 8   home_player_9   14551 non-null  float64
 9   home_player_10  14512 non-null  float64
 10  home_player_11  14523 non-null  float64
dtypes: float64(11)
memory usage: 1.2 MB


In [23]:
# home players
top_5_leagues_matches_df.iloc[:, home_players_start_idx:away_players_start_idx].describe()

Unnamed: 0,home_player_1,home_player_2,home_player_3,home_player_4,home_player_5,home_player_6,home_player_7,home_player_8,home_player_9,home_player_10,home_player_11
count,14547.0,14538.0,14545.0,14541.0,14547.0,14551.0,14533.0,14541.0,14551.0,14512.0,14523.0
mean,67540.999244,90120.123401,75328.502991,78945.332233,94028.161477,86215.306989,82200.12076,92028.973798,94381.943509,88395.371348,82525.980307
std,79081.728488,97652.409271,88316.786481,86523.706029,103101.134223,97119.99795,94373.382459,102354.036812,101888.416488,96795.703335,92267.511513
min,2984.0,2802.0,2752.0,2752.0,2752.0,2802.0,2802.0,2802.0,2770.0,2802.0,2802.0
25%,30380.0,30861.0,27492.0,27684.0,30983.0,30721.0,30530.0,30930.0,31235.0,30881.0,30853.0
50%,36479.0,39731.5,37482.0,38432.0,40985.0,39376.0,39198.0,40731.0,40601.0,39638.0,38848.0
75%,56829.0,141113.0,93457.0,101070.0,130155.0,112035.0,107930.0,121044.0,144993.0,121633.0,104045.0
max,698273.0,748432.0,696443.0,696443.0,720738.0,722766.0,692984.0,693171.0,722766.0,742405.0,696365.0


In [24]:
# confirm that the max number actually exists in the players df
top_5_leagues_matches_df.iloc[:, home_players_start_idx:away_players_start_idx].describe().loc["max", :].unique().size == players_df[players_df.player_api_id.isin(top_5_leagues_matches_df.iloc[:, home_players_start_idx:away_players_start_idx].describe().loc["max", :].unique())].shape[0]

True

In [25]:
# away players
top_5_leagues_matches_df.iloc[:, away_players_start_idx:match_events_start_idx].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14585 entries, 0 to 14584
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   away_player_1   14556 non-null  float64
 1   away_player_2   14536 non-null  float64
 2   away_player_3   14539 non-null  float64
 3   away_player_4   14539 non-null  float64
 4   away_player_5   14541 non-null  float64
 5   away_player_6   14540 non-null  float64
 6   away_player_7   14542 non-null  float64
 7   away_player_8   14530 non-null  float64
 8   away_player_9   14538 non-null  float64
 9   away_player_10  14523 non-null  float64
 10  away_player_11  14513 non-null  float64
dtypes: float64(11)
memory usage: 1.2 MB


In [26]:
# away players
top_5_leagues_matches_df.iloc[:, away_players_start_idx:match_events_start_idx].describe()

Unnamed: 0,away_player_1,away_player_2,away_player_3,away_player_4,away_player_5,away_player_6,away_player_7,away_player_8,away_player_9,away_player_10,away_player_11
count,14556.0,14536.0,14539.0,14539.0,14541.0,14540.0,14542.0,14530.0,14538.0,14523.0,14513.0
mean,67814.32928,91015.047193,74546.03329,80039.389917,93992.660684,86831.252063,83094.74192,94315.749966,95323.630898,90212.77484,84306.015641
std,79174.061489,98841.807943,86654.868263,89174.013853,102718.305201,97069.715662,95644.730922,105128.693217,103836.645073,99666.106104,93356.739627
min,2796.0,2790.0,2752.0,2752.0,2790.0,2802.0,2802.0,2802.0,2802.0,2770.0,2802.0
25%,30380.0,30894.0,27476.0,27679.0,30977.0,30731.0,30598.0,30920.0,31304.0,30893.0,30853.0
50%,36479.0,39841.0,37451.0,38432.0,40985.0,39487.0,39267.0,41098.0,40636.0,39793.0,39225.0
75%,56829.0,144999.0,89475.0,103089.0,130155.0,113465.0,109330.0,128827.0,144993.0,127982.5,108809.0
max,698273.0,748432.0,696443.0,696443.0,720738.0,722766.0,750435.0,710807.0,722766.0,722766.0,717270.0


In [27]:
# confirm that the max number actually exists in the players df
top_5_leagues_matches_df.iloc[:, away_players_start_idx:match_events_start_idx].describe().loc["max", :].unique().size == players_df[players_df.player_api_id.isin(top_5_leagues_matches_df.iloc[:, away_players_start_idx:match_events_start_idx].describe().loc["max", :].unique())].shape[0]

True

### assessing match events

In [28]:
top_5_leagues_matches_df.iloc[:, match_events_start_idx:predictions_start_idx].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14585 entries, 0 to 14584
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   goal        13567 non-null  object
 1   shoton      13567 non-null  object
 2   shotoff     13567 non-null  object
 3   foulcommit  13567 non-null  object
 4   card        13567 non-null  object
 5   cross       13567 non-null  object
 6   corner      13567 non-null  object
 7   possession  13567 non-null  object
dtypes: object(8)
memory usage: 911.7+ KB


In [29]:
top_5_leagues_matches_df.iloc[:, match_events_start_idx:predictions_start_idx].describe()

Unnamed: 0,goal,shoton,shotoff,foulcommit,card,cross,corner,possession
count,13567,13567,13567,13567,13567,13567,13567,13567
unique,12606,8121,8121,8123,13165,8123,8122,8081
top,<goal />,<shoton />,<shotoff />,<foulcommit />,<card />,<cross />,<corner />,<possession />
freq,962,5447,5447,5445,403,5445,5446,5487


In [30]:
top_5_leagues_matches_df.iloc[:, match_events_start_idx:predictions_start_idx].describe().loc["top", "goal"]

'<goal />'

In [31]:
top_5_leagues_matches_df.iloc[:, match_events_start_idx:predictions_start_idx].iloc[-1, 0]

'<goal><value><comment>n</comment><stats><goals>1</goals><shoton>1</shoton></stats><event_incident_typefk>393</event_incident_typefk><coordinates><value>22</value><value>6</value></coordinates><elapsed>43</elapsed><player2>413557</player2><subtype>shot</subtype><player1>30829</player1><sortorder>2</sortorder><team>10260</team><id>5623457</id><n>169</n><type>goal</type><goal_type>n</goal_type></value><value><comment>n</comment><stats><goals>1</goals><shoton>1</shoton></stats><event_incident_typefk>393</event_incident_typefk><coordinates><value>28</value><value>8</value></coordinates><elapsed>75</elapsed><player2>35327</player2><subtype>shot</subtype><player1>696365</player1><sortorder>0</sortorder><team>10260</team><id>5623542</id><n>329</n><type>goal</type><goal_type>n</goal_type></value><value><comment>n</comment><stats><goals>1</goals><shoton>1</shoton></stats><event_incident_typefk>393</event_incident_typefk><coordinates><value>25</value><value>6</value></coordinates><elapsed>87</el

Seems those numbers are not really reflective of the full data in the columns, so will not pay attention to them for now.

### Assessing match predictions

In [32]:
top_5_leagues_matches_df.loc[:, list(top_5_leagues_matches_df.columns[-30:-15])].describe()

Unnamed: 0,B365H,B365D,B365A,BWH,BWD,BWA,IWH,IWD,IWA,LBH,LBD,LBA,PSH,PSD,PSA
count,14573.0,14573.0,14573.0,14558.0,14558.0,14558.0,14540.0,14540.0,14540.0,14568.0,14568.0,14568.0,7292.0,7292.0,7292.0
mean,2.595606,3.809271,4.70451,2.541709,3.738281,4.455709,2.454475,3.625189,4.193165,2.512505,3.706167,4.429821,2.787401,4.140014,5.128762
std,1.756999,1.171708,3.794316,1.625401,1.090557,3.359353,1.434216,0.838685,2.881999,1.606546,1.069737,3.44706,2.191845,1.68015,4.701423
min,1.04,1.4,1.08,1.03,1.65,1.1,1.05,1.5,1.1,1.04,1.4,1.1,1.04,2.2,1.09
25%,1.67,3.25,2.6,1.7,3.2,2.6,1.7,3.2,2.55,1.67,3.2,2.5,1.71,3.38,2.6175
50%,2.1,3.4,3.6,2.1,3.4,3.45,2.1,3.3,3.3,2.1,3.4,3.4,2.18,3.6,3.73
75%,2.75,3.8,5.25,2.7,3.8,5.0,2.6,3.7,4.65,2.7,3.75,5.0,2.94,4.18,5.55
max,26.0,17.0,51.0,34.0,19.5,51.0,20.0,11.0,25.0,26.0,19.0,51.0,36.0,29.0,47.5


In [33]:
top_5_leagues_matches_df.loc[:, list(top_5_leagues_matches_df.columns[-15:])].describe()

Unnamed: 0,WHH,WHD,WHA,SJH,SJD,SJA,VCH,VCD,VCA,GBH,GBD,GBA,BSH,BSD,BSA
count,14568.0,14568.0,14568.0,11074.0,11074.0,11074.0,14555.0,14555.0,14555.0,9081.0,9081.0,9081.0,9085.0,9085.0,9085.0
mean,2.569285,3.62959,4.570721,2.533537,3.736561,4.680812,2.64294,3.875681,4.909706,2.472426,3.622296,4.386528,2.465726,3.62515,4.418149
std,1.688519,0.987731,3.745467,1.627928,1.036067,3.716216,1.916398,1.325661,4.465039,1.454708,0.883999,2.998704,1.460544,0.871147,3.158631
min,1.02,1.02,1.08,1.04,1.4,1.1,1.03,1.62,1.08,1.05,1.45,1.12,1.04,1.33,1.12
25%,1.7,3.2,2.62,1.67,3.25,2.63,1.7,3.25,2.62,1.7,3.2,2.6,1.67,3.2,2.62
50%,2.15,3.3,3.4,2.1,3.4,3.6,2.15,3.5,3.6,2.1,3.3,3.5,2.1,3.3,3.4
75%,2.7,3.75,5.0,2.7,3.75,5.25,2.8,4.0,5.4,2.63,3.7,5.0,2.6,3.75,5.0
max,26.0,17.0,51.0,23.0,15.0,41.0,36.0,26.0,67.0,21.0,11.0,34.0,17.0,13.0,34.0


In [34]:
top_5_leagues_matches_df.loc[:, list(top_5_leagues_matches_df.columns[-30:-15])].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14585 entries, 0 to 14584
Data columns (total 15 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   B365H   14573 non-null  float64
 1   B365D   14573 non-null  float64
 2   B365A   14573 non-null  float64
 3   BWH     14558 non-null  float64
 4   BWD     14558 non-null  float64
 5   BWA     14558 non-null  float64
 6   IWH     14540 non-null  float64
 7   IWD     14540 non-null  float64
 8   IWA     14540 non-null  float64
 9   LBH     14568 non-null  float64
 10  LBD     14568 non-null  float64
 11  LBA     14568 non-null  float64
 12  PSH     7292 non-null   float64
 13  PSD     7292 non-null   float64
 14  PSA     7292 non-null   float64
dtypes: float64(15)
memory usage: 1.7 MB


In [35]:
top_5_leagues_matches_df.loc[:, list(top_5_leagues_matches_df.columns[-15:])].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14585 entries, 0 to 14584
Data columns (total 15 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   WHH     14568 non-null  float64
 1   WHD     14568 non-null  float64
 2   WHA     14568 non-null  float64
 3   SJH     11074 non-null  float64
 4   SJD     11074 non-null  float64
 5   SJA     11074 non-null  float64
 6   VCH     14555 non-null  float64
 7   VCD     14555 non-null  float64
 8   VCA     14555 non-null  float64
 9   GBH     9081 non-null   float64
 10  GBD     9081 non-null   float64
 11  GBA     9081 non-null   float64
 12  BSH     9085 non-null   float64
 13  BSD     9085 non-null   float64
 14  BSA     9085 non-null   float64
dtypes: float64(15)
memory usage: 1.7 MB


### assessing player attributes

In [36]:
player_attributes_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 183978 entries, 0 to 183977
Data columns (total 42 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   id                   183978 non-null  int64  
 1   player_fifa_api_id   183978 non-null  int64  
 2   player_api_id        183978 non-null  int64  
 3   date                 183978 non-null  object 
 4   overall_rating       183142 non-null  float64
 5   potential            183142 non-null  float64
 6   preferred_foot       183142 non-null  object 
 7   attacking_work_rate  180748 non-null  object 
 8   defensive_work_rate  183142 non-null  object 
 9   crossing             183142 non-null  float64
 10  finishing            183142 non-null  float64
 11  heading_accuracy     183142 non-null  float64
 12  short_passing        183142 non-null  float64
 13  volleys              181265 non-null  float64
 14  dribbling            183142 non-null  float64
 15  curve            

In [37]:
player_attributes_df.isna().sum().unique()

array([   0,  836, 3230, 2713])

In [38]:
attributes_start_idx = 4

In [39]:
player_attributes_df.iloc[:, :attributes_start_idx].describe()

Unnamed: 0,id,player_fifa_api_id,player_api_id
count,183978.0,183978.0,183978.0
mean,91989.5,165671.524291,135900.617324
std,53110.01825,53851.094769,136927.84051
min,1.0,2.0,2625.0
25%,45995.25,155798.0,34763.0
50%,91989.5,183488.0,77741.0
75%,137983.75,199848.0,191080.0
max,183978.0,234141.0,750584.0


In [40]:
# confirm that the max value make sense
players_df[players_df.player_api_id == player_attributes_df.iloc[:, :attributes_start_idx].describe().loc["max", :].player_api_id]

Unnamed: 0,id,player_api_id,player_name,player_fifa_api_id,birthday,height,weight
470,473,750584,Alexandre Azevedo,234141,1997-01-28 00:00:00,175.26,150


In [41]:
player_attributes_df.iloc[:, attributes_start_idx:].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 183978 entries, 0 to 183977
Data columns (total 38 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   overall_rating       183142 non-null  float64
 1   potential            183142 non-null  float64
 2   preferred_foot       183142 non-null  object 
 3   attacking_work_rate  180748 non-null  object 
 4   defensive_work_rate  183142 non-null  object 
 5   crossing             183142 non-null  float64
 6   finishing            183142 non-null  float64
 7   heading_accuracy     183142 non-null  float64
 8   short_passing        183142 non-null  float64
 9   volleys              181265 non-null  float64
 10  dribbling            183142 non-null  float64
 11  curve                181265 non-null  float64
 12  free_kick_accuracy   183142 non-null  float64
 13  long_passing         183142 non-null  float64
 14  ball_control         183142 non-null  float64
 15  acceleration     

In [42]:
player_attributes_df.preferred_foot.unique()

array(['right', 'left', None], dtype=object)

In [43]:
player_attributes_df.attacking_work_rate.unique()

array(['medium', 'high', None, 'low', 'None', 'le', 'norm', 'stoc', 'y'],
      dtype=object)

In [44]:
player_attributes_df.defensive_work_rate.unique()

array(['medium', 'high', 'low', '_0', None, '5', 'ean', 'o', '1', 'ormal',
       '7', '2', '8', '4', 'tocky', '0', '3', '6', '9', 'es'],
      dtype=object)

In [45]:
player_attributes_df.iloc[:, attributes_start_idx:attributes_start_idx+15].describe()

Unnamed: 0,overall_rating,potential,crossing,finishing,heading_accuracy,short_passing,volleys,dribbling,curve,free_kick_accuracy,long_passing,ball_control
count,183142.0,183142.0,183142.0,183142.0,183142.0,183142.0,181265.0,183142.0,181265.0,183142.0,183142.0,183142.0
mean,68.600015,73.460353,55.086883,49.921078,57.266023,62.429672,49.468436,59.175154,52.965675,49.38095,57.06988,63.388879
std,7.041139,6.592271,17.242135,19.038705,16.488905,14.194068,18.256618,17.744688,18.255788,17.831746,14.394464,15.196671
min,33.0,39.0,1.0,1.0,1.0,3.0,1.0,1.0,2.0,1.0,3.0,5.0
25%,64.0,69.0,45.0,34.0,49.0,57.0,35.0,52.0,41.0,36.0,49.0,58.0
50%,69.0,74.0,59.0,53.0,60.0,65.0,52.0,64.0,56.0,50.0,59.0,67.0
75%,73.0,78.0,68.0,65.0,68.0,72.0,64.0,72.0,67.0,63.0,67.0,73.0
max,94.0,97.0,95.0,97.0,98.0,97.0,93.0,97.0,94.0,97.0,97.0,97.0


In [46]:
player_attributes_df.iloc[:, attributes_start_idx+15:attributes_start_idx+30].describe()

Unnamed: 0,acceleration,sprint_speed,agility,reactions,balance,shot_power,jumping,stamina,strength,long_shots,aggression,interceptions,positioning,vision,penalties
count,183142.0,183142.0,181265.0,183142.0,181265.0,183142.0,181265.0,183142.0,183142.0,183142.0,183142.0,183142.0,183142.0,181265.0,183142.0
mean,67.659357,68.051244,65.97091,66.103706,65.189496,61.808427,66.969045,67.038544,67.424529,53.339431,60.948046,52.009271,55.786504,57.87355,55.003986
std,12.983326,12.569721,12.954585,9.155408,13.063188,16.135143,11.006734,13.165262,12.07228,18.367025,16.089521,19.450133,18.448292,15.144086,15.546519
min,10.0,12.0,11.0,17.0,12.0,2.0,14.0,10.0,10.0,1.0,6.0,1.0,2.0,1.0,2.0
25%,61.0,62.0,58.0,61.0,58.0,54.0,60.0,61.0,60.0,41.0,51.0,34.0,45.0,49.0,45.0
50%,69.0,69.0,68.0,67.0,67.0,65.0,68.0,69.0,69.0,58.0,64.0,57.0,60.0,60.0,57.0
75%,77.0,77.0,75.0,72.0,74.0,73.0,74.0,76.0,76.0,67.0,73.0,68.0,69.0,69.0,67.0
max,97.0,97.0,96.0,96.0,96.0,97.0,96.0,96.0,96.0,96.0,97.0,96.0,96.0,97.0,96.0


In [47]:
player_attributes_df.iloc[:, attributes_start_idx+30:].describe()

Unnamed: 0,marking,standing_tackle,sliding_tackle,gk_diving,gk_handling,gk_kicking,gk_positioning,gk_reflexes
count,183142.0,183142.0,181265.0,183142.0,183142.0,183142.0,183142.0,183142.0
mean,46.772242,50.351257,48.001462,14.704393,16.063612,20.998362,16.132154,16.441439
std,21.227667,21.483706,21.598778,16.865467,15.867382,21.45298,16.099175,17.198155
min,1.0,1.0,2.0,1.0,1.0,1.0,1.0,1.0
25%,25.0,29.0,25.0,7.0,8.0,8.0,8.0,8.0
50%,50.0,56.0,53.0,10.0,11.0,12.0,11.0,11.0
75%,66.0,69.0,67.0,13.0,15.0,15.0,15.0,15.0
max,96.0,95.0,95.0,94.0,93.0,97.0,96.0,96.0


### Assessing teams

In [48]:
teams_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 164 entries, 0 to 163
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   id                164 non-null    int64  
 1   team_api_id       164 non-null    int64  
 2   team_fifa_api_id  164 non-null    float64
 3   team_long_name    164 non-null    object 
 4   team_short_name   164 non-null    object 
dtypes: float64(1), int64(2), object(2)
memory usage: 6.5+ KB


In [49]:
teams_df.describe()

Unnamed: 0,id,team_api_id,team_fifa_api_id
count,164.0,164.0,164.0
mean,19329.432927,10836.079268,15386.189024
std,13772.187036,17435.893334,37424.238876
min,3457.0,4087.0,1.0
25%,9543.75,8470.5,64.75
50%,15629.5,8664.0,362.5
75%,22042.5,9865.5,1826.0
max,48358.0,208931.0,112409.0


In [50]:
team_attributes_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 924 entries, 9 to 1449
Data columns (total 25 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   id                              924 non-null    int64  
 1   team_fifa_api_id                924 non-null    int64  
 2   team_api_id                     924 non-null    int64  
 3   date                            924 non-null    object 
 4   buildUpPlaySpeed                924 non-null    int64  
 5   buildUpPlaySpeedClass           924 non-null    object 
 6   buildUpPlayDribbling            308 non-null    float64
 7   buildUpPlayDribblingClass       924 non-null    object 
 8   buildUpPlayPassing              924 non-null    int64  
 9   buildUpPlayPassingClass         924 non-null    object 
 10  buildUpPlayPositioningClass     924 non-null    object 
 11  chanceCreationPassing           924 non-null    int64  
 12  chanceCreationPassingClass      924

In [51]:
for attr_name in [col for col in team_attributes_df.columns if col.endswith("Class")]:
    print(f"{attr_name} unique values: {team_attributes_df[attr_name].unique()}")

buildUpPlaySpeedClass unique values: ['Balanced' 'Slow' 'Fast']
buildUpPlayDribblingClass unique values: ['Little' 'Normal' 'Lots']
buildUpPlayPassingClass unique values: ['Mixed' 'Short' 'Long']
buildUpPlayPositioningClass unique values: ['Organised' 'Free Form']
chanceCreationPassingClass unique values: ['Normal' 'Safe' 'Risky']
chanceCreationCrossingClass unique values: ['Normal' 'Little' 'Lots']
chanceCreationShootingClass unique values: ['Normal' 'Lots' 'Little']
chanceCreationPositioningClass unique values: ['Organised' 'Free Form']
defencePressureClass unique values: ['Deep' 'Medium' 'High']
defenceAggressionClass unique values: ['Double' 'Press' 'Contain']
defenceTeamWidthClass unique values: ['Narrow' 'Normal' 'Wide']
defenceDefenderLineClass unique values: ['Offside Trap' 'Cover']


In [52]:
for attr_name in [col for col in team_attributes_df.iloc[:, attributes_start_idx:].columns if not col.endswith("Class")]:
    print(f"number of unique values for {attr_name}: {team_attributes_df[attr_name].unique().size}")

number of unique values for buildUpPlaySpeed: 56
number of unique values for buildUpPlayDribbling: 48
number of unique values for buildUpPlayPassing: 58
number of unique values for chanceCreationPassing: 49
number of unique values for chanceCreationCrossing: 55
number of unique values for chanceCreationShooting: 54
number of unique values for defencePressure: 48
number of unique values for defenceAggression: 46
number of unique values for defenceTeamWidth: 42


In [53]:
for col_name in ["buildUpPlayDribbling", "buildUpPlaySpeed"]:
    print(team_attributes_df[col_name].unique())

[nan 57. 70. 41. 61. 48. 46. 49. 51. 32. 37. 45. 52. 50. 38. 55. 35. 40.
 30. 29. 34. 24. 39. 31. 60. 44. 36. 54. 53. 33. 56. 59. 43. 47. 69. 62.
 58. 42. 65. 77. 28. 68. 66. 71. 26. 27. 74. 67.]
[60 65 59 45 48 55 42 46 50 23 41 39 56 40 62 66 75 25 30 70 67 63 35 64
 57 47 68 43 24 36 52 58 69 73 37 51 44 38 49 71 74 76 31 54 32 80 53 61
 34 72 29 78 26 28 20 77]


In [54]:
team_attributes_df.iloc[:, attributes_start_idx:].describe()

Unnamed: 0,buildUpPlaySpeed,buildUpPlayDribbling,buildUpPlayPassing,chanceCreationPassing,chanceCreationCrossing,chanceCreationShooting,defencePressure,defenceAggression,defenceTeamWidth
count,924.0,308.0,924.0,924.0,924.0,924.0,924.0,924.0,924.0
mean,53.541126,48.373377,48.666667,52.498918,53.971861,54.248918,45.968615,49.742424,52.270563
std,11.318181,10.66335,11.285103,10.859738,11.388694,10.548179,10.294372,9.438097,9.345908
min,20.0,24.0,20.0,21.0,20.0,22.0,23.0,27.0,29.0
25%,46.0,41.0,40.0,46.0,47.0,49.0,38.0,44.0,48.0
50%,54.0,49.0,50.0,52.0,54.0,54.0,45.0,49.0,52.0
75%,63.0,55.0,55.0,60.0,63.0,63.0,51.0,55.0,58.0
max,80.0,77.0,80.0,80.0,80.0,80.0,72.0,72.0,73.0


### Assessing champions league data

In [55]:
champs_league_hist_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1666 entries, 0 to 1665
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   year          1666 non-null   int64 
 1   team_name     1666 non-null   object
 2   progress      1666 non-null   object
 3   team_country  1666 non-null   object
dtypes: int64(1), object(3)
memory usage: 52.2+ KB


In [56]:
champs_league_hist_df.describe()

Unnamed: 0,year
count,1666.0
mean,2007.840336
std,8.001033
min,1994.0
25%,2001.0
50%,2008.0
75%,2015.0
max,2021.0


In [57]:
for col_name in champs_league_hist_df.columns:
    pprint(f"{col_name} unique values: {champs_league_hist_df[col_name].unique()}")

('year unique values: [1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 '
 '2005 2006 2007\n'
 ' 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021]')
("team_name unique values: ['A.C. Milan' 'Barcelona' 'Monaco' 'Porto' 'Spartak "
 "Moscow' 'Galatasaray'\n"
 " 'Werder Bremen' 'Anderlecht' 'Feyenoord' 'Steaua Bucuresti'\n"
 " 'Levski Sofia' 'Copenhagen' 'Sparta Prague' 'Manchester United'\n"
 " 'Lech Poznan' 'Austria Wien' 'Floriana' 'Akranes S.C.' 'AEK Athens'\n"
 " 'Croatia Zagreb' 'Rangers (IL)' 'Dinamo Minsk' 'Linfield' 'FC Aarau'\n"
 " 'AIK' 'HJK Helsinki' 'Kispest Honved' 'Cork City' 'Beitar Jerusalem'\n"
 " 'Skonto' 'Dynamo Kyiv' 'Rosenborg BK' 'Ajax' 'Bayern Munich'\n"
 " 'Paris Saint-Germain' 'IFK Goteborg' 'Hajduk Split' 'Benfica'\n"
 " 'FC Red Bull Salzburg' 'Avenir Beggen' 'Silkeborg' 'Vac FC-Samsung'\n"
 " 'Legia Warszawa' 'Servette' 'Maccabi Haifa' 'Juventus' 'Panathinaikos'\n"
 " 'Nantes' 'Borussia Dortmund' 'Real Madrid' 'Aalborg BK'\n"
 " 'Blac

In [58]:
champs_league_hist_df[champs_league_hist_df.progress == "1. Winner"]

Unnamed: 0,year,team_name,progress,team_country
0,1994,A.C. Milan,1. Winner,Italy
62,1995,Ajax,1. Winner,Netherlands
124,1996,Juventus,1. Winner,Italy
178,1997,Borussia Dortmund,1. Winner,Germany
231,1998,Real Madrid,1. Winner,Spain
269,1999,Manchester United,1. Winner,England
307,2000,Real Madrid,1. Winner,Spain
369,2001,Bayern Munich,1. Winner,Germany
431,2002,Real Madrid,1. Winner,Spain
493,2003,A.C. Milan,1. Winner,Italy


Cross referencing the winners with Wikipedia shows that they're consistent, so I'll consider the data to be valid and accurate.

## Assess data - Tidiness

### Assessing league data

In [59]:
leagues_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   id          5 non-null      int64 
 1   country_id  5 non-null      int64 
 2   name        5 non-null      object
dtypes: int64(2), object(1)
memory usage: 248.0+ bytes


In [60]:
leagues_df

Unnamed: 0,id,country_id,name
0,1729,1729,England Premier League
1,4769,4769,France Ligue 1
2,7809,7809,Germany 1. Bundesliga
3,10257,10257,Italy Serie A
4,21518,21518,Spain LIGA BBVA


### assessing team data

In [61]:
teams_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 164 entries, 0 to 163
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   id                164 non-null    int64  
 1   team_api_id       164 non-null    int64  
 2   team_fifa_api_id  164 non-null    float64
 3   team_long_name    164 non-null    object 
 4   team_short_name   164 non-null    object 
dtypes: float64(1), int64(2), object(2)
memory usage: 6.5+ KB


In [62]:
teams_df.describe()

Unnamed: 0,id,team_api_id,team_fifa_api_id
count,164.0,164.0,164.0
mean,19329.432927,10836.079268,15386.189024
std,13772.187036,17435.893334,37424.238876
min,3457.0,4087.0,1.0
25%,9543.75,8470.5,64.75
50%,15629.5,8664.0,362.5
75%,22042.5,9865.5,1826.0
max,48358.0,208931.0,112409.0


In [63]:
teams_df.team_api_id.unique().size == teams_df.shape[0]

True

In [64]:
team_attributes_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 924 entries, 9 to 1449
Data columns (total 25 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   id                              924 non-null    int64  
 1   team_fifa_api_id                924 non-null    int64  
 2   team_api_id                     924 non-null    int64  
 3   date                            924 non-null    object 
 4   buildUpPlaySpeed                924 non-null    int64  
 5   buildUpPlaySpeedClass           924 non-null    object 
 6   buildUpPlayDribbling            308 non-null    float64
 7   buildUpPlayDribblingClass       924 non-null    object 
 8   buildUpPlayPassing              924 non-null    int64  
 9   buildUpPlayPassingClass         924 non-null    object 
 10  buildUpPlayPositioningClass     924 non-null    object 
 11  chanceCreationPassing           924 non-null    int64  
 12  chanceCreationPassingClass      924

In [65]:
for col_name in ["team_api_id", "team_fifa_api_id", "date"]:
    pprint(team_attributes_df[col_name].unique().size == team_attributes_df.shape[0])

False
False
False


That means team api ids are not unique rows and neither is date. Perhaps combining the two will give a unique row.

In [66]:
team_attributes_df.apply(lambda row: row.date + "_" + str(row.team_api_id), axis=1).unique().size == team_attributes_df.shape[0]

True

Indeed it does! So that means each row represents data collected for a unique team on a unique date for that team.

### assessing match data

In [67]:
pprint([*top_5_leagues_matches_df.columns])

['id',
 'country_id',
 'league_id',
 'season',
 'stage',
 'date',
 'match_api_id',
 'home_team_api_id',
 'away_team_api_id',
 'home_team_goal',
 'away_team_goal',
 'home_player_X1',
 'home_player_X2',
 'home_player_X3',
 'home_player_X4',
 'home_player_X5',
 'home_player_X6',
 'home_player_X7',
 'home_player_X8',
 'home_player_X9',
 'home_player_X10',
 'home_player_X11',
 'away_player_X1',
 'away_player_X2',
 'away_player_X3',
 'away_player_X4',
 'away_player_X5',
 'away_player_X6',
 'away_player_X7',
 'away_player_X8',
 'away_player_X9',
 'away_player_X10',
 'away_player_X11',
 'home_player_Y1',
 'home_player_Y2',
 'home_player_Y3',
 'home_player_Y4',
 'home_player_Y5',
 'home_player_Y6',
 'home_player_Y7',
 'home_player_Y8',
 'home_player_Y9',
 'home_player_Y10',
 'home_player_Y11',
 'away_player_Y1',
 'away_player_Y2',
 'away_player_Y3',
 'away_player_Y4',
 'away_player_Y5',
 'away_player_Y6',
 'away_player_Y7',
 'away_player_Y8',
 'away_player_Y9',
 'away_player_Y10',
 'away_player

In [68]:
for col_name in ["match_api_id"]:
    pprint(top_5_leagues_matches_df[col_name].unique().size == top_5_leagues_matches_df.shape[0])

True


In [69]:
top_5_leagues_matches_df.match_api_id.duplicated().sum()

0

### assessing player data

In [70]:
players_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11060 entries, 0 to 11059
Data columns (total 7 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   id                  11060 non-null  int64  
 1   player_api_id       11060 non-null  int64  
 2   player_name         11060 non-null  object 
 3   player_fifa_api_id  11060 non-null  int64  
 4   birthday            11060 non-null  object 
 5   height              11060 non-null  float64
 6   weight              11060 non-null  int64  
dtypes: float64(1), int64(4), object(2)
memory usage: 605.0+ KB


In [71]:
players_df.player_api_id.unique().size == players_df.shape[0]

True

In [72]:
players_df.player_api_id.duplicated().sum() == 0

True

In [73]:
player_attributes_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 183978 entries, 0 to 183977
Data columns (total 42 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   id                   183978 non-null  int64  
 1   player_fifa_api_id   183978 non-null  int64  
 2   player_api_id        183978 non-null  int64  
 3   date                 183978 non-null  object 
 4   overall_rating       183142 non-null  float64
 5   potential            183142 non-null  float64
 6   preferred_foot       183142 non-null  object 
 7   attacking_work_rate  180748 non-null  object 
 8   defensive_work_rate  183142 non-null  object 
 9   crossing             183142 non-null  float64
 10  finishing            183142 non-null  float64
 11  heading_accuracy     183142 non-null  float64
 12  short_passing        183142 non-null  float64
 13  volleys              181265 non-null  float64
 14  dribbling            183142 non-null  float64
 15  curve            

In [74]:
for col_name in ["player_api_id", "player_fifa_api_id", "date"]:
    pprint(player_attributes_df[col_name].unique().size == player_attributes_df.shape[0])
    pprint(player_attributes_df[col_name].duplicated().sum() == 0)

False
False
False
False
False
False


In [75]:
# try using a combination of api id and date
pprint(player_attributes_df.apply(lambda row: row.date + "_" + str(row.player_api_id), axis=1).unique().size == player_attributes_df.shape[0])
pprint(player_attributes_df.apply(lambda row: row.date + "_" + str(row.player_api_id), axis=1).unique().size)
pprint(player_attributes_df.shape[0])

False
183142
183978


does one of those contain null values?

In [76]:
pprint(player_attributes_df.date.isna().sum())
pprint(player_attributes_df.player_api_id.isna().sum())

0
0


does the combination yield duplicated values?

In [77]:
pd.Series(player_attributes_df.apply(lambda row: row.date + "_" + str(row.player_api_id), axis=1)).duplicated().sum()

836

### Assessing champs league data

In [78]:
champs_league_hist_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1666 entries, 0 to 1665
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   year          1666 non-null   int64 
 1   team_name     1666 non-null   object
 2   progress      1666 non-null   object
 3   team_country  1666 non-null   object
dtypes: int64(1), object(3)
memory usage: 52.2+ KB


In [79]:
champs_league_hist_df[champs_league_hist_df.duplicated()]

Unnamed: 0,year,team_name,progress,team_country
109,1995,Galatasaray,7. Last 32,Turkey
111,1995,IFK Goteborg,7. Last 32,Sweden
113,1995,Dynamo Kyiv,7. Last 32,Ukraine
114,1995,Paris Saint-Germain,7. Last 32,France
117,1995,Hajduk Split,7. Last 32,Croatia
118,1995,Steaua Bucuresti,7. Last 32,Romania
120,1995,AEK Athens,7. Last 32,Greece
123,1995,FC Red Bull Salzburg,7. Last 32,Austria


In [80]:
champs_league_hist_df[(champs_league_hist_df.team_name == "Galatasaray") & (champs_league_hist_df.year == 1995) & (champs_league_hist_df.progress == "7. Last 32")]

Unnamed: 0,year,team_name,progress,team_country
101,1995,Galatasaray,7. Last 32,Turkey
109,1995,Galatasaray,7. Last 32,Turkey


## Clean data

### Create copies of the dataframes

In [81]:
# create a copy of the dataframes to clean
countries_df_clean = countries_df.copy()
leagues_df_clean = leagues_df.copy()
teams_df_clean = teams_df.copy()
team_attributes_df_clean = team_attributes_df.copy()
players_df_clean = players_df.copy()
player_attributes_df_clean = player_attributes_df.copy()
top_5_leagues_matches_df_clean = top_5_leagues_matches_df.copy()
champs_league_hist_df_clean = champs_league_hist_df.copy()

### match table

#### `date` column should be datetime

##### Define
convert the date column to datetime using pandas to_datetime function

##### Code

In [82]:
top_5_leagues_matches_df_clean["date"] = pd.to_datetime(top_5_leagues_matches_df_clean.date)
top_5_leagues_matches_df_clean.date.head()

0   2008-08-09
1   2008-08-09
2   2008-08-09
3   2008-08-09
4   2008-08-09
Name: date, dtype: datetime64[ns]

##### Test

In [83]:
top_5_leagues_matches_df_clean.date.dtype

dtype('<M8[ns]')

#### `season` should be categorical

##### Define
- convert season to categorical by getting the unique values and assigning them to a list
- the list should be ordered with the oldest season first and the latest season last

##### Code

In [84]:
season_categories = pd.CategoricalDtype(top_5_leagues_matches_df_clean.season.unique(), ordered=True)
season_categories

CategoricalDtype(categories=['2008/2009', '2009/2010', '2010/2011', '2011/2012',
                  '2012/2013', '2013/2014', '2014/2015', '2015/2016'],
, ordered=True)

In [85]:
# convert the season column for matches to this categorical variable
top_5_leagues_matches_df_clean["season"] = top_5_leagues_matches_df_clean.season.astype(season_categories)

##### Test

In [86]:
top_5_leagues_matches_df_clean.season.dtypes

CategoricalDtype(categories=['2008/2009', '2009/2010', '2010/2011', '2011/2012',
                  '2012/2013', '2013/2014', '2014/2015', '2015/2016'],
, ordered=True)

#### rename to `stage` to `matchday` and should be categorical

##### Define

- rename the column
- make it categorical with the categories being the unique values in the column
- should be ordered

##### Code

In [87]:
top_5_leagues_matches_df_clean.rename(columns={"stage": "matchday"}, inplace=True)

In [88]:
matchday_categories = pd.CategoricalDtype(top_5_leagues_matches_df_clean.matchday.unique(), ordered=True)

In [89]:
top_5_leagues_matches_df_clean["matchday"] = top_5_leagues_matches_df_clean.matchday.astype(matchday_categories)

##### Test

In [90]:
top_5_leagues_matches_df_clean.matchday

0         1
1         1
2         1
3         1
4         1
         ..
14580    38
14581    38
14582    38
14583    38
14584    38
Name: matchday, Length: 14585, dtype: category
Categories (38, int64): [1 < 2 < 3 < 4 ... 35 < 36 < 37 < 38]

#### player IDs (`home_player_1` ... `away_player_11`) are null for some matches

##### Define

- drop all values that are null for these columns

##### Code

In [91]:
top_5_leagues_matches_df_clean.drop(top_5_leagues_matches_df_clean[top_5_leagues_matches_df_clean.loc[:, "home_player_1":"away_player_11"].isna().sum(axis=1) > 0].index, inplace=True)

##### Test

In [92]:
assert top_5_leagues_matches_df_clean[top_5_leagues_matches_df_clean.loc[:, "home_player_1":"away_player_11"].isna().sum(axis=1) > 0].size == 0

#### match events (`goal` ... `possession`) are null for some matches

##### Define

- confirm that matches with a 0-0 scoreline don't have a null `goal` column
- drop all values that are null for these columns

##### Code

In [93]:
# get the values of the goals columns for 0-0 matches
top_5_leagues_matches_df_clean[(top_5_leagues_matches_df_clean.home_team_goal == 0) & (top_5_leagues_matches_df_clean.away_team_goal == 0)].goal.value_counts()

<goal />                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           909
<goal><value><comment>dg</comment><event_incident_typefk>304</event_incident_typefk><elapsed>87</elapsed><subtype>loose_ball</subtype><player1>23934</player1><sortorder>2</sortorder><team>8559</team><id>389536</id><n>172</n><type>goal</type><goal_type>dg</goal_type></value><value><comment>dg</comment><event_incident_typefk>298</event_incident_typefk><elapsed>90</elapsed><subtype>shot</subtype><player1>23934</player1><sortorder>2</sortorder><team

That above confirms that 0-0 matches don't have null values for the `goal` column

In [94]:
top_5_leagues_matches_df_clean[top_5_leagues_matches_df_clean.loc[:, "goal":"possession"].isna().sum(axis=1) > 0].index

Int64Index([   0,    1,    3,    4,    6,    7,    9,   13,   14,   15,
            ...
            5873, 5874, 5876, 5877, 5879, 5890, 5894, 5897, 5898, 7494],
           dtype='int64', length=933)

In [95]:
top_5_leagues_matches_df_clean.drop(top_5_leagues_matches_df_clean[top_5_leagues_matches_df_clean.loc[:, "goal":"possession"].isna().sum(axis=1) > 0].index, inplace=True)

##### Test

In [96]:
assert top_5_leagues_matches_df_clean[top_5_leagues_matches_df_clean.loc[:, "goal":"possession"].isna().sum(axis=1) > 0].size == 0

#### predictions are null for some matches

##### Define

- drop all rows that have null values

##### Code

In [97]:
top_5_leagues_matches_df_clean.iloc[:, predictions_start_idx:].shape[0]

12723

In [98]:
(top_5_leagues_matches_df_clean.iloc[:, predictions_start_idx:].isna().sum(axis=1) > 0).sum()

10960

Most of the rows have missing predictions! I can't drop the null rows.

In [99]:
top_5_leagues_matches_df_clean.iloc[:, predictions_start_idx:].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 12723 entries, 10 to 14584
Data columns (total 30 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   B365H   12714 non-null  float64
 1   B365D   12714 non-null  float64
 2   B365A   12714 non-null  float64
 3   BWH     12699 non-null  float64
 4   BWD     12699 non-null  float64
 5   BWA     12699 non-null  float64
 6   IWH     12686 non-null  float64
 7   IWD     12686 non-null  float64
 8   IWA     12686 non-null  float64
 9   LBH     12709 non-null  float64
 10  LBD     12709 non-null  float64
 11  LBA     12709 non-null  float64
 12  PSH     7023 non-null   float64
 13  PSD     7023 non-null   float64
 14  PSA     7023 non-null   float64
 15  WHH     12708 non-null  float64
 16  WHD     12708 non-null  float64
 17  WHA     12708 non-null  float64
 18  SJH     9391 non-null   float64
 19  SJD     9391 non-null   float64
 20  SJA     9391 non-null   float64
 21  VCH     12696 non-null  float64
 2

##### new strategy
- drop columns that have less than 75% non-null predictions (below that feels too incomplete to me)
- drop all match rows that have no predictions at all
- fill in the remaining null predictions using the average of those values for the row (i.e. null home predictions will be average of home predictions, likewise for draw and away)

**drop columns that have less than 75% non-null predictions (below that feels too incomplete to me)**

In [100]:
num_missing_predictions = top_5_leagues_matches_df_clean.isna().sum().iloc[-30:]
num_missing_predictions

B365H       9
B365D       9
B365A       9
BWH        24
BWD        24
BWA        24
IWH        37
IWD        37
IWA        37
LBH        14
LBD        14
LBA        14
PSH      5700
PSD      5700
PSA      5700
WHH        15
WHD        15
WHA        15
SJH      3332
SJD      3332
SJA      3332
VCH        27
VCD        27
VCA        27
GBH      5272
GBD      5272
GBA      5272
BSH      5268
BSD      5268
BSA      5268
dtype: int64

In [101]:
# drop the columns for the betting providers with more than 1/4 of the data missing
top_5_leagues_matches_df_clean.drop(num_missing_predictions[num_missing_predictions > round(top_5_leagues_matches_df_clean.shape[0] * 1 / 4)].index, axis=1, inplace=True)
top_5_leagues_matches_df_clean.iloc[:, predictions_start_idx:].columns

Index(['away_player_8', 'away_player_9', 'away_player_10', 'away_player_11',
       'goal', 'shoton', 'shotoff', 'foulcommit', 'card', 'cross', 'corner',
       'possession', 'B365H', 'B365D', 'B365A', 'BWH', 'BWD', 'BWA', 'IWH',
       'IWD', 'IWA', 'LBH', 'LBD', 'LBA', 'WHH', 'WHD', 'WHA', 'VCH', 'VCD',
       'VCA'],
      dtype='object')

In [102]:
predictions_start_idx = -18

In [103]:
match_predictions_df = top_5_leagues_matches_df_clean.loc[:, list(top_5_leagues_matches_df_clean.columns[predictions_start_idx:])]
match_predictions_df.columns

Index(['B365H', 'B365D', 'B365A', 'BWH', 'BWD', 'BWA', 'IWH', 'IWD', 'IWA',
       'LBH', 'LBD', 'LBA', 'WHH', 'WHD', 'WHA', 'VCH', 'VCD', 'VCA'],
      dtype='object')

In [104]:
match_predictions_df.describe()

Unnamed: 0,B365H,B365D,B365A,BWH,BWD,BWA,IWH,IWD,IWA,LBH,LBD,LBA,WHH,WHD,WHA,VCH,VCD,VCA
count,12714.0,12714.0,12714.0,12699.0,12699.0,12699.0,12686.0,12686.0,12686.0,12709.0,12709.0,12709.0,12708.0,12708.0,12708.0,12696.0,12696.0,12696.0
mean,2.625455,3.851312,4.724529,2.570322,3.779627,4.47566,2.478364,3.656283,4.206142,2.547071,3.754477,4.471715,2.604511,3.662175,4.61116,2.682743,3.931064,4.962055
std,1.821225,1.188397,3.867266,1.681922,1.109067,3.427111,1.483711,0.853358,2.938152,1.668051,1.087585,3.533333,1.753303,0.999789,3.834338,1.992439,1.344774,4.567066
min,1.04,1.4,1.08,1.03,1.65,1.1,1.05,1.5,1.1,1.04,1.4,1.1,1.02,1.02,1.08,1.03,1.62,1.08
25%,1.67,3.3,2.6,1.67,3.25,2.55,1.7,3.2,2.55,1.67,3.25,2.5,1.7,3.2,2.6,1.7,3.3,2.6
50%,2.1,3.4,3.5,2.1,3.4,3.45,2.1,3.3,3.3,2.1,3.4,3.4,2.15,3.3,3.4,2.15,3.5,3.6
75%,2.8,4.0,5.25,2.75,3.8,5.0,2.6,3.7,4.7,2.75,3.75,5.0,2.75,3.75,5.0,2.88,4.0,5.5
max,26.0,17.0,51.0,34.0,19.5,51.0,20.0,11.0,25.0,26.0,19.0,51.0,26.0,17.0,51.0,36.0,26.0,67.0


**drop all match rows that have no predictions at all**

In [105]:
top_5_leagues_matches_df_clean.shape

(12723, 103)

In [106]:
top_5_leagues_matches_df_clean.drop(top_5_leagues_matches_df_clean[match_predictions_df.iloc[:, predictions_start_idx:].isna().sum(axis=1) == abs(predictions_start_idx)].index, inplace=True)
top_5_leagues_matches_df_clean.shape

(12716, 103)

In [107]:
match_predictions_df = top_5_leagues_matches_df_clean.loc[:, list(top_5_leagues_matches_df_clean.columns[predictions_start_idx:])]

**fill in the remaining null predictions using the average of those values for the row (i.e. null home predictions will be average of home predictions, likewise for draw and away)**

In [108]:
def series_to_dataframe(series: pd.Series, column_names: List[str]):
    # Create an empty DataFrame
    df = pd.DataFrame()

    # Repeat the series for each column name and assign it to the DataFrame
    for column_name in column_names:
        df[column_name] = series

    return df


In [109]:
# strategy: get the average home odds for the row, and fill in with that value
# fillna allows me to specify a dataframe with values for each column to fill in with
# so I can specify a dataframe with the average home odds for each row, in the home column only
# do the same for the away and draw columns

def fill_missing_predictions(suffix: str, df: pd.DataFrame):
    """
    Fills the missing predictions for the columns ending with the provided suffix in the provided dataframe.
    The suffix indicates if the prediction is for the home team (H), away team (A) or a draw (D).

    The missing values are filled with the mean of the other predictions for the same match for the same outcome,
    meaning that missing home win predictions are filled with the mean of other home win predictions for that match.
    """
    predictions_cols = list(filter(lambda x: x.endswith(suffix), df.columns))
    df[predictions_cols] = df[predictions_cols].fillna(series_to_dataframe(df[predictions_cols].mean(axis=1), predictions_cols))
    return df

In [110]:
for suffix in ["H", "A", "D"]:
    top_5_leagues_matches_df_clean = fill_missing_predictions(suffix, top_5_leagues_matches_df_clean)

##### Test

In [111]:
match_predictions_df = top_5_leagues_matches_df_clean.loc[:, list(top_5_leagues_matches_df_clean.columns[predictions_start_idx:])]
match_predictions_df.isna().sum()

B365H    0
B365D    0
B365A    0
BWH      0
BWD      0
BWA      0
IWH      0
IWD      0
IWA      0
LBH      0
LBD      0
LBA      0
WHH      0
WHD      0
WHA      0
VCH      0
VCD      0
VCA      0
dtype: int64

In [112]:
top_5_leagues_matches_df_clean[top_5_leagues_matches_df_clean.iloc[:, predictions_start_idx:].isna().sum(axis=1) > 0]

Unnamed: 0,id,country_id,league_id,season,matchday,date,match_api_id,home_team_api_id,away_team_api_id,home_team_goal,...,IWA,LBH,LBD,LBA,WHH,WHD,WHA,VCH,VCD,VCA


### player attributes table

#### `date` column should be datetime

##### Define
- convert using `pd.to_datetime`

##### Code

In [113]:
# convert the date strings to date time objects
player_attributes_df_clean["date"] = pd.to_datetime(player_attributes_df_clean.date)
player_attributes_df_clean.date.head()

0   2016-02-18
1   2015-11-19
2   2015-09-21
3   2015-03-20
4   2007-02-22
Name: date, dtype: datetime64[ns]

##### Test

In [114]:
player_attributes_df_clean.date.dtype

dtype('<M8[ns]')

#### missing a `season` column

##### Define

- create from date column

##### Code

In [115]:
season_categories

CategoricalDtype(categories=['2008/2009', '2009/2010', '2010/2011', '2011/2012',
                  '2012/2013', '2013/2014', '2014/2015', '2015/2016'],
, ordered=True)

In [116]:
player_attributes_df.date.min(), player_attributes_df.date.max()

('2007-02-22 00:00:00', '2016-07-07 00:00:00')

**Rule**: from january up to and including july is the season that started the previous summer, august to december is the season that started the current year

In [117]:
# add a season column to the player attributes df based on the date column
def find_season(items, condition):
    for item in items:
        if condition(item):
            return item
    return None  # Return None if no match is found


def get_season_from_date(date: datetime.datetime) -> Optional[pd.CategoricalDtype]:
    """
    Returns the season category for the supplied date, starting with the 2008/2009 season
    """
    july = 7
    if date.year < 2008 or (date.year == 2008 and date.month <= july):
        return None

    return find_season(season_categories.categories, lambda season: (f"{date.year}/" in season and date.month > july) or (f"/{date.year}" in season and date.month <= july))

In [118]:
player_attributes_df_clean["season"] = player_attributes_df_clean.date.apply(get_season_from_date).astype(season_categories)
player_attributes_df_clean.season.head()

0    2015/2016
1    2015/2016
2    2015/2016
3    2014/2015
4          NaN
Name: season, dtype: category
Categories (8, object): ['2008/2009' < '2009/2010' < '2010/2011' < '2011/2012' < '2012/2013' < '2013/2014' < '2014/2015' < '2015/2016']

##### Test

In [119]:
assert "season" in player_attributes_df_clean.columns
player_attributes_df_clean.season.dtype

CategoricalDtype(categories=['2008/2009', '2009/2010', '2010/2011', '2011/2012',
                  '2012/2013', '2013/2014', '2014/2015', '2015/2016'],
, ordered=True)

#### preferred foot should be categorical

##### Define

- convert to categorical (should be unordered)

##### Code

In [120]:
player_attributes_df_clean.preferred_foot.value_counts()

right    138409
left      44733
Name: preferred_foot, dtype: int64

In [121]:
preferred_foot_categories = pd.CategoricalDtype(categories=set(filter(None, player_attributes_df_clean.preferred_foot.unique())), ordered=False)
preferred_foot_categories

CategoricalDtype(categories=['left', 'right'], ordered=False)

In [122]:
player_attributes_df_clean["preferred_foot"] = player_attributes_df_clean.preferred_foot.astype(preferred_foot_categories)

##### Test

In [123]:
player_attributes_df_clean.preferred_foot.dtype

CategoricalDtype(categories=['left', 'right'], ordered=False)

#### all attributes missing for some rows

##### Define

- drop all rows that have null values

##### Code

In [124]:
# establish the number of columns
num_attributes = player_attributes_df_clean.iloc[:, attributes_start_idx:].columns.size
num_attributes

39

In [125]:
# find the number of rows that have all values missing
(player_attributes_df_clean.iloc[:, attributes_start_idx:].isna().sum(axis=1) == num_attributes).sum()

736

In [126]:
# delete them
player_attributes_df_clean.drop(player_attributes_df_clean[player_attributes_df_clean.iloc[:, attributes_start_idx:].isna().sum(axis=1) == num_attributes].index, inplace=True)

##### Test

In [127]:
assert (player_attributes_df_clean.iloc[:, attributes_start_idx:].isna().sum(axis=1) == num_attributes).sum() == 0

#### attacking work rate null for some players

##### Define

- drop rows where this value is null

##### Code

In [128]:
player_attributes_df_clean.drop(player_attributes_df_clean[player_attributes_df_clean.attacking_work_rate.isna()].index, inplace=True)

##### Test

In [129]:
player_attributes_df_clean.attacking_work_rate.isna().sum()

0

#### attacking work rate has strange values

##### Define

- find common ground between the values that are mixed but sensible
- normalize those values to make them uniform
- convert nonsensical values to null
- drop rows where this value is null

##### Code

In [130]:
player_attributes_df_clean.attacking_work_rate.value_counts()

medium    125070
high       42823
low         8569
None        3639
norm         348
y            106
le           104
stoc          89
Name: attacking_work_rate, dtype: int64

In [131]:
players_attacking_work_rate_mapping = {
    "medium": "medium",
    "high": "high",
    "low": "low",
    "norm": "medium",
    # all these other values should be None
    "stoc": None,
    "y": None,
    "le": None,
    "None": None,
}

In [132]:
player_attributes_df_clean["attacking_work_rate"] = player_attributes_df_clean.attacking_work_rate.apply(lambda x: players_attacking_work_rate_mapping[x])

In [133]:
player_attributes_df_clean.drop(player_attributes_df_clean[player_attributes_df_clean.attacking_work_rate.isna()].index, inplace=True)

##### Test

In [134]:
player_attributes_df_clean.attacking_work_rate.value_counts()

medium    125418
high       42823
low         8569
Name: attacking_work_rate, dtype: int64

In [135]:
player_attributes_df_clean.attacking_work_rate.isna().sum()

0

#### defensive work rate has strange values

##### Define

- find common ground between the values that are mixed but sensible
- normalize those values to make them uniform
- convert nonsensical values to null
- drop rows where this value is null

##### Code

In [136]:
player_attributes_df_clean.defensive_work_rate.value_counts()

medium    130764
high       26972
low        18425
ormal        348
2             89
1             44
6             34
5             28
4             27
3             23
0             20
9             17
7             14
8              5
Name: defensive_work_rate, dtype: int64

In [137]:
players_defensive_work_rate_mapping = {
    "medium": "medium",
    "high": "high",
    "low": "low",
    "ormal": "medium",
    "2": None,
    "1": None,
    "6": None,
    "5": None,
    "4": None,
    "3": None,
    "0": None,
    "9": None,
    "7": None,
    "8": None,
}

In [138]:
player_attributes_df_clean["defensive_work_rate"] = player_attributes_df_clean.defensive_work_rate.apply(lambda x: players_defensive_work_rate_mapping[x])

In [139]:
player_attributes_df_clean.drop(player_attributes_df_clean[player_attributes_df_clean.defensive_work_rate.isna()].index, inplace=True)

##### Test

In [140]:
player_attributes_df_clean.defensive_work_rate.value_counts()

medium    131112
high       26972
low        18425
Name: defensive_work_rate, dtype: int64

In [141]:
player_attributes_df_clean.defensive_work_rate.isna().sum()

0

#### attacking work rate should be categorical

##### Define

- convert the values to categorical

##### Code

In [142]:
work_rate_categories = pd.CategoricalDtype(categories=["low", "medium", "high"], ordered=True)
player_attributes_df_clean["attacking_work_rate"] = player_attributes_df_clean.attacking_work_rate.astype(work_rate_categories)

##### Test

In [143]:
player_attributes_df_clean.attacking_work_rate.dtype

CategoricalDtype(categories=['low', 'medium', 'high'], ordered=True)

#### defensive work rate should be categorical

##### Define

- convert to categorical

##### Code

In [144]:
player_attributes_df_clean["defensive_work_rate"] = player_attributes_df_clean.defensive_work_rate.astype(work_rate_categories)

##### Test

In [145]:
player_attributes_df_clean.defensive_work_rate.dtype

CategoricalDtype(categories=['low', 'medium', 'high'], ordered=True)

### team attributes table

#### `date` column should be datetime

##### Define

- convert to datetime using `pd.to_datetime`

##### Code

In [146]:
team_attributes_df_clean["date"] = pd.to_datetime(team_attributes_df_clean.date)

##### Test

In [147]:
team_attributes_df_clean.date.dtype

dtype('<M8[ns]')

#### missing a `season` column

##### Define

- create from date column
- make it categorical

##### Code

In [148]:
team_attributes_df_clean["season"] = team_attributes_df_clean.date.apply(get_season_from_date).astype(season_categories)

##### Test

In [149]:
team_attributes_df_clean.season.dtype

CategoricalDtype(categories=['2008/2009', '2009/2010', '2010/2011', '2011/2012',
                  '2012/2013', '2013/2014', '2014/2015', '2015/2016'],
, ordered=True)

#### all the columns that end with `Class` should be categorical

##### Define

- for each of them get their unique values
- create a categorical variable (ordered if needed)
- convert them to that variable

##### Code

In [150]:
for col_name in filter(lambda x: x.endswith("Class"), team_attributes_df_clean.columns):
    print(f'"{col_name}": ({team_attributes_df_clean[col_name].unique()}, True),')

"buildUpPlaySpeedClass": (['Balanced' 'Slow' 'Fast'], True),
"buildUpPlayDribblingClass": (['Little' 'Normal' 'Lots'], True),
"buildUpPlayPassingClass": (['Mixed' 'Short' 'Long'], True),
"buildUpPlayPositioningClass": (['Organised' 'Free Form'], True),
"chanceCreationPassingClass": (['Normal' 'Safe' 'Risky'], True),
"chanceCreationCrossingClass": (['Normal' 'Little' 'Lots'], True),
"chanceCreationShootingClass": (['Normal' 'Lots' 'Little'], True),
"chanceCreationPositioningClass": (['Organised' 'Free Form'], True),
"defencePressureClass": (['Deep' 'Medium' 'High'], True),
"defenceAggressionClass": (['Double' 'Press' 'Contain'], True),
"defenceTeamWidthClass": (['Narrow' 'Normal' 'Wide'], True),
"defenceDefenderLineClass": (['Offside Trap' 'Cover'], True),


In [151]:
team_attrs_class_columns_mapping = {
    "buildUpPlaySpeedClass": (['2. Balanced', '1. Slow', '3. Fast'], True),
    "buildUpPlayDribblingClass": (['1. Little', '2. Normal', '3. Lots'], True),
    "buildUpPlayPassingClass": (['2. Mixed', '1. Short', '3. Long'], True),
    "buildUpPlayPositioningClass": (['1. Organised', '1. Free Form'], False),
    "chanceCreationPassingClass": (['2. Normal', '1. Safe', '3. Risky'], True),
    "chanceCreationCrossingClass": (['2. Normal', '1. Little', '3. Lots'], True),
    "chanceCreationShootingClass": (['2. Normal', '3. Lots', '1. Little'], True),
    "chanceCreationPositioningClass": (['1. Organised', '1. Free Form'], False),
    "defencePressureClass": (['1. Deep', '2. Medium', '3. High'], True),
    "defenceAggressionClass": (['3. Double', '2. Press', '1. Contain'], True),
    "defenceTeamWidthClass": (['1. Narrow', '2. Normal', '3. Wide'], True),
    "defenceDefenderLineClass": (['2. Offside Trap', '1. Cover'], True),
}

In [152]:
for key, value in team_attrs_class_columns_mapping.items():
    sorted_list = sorted(value[0])
    ordered = value[1]
    cat_variable = pd.CategoricalDtype([item[3:] for item in sorted_list], ordered=ordered)
    team_attributes_df_clean[key] = team_attributes_df_clean[key].astype(cat_variable)

##### Test

In [153]:
pprint(team_attributes_df_clean.defenceAggressionClass.dtype)

CategoricalDtype(categories=['Contain', 'Press', 'Double'], ordered=True)


In [154]:
for key in team_attrs_class_columns_mapping.keys():
    pprint(team_attributes_df_clean[key].dtype)

CategoricalDtype(categories=['Slow', 'Balanced', 'Fast'], ordered=True)
CategoricalDtype(categories=['Little', 'Normal', 'Lots'], ordered=True)
CategoricalDtype(categories=['Short', 'Mixed', 'Long'], ordered=True)
CategoricalDtype(categories=['Free Form', 'Organised'], ordered=False)
CategoricalDtype(categories=['Safe', 'Normal', 'Risky'], ordered=True)
CategoricalDtype(categories=['Little', 'Normal', 'Lots'], ordered=True)
CategoricalDtype(categories=['Little', 'Normal', 'Lots'], ordered=True)
CategoricalDtype(categories=['Free Form', 'Organised'], ordered=False)
CategoricalDtype(categories=['Deep', 'Medium', 'High'], ordered=True)
CategoricalDtype(categories=['Contain', 'Press', 'Double'], ordered=True)
CategoricalDtype(categories=['Narrow', 'Normal', 'Wide'], ordered=True)
CategoricalDtype(categories=['Cover', 'Offside Trap'], ordered=True)


#### `buildUpPlayDribbling` has (a lot of) null values

##### Define

- drop the column

##### Code

In [155]:
team_attributes_df_clean.drop(["buildUpPlayDribbling"], axis=1, inplace=True)

##### Test

In [156]:
assert "buildUpPlayDribbling" not in team_attributes_df_clean.columns

### champs league table

#### contains data for years that are not part of our period of interest

##### Define

- those years should be removed

##### Code

In [157]:
champs_league_hist_df_clean = champs_league_hist_df_clean[champs_league_hist_df_clean.year.isin(top_5_leagues_matches_df_clean.date.dt.year.unique())]

##### Test

In [158]:
assert champs_league_hist_df_clean.year.unique().size == top_5_leagues_matches_df_clean.date.dt.year.unique().size

#### order assigned to `progress` values is not consistent eg `5. Last 16` and `6. Last 16`

##### Define

- create a dictionary with the correct order
- use that dictionary to replace the values in the column
- remove the numbers from the values

##### Code

In [159]:
champs_league_hist_df_clean.progress.unique()

array(['1. Winner', '2. Runner Up', '3. Semifinalist',
       '4. Quarterfinalist', '5. Last 16', '7. Group Stage'], dtype=object)

Turns out all the inconsistent values got removed once the rows for other years were removed.

#### `progress` should be categorical

##### Define

- convert to categorical

##### Code

In [160]:
champs_lg_stages_categories = pd.CategoricalDtype(categories=[stage[3:] for stage in sorted(champs_league_hist_df_clean.progress.unique())[::-1]], ordered=True)
champs_lg_stages_categories

CategoricalDtype(categories=['Group Stage', 'Last 16', 'Quarterfinalist', 'Semifinalist',
                  'Runner Up', 'Winner'],
, ordered=True)

In [161]:
champs_league_hist_df_clean["progress"] = champs_league_hist_df_clean.progress.str.slice(3).astype(champs_lg_stages_categories)

##### Test

In [162]:
champs_league_hist_df_clean.progress.dtype

CategoricalDtype(categories=['Group Stage', 'Last 16', 'Quarterfinalist', 'Semifinalist',
                  'Runner Up', 'Winner'],
, ordered=True)

#### missing team api id column

##### Define

- find all the teams in the teams table that have a matching name to the teams in the champs league table
- set the team api id for those teams in the champs league table, set others to null
- for the teams without an API ID, manually create a mapping for their names to whatever their API ID is
- use the mapping to set their API IDs in the champs league table

##### Code

In [163]:
teams_df_clean.columns

Index(['id', 'team_api_id', 'team_fifa_api_id', 'team_long_name',
       'team_short_name'],
      dtype='object')

In [164]:
teams_df_clean.team_long_name.isin(champs_league_hist_df_clean.team_name.unique()).sum()

15

In [165]:
teams_df_clean[teams_df_clean.team_long_name.isin(champs_league_hist_df_clean.team_name.unique())][["team_api_id", "team_long_name"]]

Unnamed: 0,team_api_id,team_long_name
0,10260,Manchester United
2,9825,Arsenal
5,8650,Liverpool
9,8456,Manchester City
13,8586,Tottenham Hotspur
18,8455,Chelsea
42,9748,Olympique Lyonnais
45,9847,Paris Saint-Germain
72,9789,Borussia Dortmund
102,8543,Lazio


In [166]:
team_name_api_id_mapping = teams_df_clean[teams_df_clean.team_long_name.isin(champs_league_hist_df_clean.team_name.unique())][["team_api_id", "team_long_name"]].set_index("team_long_name")["team_api_id"].to_dict()
team_name_api_id_mapping

{'Manchester United': 10260,
 'Arsenal': 9825,
 'Liverpool': 8650,
 'Manchester City': 8456,
 'Tottenham Hotspur': 8586,
 'Chelsea': 8455,
 'Olympique Lyonnais': 9748,
 'Paris Saint-Germain': 9847,
 'Borussia Dortmund': 9789,
 'Lazio': 8543,
 'Fiorentina': 8535,
 'Juventus': 9885,
 'Roma': 8686,
 'Napoli': 9875,
 'Real Sociedad': 8560}

In [167]:
# set these values as team api id in the champs league df
champs_league_hist_df_clean["team_api_id"] = champs_league_hist_df_clean.team_name.apply(lambda x: team_name_api_id_mapping.get(x, None))
champs_league_hist_df_clean.team_api_id

803     10260.0
804      8455.0
805     10260.0
806      8455.0
807      8650.0
         ...   
1356        NaN
1357        NaN
1358        NaN
1359     9748.0
1360        NaN
Name: team_api_id, Length: 558, dtype: float64

In [168]:
champs_league_hist_df_clean.team_api_id.value_counts(dropna=False)

NaN        393
8455.0      28
10260.0     25
9825.0      22
9885.0      14
9748.0      13
9847.0      12
9789.0      11
8686.0      10
8456.0      10
8650.0       9
8535.0       3
8586.0       3
9875.0       3
8543.0       1
8560.0       1
Name: team_api_id, dtype: int64

So there are 393 teams whose names still need to be updated. Possible strategy:
- do a fuzzy match on the team names to find the closest match
    - first get the unique team names from the champs league table that lack an api id, and from the teams df
    - for each unique champs league team name, find the loop over the unique teams df names and find the closest match, and returns its ID
    - this leaves a list of IDs, which can be used to update the champs league table
- to start, do fuzzy match using a 2-way substring match i.e. if string 1 is a substring of string 2 or vice versa, then they are a match

In [169]:
champs_league_teams_no_api_id = champs_league_hist_df_clean[champs_league_hist_df_clean.team_api_id.isna()].team_name.unique()
champs_league_teams_no_api_id

array(['Barcelona', 'Schalke 04', 'Fenerbahce', 'Porto', 'Celtic',
       'Real Madrid', 'A.C. Milan', 'Internazionale', 'Sevilla',
       'Olympiacos', 'Marseille', 'Besiktas', 'Rosenborg BK', 'Valencia',
       'Werder Bremen', 'Benfica', 'Shakhtar Donetsk', 'Rangers (IL)',
       'Stuttgart', 'Sporting CP', 'Dynamo Kyiv', 'PSV Eindhoven',
       'CSKA Moscow', 'Slavia Prague', 'Steaua Bucuresti',
       'Bayern Munich', 'Villarreal', 'Athletico Madrid', 'Panathinaikos',
       'Bordeaux', 'CFR Cluj', 'Anorthosis', 'Basel', 'Aalborg BK',
       'Zenit Saint Petersburg', 'BATE Borisov', 'Maccabi Haifa',
       'Wolfsburg', 'Zurich', 'APOEL', 'Debrecen', 'Rubin Kazan',
       'Unirea Urziceni', 'Standard Liege', 'AZ Alkmaar', 'Copenhagen',
       'Twente', 'Hapoel Tel Aviv', 'Bursaspor', 'Spartak Moscow',
       'Zilina', 'Ajax', 'Auxerre', 'Braga', 'FK Partizan',
       'Bayer Leverkusen', 'Trabzonspor', 'Lille OSC', 'Otelul Galati',
       'Dinamo Zagreb', 'Genk', 'Viktoria Plzen', '

In [170]:
available_team_names = teams_df_clean.team_long_name.unique()
available_team_names

array(['Manchester United', 'Newcastle United', 'Arsenal',
       'West Bromwich Albion', 'Sunderland', 'Liverpool',
       'West Ham United', 'Wigan Athletic', 'Aston Villa',
       'Manchester City', 'Everton', 'Blackburn Rovers', 'Middlesbrough',
       'Tottenham Hotspur', 'Bolton Wanderers', 'Stoke City', 'Hull City',
       'Fulham', 'Chelsea', 'Portsmouth', 'Birmingham City',
       'Wolverhampton Wanderers', 'Burnley', 'Blackpool', 'Swansea City',
       'Queens Park Rangers', 'Norwich City', 'Southampton', 'Reading',
       'Crystal Palace', 'Cardiff City', 'Leicester City', 'Bournemouth',
       'Watford', 'AJ Auxerre', 'FC Nantes', 'Girondins de Bordeaux',
       'SM Caen', 'Le Havre AC', 'OGC Nice', 'Le Mans FC', 'FC Lorient',
       'Olympique Lyonnais', 'Toulouse FC', 'AS Monaco',
       'Paris Saint-Germain', 'AS Nancy-Lorraine', 'LOSC Lille',
       'Stade Rennais FC', 'Olympique de Marseille',
       'FC Sochaux-Montbéliard', 'Grenoble Foot 38', 'Valenciennes FC',
    

In [171]:
champs_lg_name_team_name_mapping = {}
for champs_lg_team_name in champs_league_teams_no_api_id:
    for _team_name in available_team_names:
        if _team_name.lower() in champs_lg_team_name.lower() or champs_lg_team_name.lower() in _team_name.lower():
            champs_lg_name_team_name_mapping[champs_lg_team_name] = _team_name

champs_lg_name_team_name_mapping

{'Barcelona': 'FC Barcelona',
 'Schalke 04': 'FC Schalke 04',
 'Real Madrid': 'Real Madrid CF',
 'A.C. Milan': 'Milan',
 'Internazionale': 'Inter',
 'Sevilla': 'Sevilla FC',
 'Marseille': 'Olympique de Marseille',
 'Valencia': 'Valencia CF',
 'Werder Bremen': 'SV Werder Bremen',
 'Stuttgart': 'VfB Stuttgart',
 'Bayern Munich': 'FC Bayern Munich',
 'Villarreal': 'Villarreal CF',
 'Bordeaux': 'Girondins de Bordeaux',
 'Wolfsburg': 'VfL Wolfsburg',
 'Auxerre': 'AJ Auxerre',
 'Montpellier': 'Montpellier Hérault SC',
 'Monaco': 'AS Monaco'}

That mapping is correct, so we need to add the team api ids to the champs league table.

In [172]:
def update_champs_league_team_api_id(team_row) -> int:
    team_name = team_row["team_name"]
    team_api_id = team_row["team_api_id"]
    if team_api_id and not np.isnan(team_api_id):
        # this team already has an API ID
        return team_row["team_api_id"]
    if team_name not in champs_lg_name_team_name_mapping:
        return None
    team_api_id = teams_df_clean.loc[teams_df_clean["team_long_name"] == champs_lg_name_team_name_mapping[team_name], "team_api_id"]
    if not team_api_id.empty:
        return team_api_id[team_api_id.index[0]]
    else:
        return None

champs_league_hist_df_clean["team_api_id"] = champs_league_hist_df_clean.apply(update_champs_league_team_api_id, axis=1)
champs_league_hist_df_clean.team_api_id.value_counts(dropna=False)

NaN        208
8634.0      37
9823.0      32
8633.0      32
8455.0      28
10260.0     25
9825.0      22
9885.0      14
8636.0      14
10189.0     13
8564.0      13
9748.0      13
9847.0      12
9789.0      11
8686.0      10
8456.0      10
8592.0       9
8650.0       9
10267.0      7
8302.0       5
10205.0      4
9827.0       4
8721.0       4
9875.0       3
9829.0       3
8586.0       3
8535.0       3
10269.0      3
8697.0       3
8583.0       1
10249.0      1
8560.0       1
8543.0       1
Name: team_api_id, dtype: int64

At this point I need a better fuzzy matching solution.

In [176]:
from thefuzz import fuzz, process as fuzz_process

In [174]:
champs_league_teams_no_api_id = champs_league_hist_df_clean[champs_league_hist_df_clean.team_api_id.isna()].team_name.unique()
champs_league_teams_no_api_id

array(['Fenerbahce', 'Porto', 'Celtic', 'Olympiacos', 'Besiktas',
       'Rosenborg BK', 'Benfica', 'Shakhtar Donetsk', 'Rangers (IL)',
       'Sporting CP', 'Dynamo Kyiv', 'PSV Eindhoven', 'CSKA Moscow',
       'Slavia Prague', 'Steaua Bucuresti', 'Athletico Madrid',
       'Panathinaikos', 'CFR Cluj', 'Anorthosis', 'Basel', 'Aalborg BK',
       'Zenit Saint Petersburg', 'BATE Borisov', 'Maccabi Haifa',
       'Zurich', 'APOEL', 'Debrecen', 'Rubin Kazan', 'Unirea Urziceni',
       'Standard Liege', 'AZ Alkmaar', 'Copenhagen', 'Twente',
       'Hapoel Tel Aviv', 'Bursaspor', 'Spartak Moscow', 'Zilina', 'Ajax',
       'Braga', 'FK Partizan', 'Bayer Leverkusen', 'Trabzonspor',
       'Lille OSC', 'Otelul Galati', 'Dinamo Zagreb', 'Genk',
       'Viktoria Plzen', 'Galatasaray', 'Malaga', 'Anderlecht',
       'Nordsjaelland', 'Austria Wien', 'Malmo FF',
       'PFC Ludogorets Razgrad', 'Maribor', 'Athletic Bilbao', 'Gent',
       'Astana', 'Borussia Monchengladbach', 'Maccabi Tel Aviv'],
 

In [178]:
champs_lg_name_team_name_mapping = {}
for champs_lg_team_name in champs_league_teams_no_api_id:
    matching_available_teams = fuzz_process.extract(champs_lg_team_name, available_team_names)
    if matching_available_teams:
        champs_lg_name_team_name_mapping[champs_lg_team_name] = matching_available_teams

champs_lg_name_team_name_mapping

{'Fenerbahce': [('FC Energie Cottbus', 54),
  ('Inter', 54),
  ('CD Tenerife', 48),
  ('RC Lens', 45),
  ('Eintracht Frankfurt', 45)],
 'Porto': [('Portsmouth', 72),
  ('RC Deportivo de La Coruña', 72),
  ('Real Sporting de Gijón', 72),
  ('Xerez Club Deportivo', 72),
  ('Blackpool', 60)],
 'Celtic': [('Wigan Athletic', 60),
  ('Athletic Club de Bilbao', 60),
  ('Atlético Madrid', 60),
  ('RC Celta de Vigo', 60),
  ('Elche CF', 54)],
 'Olympiacos': [('Olympique Lyonnais', 57),
  ('Olympique de Marseille', 54),
  ('Milan', 54),
  ('Roma', 45),
  ('Empoli', 45)],
 'Besiktas': [('Brescia', 67),
  ('Stade Brestois 29', 56),
  ('SC Bastia', 56),
  ('Siena', 54),
  ('SD Eibar', 48)],
 'Rosenborg BK': [('Siena', 54),
  ('Genoa', 54),
  ('Arsenal', 51),
  ('Blackburn Rovers', 48),
  ('1. FC Nürnberg', 48)],
 'Benfica': [('Brescia', 57),
  ('Valencia CF', 53),
  ('Siena', 50),
  ('Genoa', 50),
  ('FC Bayern Munich', 49)],
 'Shakhtar Donetsk': [('Udinese', 51),
  ('Stade de Reims', 47),
  ('Roma

The matching didn't go very well. The majority of these teams are not in the top 5 leagues, so I'll just manually add the ones that are.

In [179]:
champs_league_teams_no_api_id

array(['Fenerbahce', 'Porto', 'Celtic', 'Olympiacos', 'Besiktas',
       'Rosenborg BK', 'Benfica', 'Shakhtar Donetsk', 'Rangers (IL)',
       'Sporting CP', 'Dynamo Kyiv', 'PSV Eindhoven', 'CSKA Moscow',
       'Slavia Prague', 'Steaua Bucuresti', 'Athletico Madrid',
       'Panathinaikos', 'CFR Cluj', 'Anorthosis', 'Basel', 'Aalborg BK',
       'Zenit Saint Petersburg', 'BATE Borisov', 'Maccabi Haifa',
       'Zurich', 'APOEL', 'Debrecen', 'Rubin Kazan', 'Unirea Urziceni',
       'Standard Liege', 'AZ Alkmaar', 'Copenhagen', 'Twente',
       'Hapoel Tel Aviv', 'Bursaspor', 'Spartak Moscow', 'Zilina', 'Ajax',
       'Braga', 'FK Partizan', 'Bayer Leverkusen', 'Trabzonspor',
       'Lille OSC', 'Otelul Galati', 'Dinamo Zagreb', 'Genk',
       'Viktoria Plzen', 'Galatasaray', 'Malaga', 'Anderlecht',
       'Nordsjaelland', 'Austria Wien', 'Malmo FF',
       'PFC Ludogorets Razgrad', 'Maribor', 'Athletic Bilbao', 'Gent',
       'Astana', 'Borussia Monchengladbach', 'Maccabi Tel Aviv'],
 

In [181]:
champs_lg_name_team_name_mapping = {
    "Athletico Madrid": "Atlético Madrid",
    "Bayer Leverkusen": "Bayer 04 Leverkusen",
    "Lille OSC": "LOSC Lille",
    "Malaga": "Málaga CF",
    "Athletic Bilbao": "Athletic Club de Bilbao",
    "Borussia Monchengladbach": "Borussia Mönchengladbach",
}

In [186]:
champs_league_hist_df_clean["team_api_id"] = champs_league_hist_df_clean.apply(update_champs_league_team_api_id, axis=1)
champs_league_hist_df_clean.team_api_id.value_counts(dropna=False)

NaN        178
8634.0      37
9823.0      32
8633.0      32
8455.0      28
10260.0     25
9825.0      22
9906.0      16
9885.0      14
8636.0      14
8564.0      13
9748.0      13
10189.0     13
9847.0      12
9789.0      11
8456.0      10
8686.0      10
8592.0       9
8650.0       9
10267.0      7
8178.0       7
8302.0       5
8721.0       4
10205.0      4
9827.0       4
8535.0       3
8586.0       3
9875.0       3
10269.0      3
9864.0       3
8697.0       3
9829.0       3
8639.0       2
8583.0       1
8543.0       1
10249.0      1
8560.0       1
8315.0       1
9788.0       1
Name: team_api_id, dtype: int64

Can drop all the remaining rows that don't have API IDs because we won't need them.

In [187]:
champs_league_hist_df_clean[champs_league_hist_df_clean.team_api_id.isna()].team_name.unique()

array(['Fenerbahce', 'Porto', 'Celtic', 'Olympiacos', 'Besiktas',
       'Rosenborg BK', 'Benfica', 'Shakhtar Donetsk', 'Rangers (IL)',
       'Sporting CP', 'Dynamo Kyiv', 'PSV Eindhoven', 'CSKA Moscow',
       'Slavia Prague', 'Steaua Bucuresti', 'Panathinaikos', 'CFR Cluj',
       'Anorthosis', 'Basel', 'Aalborg BK', 'Zenit Saint Petersburg',
       'BATE Borisov', 'Maccabi Haifa', 'Zurich', 'APOEL', 'Debrecen',
       'Rubin Kazan', 'Unirea Urziceni', 'Standard Liege', 'AZ Alkmaar',
       'Copenhagen', 'Twente', 'Hapoel Tel Aviv', 'Bursaspor',
       'Spartak Moscow', 'Zilina', 'Ajax', 'Braga', 'FK Partizan',
       'Trabzonspor', 'Otelul Galati', 'Dinamo Zagreb', 'Genk',
       'Viktoria Plzen', 'Galatasaray', 'Anderlecht', 'Nordsjaelland',
       'Austria Wien', 'Malmo FF', 'PFC Ludogorets Razgrad', 'Maribor',
       'Gent', 'Astana', 'Maccabi Tel Aviv'], dtype=object)

In [191]:
champs_league_hist_df_clean.drop(champs_league_hist_df_clean[champs_league_hist_df_clean.team_api_id.isna()].index, inplace=True)

##### Test

In [192]:
assert champs_league_hist_df_clean.team_api_id.isna().sum() == 0

#### missing a `season` column

##### Define

- create from year column
- drop rows for years that are not in our timeframe

##### Code

In [200]:
champs_league_hist_df_clean["season"] = champs_league_hist_df_clean.year.apply(lambda x: f"{x - 1}/{x}").astype(season_categories)

In [204]:
champs_league_hist_df_clean.drop(champs_league_hist_df_clean[champs_league_hist_df_clean.season.isna()].index, inplace=True)

##### Test

In [205]:
champs_league_hist_df_clean.season.dtype

CategoricalDtype(categories=['2008/2009', '2009/2010', '2010/2011', '2011/2012',
                  '2012/2013', '2013/2014', '2014/2015', '2015/2016'],
, ordered=True)

In [206]:
assert champs_league_hist_df_clean.season.isna().sum() == 0

#### missing a league id column

##### Define

- create from country column by matching the country name to the country table and getting the league id (which is the same as the country id)

##### Code

In [234]:
country_name_league_id_mapping = {country_name: countries_df_clean.query(f'name == "{country_name}"').id.iloc[0] for country_name in champs_league_hist_df_clean.team_country.unique()}
country_name_league_id_mapping

{'Spain': 21518,
 'England': 1729,
 'Germany': 7809,
 'France': 4769,
 'Italy': 10257}

In [238]:
champs_league_hist_df_clean["league_id"] = champs_league_hist_df_clean.team_country.apply(lambda x: country_name_league_id_mapping.get(x, None))

##### Test

In [239]:
champs_league_hist_df_clean.league_id.value_counts(dropna=False)

21518    97
1729     80
7809     69
10257    50
4769     42
Name: league_id, dtype: int64

In [53]:
# Reshape the matches dataframe to have a single column for player IDs
home_player_cols = [f'home_player_{i}' for i in range(1, 12)]
away_player_cols = [f'away_player_{i}' for i in range(1, 12)]
player_api_ids = pd.concat([top_5_leagues_matches_df[home_player_cols], top_5_leagues_matches_df[away_player_cols]], axis=1).stack().reset_index(drop=True)
player_api_ids = player_api_ids.rename('player_api_id')

player_api_ids.head()

0    30458.0
1    32571.0
2    38703.0
3    40543.0
4    26119.0
Name: player_api_id, dtype: float64

In [54]:
# remove the players that do not feature in any match in the top 5 leagues over the seasons
player_attributes_df = player_attributes_df[player_attributes_df.player_api_id.isin(player_api_ids)]
player_attributes_df.shape

(121639, 43)

In [55]:
top_5_leagues_matches_df.head()

Unnamed: 0,id,country_id,league_id,season,stage,date,match_api_id,home_team_api_id,away_team_api_id,home_team_goal,...,LBA,WHH,WHD,WHA,SJH,SJD,SJA,VCH,VCD,VCA
0,4770,4769,4769,2008/2009,1,2008-08-09 00:00:00,483130,9827,7819,2,...,6.0,1.53,3.3,6.0,1.5,3.8,7.5,1.6,3.3,5.5
1,4771,4769,4769,2008/2009,1,2008-08-09 00:00:00,483131,9746,9831,1,...,2.87,2.3,2.8,3.0,2.4,3.1,3.1,2.3,2.9,3.0
2,4772,4769,4769,2008/2009,1,2008-08-09 00:00:00,483132,8682,8689,0,...,3.5,2.0,3.0,3.4,2.0,3.2,4.0,2.0,2.9,3.75
3,4774,4769,4769,2008/2009,1,2008-08-09 00:00:00,483134,9829,9847,1,...,2.87,2.3,2.9,2.9,2.4,3.0,3.2,2.25,3.0,3.0
4,4775,4769,4769,2008/2009,1,2008-08-09 00:00:00,483135,8481,8639,0,...,3.2,2.15,2.88,3.25,2.1,3.1,3.8,2.1,3.0,3.3


## another strategy
- sort the matches by date
- for each row in player attributes
    - find the first row in player matches that has both the season and the player ID of the player attributes row
    - if there is, return its league id
    - else, return `None`
- I therefore need the player matches table
    - it has a column for player ID and match ID
    - each match ID from the matches table gets 22 rows, one for each player that started the match
    - each match id from the matches table has the season in which the match was played
    - each match id from the matches table has the league to which the match belongs

In [56]:
top_5_leagues_matches_df.head()

Unnamed: 0,id,country_id,league_id,season,stage,date,match_api_id,home_team_api_id,away_team_api_id,home_team_goal,...,LBA,WHH,WHD,WHA,SJH,SJD,SJA,VCH,VCD,VCA
0,4770,4769,4769,2008/2009,1,2008-08-09 00:00:00,483130,9827,7819,2,...,6.0,1.53,3.3,6.0,1.5,3.8,7.5,1.6,3.3,5.5
1,4771,4769,4769,2008/2009,1,2008-08-09 00:00:00,483131,9746,9831,1,...,2.87,2.3,2.8,3.0,2.4,3.1,3.1,2.3,2.9,3.0
2,4772,4769,4769,2008/2009,1,2008-08-09 00:00:00,483132,8682,8689,0,...,3.5,2.0,3.0,3.4,2.0,3.2,4.0,2.0,2.9,3.75
3,4774,4769,4769,2008/2009,1,2008-08-09 00:00:00,483134,9829,9847,1,...,2.87,2.3,2.9,2.9,2.4,3.0,3.2,2.25,3.0,3.0
4,4775,4769,4769,2008/2009,1,2008-08-09 00:00:00,483135,8481,8639,0,...,3.2,2.15,2.88,3.25,2.1,3.1,3.8,2.1,3.0,3.3


In [57]:
player_matches_df = pd.melt(top_5_leagues_matches_df, id_vars=["id"], value_vars=home_player_cols+away_player_cols, value_name="player_api_id").drop("variable", axis=1)
player_matches_df

Unnamed: 0,id,player_api_id
0,4770,30458.0
1,4771,41186.0
2,4772,
3,4774,145039.0
4,4775,26233.0
...,...,...
320667,24492,200917.0
320668,24495,179083.0
320669,24497,25462.0
320670,4705,47382.0


In [58]:
league_matches_df = pd.melt(top_5_leagues_matches_df, id_vars=["league_id"], value_vars=home_player_cols+away_player_cols, value_name="player_api_id").drop("variable", axis=1)
league_matches_df

Unnamed: 0,league_id,player_api_id
0,4769,30458.0
1,4769,41186.0
2,4769,
3,4769,145039.0
4,4769,26233.0
...,...,...
320667,21518,200917.0
320668,21518,179083.0
320669,21518,25462.0
320670,1729,47382.0


In [59]:
season_matches_df = pd.melt(top_5_leagues_matches_df, id_vars=["season"], value_vars=home_player_cols+away_player_cols, value_name="player_api_id").drop("variable", axis=1)
season_matches_df

Unnamed: 0,season,player_api_id
0,2008/2009,30458.0
1,2008/2009,41186.0
2,2008/2009,
3,2008/2009,145039.0
4,2008/2009,26233.0
...,...,...
320667,2015/2016,200917.0
320668,2015/2016,179083.0
320669,2015/2016,25462.0
320670,2015/2016,47382.0


In [60]:
player_matches_df = player_matches_df.merge(league_matches_df, left_index=True, right_index=True).merge(season_matches_df, left_index=True, right_index=True).drop(labels=["player_api_id_x", "player_api_id_y"], axis=1)
player_matches_df.head()

Unnamed: 0,id,league_id,season,player_api_id
0,4770,4769,2008/2009,30458.0
1,4771,4769,2008/2009,41186.0
2,4772,4769,2008/2009,
3,4774,4769,2008/2009,145039.0
4,4775,4769,2008/2009,26233.0


In [61]:
# preserve only the first occurrence of a player in a season
player_matches_df = player_matches_df[~(player_matches_df[['player_api_id', 'season']].duplicated())]
player_matches_df.head()

Unnamed: 0,id,league_id,season,player_api_id
0,4770,4769,2008/2009,30458.0
1,4771,4769,2008/2009,41186.0
2,4772,4769,2008/2009,
3,4774,4769,2008/2009,145039.0
4,4775,4769,2008/2009,26233.0


In [62]:
player_attributes_df = pd.merge(player_attributes_df, player_matches_df, left_on=['player_api_id', 'season'], right_on=['player_api_id', 'season'], how='left')
player_attributes_df

Unnamed: 0,id_x,player_fifa_api_id,player_api_id,date,overall_rating,potential,preferred_foot,attacking_work_rate,defensive_work_rate,crossing,...,standing_tackle,sliding_tackle,gk_diving,gk_handling,gk_kicking,gk_positioning,gk_reflexes,season,id_y,league_id
0,6,189615,155782,2016-04-21,74.0,76.0,left,high,medium,80.0,...,75.0,78.0,14.0,7.0,9.0,9.0,12.0,2015/2016,4595.0,1729.0
1,7,189615,155782,2016-04-07,74.0,76.0,left,high,medium,80.0,...,75.0,78.0,14.0,7.0,9.0,9.0,12.0,2015/2016,4595.0,1729.0
2,8,189615,155782,2016-01-07,73.0,75.0,left,high,medium,79.0,...,75.0,78.0,14.0,7.0,9.0,9.0,12.0,2015/2016,4595.0,1729.0
3,9,189615,155782,2015-12-24,73.0,75.0,left,high,medium,79.0,...,75.0,78.0,14.0,7.0,9.0,9.0,12.0,2015/2016,4595.0,1729.0
4,10,189615,155782,2015-12-17,73.0,75.0,left,high,medium,79.0,...,75.0,78.0,14.0,7.0,9.0,9.0,12.0,2015/2016,4595.0,1729.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
121634,183974,102359,39902,2009-08-30,83.0,85.0,right,medium,low,84.0,...,31.0,30.0,9.0,20.0,84.0,20.0,20.0,2009/2010,8115.0,7809.0
121635,183975,102359,39902,2009-02-22,78.0,80.0,right,medium,low,74.0,...,31.0,30.0,9.0,20.0,73.0,20.0,20.0,2008/2009,8100.0,7809.0
121636,183976,102359,39902,2008-08-30,77.0,80.0,right,medium,low,74.0,...,31.0,30.0,9.0,20.0,73.0,20.0,20.0,2008/2009,8100.0,7809.0
121637,183977,102359,39902,2007-08-30,78.0,81.0,right,medium,low,74.0,...,32.0,30.0,9.0,20.0,73.0,20.0,20.0,,,


In [63]:
player_attributes_df = player_attributes_df[~(player_attributes_df.league_id.isna())]
player_attributes_df.reset_index(drop=True, inplace=True)
player_attributes_df

Unnamed: 0,id_x,player_fifa_api_id,player_api_id,date,overall_rating,potential,preferred_foot,attacking_work_rate,defensive_work_rate,crossing,...,standing_tackle,sliding_tackle,gk_diving,gk_handling,gk_kicking,gk_positioning,gk_reflexes,season,id_y,league_id
0,6,189615,155782,2016-04-21,74.0,76.0,left,high,medium,80.0,...,75.0,78.0,14.0,7.0,9.0,9.0,12.0,2015/2016,4595.0,1729.0
1,7,189615,155782,2016-04-07,74.0,76.0,left,high,medium,80.0,...,75.0,78.0,14.0,7.0,9.0,9.0,12.0,2015/2016,4595.0,1729.0
2,8,189615,155782,2016-01-07,73.0,75.0,left,high,medium,79.0,...,75.0,78.0,14.0,7.0,9.0,9.0,12.0,2015/2016,4595.0,1729.0
3,9,189615,155782,2015-12-24,73.0,75.0,left,high,medium,79.0,...,75.0,78.0,14.0,7.0,9.0,9.0,12.0,2015/2016,4595.0,1729.0
4,10,189615,155782,2015-12-17,73.0,75.0,left,high,medium,79.0,...,75.0,78.0,14.0,7.0,9.0,9.0,12.0,2015/2016,4595.0,1729.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
61637,183965,47058,35506,2009-02-22,70.0,78.0,right,,_0,48.0,...,74.0,65.0,14.0,25.0,51.0,25.0,25.0,2008/2009,1765.0,1729.0
61638,183966,47058,35506,2008-08-30,72.0,78.0,right,,_0,48.0,...,78.0,65.0,14.0,25.0,51.0,25.0,25.0,2008/2009,1765.0,1729.0
61639,183974,102359,39902,2009-08-30,83.0,85.0,right,medium,low,84.0,...,31.0,30.0,9.0,20.0,84.0,20.0,20.0,2009/2010,8115.0,7809.0
61640,183975,102359,39902,2009-02-22,78.0,80.0,right,medium,low,74.0,...,31.0,30.0,9.0,20.0,73.0,20.0,20.0,2008/2009,8100.0,7809.0


In [64]:
player_attributes_df = player_attributes_df.rename({"id_x": "id"}, axis=1).drop("id_y", axis=1)
player_attributes_df.head()

Unnamed: 0,id,player_fifa_api_id,player_api_id,date,overall_rating,potential,preferred_foot,attacking_work_rate,defensive_work_rate,crossing,...,marking,standing_tackle,sliding_tackle,gk_diving,gk_handling,gk_kicking,gk_positioning,gk_reflexes,season,league_id
0,6,189615,155782,2016-04-21,74.0,76.0,left,high,medium,80.0,...,76.0,75.0,78.0,14.0,7.0,9.0,9.0,12.0,2015/2016,1729.0
1,7,189615,155782,2016-04-07,74.0,76.0,left,high,medium,80.0,...,76.0,75.0,78.0,14.0,7.0,9.0,9.0,12.0,2015/2016,1729.0
2,8,189615,155782,2016-01-07,73.0,75.0,left,high,medium,79.0,...,76.0,75.0,78.0,14.0,7.0,9.0,9.0,12.0,2015/2016,1729.0
3,9,189615,155782,2015-12-24,73.0,75.0,left,high,medium,79.0,...,76.0,75.0,78.0,14.0,7.0,9.0,9.0,12.0,2015/2016,1729.0
4,10,189615,155782,2015-12-17,73.0,75.0,left,high,medium,79.0,...,76.0,75.0,78.0,14.0,7.0,9.0,9.0,12.0,2015/2016,1729.0
