# Stage 1: Build a database

## Basic match dataframe setup

In [1]:
import sqlite3
import pandas as pd

conn = sqlite3.connect("football_database.sqlite")
df = pd.read_sql_query("""
        SELECT id, country_id, league_id, season, stage, date, 
            match_api_id, home_team_api_id, away_team_api_id, 
            home_team_goal, away_team_goal, home_player_1, home_player_2, 
            home_player_3, home_player_4, home_player_5, 
            home_player_6, home_player_7, home_player_8, 
            home_player_9, home_player_10, home_player_11, 
            away_player_1, away_player_2, away_player_3, 
            away_player_4, away_player_5, away_player_6, 
            away_player_7, away_player_8, away_player_9, 
            away_player_10, away_player_11, goal, shoton, 
            shotoff, foulcommit, card, cross, corner, possession
        FROM match
        """, conn)


In [2]:
# drop null values
df.dropna(inplace=True)

df.shape

(13325, 41)

In [3]:
# Add match_result

df.loc[df['home_team_goal'] > df['away_team_goal'], 'match_result'] = 1
df.loc[df['home_team_goal'] < df['away_team_goal'], 'match_result'] = 2
df.loc[df['home_team_goal'] == df['away_team_goal'], 'match_result'] = 3
df['match_result'] = df['match_result'].astype(int)

# Get the first year of season
df['season'] = df['season'].apply(lambda x: x[:4])

df

Unnamed: 0,id,country_id,league_id,season,stage,date,match_api_id,home_team_api_id,away_team_api_id,home_team_goal,...,away_player_11,goal,shoton,shotoff,foulcommit,card,cross,corner,possession,match_result
1728,1729,1729,1729,2008,1,2008-08-17 00:00:00,489042,10260,10261,1,...,37799.0,<goal><value><comment>n</comment><stats><goals...,<shoton><value><stats><blocked>1</blocked></st...,<shotoff><value><stats><shotoff>1</shotoff></s...,<foulcommit><value><stats><foulscommitted>1</f...,<card><value><comment>y</comment><stats><ycard...,<cross><value><stats><crosses>1</crosses></sta...,<corner><value><stats><corners>1</corners></st...,<possession><value><comment>56</comment><event...,3
1729,1730,1729,1729,2008,1,2008-08-16 00:00:00,489043,9825,8659,1,...,27267.0,<goal><value><comment>n</comment><stats><goals...,<shoton><value><stats><blocked>1</blocked></st...,<shotoff><value><stats><shotoff>1</shotoff></s...,<foulcommit><value><stats><foulscommitted>1</f...,<card />,<cross><value><stats><crosses>1</crosses></sta...,<corner><value><stats><corners>1</corners></st...,<possession><value><comment>65</comment><event...,1
1730,1731,1729,1729,2008,1,2008-08-16 00:00:00,489044,8472,8650,0,...,30853.0,<goal><value><comment>n</comment><stats><goals...,<shoton><value><stats><blocked>1</blocked></st...,<shotoff><value><stats><shotoff>1</shotoff></s...,<foulcommit><value><stats><foulscommitted>1</f...,<card><value><comment>y</comment><stats><ycard...,<cross><value><stats><crosses>1</crosses></sta...,<corner><value><stats><corners>1</corners></st...,<possession><value><comment>45</comment><event...,2
1731,1732,1729,1729,2008,1,2008-08-16 00:00:00,489045,8654,8528,2,...,34466.0,<goal><value><comment>n</comment><stats><goals...,<shoton><value><stats><shoton>1</shoton></stat...,<shotoff><value><stats><shotoff>1</shotoff></s...,<foulcommit><value><stats><foulscommitted>1</f...,<card><value><comment>y</comment><stats><ycard...,<cross><value><stats><crosses>1</crosses></sta...,<corner><value><stats><corners>1</corners></st...,<possession><value><comment>50</comment><event...,1
1733,1734,1729,1729,2008,1,2008-08-16 00:00:00,489047,8668,8655,2,...,30646.0,<goal><value><comment>n</comment><stats><goals...,<shoton><value><stats><shoton>1</shoton></stat...,<shotoff><value><stats><shotoff>1</shotoff></s...,<foulcommit><value><stats><foulscommitted>1</f...,<card><value><comment>y</comment><stats><ycard...,<cross><value><stats><crosses>1</crosses></sta...,<corner><value><stats><corners>1</corners></st...,<possession><value><comment>46</comment><event...,2
1734,1735,1729,1729,2008,1,2008-08-16 00:00:00,489048,8549,8586,2,...,23949.0,<goal><value><comment>dg</comment><event_incid...,<shoton><value><stats><shoton>1</shoton></stat...,<shotoff><value><stats><shotoff>1</shotoff></s...,<foulcommit><value><stats><foulscommitted>1</f...,<card><value><comment>y</comment><stats><ycard...,<cross><value><stats><corners>1</corners></sta...,<corner><value><stats><corners>1</corners></st...,<possession><value><comment>49</comment><event...,1
1735,1736,1729,1729,2008,1,2008-08-16 00:00:00,489049,8559,10194,3,...,23314.0,<goal><value><comment>n</comment><stats><goals...,<shoton><value><stats><blocked>1</blocked></st...,<shotoff><value><stats><shotoff>1</shotoff></s...,<foulcommit><value><stats><foulscommitted>1</f...,<card><value><comment>y</comment><stats><ycard...,<cross><value><event_incident_typefk>123</even...,<corner><value><stats><corners>1</corners></st...,<possession><value><comment>58</comment><event...,1
1736,1737,1729,1729,2008,1,2008-08-16 00:00:00,489050,8667,9879,2,...,24741.0,<goal><value><comment>n</comment><stats><goals...,<shoton><value><stats><shoton>1</shoton></stat...,<shotoff><value><stats><shotoff>1</shotoff></s...,<foulcommit><value><stats><foulscommitted>1</f...,<card><value><comment>y</comment><stats><ycard...,<cross><value><stats><crosses>1</crosses></sta...,<corner><value><stats><corners>1</corners></st...,<possession><value><comment>49</comment><event...,1
1737,1738,1729,1729,2008,1,2008-08-17 00:00:00,489051,8455,8462,4,...,30830.0,<goal><value><comment>n</comment><stats><goals...,<shoton><value><stats><shoton>1</shoton></stat...,<shotoff><value><stats><shotoff>1</shotoff></s...,<foulcommit><value><stats><foulscommitted>1</f...,<card><value><comment>y</comment><stats><ycard...,<cross><value><stats><crosses>1</crosses></sta...,<corner><value><stats><corners>1</corners></st...,<possession><value><comment>57</comment><event...,1
1738,1739,1729,1729,2008,10,2008-10-29 00:00:00,489132,10260,8654,2,...,24773.0,<goal><value><comment>n</comment><stats><goals...,<shoton><value><stats><shoton>1</shoton></stat...,<shotoff><value><stats><shotoff>1</shotoff></s...,<foulcommit><value><stats><foulscommitted>1</f...,<card><value><comment>y</comment><stats><ycard...,<cross><value><stats><corners>1</corners></sta...,<corner><value><stats><corners>1</corners></st...,<possession><value><comment>59</comment><event...,1


In [4]:
## Calculate lineup average age

# Load birth year data
birth_year = pd.read_sql_query("""
        SELECT id, strftime("%Y", birthday) as year
        FROM player
""", conn, params=())

birth_year.iloc[:5, :]

Unnamed: 0,id,year
0,1,1992
1,2,1989
2,3,1991
3,4,1982
4,5,1979


In [5]:
# Get average age of a line up

home_fields = [f'home_player_{i}' for i in range(1, 12)]

def get_year(field):
    return birth_year.loc[birth_year['id'] == df[field], 'year'].astype(int).squeeze()

## TODO: Map the id to the 'birth_year' table and get the average
# sum([get_year(field) for field in home_fields])
# df['home_player_1'].astype(int)

In [6]:
# Create a helper function that counts team nodes
from collections import Counter
import xml.etree.ElementTree as ET

def count_teams(xml_string):
    c = Counter()
    root = ET.fromstring(xml_string)
    for t in root.iter('team'):
        c.update([t.text])
    return c

In [7]:
# Get match data from XML
print('shoton:', count_teams(df['shoton'].iloc[0]))
print('shotoff:', count_teams(df['shotoff'].iloc[0]))
print('foulcommit:', count_teams(df['foulcommit'].iloc[0]))
print('card:', count_teams(df['card'].iloc[0]))
print('corner:', count_teams(df['corner'].iloc[0]))

# TODO: Add red cards and possession
# print('possession:', count_teams(df['possession'].iloc[0]))


shoton: Counter({'10260': 11, '10261': 1})
shotoff: Counter({'10260': 10, '10261': 9})
foulcommit: Counter({'10260': 16, '10261': 11})
card: Counter({'10260': 3})
corner: Counter({'10261': 6, '10260': 6})
