In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import sqlite3

%matplotlib inline

In [26]:
def get_dataframe_from_sql(query, db_path = 'football.sqlite'):
    """
    Returns a pandas dataframe containing the db data returned
    by the provided SQL query.
    """
    # establish a connection to the database
    conn = sqlite3.connect(db_path)

    # load the query results into a pandas dataframe
    df = pd.read_sql_query(query, conn)

    # close the connection to the database
    conn.close()

    # return the dataframe
    return df

In [11]:
get_dataframe_from_sql("SELECT name FROM sqlite_master WHERE type='table';")

Unnamed: 0,name
0,sqlite_sequence
1,Player_Attributes
2,Player
3,Match
4,League
5,Country
6,Team
7,Team_Attributes


In [9]:
get_dataframe_from_sql("PRAGMA table_info(Player_Attributes);").head(30)

Unnamed: 0,cid,name,type,notnull,dflt_value,pk
0,0,id,INTEGER,0,,1
1,1,player_fifa_api_id,INTEGER,0,,0
2,2,player_api_id,INTEGER,0,,0
3,3,date,TEXT,0,,0
4,4,overall_rating,INTEGER,0,,0
5,5,potential,INTEGER,0,,0
6,6,preferred_foot,TEXT,0,,0
7,7,attacking_work_rate,TEXT,0,,0
8,8,defensive_work_rate,TEXT,0,,0
9,9,crossing,INTEGER,0,,0


In [25]:
get_dataframe_from_sql("SELECT * FROM Team;").head()

Unnamed: 0,id,team_api_id,team_fifa_api_id,team_long_name,team_short_name
0,1,9987,673.0,KRC Genk,GEN
1,2,9993,675.0,Beerschot AC,BAC
2,3,10000,15005.0,SV Zulte-Waregem,ZUL
3,4,9994,2007.0,Sporting Lokeren,LOK
4,5,9984,1750.0,KSV Cercle Brugge,CEB


#### get the top 5 leagues

In [32]:
top_5_leagues_names = {
    "England Premier League",
    "France Ligue 1",
    "Germany 1. Bundesliga",
    "Italy Serie A",
    "Spain LIGA BBVA",
}

league_name_values = [f"\'{name}\'" for name in top_5_leagues_names]
leagues_df = get_dataframe_from_sql(f'SELECT * FROM League WHERE name IN ({", ".join(league_name_values)})')
leagues_df

Unnamed: 0,id,country_id,name
0,1729,1729,England Premier League
1,4769,4769,France Ligue 1
2,7809,7809,Germany 1. Bundesliga
3,10257,10257,Italy Serie A
4,21518,21518,Spain LIGA BBVA


#### filter the matches that are in each league

In [33]:
top_5_leagues_matches = get_dataframe_from_sql('SELECT * FROM Match').query('league_id in @leagues_df.id')
top_5_leagues_matches.sample(30)

Unnamed: 0,id,country_id,league_id,season,stage,date,match_api_id,home_team_api_id,away_team_api_id,home_team_goal,...,SJA,VCH,VCD,VCA,GBH,GBD,GBA,BSH,BSD,BSA
24087,24088,21518,21518,2014/2015,36,2015-05-10 00:00:00,1778384,10205,10268,1,...,,1.36,5.2,10.0,,,,,,
11139,11140,10257,10257,2010/2011,20,2011-01-16 00:00:00,888353,10233,8600,2,...,3.2,2.4,3.2,3.25,2.3,3.1,3.1,2.4,3.1,3.0
2841,2842,1729,1729,2010/2011,7,2010-10-02 00:00:00,839877,8654,9879,1,...,3.25,2.3,3.3,3.25,2.3,3.25,3.0,2.2,3.25,3.2
11331,11332,10257,10257,2010/2011,38,2011-05-22 00:00:00,888535,8636,8530,3,...,7.0,1.53,3.6,6.5,1.5,3.6,7.0,1.5,3.6,6.5
4536,4537,1729,1729,2015/2016,22,2016-01-18 00:00:00,1988928,10003,9817,1,...,,2.4,3.2,3.4,,,,,,
7200,7201,4769,4769,2014/2015,23,2015-01-31 00:00:00,1709919,8588,7794,1,...,,2.2,3.2,3.75,,,,,,
23376,23377,21518,21518,2012/2013,5,2012-09-24 00:00:00,1260044,8370,8633,0,...,1.33,11.0,5.75,1.3,10.0,5.0,1.3,9.0,5.0,1.33
6926,6927,4769,4769,2013/2014,32,2014-04-05 00:00:00,1468518,9827,9851,2,...,3.5,2.15,3.2,3.9,,,,,,
7450,7451,4769,4769,2015/2016,11,2015-10-23 00:00:00,1989880,7819,9830,0,...,,1.95,3.3,4.75,,,,,,
21549,21550,21518,21518,2008/2009,12,2008-11-23 00:00:00,530255,8634,8305,1,...,17.0,1.14,7.0,17.0,1.2,6.0,13.0,1.17,6.0,15.0
