In [None]:
import yaml
import requests
import pandas as pd
import json
import sqlalchemy
import pg8000
import pymysql 
from sqlalchemy import Table, Column, Integer, String, MetaData, Date
from sqlalchemy.sql import select, insert

In [None]:
with open('footballYaml.yaml') as f:
    config = yaml.load(f, Loader=yaml.FullLoader)
football_key = config['football_key']
db_user = config["db_user"]
db_pass = config["db_pass"]
db_name = config["db_name"]
db_host = config["db_host"]

# Functions

### Function to get the results by seasons.

For this function we need to consider:

* The function need two parameters season that refers the year of we want to get the information and league_id.
* The premier league has the id 39.
* One request by year. 

In [None]:
def season_data(year):
    url = "https://v3.football.api-sports.io/fixtures"
    querystring = {"league":"39","season":year}
    payload={}
    headers = {
        'x-rapidapi-key': football_key,
        'x-rapidapi-host': 'v3.football.api-sports.io'
    }
    response = requests.request("GET", url, headers=headers, params=querystring,  data=payload).json()
    season_data = []
    for row in response["response"]:
        season_data.append({"season":row["league"]["season"],
                               "match_date":row["fixture"]["date"],
                               "match_id":row["fixture"]["id"],
                               "local_team":row["teams"]["home"]["name"],
                                "local_team_id":row["teams"]["home"]["id"],
                               "away_team":row["teams"]["away"]["name"],
                                "away_team_id":row["teams"]["away"]["id"],
                               "local_goals":row["goals"]["home"],
                               "away_goals":row["goals"]["away"]})
    return season_data


### Function to get relevant statistic by player, team and seasons. 

Get players statistics. In order to get the information we need to consider:

* This endpoint returns the players for whom the profile and statistics data are available. Note that it is possible that a player has statistics for 2 teams in the same season in case of transfers. In that case the key is team_id and player_id
* The statistics are calculated according to the team id, league id and season.
* The players id are unique in the API.
* This endpoint uses a pagination system, you can navigate between the different pages thanks to the page parameter.
* The season 2019 has 33 pages, season 2020 has 37 pages and season 2021 has 39 pages.
* One request per page, each of one has 20 different player's statistics. 

In [None]:
def statistics(year, page):
    url = "https://v3.football.api-sports.io/players"
    querystring = {"league":"39","season":year, "page":page}
    
    headers = {
        'x-rapidapi-host': "v3.football.api-sports.io",
        'x-rapidapi-key': football_key
    }
    
    response = requests.request("GET", url, headers=headers, params=querystring).json()
    statistic_data = []
    for row in response["response"]:
        statistic_data.append({"page":response['paging']["current"],
                               "season":row["statistics"][0]["league"]["season"],
                               "player_id":row["player"]["id"],
                               "player_name":row["player"]["name"],
                               "age":row["player"]["age"],
                               "height":row["player"]["height"],
                               "weight":row["player"]["weight"],
                               "injured":row["player"]["injured"],
                               "team_id":row["statistics"][0]["team"]["id"],
                               "appearences":row["statistics"][0]["games"]["appearences"],
                               "minutes":row["statistics"][0]["games"]["minutes"],
                               "position":row["statistics"][0]["games"]["position"],
                               "rating":row["statistics"][0]["games"]["rating"],
                               "shots":row["statistics"][0]["shots"]["total"],
                               "goals":row["statistics"][0]["goals"]["total"],
                               "assists":row["statistics"][0]["goals"]["assists"],
                               "passes_accuracy":row["statistics"][0]["passes"]["accuracy"],
                               "total_duels":row["statistics"][0]["duels"]["total"],
                               "won_duels":row["statistics"][0]["duels"]["won"]
                              })
        
    return statistic_data

# Request to get information

We use the next code to get the information for the last 3 season on the premier league

In [None]:
seasons =[2019, 2020, 2021]
all_season_data = {}


for year in seasons:
    all_season_data[year]= season_data(year)

**Statistics for 2019**

In [None]:
#season 2019. Fisrt 3 request pages for season 2019
statistic_2019 = []
for page in range(1,4):
    data = []
    data = statistics(2019, page)
    statistic_2019 = statistic_2019 + data
    print(page)

In [None]:
#Next 30 request peages for season 2019
for page in range(4,34):
    data = []
    data = statistics(2019, page)
    statistic_2019 = statistic_2019 + data
    print(page)

**Statistics for 2020**

In [None]:
#Season 2020. Fisrt 7 request pages for season 2020
statistic_2020 = []
for page in range(1,8):
    data = []
    data = statistics(2020, page)
    statistic_2020 = statistic_2020 + data
    print(page)

In [None]:
#Next 30 request peages for season 2020
for page in range(7,38):
    data = []
    data = statistics(2020, page)
    statistic_2020 = statistic_2020 + data
    print(page)

**Statistics for 2021**

In [None]:
#Season 2021. Fisrt 9 request pages for season 2021
statistic_2020 = []
for page in range(1,10):
    data = []
    data = statistics(2021, page)
    statistic_2021 = statistic_2021 + data
    print(page)

In [None]:
#Next 30 request peages for season 2021
for page in range(9,40):
    data = []
    data = statistics(2021, page)
    statistic_2021 = statistic_2021 + data
    print(page)

In [None]:
total_statistics = statistic_2019 + statistic_2020 + statistic_2021

# Set up data base

As we specify on the readme the idea is to usea a postgres data base to manage and upload the data. So, we create a postreges instance on GCP and define the conextion as follow:

In [None]:
host_args = db_host.split(":")
if len(host_args) == 1:
    db_hostname = db_host
    db_port = 5432
elif len(host_args) == 2:
    db_hostname, db_port = host_args[0], int(host_args[1])

conn = sqlalchemy.create_engine(
    # Equivalent URL:
    # postgresql+pg8000://<db_user>:<db_pass>@<db_host>:<db_port>/<db_name>
    sqlalchemy.engine.url.URL.create(
        drivername="postgresql+pg8000",
        username=db_user,  # e.g. "my-database-user"
        password=db_pass,  # e.g. "my-database-password"
        host=db_hostname,  # e.g. "127.0.0.1"
        port=db_port,  # e.g. 5432
        database=db_name  # e.g. "my-database-name"
    )
)

**Test the connection**

In [None]:
conn.connect()

We create two tables. _match_ table and _statistic_ table.

In [None]:
meta = MetaData()

match = Table(
    'match', meta, 
    Column('id', Integer, primary_key = True), 
    Column('season', Integer), 
    Column('match_date', Date),
    Column('local_team', String),
    Column('away_team', String),
    Column('local_goals', Integer),
    Column('away_goals', Integer)
)

In [None]:
meta.create_all(conn)

# Insert data to a database

If we want to use the data from the database is required to insert that data into it. On this case, we created a generic function to insert the data.

In [None]:
def insert_data(conn, table, data):
    ins = table.insert()
    conn.execute(ins, data)

Now, with the helper function insert_data we insert all the season information into the required table

In [None]:
for season in all_season_data:
    insert_data(conn,match,all_season_data[season])

In [None]:
s = select(match)
result = conn.execute(s)

In [None]:
for row in result:
    print(row)

In [None]:
match = Table(
    'match', meta, 
    Column('id', Integer, primary_key = True), 
    Column('season', Integer), 
    Column('match_date', Date),
    Column('local_team', String),
    Column('away_team', String),
    Column('local_goals', Integer),
    Column('away_goals', Integer)
)

In [None]:
statistic = Table()

In [None]:
#Referencias
#https://cloud.google.com/sdk/gcloud/reference/sql/connect
#https://docs.sqlalchemy.org/en/14/core/tutorial.html#deletes
#https://cloud.google.com/sql/docs/postgres/connect-app-engine-standard#private-ip_1