In [49]:
import pandas
import duckdb
from src.club_names import name_dict
import numpy as np
from os import walk


In [50]:
def get_players():
    f , dfs = [] , []
    for (dirpath, dirnames, filenames) in walk("data/fifa/historical_data"):
        f.extend(filenames)
    relevant_fields = ['club_name', 'overall', 'club_position', 'league_name']

    for filename in f:
        path = "data/fifa/historical_data/" + filename
        new_df = pandas.read_csv(path,skipinitialspace=True, usecols=relevant_fields)
        new_df["season"] = filename[:-4]
        dfs.append(new_df)
        
    return pandas.concat(dfs)
    

In [51]:

players = get_players()
results = pandas.read_csv(r"data/results_premier/results.csv", encoding = "ISO-8859-1")



In [52]:
query = """
    SELECT 
      club_name
      , season
      , avg(overall) filter(where club_position = 'RES') as reserve_overall
      , avg(overall) filter(where club_position = 'SUB') as subs_overall
      , avg(overall) filter(where club_position not in ('RES','SUB') ) as titular_overall
      , avg(overall) filter(where club_position in ('ST','CAM','LS','LW', 'RW','CF','RS' , 'LM' , 'RF') ) as attack_overall
      , avg(overall) filter(where club_position in ( 'GK', 'LCB', 'RCB' ,'CDM', 'LDM' , 'RDM', 'LB', 'RB' ,'CB', 'RCB' , 'LCB' ) ) as defend_overall

    FROM players 
    where league_name = 'English Premier League' and club_position not null
    group by club_name, season
    
"""
teams = duckdb.query(query).to_df()


In [53]:

query = """
    SELECT 
        season ,
        HomeTeam,  
        AwayTeam,
        FTHG as goals_home,
        FTAG as goals_away
        
    FROM results
    where 
        season in (
            '2014-15',
            '2015-16',
            '2016-17', 
            '2017-18', 
            '2018-19',
            '2019-20',
            '2020-21'
            )
"""
matches = duckdb.query(query).to_df()

In [54]:
matches["away_code"] = matches.AwayTeam.apply(lambda x: name_dict[x])
matches["home_code"] = matches.HomeTeam.apply(lambda x: name_dict[x])

In [55]:
query = """
    SELECT 
        r.season,
        home_code as home
        ,away_code as away
        ,goals_home
        ,goals_away
        , th.reserve_overall as home_reserve_overall
        , th.subs_overall as home_subs_overall
        , th.titular_overall as home_titular_overall
        , th.attack_overall as home_attack_overall
        , th.defend_overall as home_defend_overall
        , ta.reserve_overall as away_reserve_overall
        , ta.subs_overall as away_subs_overall
        , ta.titular_overall as away_titular_overall
        , ta.attack_overall as away_attack_overall
        , ta.defend_overall as away_defend_overall
    FROM matches r
    Left Join teams th on (r.home_code, r.season) = (th.club_name, th.season)
    Left Join teams ta on (r.away_code, r.season) = (ta.club_name, ta.season)
"""
df = duckdb.query(query).to_df()

In [56]:
rows_with_errors = df[df.isnull().any(axis=1)].index
df = df.drop(df.index[rows_with_errors])

In [57]:
df.to_csv('data/ml/df.csv', index=False)