In [11]:
import pandas
import duckdb
from src.club_names import name_dict
import numpy as np
from os import walk


In [12]:
FIFA_PATH = "data/fifa"
PREMIER_PATH = "data/results_premier/results.csv"
ML_PATH = "data/ml"
CURRENT_SEASON = "2021-22"
relevant_fields = ['club_name', 'overall', 'club_position', 'league_name']

In [13]:
def get_historical_players():
    data_path = f"{FIFA_PATH}/historical_data"
    f , dfs = [] , []
    for (_, _, filenames) in walk(data_path):
        f.extend(filenames)

    for filename in f:
        path = f"{data_path}/{filename}"
        new_df = pandas.read_csv(path,skipinitialspace=True, usecols=relevant_fields)
        new_df["season"] = filename[:-4]
        dfs.append(new_df)
        
    return pandas.concat(dfs)

def get_current_players():
    data_path = f"{FIFA_PATH}/current_season_data/{CURRENT_SEASON}.csv"
    df = pandas.read_csv(data_path,skipinitialspace=True, usecols=relevant_fields)
    df["season"] = CURRENT_SEASON
    return df    

In [14]:
def get_team_stats(players):
  query = """
      SELECT 
        club_name
        , season
        , avg(overall) filter(where club_position = 'RES') as reserve_overall
        , avg(overall) filter(where club_position = 'SUB') as subs_overall
        , avg(overall) filter(where club_position not in ('RES','SUB') ) as titular_overall
        , avg(overall) filter(where club_position in ('ST','CAM','LS','LW', 'RW','CF','RS' , 'LM' , 'RF') ) as attack_overall
        , avg(overall) filter(where club_position in ( 'GK', 'LCB', 'RCB' ,'CDM', 'LDM' , 'RDM', 'LB', 'RB' ,'CB', 'RCB' , 'LCB' ) ) as defend_overall

      FROM players 
      where league_name = 'English Premier League' and club_position not null
      group by club_name, season
      
  """
  teams = duckdb.query(query).to_df()
  return teams


In [15]:


def get_matches_results(results, seasons):
    seasons_string = ','.join(f"""'{season}'""" for season in seasons)
    query = f"""
        SELECT 
            season ,
            HomeTeam,  
            AwayTeam,
            FTHG as goals_home,
            FTAG as goals_away
            
        FROM results
        where 
            season in ({seasons_string})
    """
    matches = duckdb.query(query).to_df()
    matches["away_code"] = matches.AwayTeam.apply(lambda x: name_dict[x])
    matches["home_code"] = matches.HomeTeam.apply(lambda x: name_dict[x])
    return  matches

In [16]:

def get_database(matches, teams):
    query = """
        SELECT 
            r.season,
            home_code as home
            ,away_code as away
            ,goals_home
            ,goals_away
            , th.reserve_overall as home_reserve_overall
            , th.subs_overall as home_subs_overall
            , th.titular_overall as home_titular_overall
            , th.attack_overall as home_attack_overall
            , th.defend_overall as home_defend_overall
            , ta.reserve_overall as away_reserve_overall
            , ta.subs_overall as away_subs_overall
            , ta.titular_overall as away_titular_overall
            , ta.attack_overall as away_attack_overall
            , ta.defend_overall as away_defend_overall
        FROM matches r
        Left Join teams th on (r.home_code, r.season) = (th.club_name, th.season)
        Left Join teams ta on (r.away_code, r.season) = (ta.club_name, ta.season)
    """
    df = duckdb.query(query).to_df()
    rows_with_errors = df[df.isnull().any(axis=1)].index
    return df.drop(df.index[rows_with_errors])

In [17]:
def get_matches_current_season(team_stats):
    matches_current_season = []
    for home in team_stats.club_name.unique():
        for away in team_stats.club_name.unique():
            if home == away:
                continue
            match = {
                "season": CURRENT_SEASON,
                "home_code": home,
                "away_code": away,
                "goals_home": -1,
                "goals_away": -1,        
            }
            matches_current_season.append(match)
    return pandas.DataFrame(matches_current_season)


In [18]:

players = get_historical_players()
team_stats = get_team_stats(players)
results = pandas.read_csv(PREMIER_PATH, encoding = "ISO-8859-1")
historical_seasons = [
            '2014-15',
            '2015-16',
            '2016-17', 
            '2017-18', 
            '2018-19',
            '2019-20',
            '2020-21',
]
matches = get_matches_results(results, historical_seasons)
df_historical = get_database(matches, team_stats)


In [19]:

current_players = get_current_players()
team_stats = get_team_stats(current_players)
matches = get_matches_current_season(team_stats)
df_current = get_database(matches, team_stats)


In [20]:

df_historical.to_csv(f'{ML_PATH}/df_historical.csv', index=False)
df_current.to_csv(f'{ML_PATH}/df_current.csv', index=False)
