In [27]:
import pandas as pd
from IPython.core.display_functions import display
from pandas import DataFrame, json_normalize
from utils import num_of_round, is_allowed_round, response_from_json

FILE_NAME = 'data/league39/2021/games.json'
response = response_from_json(file_name=FILE_NAME)

In [28]:
all_response_data = json_normalize(data=response)
all_response_data = all_response_data.astype(
    {'goals.away': 'Int64', 'goals.home': 'Int64'}, errors='ignore')
all_response_data

Unnamed: 0,fixture.id,fixture.referee,fixture.timezone,fixture.date,fixture.timestamp,fixture.periods.first,fixture.periods.second,fixture.venue.id,fixture.venue.name,fixture.venue.city,...,goals.home,goals.away,score.halftime.home,score.halftime.away,score.fulltime.home,score.fulltime.away,score.extratime.home,score.extratime.away,score.penalty.home,score.penalty.away
0,710556,M. Oliver,UTC,2021-08-13T19:00:00+00:00,1628881200,1.628881e+09,1.628885e+09,10503,Brentford Community Stadium,"Brentford, Middlesex",...,2,0,1.0,0.0,2.0,0.0,,,,
1,710557,D. Coote,UTC,2021-08-14T14:00:00+00:00,1628949600,1.628950e+09,1.628953e+09,512,Turf Moor,Burnley,...,1,2,1.0,0.0,1.0,2.0,,,,
2,710558,J. Moss,UTC,2021-08-14T14:00:00+00:00,1628949600,1.628950e+09,1.628953e+09,519,Stamford Bridge,London,...,3,0,2.0,0.0,3.0,0.0,,,,
3,710559,A. Madley,UTC,2021-08-14T14:00:00+00:00,1628949600,1.628950e+09,1.628953e+09,8560,Goodison Park,Liverpool,...,3,1,0.0,1.0,3.0,1.0,,,,
4,710560,C. Pawson,UTC,2021-08-14T14:00:00+00:00,1628949600,1.628950e+09,1.628953e+09,547,King Power Stadium,"Leicester, Leicestershire",...,1,0,1.0,0.0,1.0,0.0,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
375,710931,,UTC,2022-05-22T15:00:00+00:00,1653231600,,,525,Selhurst Park,London,...,,,,,,,,,,
376,710932,,UTC,2022-05-22T15:00:00+00:00,1653231600,,,547,King Power Stadium,"Leicester, Leicestershire",...,,,,,,,,,,
377,710933,,UTC,2022-05-22T15:00:00+00:00,1653231600,,,550,Anfield,Liverpool,...,,,,,,,,,,
378,710934,,UTC,2022-05-22T15:00:00+00:00,1653231600,,,555,Etihad Stadium,Manchester,...,,,,,,,,,,


In [29]:
COLUMN_NAMES_DICT = {'teams.home.name': 'host',
                     'teams.away.name': 'guest',
                     'goals.home': 'goals_host',
                     'goals.away': 'goals_guest',
                     'league.round': 'round'}

all_games = all_response_data[COLUMN_NAMES_DICT.keys()]
all_games = all_games.rename(columns=COLUMN_NAMES_DICT)
all_games['round'] = all_games['round'].apply(lambda s: num_of_round(s))
all_games

Unnamed: 0,host,guest,goals_host,goals_guest,round
0,Brentford,Arsenal,2,0,1
1,Burnley,Brighton,1,2,1
2,Chelsea,Crystal Palace,3,0,1
3,Everton,Southampton,3,1,1
4,Leicester,Wolves,1,0,1
...,...,...,...,...,...
375,Crystal Palace,Manchester United,,,38
376,Leicester,Southampton,,,38
377,Liverpool,Wolves,,,38
378,Manchester City,Aston Villa,,,38


In [30]:
games_allowed = all_games[all_games["round"].apply(lambda n: is_allowed_round(n))]
games_allowed

Unnamed: 0,host,guest,goals_host,goals_guest,round
60,Brighton,Arsenal,0,0,7
61,Burnley,Norwich,0,0,7
62,Chelsea,Southampton,3,1,7
63,Crystal Palace,Leicester,2,2,7
64,Leeds,Watford,1,0,7
...,...,...,...,...,...
315,Manchester City,Liverpool,,,32
316,Newcastle,Wolves,,,32
317,Norwich,Burnley,,,32
318,Southampton,Chelsea,,,32


In [31]:
def single_team_db(predicted_round: int):
    games_df = games_allowed[games_allowed['round'] < predicted_round]
    host_df = games_df.groupby('host').agg('sum').reset_index()
    host_df.drop(labels=['round'], axis=1)
    guest_df = games_df.groupby('guest').agg('sum').reset_index()
    guest_df.drop(labels=['round'], axis=1)

    tot_each_team_df = DataFrame(data={
        "team": host_df.host,
        "GF_as_any": host_df.goals_host + guest_df.goals_guest,
        "GA_as_any": host_df.goals_guest + guest_df.goals_host,
        "GF_as_host": host_df.goals_host,
        "GA_as_host": host_df.goals_guest,
        "GF_as_guest": guest_df.goals_guest,
        "GA_as_guest": guest_df.goals_host,
    })
    tot_each_team_df.sort_values(by=['team'], ascending=True)
    return tot_each_team_df


final_single_team_db = single_team_db(predicted_round=1 + games_allowed['round'].max())
final_single_team_db

Unnamed: 0,team,GF_as_any,GA_as_any,GF_as_host,GA_as_host,GF_as_guest,GA_as_guest
0,Arsenal,28,15,14,5,14,10
1,Aston Villa,19,25,11,15,8,10
2,Brentford,18,33,9,13,9,20
3,Brighton,15,18,6,8,9,10
4,Burnley,11,16,6,4,5,12
5,Chelsea,36,16,21,9,15,7
6,Crystal Palace,25,25,17,14,8,11
7,Everton,12,28,7,15,5,13
8,Leeds,18,26,10,10,8,16
9,Leicester,27,27,16,13,11,14


In [32]:
HOST_COLUMNS = {
    'team': 'team1',
    'GF_as_any': 'GF1_as_any',
    'GA_as_any': 'GA1_as_any',
    'GF_as_host': 'GF1_as_host',
    'GA_as_host': 'GA1_as_host',
}

HOST_COLUMNS_TO_DEL = ['GF_as_guest', 'GA_as_guest']

GUEST_COLUMNS = {
    'team': 'team2',
    'GF_as_any': 'GF2_as_any',
    'GA_as_any': 'GA2_as_any',
    'GF_as_guest': 'GF2_as_guest',
    'GA_as_guest': 'GA2_as_guest',
}

GUEST_COLUMNS_TO_DEL = ['GF_as_host', 'GA_as_host']

In [33]:
def matches_to_predict(predicted_round: int) -> games_allowed:
    return games_allowed[games_allowed['round'] == predicted_round]

In [40]:
MIN_ROUND_PREDICTED = 8
MAX_ROUND_PREDICTED = games_allowed['round'].max()


def round_sums(predicted_round: int):
    matches = matches_to_predict(predicted_round=predicted_round)

    single_team_sums = single_team_db(predicted_round=predicted_round)

    sums_up_to_round = None
    for ind, row in matches.iterrows():
        host = row['host']
        guest = row['guest']

        host_stats: DataFrame
        guest_stats: DataFrame

        host_stats = single_team_sums.loc[single_team_sums['team'] == host]
        host_stats = host_stats.rename(columns=HOST_COLUMNS)
        host_stats.drop(HOST_COLUMNS_TO_DEL, inplace=True, axis=1)

        guest_stats = single_team_sums.loc[single_team_sums['team'] == guest]
        guest_stats = guest_stats.rename(columns=GUEST_COLUMNS)
        guest_stats.drop(GUEST_COLUMNS_TO_DEL, inplace=True, axis=1)

        # there are no stats (sums) for initial rounds
        # since not all teams play in a single round
        if not host_stats.shape[0]:
            continue

        host_stats.index = [0]
        guest_stats.index = [0]

        line = pd.concat([host_stats, guest_stats], axis=1)
        if sums_up_to_round is not None:
            sums_up_to_round = pd.concat([sums_up_to_round, line], axis=0)
        else:
            sums_up_to_round = line

    return sums_up_to_round


PREDICTED_ROUND = 15

# initial value
all_sums_up_to_round = round_sums(predicted_round=MIN_ROUND_PREDICTED)
for r in range(MIN_ROUND_PREDICTED + 1, PREDICTED_ROUND + 1):
    next_round_sums = round_sums(r)
    all_sums_up_to_round = pd.concat([all_sums_up_to_round, next_round_sums], axis=0)

all_sums_up_to_round.reset_index(drop=True, inplace=True)
all_sums_up_to_round.to_excel('output.xlsx')

final = round_sums(predicted_round=PREDICTED_ROUND).reset_index(drop=True)
final.to_excel('round_output.xlsx')
final

Unnamed: 0,team1,GF1_as_any,GA1_as_any,GF1_as_host,GA1_as_host,team2,GF2_as_any,GA2_as_any,GF2_as_guest,GA2_as_guest
0,Aston Villa,10,16,6,9,Leicester,15,15,7,6
1,Everton,5,17,3,10,Arsenal,12,10,4,7
2,Leeds,7,6,4,2,Brentford,9,14,6,9
3,Manchester United,11,19,4,10,Crystal Palace,13,11,7,6
4,Newcastle,9,16,6,10,Burnley,9,9,3,5
5,Southampton,9,13,6,4,Brighton,5,10,3,5
6,Tottenham,9,8,6,5,Norwich,6,12,3,9
7,Watford,12,17,5,9,Manchester City,17,7,10,4
8,West Ham,12,9,6,5,Chelsea,21,4,9,1
9,Wolves,9,7,5,2,Liverpool,28,8,16,4
