In [1]:
import soccerdata as sd
import pandas as pd
import traceback
import matplotlib.pyplot as plt
from mplsoccer import Pitch,VerticalPitch
from datetime import datetime,timedelta
import requests,json,config
import matplotlib.patches as patches
import math,sys,os,random
from threading import Thread
import time,concurrent
import numpy as np
from math import isnan
# import mysql.connector
from bs4 import BeautifulSoup

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

In [2]:
FOTMOB_URL = config.getFotMobUrls()
FOTMOB_LEAGUE = config.getFotMobLeagueDict()
CONFIG = config.getGeneralConfig()

In [3]:
FOTMOB_LEAGUE

{'ENG-Premier League': 47,
 'ESP-La Liga': 87,
 'ITA-Serie A': 55,
 'GER-Bundesliga': 54,
 'FRA-Ligue 1': 53,
 'INT-World Cup': 77,
 'UCL': 42,
 'UEL': 73,
 'UECL': 10216,
 'Eredivisie': 57,
 'Pro_League': 40,
 'Primeira_Liga': 61}

In [4]:
def getFotMobSeasonFixture(league,seasons): # Example [2016,2017] or "23_24"
    # # making season list
    if "list" in str(type(seasons)):
        # "%2F".join(['20'+str(x)for x in season.split('_')])
        season_name_str = [(f'{x%2000}-{(x+1)%2000}',f'20{str(x)}%2F20{str(x+1)}')for x in seasons]
        fixture_urls = [(season,FOTMOB_URL['fixture'].format(FOTMOB_LEAGUE[league],season_str)) for season,season_str in season_name_str]
        
    else:
        season_name_str = "%2F".join(['20'+str(x)for x in seasons.split('_')])
        fixture_urls = [FOTMOB_URL['fixture'].format(FOTMOB_LEAGUE[league],season_name)]
    all_fixtures = []
    for season_nm,fixture_url in fixture_urls:
        fixture_res = requests.get(fixture_url)
        # return
        df = pd.DataFrame()
        
        if fixture_res.status_code == 200 :
            try:
                fixtures = fixture_res.json()
                for match in fixtures :
                    i = df.shape[0]
                    df.loc[i, 'id']=match['id']
                    df.loc[i, 'league']=league.split('-')[-1].strip(' ')
                    df.loc[i, 'season']=season_nm
                    df.loc[i, 'url']=match['pageUrl']
                    df.loc[i, 'home_team']=match['home']['name']
                    df.loc[i, 'away_team']=match['away']['name']
                    df.loc[i, 'date']=match['status']['utcTime']
                all_fixtures.append(df)
            except JSONDecodeError as jde :
                print("Json Error ")
                print("Data Response ",fixture_res.content.decode())
                all_fixtures.append(df)
            except Exception as e :
                print("General Error ")
                print("Exception as  : ",str(e))
                all_fixtures.append(df)
    print(f"Returning {len(all_fixtures)} DFs combination")            
    return pd.concat(all_fixtures, ignore_index=True)    
# getFotMobSeasonFixture(league,season_name)
# getFotMobSeasonFixture(league,[2017,2016])
# getFotMobSeasonFixture(league,season)

In [5]:
def pre_process_dataframe_for_schedules(df_name,df_under):
    
    df_under = df_under
    if df_name == "WS":
        df_under = df_under.reset_index()
        df_under['date'] = pd.to_datetime(df_under['date']).dt.tz_localize(None).dt.tz_localize('UTC') + timedelta(hours=-1)
        df_under['season'] = df_under['season'].astype(str).apply(lambda x: x[:2] + '-' + x[2:])
    elif df_name == "FB" :
        df_under = df_under.reset_index()
        df_under['season'] = df_under['season'].astype(str).apply(lambda x: x[:2] + '-' + x[2:])
        df_under['date'] = df_under['date'].astype(str)
        df_under['time'] = df_under['time'].astype(str).replace('<NA>',"00:00")
        df_under['date'] = df_under['date']+' '+df_under['time']+":00"
        df_under['date'] = pd.to_datetime(df_under['date']).dt.tz_localize(None).dt.tz_localize('UTC') + timedelta(hours=-2)
        
    else:
        df_under = df_under.reset_index()
        df_under['date'] = pd.to_datetime(df_under['date'].str.split('Z').str[0].str.split(".").str[0],format='%Y-%m-%dT%H:%M:%S')
        # df_under['date'] = pd.to_datetime(df_under['date'].str.split('.').str[0],format='%Y-%m-%dT%H:%M:%SZ')
        # df_under['date'] = pd.to_datetime(df_under['date']) + timedelta(hours=1)
        # df_under['date'] = pd.to_datetime(df_under['date'].str.split('T').str[0])
        # df_under['date'] = pd.to_datetime(df_under['date'].str.split('T').str[0])
        df_under['date'] = df_under['date'].dt.tz_localize(None).dt.tz_localize('UTC')

    # df_under['date'] = df_under['date'].dt.date
    return df_under.sort_values(['date', 'home_team', 'away_team'])
# pre_process_dataframe_for_schedules('FM',fotmob_df),\
# pre_process_dataframe_for_schedules('FB',fbref_df),\
# pre_process_dataframe_for_schedules('WS',whoscored_df)

In [6]:
def update_team_alias(WS_df,FB_df,FM_df,file_path,write_mode=True):
    try:
        season_set = set([x.year for x in WS_df['date']])
        if write_mode or 1:
            with open(file_path,'r') as f :
                existing_alias = json.load(f)
        else:
            existing_alias = {}
        team_alias = existing_alias      
        for season in season_set:
            # print("")
            # print(team_alias)
            WS_mw1,FB_mw1,FM_mw1 = getFirstMatchweekOfSeason(WS_df,FB_df,FM_df,season)
            if WS_mw1.empty or FB_mw1.empty or FM_mw1.empty:
                continue
            # x = getFirstMatchweekOfSeason(WS_df,FB_df,FM_df,season)
            # WS = sorted(list(set(sortedlist(WS_mw1['home_team'].unique()) + list(WS_mw1['away_team'].unique())))))
            # FB = sorted(list(set(sorted(list(FB_mw1['home_team'].unique()) + list(FB_mw1['away_team'].unique())))))
            # FM = sorted(list(set(sorted(list(FM_mw1['home_team'].unique()) + list(FM_mw1['away_team'].unique())))))
            
            WS = list(WS_mw1['home_team'].unique()) + list(WS_mw1['away_team'].unique())
            FB = list(FB_mw1['home_team'].unique()) + list(FB_mw1['away_team'].unique())
            FM = list(FM_mw1['home_team'].unique()) + list(FM_mw1['away_team'].unique())
            print(len(WS),len(FB),len(FM))
            # print(WS)
            # print(FM)
            # print(FB)
            for i,team in enumerate(WS):
                flag = False
                if team not in team_alias.keys():
                    team_alias[team] = [team]
                    flag = True
                if flag : 
                    if team != FB[i] :
                        team_alias[team].append(FB[i])
                    if team != FM[i] :
                        team_alias[team].append(FM[i])
                team_alias[team] = list(set(team_alias[team]))
        merged_alias = {**existing_alias,**team_alias}
        print(team_alias)
        if write_mode :
            with open(file_path,'w') as f :
                f.write(json.dumps(merged_alias))
        return merged_alias
    except Exception as e:
        print("Err : update_team_alias : ",e)
        import sys,os
        exc_type, exc_obj, exc_tb = sys.exc_info()
        fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
        print(exc_type, fname, exc_tb.tb_lineno)
        
    return False
# team_aliases = update_team_alias(WS,FB,FM,CONFIG['team_name_path'])
# team_aliases = update_team_alias(WS,FB,FM,CONFIG['team_name_path'],write_mode = False)


In [7]:
def getFirstMatchweekOfSeason(WS,FB,FM,season):
    print(f" ===================== SEASON NAME : {season} ===================== ")
    # start_border_date = datetime.strptime(f'25-07-{season}','%d-%m-%Y').date()
    # end_border_date = datetime.strptime(f'25-07-{season+1}','%d-%m-%Y').date()
    start_border_date = pd.to_datetime(datetime.strptime(f'25-07-{season}','%d-%m-%Y'), utc=True)
    end_border_date = pd.to_datetime(datetime.strptime(f'25-07-{season+1}','%d-%m-%Y'), utc=True)
    season = season%2000
    season_name = str(season)+'-'+str(season+1)
    print(season_name)
    # end_border_date = datetime.strptime(f'25-07-{season+1}','%d-%m-%Y')

    # season_wide_df_WS = WS[(WS['date'] > start_border_date) & (WS['date'] < end_border_date)]
    # season_wide_df_FM = FM[(FM['date'] > start_border_date) & (FM['date'] < end_border_date)]
    # season_wide_df_FB = FB[(FB['date'] > start_border_date) & (FB['date'] < end_border_date)]
    
    season_wide_df_WS = WS[WS['season'] == season_name]
    season_wide_df_FM = FM[FM['season'] == season_name]
    season_wide_df_FB = FB[FB['season'] == season_name]
    if season_wide_df_WS.empty or season_wide_df_FM.empty or season_wide_df_FB.empty:
        return pd.DataFrame(),pd.DataFrame(),pd.DataFrame()
    # season_wide_df_WS.to_csv("WS.csv")
    # season_wide_df_FM.to_csv("FM.csv")
    # season_wide_df_FB.to_csv("FB.csv")
    
    ht_team_count_arr =[len(list(season_wide_df_WS['home_team'].unique())),
        len(list(season_wide_df_FB['home_team'].unique())),
        len(list(season_wide_df_FM['home_team'].unique()))
    ]
    at_team_count_arr =[len(sorted(list(season_wide_df_WS['home_team'].unique()))),
        len(sorted(list(season_wide_df_FB['home_team'].unique()))),
        len(sorted(list(season_wide_df_FM['home_team'].unique())))
    ]
    
    ht_team_count = sum(ht_team_count_arr)/len(ht_team_count_arr)
    at_team_count = sum(at_team_count_arr)/len(at_team_count_arr)
    GW_WS = []
    GW_FB = []
    GW_FM = []
    if ht_team_count == at_team_count:
        team_list = list(season_wide_df_WS['home_team'].unique())
        i=0
        while len(team_list):
            # print(f"Left Teams : {len(team_list)}  | {team_list}")
            ht = season_wide_df_WS.iloc[i]['home_team'] 
            at = season_wide_df_WS.iloc[i]['away_team']
            c = 0
            if ht in team_list:
                # print("Removing HT... ",ht)
                team_list.remove(ht)
                c+=1
            if at in team_list:
                # print("Removing AT... ",at)
                team_list.remove(at)
                c+=1
            # print()
            if c != 0:
                # print(f"Adding Game : {ht} v {at} at index = {i}")
                # if i == 15:
                    # display(season_wide_df_WS.iloc[i])
                    # display(season_wide_df_FB.iloc[i])
                    # display(season_wide_df_FM.iloc[i])
                
                GW_WS.append(season_wide_df_WS.iloc[i])
                GW_FB.append(season_wide_df_FB.iloc[i])
                GW_FM.append(season_wide_df_FM.iloc[i])
                # print(f"Counts = WS : {len(GW_WS)} | FB : {len(GW_FB)} | FM : {len(GW_FM)}")
            else:
                # print(f"Can't add game : {ht} v {at} at index = {i}")
                # print("Kela and i hogya,",i, " With  HT : ",ht, " and AT : ",at)
                # print("Kela : ",team_list)
                # print(season_wide_df_WS.shape)
                if i > 120 :
                    break
            i+=1
            # print('=====================================')
    else:
        return f"Error : HT Count : {ht_team_count} | AT Count : {at_team_count}"
    # print(len(GW_WS),len(GW_FB),len(GW_FM))
    return pd.DataFrame(GW_WS),pd.DataFrame(GW_FB),pd.DataFrame(GW_FM)
# 
# a,b,c = getFirstMatchweekOfSeason(WS,FB,FM,2019)

In [8]:
# for i in range(a.shape[0]) :
#     print(i)
#     ht_ws = a.iloc[i]['home_team']
#     at_ws = a.iloc[i]['away_team']
#     dt_ws = a.iloc[i]['date']
    
#     ht_fb = b.iloc[i]['home_team']
#     at_fb = b.iloc[i]['away_team']
#     dt_fb = b.iloc[i]['date']
    
#     ht_fm = c.iloc[i]['home_team']
#     at_fm = c.iloc[i]['away_team']
#     dt_fm = c.iloc[i]['date']

#     print(f"WS : {ht_ws} v {at_ws} at {dt_ws}")
#     print(f"FB : {ht_fb} v {at_fb} at {dt_fb}")
#     print(f"FM : {ht_fm} v {at_fm} at {dt_fm}")

In [194]:
def combine_dataframes(WS_init,FB_init,FM_init,team_alias):
    WS = WS_init
    FB = FB_init
    FM = FM_init
    print(WS.shape)
    WS['source'] = "WhoScored"
    FM['source'] = "FotMob"
    FB['source'] = "FBref"
    try:
        WS['date'] = WS['date'].dt.date
        FM['date'] = FM['date'].dt.date
        FB['date'] = FB['date'].dt.date
    except :
        print("Ik about the dt thing")
        
    df = pd.DataFrame()
    dfs = [WS,FM,FB]
    # Find the DataFrame with the least number of rows
    loop_df = min(dfs, key=len)
    try:
        for x in WS.index:
            msg = None
            # Get the row data
            WS_row = WS.loc[x]
            # row = loop_df.loc[x]
            # print("Source is :",row['source'])
            row_home_team = WS_row['home_team']
            row_away_team = WS_row['away_team']
            row_date = WS_row['date']
            row_season = WS_row['season']
            row_round = str(WS_row['stage'])
            acceptable_rounds = ['Regular season', league, "None",None,'nan',league.split('-')[-1].strip(' '),league.replace('_',' '),
                                'Eredivisie ECL Playoff','Europa League Playoff', # Dutch League
                                 'Jupiler League','First Division A','Playoff Championship', # belgian League
                                "Champions League Group Stages","Champions League Final Stage", # UCL
                                 "Europa League Group Stages","Europa League Final Stage" #UEL
                                ]
            # print(acceptable_rounds)
            if row_round.strip(' ') not in acceptable_rounds:
                print("Row Round nikla : ",row_round)
                continue
    
            home_team_alias = team_alias[row_home_team]
            away_team_alias = team_alias[row_away_team]
    
            #Get DF ROWS
            # WS_row = WS[(WS['home_team'].isin(home_team_alias)) & (WS['away_team'].isin(away_team_alias)) & (WS['date'] == row_date)]
            FM_row = FM[(FM['home_team'].isin(home_team_alias)) & (FM['away_team'].isin(away_team_alias)) & (FM['season'] == row_season)]
            FB_row = FB[(FB['home_team'].isin(home_team_alias)) & (FB['away_team'].isin(away_team_alias)) & (FB['season'] == row_season)]
    
            if FB_row.empty or FM_row.empty or WS_row.empty:
                print("Guilty index : ",x)
                print(" HT ALias : ",home_team_alias)
                print(" AT ALias : ",away_team_alias)
                print()
                print(WS.loc[x])
                print()
                msg = f"Problem | Home : {row_home_team} | Away : {row_away_team} | Date : {row_date} \n "
                msg += f"DFs | WhoScored : {WS_row.empty} | FBRef : {FB_row.empty} | FotMob : {FM_row.empty}. \n"
                # msg += f"TIME | WhoScored : ({WS['date'][x]}) | FBRef : ({FM['date'][x]}) | FotMob : ({FB['date'][x]})."
                print(msg)
                if not FB_row.empty:
                    print("FB ROW")
                    if "Cancelled" in str(FB_row.iloc[0]['notes']):
                        continue
                return msg
            else:
                # Keeping only first row if all exist
                # print(WS_row.shape,FB_row.shape,FM_row.shape)
                try:
                    FB_row = FB_row.iloc[0]
                    FM_row = FM_row.iloc[0]
                except Exception as e:
                    print(f"DATE : {row_date} | HT : {row_home_team} | AT :{row_away_team}")
                    if "Cancelled" in str(FB_row['notes']):
                        continue
                    print("Excepitoin : ",e)
                    print("FB ROW IS  : ")
                    print(FB_row)
                    print()
                    print("FM ROW IS  : ")
                    print(FM_row)
                    print()
            # avoiding matches which are not played yet
            if str(FB_row['game_id']) == 'nan' or FB_row['match_report'] is None:
                continue
                
            i = df.shape[0]
            
            df.loc[i, 'competition']=WS_row['league']
            df.loc[i, 'season']=WS_row['season']
            df.loc[i, 'stage']=WS_row['stage']
            
            df.loc[i, 'date'] = row_date
            df.loc[i, 'home_team'] = row_home_team        
            df.loc[i, 'away_team'] = row_away_team
            # print("INDEX",df)
            df.loc[i, 'score'] = FB_row['score']
            
            df.loc[i, 'whoscored_id']=WS_row['game_id']
            df.loc[i, 'whoscored_url']=WS_row['url']
            
            df.loc[i, 'fbref_id']=FB_row['game_id']
            df.loc[i, 'fbref_url']=FB_row['match_report']
    
            df.loc[i, 'fotmob_id']=FM_row['id']
            df.loc[i, 'fotmob_url']=FM_row['url']
        return df
    except Exception as e:
        print("Merge DF Exception Occured : ",str(e))
        import sys,os
        exc_type, exc_obj, exc_tb = sys.exc_info()
        fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
        print(exc_type, fname, exc_tb.tb_lineno)
        print('-------')
        print('-------', msg)
        raise e
combine_dataframes(WS,FB,FM,team_aliases)
# FM.to_csv("Test.csv")

(1232, 10)
Ik about the dt thing
Guilty index :  1488
 HT ALias :  ['Getafe']
 AT ALias :  ['Inter']

league                                                                                                UEL
season                                                                                              19-20
game                                                                              2020-03-19 Getafe-Inter
date                                                                                           2020-03-19
home_team                                                                                          Getafe
away_team                                                                                           Inter
game_id                                                                                           1456775
url          https://1xbet.whoscored.com/Matches/1456775/Show/Europe-Europa-League-2019-2020-Getafe-Inter
stage                                             

Unnamed: 0,competition,season,stage,date,home_team,away_team,score,whoscored_id,whoscored_url,fbref_id,fbref_url,fotmob_id,fotmob_url
0,UEL,17-18,Europa League Group Stages,2017-09-14,Apollon Limassol,Lyon,1–1,1239080.0,https://1xbet.whoscored.com/Matches/1239080/Live/Europe-Europa-League-2017-2018-Apollon-Limassol-Lyon,01a748dc,/en/matches/01a748dc/Apollon-Limassol-Lyon-September-14-2017-Europa-League,2621504,/matches/apollon-limassol-vs-lyon/2lt7yg#2621504
1,UEL,17-18,Europa League Group Stages,2017-09-14,Atalanta,Everton,3–0,1239077.0,https://1xbet.whoscored.com/Matches/1239077/Live/Europe-Europa-League-2017-2018-Atalanta-Everton,3e434452,/en/matches/3e434452/Atalanta-Everton-September-14-2017-Europa-League,2621503,/matches/atalanta-vs-everton/2fzuz4#2621503
2,UEL,17-18,Europa League Group Stages,2017-09-14,Austria Wien,AC Milan,1–5,1239095.0,https://1xbet.whoscored.com/Matches/1239095/Live/Europe-Europa-League-2017-2018-Austria-Wien-AC-Milan,03c2c6fe,/en/matches/03c2c6fe/Austria-Wien-Milan-September-14-2017-Europa-League,2621491,/matches/ac-milan-vs-austria-wien/2upzfw#2621491
3,UEL,17-18,Europa League Group Stages,2017-09-14,BSC Young Boys,Partizan Belgrade,1–1,1239169.0,https://1xbet.whoscored.com/Matches/1239169/Live/Europe-Europa-League-2017-2018-BSC-Young-Boys-Partizan-Belgrade,082d826c,/en/matches/082d826c/Young-Boys-Partizan-September-14-2017-Europa-League,2621468,/matches/partizan-beograd-vs-young-boys/2qia07#2621468
4,UEL,17-18,Europa League Group Stages,2017-09-14,Dynamo Kyiv,Skenderbeu,3–1,1239166.0,https://1xbet.whoscored.com/Matches/1239166/Live/Europe-Europa-League-2017-2018-Dynamo-Kyiv-Skenderbeu,b14eb0ac,/en/matches/b14eb0ac/Dynamo-Kyiv-Skenderbeu-Korce-September-14-2017-Europa-League,2621467,/matches/dynamo-kyiv-vs-skenderbeu/2wccdd#2621467
5,UEL,17-18,Europa League Group Stages,2017-09-14,FC Copenhagen,Lokomotiv Moscow,0–0,1239198.0,https://1xbet.whoscored.com/Matches/1239198/Live/Europe-Europa-League-2017-2018-FC-Copenhagen-Lokomotiv-Moscow,d579c154,/en/matches/d579c154/FC-Copenhagen-Lokomotiv-Moscow-September-14-2017-Europa-League,2621516,/matches/fc-kobenhavn-vs-lokomotiv-moscow/2f2f0h#2621516
6,UEL,17-18,Europa League Group Stages,2017-09-14,FC Zlin,FC Sheriff,0–0,1239078.0,https://1xbet.whoscored.com/Matches/1239078/Live/Europe-Europa-League-2017-2018-FC-Zlin-FC-Sheriff,133da4ef,/en/matches/133da4ef/Fastav-Zlin-Sheriff-Tiraspol-September-14-2017-Europa-League,2621515,/matches/fc-zlin-vs-fc-sheriff/13zn5c#2621515
7,UEL,17-18,Europa League Group Stages,2017-09-14,Hoffenheim,Braga,1–2,1239186.0,https://1xbet.whoscored.com/Matches/1239186/Live/Europe-Europa-League-2017-2018-Hoffenheim-Braga,175ffd65,/en/matches/175ffd65/Hoffenheim-Braga-September-14-2017-Europa-League,2621479,/matches/tsg-hoffenheim-vs-braga/2ts7o1#2621479
8,UEL,17-18,Europa League Group Stages,2017-09-14,Istanbul Basaksehir,Ludogorets Razgrad,0–0,1239187.0,https://1xbet.whoscored.com/Matches/1239187/Live/Europe-Europa-League-2017-2018-Istanbul-Basaksehir-Ludogorets-Razgrad,87924f0c,/en/matches/87924f0c/Istanbul-Basaksehir-Ludogorets-Razgrad-September-14-2017-Europa-League,2621480,/matches/istanbul-basaksehir-vs-ludogorets-razgrad/ac0ti0k#2621480
9,UEL,17-18,Europa League Group Stages,2017-09-14,Rijeka,AEK Athens,1–2,1239097.0,https://1xbet.whoscored.com/Matches/1239097/Live/Europe-Europa-League-2017-2018-Rijeka-AEK-Athens,b368942f,/en/matches/b368942f/Rijeka-AEK-Athens-September-14-2017-Europa-League,2621492,/matches/aek-athens-vs-rijeka/2wdy2a#2621492


In [186]:
w.to_csv("w.csv")

In [183]:
w = WS[WS['season'] == '19-20']
b = FB[FB['season'] == '19-20']
m = FM[FM['season'] == '19-20']

In [191]:
m[(m['home_team'] == 'Getafe' ) | ( m['away_team'] == 'Getafe' )]

Unnamed: 0,index,id,league,season,url,home_team,away_team,date,source
1314,1314,3171246,UEL,19-20,/matches/getafe-vs-trabzonspor/2p2net#3171246,Getafe,Trabzonspor,2019-09-19,FotMob
1347,1347,3171231,UEL,19-20,/matches/getafe-vs-fc-krasnodar/774tq9d#3171231,Krasnodar,Getafe,2019-10-03,FotMob
1373,1373,3171247,UEL,19-20,/matches/getafe-vs-basel/2r09yp#3171247,Getafe,Basel,2019-10-24,FotMob
1383,1383,3171249,UEL,19-20,/matches/getafe-vs-basel/2r09yp#3171249,Basel,Getafe,2019-11-07,FotMob
1406,1406,3171266,UEL,19-20,/matches/getafe-vs-trabzonspor/2p2net#3171266,Trabzonspor,Getafe,2019-11-28,FotMob
1434,1434,3171251,UEL,19-20,/matches/getafe-vs-fc-krasnodar/774tq9d#3171251,Getafe,Krasnodar,2019-12-12,FotMob
1455,1455,3236273,UEL,19-20,/matches/getafe-vs-ajax/2d0g44#3236273,Getafe,Ajax,2020-02-20,FotMob
1476,1476,3236274,UEL,19-20,/matches/getafe-vs-ajax/2d0g44#3236274,Ajax,Getafe,2020-02-27,FotMob
1488,1488,3284791,UEL,19-20,/matches/getafe-vs-inter/2dg1jb#3284791,Inter,Getafe,2020-03-12,FotMob
1494,1494,3284799,UEL,19-20,/matches/getafe-vs-inter/2dg1jb#3284799,Inter,Getafe,2020-08-05,FotMob


In [169]:
leagues = [
    # 'ENG-Premier League',
    # 'ESP-La Liga',
    # 'Eredivisie',
    # 'FRA-Ligue 1',
    # 'GER-Bundesliga',
    # 'ITA-Serie A',
    # 'Primeira_Liga',
    # 'Pro_League',
    # 'UCL',
    # 'UECL'
    'UEL'
]

In [170]:
league = leagues[0]
season = [x for x in range(17,24)]
# season = [x for x in range(21,24)] # For UECL ONLY
ws = sd.WhoScored(leagues=league, seasons=season)
fb = sd.FBref(leagues=league, seasons=season)

# # get all 3 dfs
# print("Fetching DataFrames ... ")
whoscored_df = ws.read_schedule()
fbref_df = fb.read_schedule()
fotmob_df = getFotMobSeasonFixture(league,season)

Returning 7 DFs combination


In [195]:
print("Preparing Fetched DataFrames ... ")
# # prepare DFs
WS = pre_process_dataframe_for_schedules('WS',whoscored_df)
FB = pre_process_dataframe_for_schedules('FB',fbref_df)
FM = pre_process_dataframe_for_schedules('FM',fotmob_df)
# 1/0
print(league)
if league == 'UCL' or league =='UEL' or league == 'UECL':
    FM_dfs = []
    WS_dfs = []
    for season in FM['season'].unique():
        temp_df = FM[FM['season'] == season]
        group_stage_start = min(FB[FB['season'] == season]['date'])
        print(f"Season {season} | Group Stage Start {group_stage_start}")
        sw_FM_df = temp_df[temp_df['date'].dt.date >=group_stage_start.date()]
        FM_dfs.append(sw_FM_df)
        if league == "UECL" :
            t_ws_df = WS[WS['season'] == season]
            print(t_ws_df.shape)
            sw_WS_df = t_ws_df[t_ws_df['date'] >=group_stage_start]
            print("MIN: ",min(t_ws_df['date']).date(),"MAX : ",max(t_ws_df['date']).date(), " VS  ",group_stage_start.date())
            WS_dfs.append(sw_WS_df)
        print(f"WS : {WS[WS['season'] == season].shape} | FB : {FB[FB['season'] == season].shape} | FM : {sw_FM_df.shape}")
    FM = pd.concat(FM_dfs)
    if league != "UECL":
        WS = WS[WS['stage'].str.contains('Final|Group Stages', case=False)]
    else:
        print([x.shape for x in WS_dfs])
        WS = pd.concat(WS_dfs)
     

Preparing Fetched DataFrames ... 
UEL
Season 17-18 | Group Stage Start 2017-09-14 17:00:00+00:00
WS : (473, 9) | FB : (205, 19) | FM : (205, 8)
Season 18-19 | Group Stage Start 2018-09-20 16:55:00+00:00
WS : (519, 9) | FB : (205, 19) | FM : (205, 8)
Season 19-20 | Group Stage Start 2019-09-19 16:55:00+00:00
WS : (513, 9) | FB : (199, 19) | FM : (199, 8)
Season 20-21 | Group Stage Start 2020-10-22 15:55:00+00:00
WS : (362, 9) | FB : (205, 19) | FM : (205, 8)
Season 21-22 | Group Stage Start 2021-09-15 15:30:00+00:00
WS : (177, 9) | FB : (141, 19) | FM : (141, 8)
Season 22-23 | Group Stage Start 2022-09-08 16:45:00+00:00
WS : (175, 9) | FB : (141, 19) | FM : (141, 8)
Season 23-24 | Group Stage Start 2023-09-21 16:45:00+00:00
WS : (170, 9) | FB : (136, 19) | FM : (141, 8)


In [196]:
# team_aliases = update_team_alias(WS,FB,FM,CONFIG['team_name_path'])
# team_aliases = update_team_alias(WS,FB,FM,CONFIG['team_name_path'],write_mode=False)


In [197]:
with open(CONFIG['team_name_path'],'r') as f :
    team_aliases = json.load(f)

In [198]:
# merge dfs
print("Merging Fetched DataFrames ... ")
merged_df = combine_dataframes(WS,FB,FM,team_aliases)
# print("DFGHJKL",str(merged_df))

Merging Fetched DataFrames ... 
(1232, 9)
Guilty index :  1488
 HT ALias :  ['Getafe']
 AT ALias :  ['Inter']

league                                                                                                UEL
season                                                                                              19-20
game                                                                              2020-03-19 Getafe-Inter
date                                                                                           2020-03-19
home_team                                                                                          Getafe
away_team                                                                                           Inter
game_id                                                                                           1456775
url          https://1xbet.whoscored.com/Matches/1456775/Show/Europe-Europa-League-2019-2020-Getafe-Inter
stage                                    

In [175]:
# merged_df.to_csv(rf"./combined_sch/combined_{league}.csv")

In [199]:
league

'UEL'

In [200]:
# merged_dfs = {}

In [201]:
merged_dfs[league] = merged_df

In [202]:
merged_dfs.keys()

dict_keys(['ENG-Premier League', 'ESP-La Liga', 'Eredivisie', 'FRA-Ligue 1', 'GER-Bundesliga', 'ITA-Serie A', 'Primeira_Liga', 'Pro_League', 'UCL', 'UEL'])

In [125]:
# # get missing data info
# print("Get missing games |DataFrames ... ")
# miss_WS = WS[~WS['game_id'].isin(merged_df['whoscored_id'])]
# miss_FM = FM[~FM['id'].isin(merged_df['fotmob_id'])]
# miss_FB = FB[~FB['game_id'].isin(merged_df['fbref_id'])]

# miss_WS.to_csv(rf"./missing_matches/miss_WS_{league}.csv")
# miss_FB.to_csv(rf"./missing_matches/miss_FB_{league}.csv")
# miss_FM.to_csv(rf"./missing_matches/miss_FM_{league}.csv")

Get missing games |DataFrames ... 


In [204]:
for k,v in merged_dfs.items():
    print(k," : ",max(v['date']))

ENG-Premier League  :  2024-03-17
ESP-La Liga  :  2024-03-17
Eredivisie  :  2024-03-17
FRA-Ligue 1  :  2024-03-17
GER-Bundesliga  :  2024-03-17
ITA-Serie A  :  2024-03-17
Primeira_Liga  :  2024-03-17
Pro_League  :  2024-03-30
UCL  :  2024-03-13
UEL  :  2024-03-14


In [205]:
all_merged = pd.concat(list(merged_dfs.values()))

In [206]:
all_merged.shape

(20370, 13)

In [None]:
all_merged.to_csv(r"./combined_sch/all_merged_2.csv)