# Standardize Team Names

In [1]:
import pathlib
import sys
import pickle

from typing import List

import numpy as np
import pandas as pd

# Load the "autoreload" extension
%load_ext autoreload
# always reload modules marked with "%aimport"
%autoreload 1
# add the 'src' directory to path to import modules
src_dir = pathlib.Path().cwd().resolve().parent / 'src'
#src_dir = os.path.join(os.getcwd(), os.pardir, 'src')
sys.path.append(src_dir)
# import my class code from the source
# %aimport src-dir.filename

pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

PROJECT_DIR = pathlib.Path.cwd().resolve().parent
#DATA_DIR = PROJECT_DIR / 'data'
RAW_DATA_DIR = PROJECT_DIR / 'data' / '01-raw'
CLEANED_DATA_DIR = PROJECT_DIR / 'data' / '02-cleaned'
STDZED_DATA_DIR = PROJECT_DIR / 'data' / '03-standardized'
REF_DATA_DIR = PROJECT_DIR / 'data' / 'reference'

## Select League Data

In [2]:
nations = ['germany', 'united-kingdom', 'spain', 'italy',
          'france', 'united-kingdom', 'germany', 'netherlands',
          'russian-federation', 'scotland', 'portugal', 'switzerland',
          'belgium', 'turkey', 'poland', 'united-kingdom']

# poland, switzerland are multileague

leagues = ['bundesliga', 'english-premier-league', 'la-liga', 'serie-a',
           'ligue-1', 'english-championship', 'bundesliga-2', 'eredivisie',
           'premier-league', 'premiership', 'primeira-liga', 'super-league',
           'first-division-a', 'super-lig', 'ekstraklasa', 'one']

seasons = ['2000-2001', '2001-2002', '2002-2003', '2003-2004',
           '2004-2005', '2005-2006', '2006-2007', '2007-2008',
           '2008-2009', '2009-2010', '2010-2011', '2011-2012',
           '2012-2013', '2013-2014', '2014-2015', '2015-2016',
           '2016-2017', '2017-2018']

## Standardize Football- Data-co - uk

+ Load standardize dictionary
+ Use dictionary to form path to clean dataframes
+ Load dataframes
+ Standardize team names
+ save to standardized directory

In [3]:
epl_names_d = {'villa': 'aston-villa',
              'blackburn': 'blackburn-rovers',
              'birmingham': 'birmingham-city',
              'bolton': 'bolton-wanderers',
              'boro': 'middlesbrough',
              'brighton': 'brighton-and-hove-albion',
              'brighton-&-Hove Albion': ' brighton-and-hove-albion',
              'cardiff': 'cardiff-city',
              'huddersfield': 'huddersfield-town',
              'hull': 'hull-city',
              'leicester': 'leicester-city',
              'leicester-cty': 'leicester-city',
              'man-city': 'manchester-city',
              'man-u': 'manchester-united',
              'man-utd': 'manchester-united',
              'manchester-utd': 'manchester-united',
              'man-united': 'manchester-united',
              'newcastle': 'newcastle-united',
              'newcastle-utd': 'newcastle-united',
              'norwich': 'norwich-city',
              'qpr': 'queens-park-rangers',
              'sheffield': 'sheffield-united',
              'stoke': 'stoke-city',
              'swansea': 'swansea-city',
              'tottenham': 'tottenham-hotspur',
              'west-brom': 'west-bromwich-albion',
              'west-ham': 'west-ham-united',
              'wigan': 'wigan-athletic',
              'wolves': 'wolverhampton-wanderers'}

In [4]:
fp = REF_DATA_DIR / 'united-kingdom' / 'english-premier-league' / 'english-premier-league_std_name_dict.pkl'

with open(fp, 'wb') as handle:
    pickle.dump(epl_names_d, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [5]:
with open(fp, 'rb') as handle:
    std_names_d = pickle.load(handle)

## Use Dictionary to form path to cleaned dataframes dir

In [6]:
nations = ['united-kingdom']
leagues = ['english-premier-league']
#sources = ['football-data-co-uk', 'indatabet-com', 'whoscored-com']

In [7]:
def form_fdcu_fps(CLEANED_DATA_DIR, nation, league, seasons):
    """
    Accepts directory names to enable reach into raw data directory
    Returns full filepaths of the data files
    This is different - different signature no source - to other versions - needs to be standardized
    """
    fps = []
    fdcu_specifics = ['football-data-co-uk', 'season-data']
    for season in seasons:
        fn = season + '.csv'
        stub = CLEANED_DATA_DIR / fdcu_specifics[0] / nation / league / season
        fp = stub / fn
        #if fp.is_file():
        if fp.exists():
            fps.append(fp)
    return fps

fps = form_fdcu_fps(CLEANED_DATA_DIR, nations[0], leagues[0], seasons)

In [8]:
all_fps = []
for nation, league in zip(nations, leagues):
    fps = form_fdcu_fps(CLEANED_DATA_DIR, nation, league, seasons)
    all_fps.extend(fps)

print(len(all_fps))

13


## Load DataFrames

In [9]:
season_dfs = []
for fp in all_fps:
    season_df = pd.read_csv(fp, parse_dates=['date'], index_col=None)
    season_dfs.append(season_df)
season_dfs[-1].head()

Unnamed: 0,a_corners,a_fouls,a_redCards,a_shots,a_shotsOnTarget,a_yellowCards,a,odds_awin_bet365,odds_draw_bet365,odds_hwin_bet365,odds_awin_BW,odds_draw_BW,odds_hwin_BW,n_Bb1X2,n_BbAsian,BbAsian_handicap,odds_ftgoalsu2.5_bbmean,odds_ftgoalso2.5_bbmean,odds_awin_bbmean,odds_asianaway_bbmean,odds_asianhome_bbmean,odds_draw_bbmean,odds_hwin_bbmean,odds_ftgoalsu2.5_bbmax,odds_ftgoalso2.5_bbmax,odds_awin_bbmax,odds_asianaway_bbmax,odds_asianhome_bbmax,odds_draw_bbmax,odds_hwin_bbmax,n_BbOU,date,a_ftGoals,h_ftGoals,ftResult,h_corners,h_fouls,h_redCards,h_shots,h_shotsOnTarget,a_htGoals,h_htGoals,h_yellowCards,h,odds_awin_IW,odds_draw_IW,odds_hwin_IW,odds_awin_LB,odds_draw_LB,odds_hwin_LB,odds_awin_VC,odds_draw_VC,odds_hwin_VC,odds_awin_WH,odds_draw_WH,odds_hwin_WH,league,nation,season,clodds_away_pinn,clodds_draw_pinn,clodds_hwin_pinn
0,4.0,12.0,0.0,6.0,3.0,1.0,Leicester,6.5,4.5,1.53,6.75,4.6,1.5,41.0,21.0,-1.0,2.32,1.61,6.44,2.02,1.85,4.43,1.51,2.43,1.65,6.89,2.1,1.91,4.6,1.55,37.0,2017-08-11,3.0,4.0,H,9.0,9.0,0.0,27.0,10.0,2.0,2.0,0.0,Arsenal,6.5,4.5,1.47,6.5,4.4,1.44,6.5,4.5,1.53,6.0,4.2,1.53,english-premier-league,united-kingdom,2017-2018,7.25,4.73,1.49
1,10.0,9.0,0.0,14.0,4.0,2.0,Man City,1.33,5.5,11.0,1.3,5.25,11.0,40.0,20.0,1.5,2.27,1.63,1.32,1.96,1.91,5.25,10.1,2.4,1.7,1.36,2.01,1.95,5.6,11.5,35.0,2017-08-12,2.0,0.0,A,3.0,6.0,0.0,6.0,2.0,0.0,0.0,0.0,Brighton,1.35,5.3,8.0,1.3,5.0,10.0,1.33,5.5,10.0,1.33,4.8,10.0,english-premier-league,united-kingdom,2017-2018,1.29,6.15,11.75
2,5.0,11.0,0.0,10.0,5.0,3.0,Burnley,15.0,6.5,1.25,12.5,6.5,1.22,41.0,20.0,-1.75,2.23,1.66,13.67,1.9,1.97,6.06,1.24,2.33,1.71,15.5,1.95,2.03,6.55,1.27,36.0,2017-08-12,3.0,2.0,A,8.0,16.0,2.0,19.0,6.0,3.0,0.0,3.0,Chelsea,13.5,6.2,1.22,15.0,5.75,1.25,15.0,6.25,1.25,13.0,5.5,1.25,english-premier-league,united-kingdom,2017-2018,12.25,5.4,1.33
3,9.0,19.0,0.0,8.0,6.0,3.0,Huddersfield,5.0,3.6,1.83,4.75,3.5,1.8,41.0,18.0,-0.75,1.72,2.11,4.82,1.83,2.05,3.5,1.81,1.79,2.19,5.11,1.86,2.1,3.65,1.86,36.0,2017-08-12,3.0,0.0,A,12.0,7.0,0.0,14.0,4.0,2.0,0.0,1.0,Crystal Palace,4.3,3.5,1.85,4.6,3.4,1.8,5.0,3.6,1.83,5.0,3.3,1.8,english-premier-league,united-kingdom,2017-2018,5.51,3.56,1.79
4,7.0,10.0,0.0,9.0,1.0,1.0,Stoke,5.75,3.8,1.7,5.5,3.6,1.7,40.0,19.0,-0.75,1.76,2.08,5.5,1.98,1.9,3.69,1.69,1.8,2.17,6.0,2.01,1.94,3.85,1.71,35.0,2017-08-12,0.0,1.0,H,6.0,13.0,0.0,9.0,4.0,0.0,1.0,1.0,Everton,5.0,3.7,1.7,5.25,3.6,1.67,5.75,3.8,1.7,5.5,3.5,1.7,english-premier-league,united-kingdom,2017-2018,5.42,3.49,1.82


## Standardize Team Names

In [10]:
def copy_data(src_fps, dest_fps):
    """
    Accepts a list of strings representing filepaths
    Copies the files if they do not exist, otherwise counts if already there
    Returns the number of files copied, and number of files already existing
    """
#     dest_fps = []
#     for src_fp in src_fps:
#         #dest_fps.append(src_fp.replace(str(env_dir), str(project_dir)))
    n_copied = 0
    n_exist = 0
    for src_fp, dest_fp in zip(src_fps, dest_fps):
#         src_fp = pathlib.Path(src_fp)
#         dest_fp = pathlib.Path(dest_fp)
        if not dest_fp.exists():
            dest_fp.parent.mkdir(parents=True, exist_ok=True)
            # Copy the original files without touching them
            dest_fp.write_bytes(src_fp.read_bytes())
            n_copied += 1
        else:
            n_exist += 1
    return n_copied, n_exist

In [11]:
def stdze_names_to_dict(df_orig, std_names_d):
    df = df_orig.copy(deep=True)
    # Standardize the team names
    df['h'] = df['h'].str.strip().str.lower().str.replace(' ', '-')
    df.loc[df['h'].isin(std_names_d.keys()), 'h'] = df['h'].map(std_names_d)
    df['a'] = df['a'].str.strip().str.lower().str.replace(' ', '-')
    df.loc[df['a'].isin(std_names_d.keys()), 'a'] = df['a'].map(std_names_d)
    return df

## Save to Standardized Directory

In [12]:
#Change this to keep list of src and dest filepaths
for df, src_fp in zip(season_dfs, all_fps):
    new_df = stdze_names_to_dict(df, std_names_d)
    # Write to new directory
#     print(src_fp)
    dest_fp = pathlib.Path(str(src_fp).replace('02-cleaned', '03-standardized'))
#     print(dest_fp)
    new_df.to_csv(dest_fp, index=None)
    # copy_data([src_fp], [dest_fp])
    

# Standardize Indatabet com

+ Use dictionary to form path to clean dataframes
+ Load dataframes
+ Standardize team names
+ save to standardized directory

In [13]:
def form_indatabet_fps(CLEANED_DATA_DIR, nation, league, seasons):
    """
    Accepts directory names to enable reach into raw data directory
    Returns full filepaths of the data files
    This is different - different signature no source - to other versions - needs to be standardized
    """
    fps = []
    indatabet_specifics = ['indatabet-com', 'season-data']
    for season in seasons:
        fn = season + '.csv'
        stub = CLEANED_DATA_DIR / indatabet_specifics[0] / nation / league / season
        fp = stub / fn
        #if fp.is_file():
        if fp.exists():
            fps.append(fp)
    return fps

fps = form_indatabet_fps(CLEANED_DATA_DIR, nations[0], leagues[0], seasons)

In [14]:
all_fps = []
for nation, league in zip(nations, leagues):
    fps = form_indatabet_fps(CLEANED_DATA_DIR, nation, league, seasons)
    all_fps.extend(fps)

print(len(all_fps))

11


In [15]:
season_dfs = []
for fp in all_fps:
    season_df = pd.read_csv(fp, parse_dates=['date'], index_col=None)
    season_dfs.append(season_df)
season_dfs[0].head()

Unnamed: 0,date,id_fifa,nation,league,season,h,a,h_ftGoals,a_ftGoals,odds_hwin_pinn,odds_draw_pinn,odds_awin_pinn,odds_hwin_bet365,odds_draw_bet365,odds_awin_bet365
0,2007-08-11,eng-pl,united-kingdom,english-premier-league,2007-2008,sunderland,tottenham,1.0,0.0,3.4,3.35,2.2,2.8,3.25,2.25
1,2007-08-11,eng-pl,united-kingdom,english-premier-league,2007-2008,west-ham,manchester-city,0.0,2.0,2.16,3.25,3.6,1.83,3.2,4.0
2,2007-08-11,eng-pl,united-kingdom,english-premier-league,2007-2008,middlesbrough,blackburn,1.0,2.0,2.57,3.2,2.95,2.25,3.25,2.8
3,2007-08-11,eng-pl,united-kingdom,english-premier-league,2007-2008,everton,wigan,2.0,1.0,1.73,3.5,5.4,1.62,3.5,5.0
4,2007-08-11,eng-pl,united-kingdom,english-premier-league,2007-2008,derby,portsmouth,2.0,2.0,3.2,3.2,2.35,2.5,3.25,2.5


In [16]:
def stdze_names_to_dict(df_orig, std_names_d):
    df = df_orig.copy(deep=True)
    # Standardize the team names
    df['h'] = df['h'].str.strip().str.lower().str.replace(' ', '-')
    df.loc[df['h'].isin(std_names_d.keys()), 'h'] = df['h'].map(std_names_d)
    df['a'] = df['a'].str.strip().str.lower().str.replace(' ', '-')
    df.loc[df['a'].isin(std_names_d.keys()), 'a'] = df['a'].map(std_names_d)
#     print(sorted(df['h'].unique()))
#     print(sorted(df['a'].unique()))
#     print('\n\n')
    return df

In [17]:
#Change this to keep list of src and dest filepaths
for df, src_fp in zip(season_dfs, all_fps):
    new_df = stdze_names_to_dict(df, std_names_d)
    # Write to new directory
#     print(src_fp)
    dest_fp = pathlib.Path(str(src_fp).replace('02-cleaned', '03-standardized'))
#     print(dest_fp)
    # copy_data([src_fp], [dest_fp])
    new_df.to_csv(dest_fp, index=None)

# Standardize Whoscored com

+ Use dictionary to form path to clean images
+ Load image filepaths
+ Standardize team names
+ save to standardized directory

In [18]:
nation = 'united-kingdom'
league = 'english-premier-league'

In [19]:
heatmap_fps = []
heatmap_dirs = [d for d in pathlib.Path(CLEANED_DATA_DIR /'whoscored-com' / nation / league).rglob('**/*')
               if d.is_dir() and str(d).split('/')[-1] =='heatmaps']
for heatmap_dir in heatmap_dirs:
    heatmaps = [fp for fp in heatmap_dir.iterdir() if fp.is_file()]
    heatmap_fps.extend(heatmaps)

In [20]:
heatmap_fps[0:5]

[PosixPath('/media/david/5C14F53A14F517AA/code/ana_py37/projects/soccer-predictions/data/02-cleaned/whoscored-com/united-kingdom/english-premier-league/2009-2010/heatmaps/Arsenal__Aston Villa.png'),
 PosixPath('/media/david/5C14F53A14F517AA/code/ana_py37/projects/soccer-predictions/data/02-cleaned/whoscored-com/united-kingdom/english-premier-league/2009-2010/heatmaps/Arsenal__Birmingham.png'),
 PosixPath('/media/david/5C14F53A14F517AA/code/ana_py37/projects/soccer-predictions/data/02-cleaned/whoscored-com/united-kingdom/english-premier-league/2009-2010/heatmaps/Arsenal__Blackburn.png'),
 PosixPath('/media/david/5C14F53A14F517AA/code/ana_py37/projects/soccer-predictions/data/02-cleaned/whoscored-com/united-kingdom/english-premier-league/2009-2010/heatmaps/Arsenal__Bolton.png'),
 PosixPath('/media/david/5C14F53A14F517AA/code/ana_py37/projects/soccer-predictions/data/02-cleaned/whoscored-com/united-kingdom/english-premier-league/2009-2010/heatmaps/Arsenal__Burnley.png')]

In [21]:
len(heatmap_fps)

3420

In [22]:
shotmap_fps = []
shotmap_dirs = [d for d in pathlib.Path(CLEANED_DATA_DIR /'whoscored-com'/ nation / league).rglob('**/*')
               if d.is_dir() and str(d).split('/')[-1] =='shotmaps']
for shotmap_dir in shotmap_dirs:
    shotmaps = [fp for fp in shotmap_dir.iterdir() if fp.is_file()]
    shotmap_fps.extend(shotmaps)

In [23]:
len(shotmap_fps)

3420

In [24]:
dest_fps = []
# Change to create parallel list, and then copy across with standard names
######################## !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! #########################3
for heatmap_fp in heatmap_fps:
    #print(heatmap_fp)
    h = str(heatmap_fp).split('__')[0].split('/')[-1].lower().replace(' ', '-')
    a = str(heatmap_fp).split('__')[1].split('/')[0].split('.')[0].lower().replace(' ', '-')
    if h in std_names_d.keys():
        h = std_names_d[h]
    if a in std_names_d.keys():
        a = std_names_d[a]
    season = str(heatmap_fp.parent.parent).split('/')[-1]
    dest_fp_stub = pathlib.Path(str(heatmap_fp).replace('02-cleaned', '03-standardized')).parent
    # print(dest_fp_stub)
    fn = season + '__' + 'date' + '__' + h + '__' + a + heatmap_fp.suffix
    dest_fp = dest_fp_stub / fn
    #print(dest_fp)
    dest_fps.append(dest_fp)
    #break
#     fps[season + '__' + 'date' + '__' + h + '__' + a] = heatmap_fp
# for k in list(fps.keys())[0:5]:
#     print(f'{k} : {fps[k]}')
#print(fps.items()[0:5])

In [25]:
n_copied, n_exist= copy_data(heatmap_fps, dest_fps)
print(n_copied, n_exist)

0 3420


In [26]:
dst_sm_fps = []
for shotmap_fp in shotmap_fps:
#     print(heatmap_fp)
    h = str(shotmap_fp).split('__')[0].split('/')[-1].lower().replace(' ', '-')
    a = str(shotmap_fp).split('__')[1].split('/')[0].split('.')[0].lower().replace(' ', '-')
    if h in std_names_d.keys():
        h = std_names_d[h]
    if a in std_names_d.keys():
        a = std_names_d[a]
    season = str(shotmap_fp.parent.parent).split('/')[-1] 
    dest_sm_fp_stub = pathlib.Path(str(shotmap_fp).replace('02-cleaned', '03-standardized')).parent
    # print(dest_fp_stub)
    fn = season + '__' + 'date' + '__' + h + '__' + a + shotmap_fp.suffix
    dst_sm_fp = dest_sm_fp_stub / fn
    #print(dest_fp)
    dst_sm_fps.append(dst_sm_fp)

#     # Take a peek at the original shotmap filepaths 
# for k in list(fpss.keys())[0:5]:
#     print(f'{k} : {fpss[k]}')

In [27]:
n_copied, n_exist= copy_data(shotmap_fps, dst_sm_fps)
print(n_copied, n_exist)

0 3420


In [28]:
stop

NameError: name 'stop' is not defined

In [None]:
hm_src_fps = [] ; hm_dest_fps = []
sm_src_fps = [] ; sm_dest_fps = []
for nation, league in zip(nations, leagues):
    print(nation, league)
    for season in seasons:
        src_stub = RAW_DATA_DIR / 'football-data' / nation / league / season / 'who-scored-com'
        dest_stub = CLEANED_DATA_DIR / 'whoscored-com' / nation / league / season 
#         print(stub)
        hm_src_dir = src_stub / 'game-heatmaps'
        sm_src_dir = src_stub / 'game-shotmaps'
        if hm_src_dir.exists() and len(list(hm_src_dir.glob('*.png'))) > 0:
            hm_src_fps_sublist = list(hm_src_dir.glob('*.png'))
            hm_src_fps.append(hm_src_fps_sublist)
            hm_src_fps.append(list(hm_src_dir.glob('*.png')))
            #print(hm_src_fps[0:3], '\n')
            hm_dest_fps.append([dest_stub / 'heatmaps' / fp.name for fp in hm_src_fps_sublist])
            
        if sm_src_dir.exists() and len(list(sm_src_dir.glob('*.png'))) > 0:
            sm_src_fps_sublist = list(sm_src_dir.glob('*.png'))
            sm_src_fps.append(sm_src_fps_sublist)
            sm_src_fps.append(list(sm_src_dir.glob('*.png')))
            #print(hm_src_fps[0:3], '\n')
            sm_dest_fps.append([dest_stub / 'shotmaps' / fp.name for fp in sm_src_fps_sublist])
        
# hm_dest_fps[0:3]

In [None]:
def copy_data(src_fps, dest_fps):
    """
    Accepts a list of strings representing filepaths
    Copies the files if they do not exist, otherwise counts if already there
    Returns the number of files copied, and number of files already existing
    """
#     dest_fps = []
#     for src_fp in src_fps:
#         #dest_fps.append(src_fp.replace(str(env_dir), str(project_dir)))
    n_copied = 0
    n_exist = 0
    for src_fp, dest_fp in zip(src_fps, dest_fps):
#         src_fp = pathlib.Path(src_fp)
#         dest_fp = pathlib.Path(dest_fp)
        if not dest_fp.exists():
            dest_fp.parent.mkdir(parents=True, exist_ok=True)
            # Copy the original files without touching them
            dest_fp.write_bytes(src_fp.read_bytes())
            n_copied += 1
        else:
            n_exist += 1
    return n_copied, n_exist

In [None]:
hm_src_flatlist = [item for sublist in hm_src_fps for item in sublist]
hm_dest_flatlist = [item for sublist in hm_dest_fps for item in sublist]
n_copied, n_exist = copy_data(hm_src_flatlist, hm_dest_flatlist)
print(n_copied, n_exist)

In [None]:
sm_src_flatlist = [item for sublist in sm_src_fps for item in sublist]
sm_dest_flatlist = [item for sublist in sm_dest_fps for item in sublist]
n_copied, n_exist = copy_data(sm_src_flatlist, sm_dest_flatlist)
print(n_copied, n_exist)

In [None]:
df1 = df.copy(deep=True)

def copy_src_to_dest(src_fp, dest_fp):
    if not dest_fp.exists():
        dest_fp.parent.mkdir(parents=True, exist_ok=True)
        # Copy the original files without touching them
        dest_fp.write_bytes(src_fp.read_bytes())

def form_image_fp(row, league=None, image_type=None):
    row = row.copy(deep=True)
    season = row.name[0]
    date = str(row.name[1].date())
    h = row['h']
    a = row['a']
    key = season + '__' + 'date' + '__' + h + '__' + a
    src_fp = fps[key]
    ext = src_fp.suffix
    dest_fp = INTERIM_DATA_DIR / league/ 'heatmaps' / str(key.replace('date', date) + src_fp.suffix) 
    # copy_src_to_dest(src_fp, dest_fp)
    rel_path = str(dest_fp).replace(str(PROJECT_DIR), '')
    return rel_path
    
df1.loc[:, 'heatmap_path'] = df1.apply(form_image_fp,
                                       league=league,
                                       image_type='heatmaps', axis=1)
df1.head()

In [None]:
stop

In [None]:
league = 'english-premier-league'
fn = str('football-data-' + league + '.csv')
df = pd.read_csv(INTERIM_DATA_DIR / league / fn, parse_dates=['date'], index_col=None)
df.head()

In [None]:
save_file_name =  str('merge-1-' + league + '.csv')

## Standardize Team Names

In [None]:
epl_names_d = {'villa': 'aston-villa',
              'blackburn': 'blackburn-rovers',
              'birmingham': 'birmingham-city',
              'bolton': 'bolton-wanderers',
              'brighton': 'brighton-and-hove-albion',
              'brighton-&-Hove Albion': ' brighton-and-hove-albion',
              'cardiff': 'cardiff-city',
              'huddersfield': 'huddersfield-town',
              'hull': 'hull-city',
              'leicester': 'leicester-city',
              'leicester-cty': 'leicester-city',
              'man-city': 'manchester-city',
              'man-u': 'manchester-united',
              'man-utd': 'manchester-united',
              'manchester-utd': 'manchester-united',
              'man-united': 'manchester-united',
              'boro': 'middlesborough',
              'newcastle': 'newcastle-united',
              'newcastle-utd': 'newcastle-united',
              'norwich': 'norwich-city',
              'qpr': 'queens-park-rangers',
              'stoke': 'stoke-city',
              'swansea': 'swansea-city',
              'tottenham': 'tottenham-hotspur',
              'west-brom': 'west-bromwich-albion',
              'west-ham': 'west-ham-united',
              'wigan': 'wigan-athletic',
              'wolves': 'wolverhampton-wanderers'}

In [None]:
fn = 'english-premier-league_std_name_dict.pkl'

with open(REF_DATA_DIR / fn, 'wb') as handle:
    pickle.dump(epl_names_d, handle, protocol=pickle.HIGHEST_PROTOCOL)
    

In [None]:
with open(REF_DATA_DIR / fn, 'rb') as handle:
    std_names_d = pickle.load(handle)

In [None]:
# Standardize the team names
df['h'] = df['h'].str.strip().str.lower().str.replace(' ', '-')
df.loc[df['h'].isin(std_names_d.keys()), 'h'] = df['h'].map(std_names_d)
df['a'] = df['a'].str.strip().str.lower().str.replace(' ', '-')
df.loc[df['a'].isin(std_names_d.keys()), 'a'] = df['a'].map(std_names_d)

In [None]:
df.head()

In [None]:
df.set_index(['season', 'date'], inplace=True)
df.head()

In [None]:
### Name and Copy Heatmaps and Shotmaps

In [None]:
country = 'england'
league_dir = 'premier'

In [None]:
heatmap_fps = []
heatmap_dirs = [d for d in pathlib.Path(RAW_DATA_DIR /'soccer' / country / league_dir).rglob('**/*')
               if d.is_dir() and str(d).split('/')[-1] =='heatmaps']
for heatmap_dir in heatmap_dirs:
    heatmaps = [fp for fp in heatmap_dir.iterdir() if fp.is_file()]
    heatmap_fps.extend(heatmaps)

In [None]:
shotmap_fps = []
shotmap_dirs = [d for d in pathlib.Path(RAW_DATA_DIR /'soccer'/ country / league_dir).rglob('**/*')
               if d.is_dir() and str(d).split('/')[-1] =='shotmaps']
for shotmap_dir in shotmap_dirs:
    shotmaps = [fp for fp in shotmap_dir.iterdir() if fp.is_file()]
    shotmap_fps.extend(shotmaps)

In [None]:
fps = {}
for heatmap_fp in heatmap_fps:
#     print(heatmap_fp)
    h = str(heatmap_fp).split('__')[0].split('/')[-1].lower().replace(' ', '-')
    a = str(heatmap_fp).split('__')[1].split('/')[0].split('.')[0].lower().replace(' ', '-')
    if h in std_names_d.keys():
        h = std_names_d[h]
    if a in std_names_d.keys():
        a = std_names_d[a]
    season = str(heatmap_fp.parent.parent).split('/')[-1] 
    fps[season + '__' + 'date' + '__' + h + '__' + a] = heatmap_fp
for k in list(fps.keys())[0:5]:
    print(f'{k} : {fps[k]}')
#print(fps.items()[0:5])

In [None]:
fpss = {}
for shotmap_fp in shotmap_fps:
#     print(heatmap_fp)
    h = str(shotmap_fp).split('__')[0].split('/')[-1].lower().replace(' ', '-')
    a = str(shotmap_fp).split('__')[1].split('/')[0].split('.')[0].lower().replace(' ', '-')
    if h in std_names_d.keys():
        h = std_names_d[h]
    if a in std_names_d.keys():
        a = std_names_d[a]
    season = str(shotmap_fp.parent.parent).split('/')[-1] 
    fpss[season + '__' + 'date' + '__' + h + '__' + a] = shotmap_fp

    # Take a peek at the original shotmap filepaths 
for k in list(fpss.keys())[0:5]:
    print(f'{k} : {fpss[k]}')

## Merge - Get Image Paths into DataFrame

In [None]:
df1 = df.copy(deep=True)

def copy_src_to_dest(src_fp, dest_fp):
    if not dest_fp.exists():
        dest_fp.parent.mkdir(parents=True, exist_ok=True)
        # Copy the original files without touching them
        dest_fp.write_bytes(src_fp.read_bytes())

def form_image_fp(row, league=None, image_type=None):
    row = row.copy(deep=True)
    season = row.name[0]
    date = str(row.name[1].date())
    h = row['h']
    a = row['a']
    key = season + '__' + 'date' + '__' + h + '__' + a
    src_fp = fps[key]
    ext = src_fp.suffix
    dest_fp = INTERIM_DATA_DIR / league/ 'heatmaps' / str(key.replace('date', date) + src_fp.suffix) 
    copy_src_to_dest(src_fp, dest_fp)
    rel_path = str(dest_fp).replace(str(PROJECT_DIR), '')
    return rel_path
    
df1.loc[:, 'heatmap_path'] = df1.apply(form_image_fp,
                                       league=league,
                                       image_type='heatmaps', axis=1)
df1.head()


In [None]:
def form_image_fp(row, league=None, image_type=None):
    row = row.copy(deep=True)
    season = row.name[0]
    date = str(row.name[1].date())
    h = row['h']
    a = row['a']
    key = season + '__' + 'date' + '__' + h + '__' + a
    src_fp = fpss[key]
    ext = src_fp.suffix
    dest_fp = INTERIM_DATA_DIR / league/ 'shotmaps' / str(key.replace('date', date) + src_fp.suffix) 
    copy_src_to_dest(src_fp, dest_fp)
    rel_path = str(dest_fp).replace(str(PROJECT_DIR), '')
    return rel_path
    
df1.loc[:, 'shotmap_path'] = df1.apply(form_image_fp,
                                       league=league,
                                       image_type='shotmaps', axis=1)
df1.head()


In [None]:
df1.iloc[0].loc['heatmap_path']

In [None]:
df1.iloc[0].loc['shotmap_path']

In [None]:
df1.to_csv(INTERIM_DATA_DIR / league / save_file_name, index=True)

In [None]:
## Tidy up code - leave image paths as relative to project dir