# Standardize Team Names

In [1]:
import pathlib
import sys
import pickle

from typing import List

import numpy as np
import pandas as pd

# Load the "autoreload" extension
%load_ext autoreload
# always reload modules marked with "%aimport"
%autoreload 1
# add the 'src' directory to path to import modules
src_dir = pathlib.Path().cwd().resolve().parent / 'src'
#src_dir = os.path.join(os.getcwd(), os.pardir, 'src')
sys.path.append(src_dir)
# import my class code from the source
# %aimport src-dir.filename

pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

PROJECT_DIR = pathlib.Path.cwd().resolve().parent
#DATA_DIR = PROJECT_DIR / 'data'
#RAW_DATA_DIR = PROJECT_DIR / 'data' / '01-raw'
CLEANED_DATA_DIR = PROJECT_DIR / 'data' / '03-cleaned'
STDZED_DATA_DIR = PROJECT_DIR / 'data' / '04-standardized'
REF_DATA_DIR = PROJECT_DIR / 'data' / 'reference'

## Select League Data

## Standardize Football- Data-co - uk

+ Load standardize dictionary
+ Use dictionary to form path to clean dataframes
+ Load dataframes
+ Standardize team names
+ save to standardized directory

In [2]:
def get_fps(top_level_dir, ext='csv'):
    """
    COMMON !!!!!!!!!!!
    """
    src_fps = list(top_level_dir.rglob('*.' + ext))
    return src_fps

def read_csvs(fps):
    """
    COMMON !!!!!!!!!!
    """
    dfs = [pd.read_csv(fp) for fp in fps]
    return dfs

def get_std_dict(df, std_dict_top_dir):
    """
    COMMON !!!!!!!!!!!!!!!!!!
    """
    nation = df['nation'].unique()[0]
    league = df['league'].unique()[0]
    fn = league + '.pkl'
    fp = std_dict_top_dir / nation / league / fn
    
    try:
        with open(fp, 'rb') as handle:
            std_dict = pickle.load(handle)
        return std_dict
    except:
        return {'key':'value'}
    
def get_std_dict_from_path(fp, std_dict_top_dir):
    #'/media/david/5C14F53A14F517AA/code/ana_py37/projects/soccer-predictions/data/
    #03-cleaned/whoscored-com-shotmaps/england/english-premier-league/2009-2010/Arsenal__Aston Villa.png')
    fp_parts = str(fp).split('/')
    nation = fp_parts[-4]
    league = fp_parts[-3]
    fn = league + '.pkl'
    fp = std_dict_top_dir / nation / league / fn
    # print(fp)
    
    try:
        with open(fp, 'rb') as handle:
            std_dict = pickle.load(handle)
    except:
        std_dict = {'key':'value'}
    return std_dict

def standardize_team_names(df_orig, std_names_dict):
    """
    COMMON !!!!!!!!!!!!!!!!
    """
    df = df_orig.copy(deep=True)
    
    # If there is no standard dictioanry available yet, return an empty dataframe
    # so that we don't write a non-standardized dataframe to the standardized directory
    if std_names_dict == {'key':'value'}:
        df = pd.DataFrame()
    else:
        # Standardize the team names
        df['h'] = df['h'].str.strip().str.lower().str.replace(' ', '-')
        df.loc[df['h'].isin(std_names_dict.keys()), 'h'] = df['h'].map(std_names_dict)
        df['a'] = df['a'].str.strip().str.lower().str.replace(' ', '-')
        df.loc[df['a'].isin(std_names_dict.keys()), 'a'] = df['a'].map(std_names_dict)
        if df['h'].isnull().sum() + df['a'].isnull().sum() > 0:
            print(df.head(2))
    return df

def standardize_dfs(dfs, std_dict_top_dir):
    """
    COMMON !!!!!!!!!!!!
    """
    stdzed_dfs = []
    for df in dfs:
        std_dict = get_std_dict(df, std_dict_top_dir)
        df = standardize_team_names(df, std_dict)
        stdzed_dfs.append(df)
    return stdzed_dfs


def make_save_fps(top_level_dir, season_dfs, source = 'indatabet-com'):
    """
    COMMON !!!!!!!!!!!!
    """
    
    scoped_fdcuk_fps = []
    for season_df in season_dfs:
        nation = season_df['nation'].unique()[0]
        league = season_df['league'].unique()[0]
        season = season_df['season'].unique()[0]
        fn = str(season) + '.csv'
        save_fp = top_level_dir / source / nation / league / season / fn
        scoped_fdcuk_fps.append(save_fp)
    return scoped_fdcuk_fps


def save_dfs_to_fps(dfs, fps):
    """
    COMMON !!!!!!!!!!!!!
    """
    n = 0
    for df, fp in zip(dfs, fps):
        if not fp.exists():
            fp.parent.mkdir(parents=True, exist_ok=True)
        df.to_csv(fp, index=False)
        n += 1
    return n


def standardize_team_names_on_fp(fp, std_names_dict):
#     dest_fps = []
#     # Change to create parallel list, and then copy across with standard names
#     ######################## !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! #########################3
#     for heatmap_fp in heatmap_fps:
        #print(heatmap_fp)
    if std_names_dict == {'key': 'value'}:
        fn = None
    else:
        h = str(fp).split('__')[0].split('/')[-1].lower().replace(' ', '-')
        a = str(fp).split('__')[1].split('/')[0].split('.')[0].lower().replace(' ', '-')
        if h in std_names_dict.keys():
            h = std_names_dict[h]
        if a in std_names_dict.keys():
            a = std_names_dict[a]
        fn = h + '__' + a + fp.suffix
    return fn
#         season = str(heatmap_fp.parent.parent).split('/')[-1]
#         dest_fp_stub = pathlib.Path(str(heatmap_fp).replace('02-cleaned', '03-standardized')).parent
#         # print(dest_fp_stub)
#         fn = season + '__' + 'date' + '__' + h + '__' + a + heatmap_fp.suffix
#         dest_fp = dest_fp_stub / fn
#         #print(dest_fp)
#         dest_fps.append(dest_fp)

def standardize_fns(src_fps, std_dict_top_dir):
    stdzed_fns = []
    for fp in src_fps:
        std_dict = get_std_dict_from_path(fp, std_dict_top_dir)
        fn = standardize_team_names_on_fp(fp, std_dict)
        stdzed_fns.append(fn)
    return stdzed_fns
    

def standardize_fps(cleaned_src_fps, stdzed_fns, dest_top_dir, source_dir):
    # '/media/david/5C14F53A14F517AA/code/ana_py37/projects/soccer-predictions/data/
    # 03-cleaned/whoscored-com-shotmaps/england/english-premier-league/2009-2010/Arsenal__Aston Villa.png'
    # stdzed_fn = 'arsenal__aston-villa.png'
    stdzed_dest_fps = []
    for cleaned_src_fp, stdzed_fn in zip(cleaned_src_fps, stdzed_fns):
        if stdzed_fn == None:
            stdzed_dest_fp = False
        else:
            fp_parts = str(cleaned_src_fp).split('/')
            nation = fp_parts[-4]
            league = fp_parts[-3]
            season = fp_parts[-2]
            stdzed_dest_fp = dest_top_dir / source_dir / nation / league / season / stdzed_fn
            # print(fp)
            # Compile dest file path
        stdzed_dest_fps.append(stdzed_dest_fp)
    return stdzed_dest_fps  
    

def copy_data(src_fps, dest_fps):
    """
    Accepts a list of strings representing filepaths
    Copies the files if they do not exist, otherwise counts if already there
    Returns the number of files copied, and number of files already existing
    """
    n_copied = 0
    n_exist = 0
    for src_fp, dest_fp in zip(src_fps, dest_fps):
        if not dest_fp.exists():
            dest_fp.parent.mkdir(parents=True, exist_ok=True)
            # Copy the original files without touching them
            dest_fp.write_bytes(src_fp.read_bytes())
            n_copied += 1
        else:
            n_exist += 1
    return n_copied, n_exist

for source_dir in ['football-data-co-uk', 'indatabet-com']:
    cleaned_fps = get_fps(CLEANED_DATA_DIR / source_dir, ext='csv')
    dfs = read_csvs(cleaned_fps)
    stdzed_dfs = standardize_dfs(dfs, REF_DATA_DIR)
    # Not all standardized dictionaries available yet
    stdzed_dfs = [df for df in stdzed_dfs if len(df) > 0]
    stdzed_dfs_fps = make_save_fps(STDZED_DATA_DIR,
                                    stdzed_dfs,
                                    source = source_dir)
    n_saved = save_dfs_to_fps(stdzed_dfs, stdzed_dfs_fps)
    print(n_saved)
    
for source_dir in ['whoscored-com-heatmaps', 'whoscored-com-shotmaps']:
    cleaned_src_fps = get_fps(CLEANED_DATA_DIR / source_dir, ext='png')
    stdzed_fns = standardize_fns(cleaned_src_fps, REF_DATA_DIR)
    stdzed_dest_fps = standardize_fps(cleaned_src_fps, stdzed_fns, STDZED_DATA_DIR, source_dir)
    # Because not all the standardized dictionaries are available
    # We only want to copy images that have actually been standardized
    cleaned_src_fps = [src_fp for src_fp, dest_fp in zip(cleaned_src_fps, stdzed_dest_fps) if dest_fp != False]
    stdzed_dest_fps = [dest_fp for src_fp, dest_fp in zip(cleaned_src_fps, stdzed_dest_fps) if dest_fp != False]
    
    #stdzed_dest_fps = fp for fp in stdzed_dest_fps if fp not None]
    n_copied, n_exist = copy_data(cleaned_src_fps, stdzed_dest_fps)
    print(n_copied, n_saved)
    
    
    


17
11
3420 11
3420 11


In [3]:
cleaned_src_fps[0]

PosixPath('/media/david/5C14F53A14F517AA/code/ana_py37/projects/soccer-predictions/data/03-cleaned/whoscored-com-shotmaps/england/english-premier-league/2009-2010/Arsenal__Aston Villa.png')

In [4]:
stdzed_fns[0]

'arsenal__aston-villa.png'

In [5]:
stdzed_dest_fps[0]

PosixPath('/media/david/5C14F53A14F517AA/code/ana_py37/projects/soccer-predictions/data/04-standardized/whoscored-com-shotmaps/england/english-premier-league/2009-2010/arsenal__aston-villa.png')

In [6]:
stop

NameError: name 'stop' is not defined

In [None]:
dfs[0].head()

## Use Dictionary to form path to cleaned dataframes dir

In [None]:
nations = ['united-kingdom']
leagues = ['english-premier-league']
#sources = ['football-data-co-uk', 'indatabet-com', 'whoscored-com']

In [None]:
def form_fdcu_fps(CLEANED_DATA_DIR, nation, league, seasons):
    """
    Accepts directory names to enable reach into raw data directory
    Returns full filepaths of the data files
    This is different - different signature no source - to other versions - needs to be standardized
    """
    fps = []
    fdcu_specifics = ['football-data-co-uk', 'season-data']
    for season in seasons:
        fn = season + '.csv'
        stub = CLEANED_DATA_DIR / fdcu_specifics[0] / nation / league / season
        fp = stub / fn
        #if fp.is_file():
        if fp.exists():
            fps.append(fp)
    return fps

fps = form_fdcu_fps(CLEANED_DATA_DIR, nations[0], leagues[0], seasons)

In [None]:
all_fps = []
for nation, league in zip(nations, leagues):
    fps = form_fdcu_fps(CLEANED_DATA_DIR, nation, league, seasons)
    all_fps.extend(fps)

print(len(all_fps))

## Load DataFrames

In [None]:
season_dfs = []
for fp in all_fps:
    season_df = pd.read_csv(fp, parse_dates=['date'], index_col=None)
    season_dfs.append(season_df)
season_dfs[-1].head()

## Standardize Team Names

In [None]:
def copy_data(src_fps, dest_fps):
    """
    Accepts a list of strings representing filepaths
    Copies the files if they do not exist, otherwise counts if already there
    Returns the number of files copied, and number of files already existing
    """
#     dest_fps = []
#     for src_fp in src_fps:
#         #dest_fps.append(src_fp.replace(str(env_dir), str(project_dir)))
    n_copied = 0
    n_exist = 0
    for src_fp, dest_fp in zip(src_fps, dest_fps):
#         src_fp = pathlib.Path(src_fp)
#         dest_fp = pathlib.Path(dest_fp)
        if not dest_fp.exists():
            dest_fp.parent.mkdir(parents=True, exist_ok=True)
            # Copy the original files without touching them
            dest_fp.write_bytes(src_fp.read_bytes())
            n_copied += 1
        else:
            n_exist += 1
    return n_copied, n_exist

In [None]:
def stdze_names_to_dict(df_orig, std_names_d):
    df = df_orig.copy(deep=True)
    # Standardize the team names
    df['h'] = df['h'].str.strip().str.lower().str.replace(' ', '-')
    df.loc[df['h'].isin(std_names_d.keys()), 'h'] = df['h'].map(std_names_d)
    df['a'] = df['a'].str.strip().str.lower().str.replace(' ', '-')
    df.loc[df['a'].isin(std_names_d.keys()), 'a'] = df['a'].map(std_names_d)
    return df

## Save to Standardized Directory

In [None]:
#Change this to keep list of src and dest filepaths
for df, src_fp in zip(season_dfs, all_fps):
    new_df = stdze_names_to_dict(df, std_names_d)
    # Write to new directory
#     print(src_fp)
    dest_fp = pathlib.Path(str(src_fp).replace('02-cleaned', '03-standardized'))
#     print(dest_fp)
    new_df.to_csv(dest_fp, index=None)
    # copy_data([src_fp], [dest_fp])
    

# Standardize Indatabet com

+ Use dictionary to form path to clean dataframes
+ Load dataframes
+ Standardize team names
+ save to standardized directory

In [None]:
def form_indatabet_fps(CLEANED_DATA_DIR, nation, league, seasons):
    """
    Accepts directory names to enable reach into raw data directory
    Returns full filepaths of the data files
    This is different - different signature no source - to other versions - needs to be standardized
    """
    fps = []
    indatabet_specifics = ['indatabet-com', 'season-data']
    for season in seasons:
        fn = season + '.csv'
        stub = CLEANED_DATA_DIR / indatabet_specifics[0] / nation / league / season
        fp = stub / fn
        #if fp.is_file():
        if fp.exists():
            fps.append(fp)
    return fps

fps = form_indatabet_fps(CLEANED_DATA_DIR, nations[0], leagues[0], seasons)

In [None]:
all_fps = []
for nation, league in zip(nations, leagues):
    fps = form_indatabet_fps(CLEANED_DATA_DIR, nation, league, seasons)
    all_fps.extend(fps)

print(len(all_fps))

In [None]:
season_dfs = []
for fp in all_fps:
    season_df = pd.read_csv(fp, parse_dates=['date'], index_col=None)
    season_dfs.append(season_df)
season_dfs[0].head()

In [None]:
def stdze_names_to_dict(df_orig, std_names_d):
    df = df_orig.copy(deep=True)
    # Standardize the team names
    df['h'] = df['h'].str.strip().str.lower().str.replace(' ', '-')
    df.loc[df['h'].isin(std_names_d.keys()), 'h'] = df['h'].map(std_names_d)
    df['a'] = df['a'].str.strip().str.lower().str.replace(' ', '-')
    df.loc[df['a'].isin(std_names_d.keys()), 'a'] = df['a'].map(std_names_d)
#     print(sorted(df['h'].unique()))
#     print(sorted(df['a'].unique()))
#     print('\n\n')
    return df

In [None]:
#Change this to keep list of src and dest filepaths
for df, src_fp in zip(season_dfs, all_fps):
    new_df = stdze_names_to_dict(df, std_names_d)
    # Write to new directory
#     print(src_fp)
    dest_fp = pathlib.Path(str(src_fp).replace('02-cleaned', '03-standardized'))
#     print(dest_fp)
    # copy_data([src_fp], [dest_fp])
    new_df.to_csv(dest_fp, index=None)

# Standardize Whoscored com

+ Use dictionary to form path to clean images
+ Load image filepaths
+ Standardize team names
+ save to standardized directory

In [None]:
nation = 'united-kingdom'
league = 'english-premier-league'

In [None]:
heatmap_fps = []
heatmap_dirs = [d for d in pathlib.Path(CLEANED_DATA_DIR /'whoscored-com' / nation / league).rglob('**/*')
               if d.is_dir() and str(d).split('/')[-1] =='heatmaps']
for heatmap_dir in heatmap_dirs:
    heatmaps = [fp for fp in heatmap_dir.iterdir() if fp.is_file()]
    heatmap_fps.extend(heatmaps)

In [None]:
heatmap_fps[0:5]

In [None]:
len(heatmap_fps)

In [None]:
shotmap_fps = []
shotmap_dirs = [d for d in pathlib.Path(CLEANED_DATA_DIR /'whoscored-com'/ nation / league).rglob('**/*')
               if d.is_dir() and str(d).split('/')[-1] =='shotmaps']
for shotmap_dir in shotmap_dirs:
    shotmaps = [fp for fp in shotmap_dir.iterdir() if fp.is_file()]
    shotmap_fps.extend(shotmaps)

In [None]:
len(shotmap_fps)

In [None]:
dest_fps = []
# Change to create parallel list, and then copy across with standard names
######################## !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! #########################3
for heatmap_fp in heatmap_fps:
    #print(heatmap_fp)
    h = str(heatmap_fp).split('__')[0].split('/')[-1].lower().replace(' ', '-')
    a = str(heatmap_fp).split('__')[1].split('/')[0].split('.')[0].lower().replace(' ', '-')
    if h in std_names_d.keys():
        h = std_names_d[h]
    if a in std_names_d.keys():
        a = std_names_d[a]
    season = str(heatmap_fp.parent.parent).split('/')[-1]
    dest_fp_stub = pathlib.Path(str(heatmap_fp).replace('02-cleaned', '03-standardized')).parent
    # print(dest_fp_stub)
    fn = season + '__' + 'date' + '__' + h + '__' + a + heatmap_fp.suffix
    dest_fp = dest_fp_stub / fn
    #print(dest_fp)
    dest_fps.append(dest_fp)
    #break
#     fps[season + '__' + 'date' + '__' + h + '__' + a] = heatmap_fp
# for k in list(fps.keys())[0:5]:
#     print(f'{k} : {fps[k]}')
#print(fps.items()[0:5])

In [None]:
n_copied, n_exist= copy_data(heatmap_fps, dest_fps)
print(n_copied, n_exist)

In [None]:
dst_sm_fps = []
for shotmap_fp in shotmap_fps:
#     print(heatmap_fp)
    h = str(shotmap_fp).split('__')[0].split('/')[-1].lower().replace(' ', '-')
    a = str(shotmap_fp).split('__')[1].split('/')[0].split('.')[0].lower().replace(' ', '-')
    if h in std_names_d.keys():
        h = std_names_d[h]
    if a in std_names_d.keys():
        a = std_names_d[a]
    season = str(shotmap_fp.parent.parent).split('/')[-1] 
    dest_sm_fp_stub = pathlib.Path(str(shotmap_fp).replace('02-cleaned', '03-standardized')).parent
    # print(dest_fp_stub)
    fn = season + '__' + 'date' + '__' + h + '__' + a + shotmap_fp.suffix
    dst_sm_fp = dest_sm_fp_stub / fn
    #print(dest_fp)
    dst_sm_fps.append(dst_sm_fp)

#     # Take a peek at the original shotmap filepaths 
# for k in list(fpss.keys())[0:5]:
#     print(f'{k} : {fpss[k]}')

In [None]:
n_copied, n_exist= copy_data(shotmap_fps, dst_sm_fps)
print(n_copied, n_exist)

In [None]:
stop

In [None]:
hm_src_fps = [] ; hm_dest_fps = []
sm_src_fps = [] ; sm_dest_fps = []
for nation, league in zip(nations, leagues):
    print(nation, league)
    for season in seasons:
        src_stub = RAW_DATA_DIR / 'football-data' / nation / league / season / 'who-scored-com'
        dest_stub = CLEANED_DATA_DIR / 'whoscored-com' / nation / league / season 
#         print(stub)
        hm_src_dir = src_stub / 'game-heatmaps'
        sm_src_dir = src_stub / 'game-shotmaps'
        if hm_src_dir.exists() and len(list(hm_src_dir.glob('*.png'))) > 0:
            hm_src_fps_sublist = list(hm_src_dir.glob('*.png'))
            hm_src_fps.append(hm_src_fps_sublist)
            hm_src_fps.append(list(hm_src_dir.glob('*.png')))
            #print(hm_src_fps[0:3], '\n')
            hm_dest_fps.append([dest_stub / 'heatmaps' / fp.name for fp in hm_src_fps_sublist])
            
        if sm_src_dir.exists() and len(list(sm_src_dir.glob('*.png'))) > 0:
            sm_src_fps_sublist = list(sm_src_dir.glob('*.png'))
            sm_src_fps.append(sm_src_fps_sublist)
            sm_src_fps.append(list(sm_src_dir.glob('*.png')))
            #print(hm_src_fps[0:3], '\n')
            sm_dest_fps.append([dest_stub / 'shotmaps' / fp.name for fp in sm_src_fps_sublist])
        
# hm_dest_fps[0:3]

In [None]:
def copy_data(src_fps, dest_fps):
    """
    Accepts a list of strings representing filepaths
    Copies the files if they do not exist, otherwise counts if already there
    Returns the number of files copied, and number of files already existing
    """
#     dest_fps = []
#     for src_fp in src_fps:
#         #dest_fps.append(src_fp.replace(str(env_dir), str(project_dir)))
    n_copied = 0
    n_exist = 0
    for src_fp, dest_fp in zip(src_fps, dest_fps):
#         src_fp = pathlib.Path(src_fp)
#         dest_fp = pathlib.Path(dest_fp)
        if not dest_fp.exists():
            dest_fp.parent.mkdir(parents=True, exist_ok=True)
            # Copy the original files without touching them
            dest_fp.write_bytes(src_fp.read_bytes())
            n_copied += 1
        else:
            n_exist += 1
    return n_copied, n_exist

In [None]:
hm_src_flatlist = [item for sublist in hm_src_fps for item in sublist]
hm_dest_flatlist = [item for sublist in hm_dest_fps for item in sublist]
n_copied, n_exist = copy_data(hm_src_flatlist, hm_dest_flatlist)
print(n_copied, n_exist)

In [None]:
sm_src_flatlist = [item for sublist in sm_src_fps for item in sublist]
sm_dest_flatlist = [item for sublist in sm_dest_fps for item in sublist]
n_copied, n_exist = copy_data(sm_src_flatlist, sm_dest_flatlist)
print(n_copied, n_exist)

In [None]:
df1 = df.copy(deep=True)

def copy_src_to_dest(src_fp, dest_fp):
    if not dest_fp.exists():
        dest_fp.parent.mkdir(parents=True, exist_ok=True)
        # Copy the original files without touching them
        dest_fp.write_bytes(src_fp.read_bytes())

def form_image_fp(row, league=None, image_type=None):
    row = row.copy(deep=True)
    season = row.name[0]
    date = str(row.name[1].date())
    h = row['h']
    a = row['a']
    key = season + '__' + 'date' + '__' + h + '__' + a
    src_fp = fps[key]
    ext = src_fp.suffix
    dest_fp = INTERIM_DATA_DIR / league/ 'heatmaps' / str(key.replace('date', date) + src_fp.suffix) 
    # copy_src_to_dest(src_fp, dest_fp)
    rel_path = str(dest_fp).replace(str(PROJECT_DIR), '')
    return rel_path
    
df1.loc[:, 'heatmap_path'] = df1.apply(form_image_fp,
                                       league=league,
                                       image_type='heatmaps', axis=1)
df1.head()

In [None]:
stop

In [None]:
league = 'english-premier-league'
fn = str('football-data-' + league + '.csv')
df = pd.read_csv(INTERIM_DATA_DIR / league / fn, parse_dates=['date'], index_col=None)
df.head()

In [None]:
save_file_name =  str('merge-1-' + league + '.csv')

## Standardize Team Names

In [None]:
epl_names_d = {'villa': 'aston-villa',
              'blackburn': 'blackburn-rovers',
              'birmingham': 'birmingham-city',
              'bolton': 'bolton-wanderers',
              'brighton': 'brighton-and-hove-albion',
              'brighton-&-Hove Albion': ' brighton-and-hove-albion',
              'cardiff': 'cardiff-city',
              'huddersfield': 'huddersfield-town',
              'hull': 'hull-city',
              'leicester': 'leicester-city',
              'leicester-cty': 'leicester-city',
              'man-city': 'manchester-city',
              'man-u': 'manchester-united',
              'man-utd': 'manchester-united',
              'manchester-utd': 'manchester-united',
              'man-united': 'manchester-united',
              'boro': 'middlesborough',
              'newcastle': 'newcastle-united',
              'newcastle-utd': 'newcastle-united',
              'norwich': 'norwich-city',
              'qpr': 'queens-park-rangers',
              'stoke': 'stoke-city',
              'swansea': 'swansea-city',
              'tottenham': 'tottenham-hotspur',
              'west-brom': 'west-bromwich-albion',
              'west-ham': 'west-ham-united',
              'wigan': 'wigan-athletic',
              'wolves': 'wolverhampton-wanderers'}

In [None]:
fn = 'english-premier-league_std_name_dict.pkl'

with open(REF_DATA_DIR / fn, 'wb') as handle:
    pickle.dump(epl_names_d, handle, protocol=pickle.HIGHEST_PROTOCOL)
    

In [None]:
with open(REF_DATA_DIR / fn, 'rb') as handle:
    std_names_d = pickle.load(handle)

In [None]:
# Standardize the team names
df['h'] = df['h'].str.strip().str.lower().str.replace(' ', '-')
df.loc[df['h'].isin(std_names_d.keys()), 'h'] = df['h'].map(std_names_d)
df['a'] = df['a'].str.strip().str.lower().str.replace(' ', '-')
df.loc[df['a'].isin(std_names_d.keys()), 'a'] = df['a'].map(std_names_d)

In [None]:
df.head()

In [None]:
df.set_index(['season', 'date'], inplace=True)
df.head()

In [None]:
### Name and Copy Heatmaps and Shotmaps

In [None]:
country = 'england'
league_dir = 'premier'

In [None]:
heatmap_fps = []
heatmap_dirs = [d for d in pathlib.Path(RAW_DATA_DIR /'soccer' / country / league_dir).rglob('**/*')
               if d.is_dir() and str(d).split('/')[-1] =='heatmaps']
for heatmap_dir in heatmap_dirs:
    heatmaps = [fp for fp in heatmap_dir.iterdir() if fp.is_file()]
    heatmap_fps.extend(heatmaps)

In [None]:
shotmap_fps = []
shotmap_dirs = [d for d in pathlib.Path(RAW_DATA_DIR /'soccer'/ country / league_dir).rglob('**/*')
               if d.is_dir() and str(d).split('/')[-1] =='shotmaps']
for shotmap_dir in shotmap_dirs:
    shotmaps = [fp for fp in shotmap_dir.iterdir() if fp.is_file()]
    shotmap_fps.extend(shotmaps)

In [None]:
fps = {}
for heatmap_fp in heatmap_fps:
#     print(heatmap_fp)
    h = str(heatmap_fp).split('__')[0].split('/')[-1].lower().replace(' ', '-')
    a = str(heatmap_fp).split('__')[1].split('/')[0].split('.')[0].lower().replace(' ', '-')
    if h in std_names_d.keys():
        h = std_names_d[h]
    if a in std_names_d.keys():
        a = std_names_d[a]
    season = str(heatmap_fp.parent.parent).split('/')[-1] 
    fps[season + '__' + 'date' + '__' + h + '__' + a] = heatmap_fp
for k in list(fps.keys())[0:5]:
    print(f'{k} : {fps[k]}')
#print(fps.items()[0:5])

In [None]:
fpss = {}
for shotmap_fp in shotmap_fps:
#     print(heatmap_fp)
    h = str(shotmap_fp).split('__')[0].split('/')[-1].lower().replace(' ', '-')
    a = str(shotmap_fp).split('__')[1].split('/')[0].split('.')[0].lower().replace(' ', '-')
    if h in std_names_d.keys():
        h = std_names_d[h]
    if a in std_names_d.keys():
        a = std_names_d[a]
    season = str(shotmap_fp.parent.parent).split('/')[-1] 
    fpss[season + '__' + 'date' + '__' + h + '__' + a] = shotmap_fp

    # Take a peek at the original shotmap filepaths 
for k in list(fpss.keys())[0:5]:
    print(f'{k} : {fpss[k]}')

## Merge - Get Image Paths into DataFrame

In [None]:
df1 = df.copy(deep=True)

def copy_src_to_dest(src_fp, dest_fp):
    if not dest_fp.exists():
        dest_fp.parent.mkdir(parents=True, exist_ok=True)
        # Copy the original files without touching them
        dest_fp.write_bytes(src_fp.read_bytes())

def form_image_fp(row, league=None, image_type=None):
    row = row.copy(deep=True)
    season = row.name[0]
    date = str(row.name[1].date())
    h = row['h']
    a = row['a']
    key = season + '__' + 'date' + '__' + h + '__' + a
    src_fp = fps[key]
    ext = src_fp.suffix
    dest_fp = INTERIM_DATA_DIR / league/ 'heatmaps' / str(key.replace('date', date) + src_fp.suffix) 
    copy_src_to_dest(src_fp, dest_fp)
    rel_path = str(dest_fp).replace(str(PROJECT_DIR), '')
    return rel_path
    
df1.loc[:, 'heatmap_path'] = df1.apply(form_image_fp,
                                       league=league,
                                       image_type='heatmaps', axis=1)
df1.head()


In [None]:
def form_image_fp(row, league=None, image_type=None):
    row = row.copy(deep=True)
    season = row.name[0]
    date = str(row.name[1].date())
    h = row['h']
    a = row['a']
    key = season + '__' + 'date' + '__' + h + '__' + a
    src_fp = fpss[key]
    ext = src_fp.suffix
    dest_fp = INTERIM_DATA_DIR / league/ 'shotmaps' / str(key.replace('date', date) + src_fp.suffix) 
    copy_src_to_dest(src_fp, dest_fp)
    rel_path = str(dest_fp).replace(str(PROJECT_DIR), '')
    return rel_path
    
df1.loc[:, 'shotmap_path'] = df1.apply(form_image_fp,
                                       league=league,
                                       image_type='shotmaps', axis=1)
df1.head()


In [None]:
df1.iloc[0].loc['heatmap_path']

In [None]:
df1.iloc[0].loc['shotmap_path']

In [None]:
df1.to_csv(INTERIM_DATA_DIR / league / save_file_name, index=True)

In [None]:
## Tidy up code - leave image paths as relative to project dir