# Copy Whoscored Images

In [1]:
import pathlib
import sys
import pickle

from typing import List

import numpy as np
import pandas as pd

# Load the "autoreload" extension
%load_ext autoreload
# always reload modules marked with "%aimport"
%autoreload 1
# add the 'src' directory to path to import modules
src_dir = pathlib.Path().cwd().resolve().parent / 'src'
#src_dir = os.path.join(os.getcwd(), os.pardir, 'src')
sys.path.append(src_dir)
# import my class code from the source
# %aimport src-dir.filename

pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

PROJECT_DIR = pathlib.Path.cwd().resolve().parent
#DATA_DIR = PROJECT_DIR / 'data'
RAW_DATA_DIR = PROJECT_DIR / 'data' / '01-raw'
SCOPED_DATA_DIR = PROJECT_DIR / 'data' / '02-scoped'
#INTERIM_DATA_DIR = DATA_DIR / '02-interim' / 'football-data'
#REF_DATA_DIR = DATA_DIR / 'reference'

In [2]:
scope_data = {'nations': ['germany', 'england', 'spain', 'italy',
          'france', 'england', 'germany', 'netherlands',
          'russian-federation', 'scotland', 'portugal', 'switzerland',
          'belgium', 'turkey', 'poland', 'england'],
              'leagues': ['bundesliga', 'english-premier-league', 'la-liga', 'serie-a',
           'ligue-1', 'english-championship', 'bundesliga-2', 'eredivisie',
           'premier-league', 'premiership', 'primeira-liga', 'super-league',
           'first-division-a', 'super-lig', 'ekstraklasa', 'one'],
              'seasons': ['2000-2001', '2001-2002', '2002-2003', '2003-2004',
           '2004-2005', '2005-2006', '2006-2007', '2007-2008',
           '2008-2009', '2009-2010', '2010-2011', '2011-2012',
           '2012-2013', '2013-2014', '2014-2015', '2015-2016',
           '2016-2017', '2017-2018']}

In [3]:
def make_whoscored_image_src_fps(top_level_dir, scope_data, image_type='game-heatmaps'):
    
    src_fps = []
    for nation, league in zip(scope_data['nations'], scope_data['leagues']):
        for season in scope_data['seasons']:
            src_stub = top_level_dir / 'football-data' / nation / league / season / 'who-scored-com'
            src_dir = src_stub / image_type
            src_fp_sublist = list(src_dir.glob('*.png'))
            if src_dir.exists() and len(src_fp_sublist) > 0:
                src_fps.extend(src_fp_sublist)
   
    return src_fps
            
def make_whoscored_image_dest_fps(src_top_level_dir, dest_top_level_dir, src_fps, image_type='game-heatmaps'):
    # src looks like
    # /media/david/5C14F53A14F517AA/code/ana_py37/projects/soccer-predictions/data
    #/01-raw/football-data/germany/bundesliga/2009-2010/who-scored-com/game-heatmaps/Bayern__Bochum.png
    
    # dest shoulkd look like
    #'/media/david/5C14F53A14F517AA/code/ana_py37/projects/soccer-predictions/data
    #/02-scoped/whoscored-com-heatmaps/germany/bundesliga/2009-2010/Bayern__Bochum.png'
   
    dest_fps = []
    for src_fp in src_fps:
        src_fp_parts = str(src_fp).split('/')
        source_parts = src_fp_parts[-3].split('-')
        source = source_parts[0] + source_parts[1] + '-' + source_parts[-1] + '-' + image_type.split('-')[-1]
        nation = src_fp_parts[-6]
        league = src_fp_parts[-5]
        season = src_fp_parts[-4]
        dest_fp = dest_top_level_dir /source / nation / league / season / src_fp.name
        dest_fps.append(dest_fp)
        
    return dest_fps

def copy_data(src_fps, dest_fps):
    """
    Accepts a list of strings representing filepaths
    Copies the files if they do not exist, otherwise counts if already there
    Returns the number of files copied, and number of files already existing
    """
    n_copied = 0
    n_exist = 0
    for src_fp, dest_fp in zip(src_fps, dest_fps):
        if not dest_fp.exists():
            dest_fp.parent.mkdir(parents=True, exist_ok=True)
            # Copy the original files without touching them
            dest_fp.write_bytes(src_fp.read_bytes())
            n_copied += 1
        else:
            n_exist += 1
    return n_copied, n_exist


for image_type in ['game-heatmaps', 'game-shotmaps']:
    src_fps = make_whoscored_image_src_fps(RAW_DATA_DIR,
                                           scope_data,
                                           image_type=image_type)

    dest_fps = make_whoscored_image_dest_fps(RAW_DATA_DIR,
                                             SCOPED_DATA_DIR,
                                             src_fps,
                                             image_type=image_type)
    n_copied, n_exist = copy_data(src_fps, dest_fps)


In [4]:
stop

NameError: name 'stop' is not defined

In [None]:
hm_src_fps = [] ; hm_dest_fps = []
sm_src_fps = [] ; sm_dest_fps = []
for nation, league in zip(nations, leagues):
    print(nation, league)
    for season in seasons:
        src_stub = RAW_DATA_DIR / 'football-data' / nation / league / season / 'who-scored-com'
        dest_stub = CLEANED_DATA_DIR / 'whoscored-com' / nation / league / season 
#         print(stub)
        hm_src_dir = src_stub / 'game-heatmaps'
        sm_src_dir = src_stub / 'game-shotmaps'
        if hm_src_dir.exists() and len(list(hm_src_dir.glob('*.png'))) > 0:
            hm_src_fps_sublist = list(hm_src_dir.glob('*.png'))
            hm_src_fps.append(hm_src_fps_sublist)
            hm_src_fps.append(list(hm_src_dir.glob('*.png')))
            #print(hm_src_fps[0:3], '\n')
            hm_dest_fps.append([dest_stub / 'heatmaps' / fp.name for fp in hm_src_fps_sublist])
            
        if sm_src_dir.exists() and len(list(sm_src_dir.glob('*.png'))) > 0:
            sm_src_fps_sublist = list(sm_src_dir.glob('*.png'))
            sm_src_fps.append(sm_src_fps_sublist)
            sm_src_fps.append(list(sm_src_dir.glob('*.png')))
            #print(hm_src_fps[0:3], '\n')
            sm_dest_fps.append([dest_stub / 'shotmaps' / fp.name for fp in sm_src_fps_sublist])
        
# hm_dest_fps[0:3]

In [None]:
def copy_data(src_fps, dest_fps):
    """
    Accepts a list of strings representing filepaths
    Copies the files if they do not exist, otherwise counts if already there
    Returns the number of files copied, and number of files already existing
    """
#     dest_fps = []
#     for src_fp in src_fps:
#         #dest_fps.append(src_fp.replace(str(env_dir), str(project_dir)))
    n_copied = 0
    n_exist = 0
    for src_fp, dest_fp in zip(src_fps, dest_fps):
#         src_fp = pathlib.Path(src_fp)
#         dest_fp = pathlib.Path(dest_fp)
        if not dest_fp.exists():
            dest_fp.parent.mkdir(parents=True, exist_ok=True)
            # Copy the original files without touching them
            dest_fp.write_bytes(src_fp.read_bytes())
            n_copied += 1
        else:
            n_exist += 1
    return n_copied, n_exist

In [None]:
hm_src_flatlist = [item for sublist in hm_src_fps for item in sublist]
hm_dest_flatlist = [item for sublist in hm_dest_fps for item in sublist]
n_copied, n_exist = copy_data(hm_src_flatlist, hm_dest_flatlist)
print(n_copied, n_exist)

In [None]:
sm_src_flatlist = [item for sublist in sm_src_fps for item in sublist]
sm_dest_flatlist = [item for sublist in sm_dest_fps for item in sublist]
n_copied, n_exist = copy_data(sm_src_flatlist, sm_dest_flatlist)
print(n_copied, n_exist)

In [None]:
stop

In [None]:
league = 'english-premier-league'
fn = str('football-data-' + league + '.csv')
df = pd.read_csv(INTERIM_DATA_DIR / league / fn, parse_dates=['date'], index_col=None)
df.head()

In [None]:
save_file_name =  str('merge-1-' + league + '.csv')

## Standardize Team Names

In [None]:
epl_names_d = {'villa': 'aston-villa',
              'blackburn': 'blackburn-rovers',
              'birmingham': 'birmingham-city',
              'bolton': 'bolton-wanderers',
              'brighton': 'brighton-and-hove-albion',
              'brighton-&-Hove Albion': ' brighton-and-hove-albion',
              'cardiff': 'cardiff-city',
              'huddersfield': 'huddersfield-town',
              'hull': 'hull-city',
              'leicester': 'leicester-city',
              'leicester-cty': 'leicester-city',
              'man-city': 'manchester-city',
              'man-u': 'manchester-united',
              'man-utd': 'manchester-united',
              'manchester-utd': 'manchester-united',
              'man-united': 'manchester-united',
              'boro': 'middlesborough',
              'newcastle': 'newcastle-united',
              'newcastle-utd': 'newcastle-united',
              'norwich': 'norwich-city',
              'qpr': 'queens-park-rangers',
              'stoke': 'stoke-city',
              'swansea': 'swansea-city',
              'tottenham': 'tottenham-hotspur',
              'west-brom': 'west-bromwich-albion',
              'west-ham': 'west-ham-united',
              'wigan': 'wigan-athletic',
              'wolves': 'wolverhampton-wanderers'}

In [None]:
fn = 'english-premier-league_std_name_dict.pkl'

with open(REF_DATA_DIR / fn, 'wb') as handle:
    pickle.dump(epl_names_d, handle, protocol=pickle.HIGHEST_PROTOCOL)
    

In [None]:
with open(REF_DATA_DIR / fn, 'rb') as handle:
    std_names_d = pickle.load(handle)

In [None]:
# Standardize the team names
df['h'] = df['h'].str.strip().str.lower().str.replace(' ', '-')
df.loc[df['h'].isin(std_names_d.keys()), 'h'] = df['h'].map(std_names_d)
df['a'] = df['a'].str.strip().str.lower().str.replace(' ', '-')
df.loc[df['a'].isin(std_names_d.keys()), 'a'] = df['a'].map(std_names_d)

In [None]:
df.head()

In [None]:
df.set_index(['season', 'date'], inplace=True)
df.head()

In [None]:
### Name and Copy Heatmaps and Shotmaps

In [None]:
country = 'england'
league_dir = 'premier'

In [None]:
heatmap_fps = []
heatmap_dirs = [d for d in pathlib.Path(RAW_DATA_DIR /'soccer' / country / league_dir).rglob('**/*')
               if d.is_dir() and str(d).split('/')[-1] =='heatmaps']
for heatmap_dir in heatmap_dirs:
    heatmaps = [fp for fp in heatmap_dir.iterdir() if fp.is_file()]
    heatmap_fps.extend(heatmaps)

In [None]:
shotmap_fps = []
shotmap_dirs = [d for d in pathlib.Path(RAW_DATA_DIR /'soccer'/ country / league_dir).rglob('**/*')
               if d.is_dir() and str(d).split('/')[-1] =='shotmaps']
for shotmap_dir in shotmap_dirs:
    shotmaps = [fp for fp in shotmap_dir.iterdir() if fp.is_file()]
    shotmap_fps.extend(shotmaps)

In [None]:
fps = {}
for heatmap_fp in heatmap_fps:
#     print(heatmap_fp)
    h = str(heatmap_fp).split('__')[0].split('/')[-1].lower().replace(' ', '-')
    a = str(heatmap_fp).split('__')[1].split('/')[0].split('.')[0].lower().replace(' ', '-')
    if h in std_names_d.keys():
        h = std_names_d[h]
    if a in std_names_d.keys():
        a = std_names_d[a]
    season = str(heatmap_fp.parent.parent).split('/')[-1] 
    fps[season + '__' + 'date' + '__' + h + '__' + a] = heatmap_fp
for k in list(fps.keys())[0:5]:
    print(f'{k} : {fps[k]}')
#print(fps.items()[0:5])

In [None]:
fpss = {}
for shotmap_fp in shotmap_fps:
#     print(heatmap_fp)
    h = str(shotmap_fp).split('__')[0].split('/')[-1].lower().replace(' ', '-')
    a = str(shotmap_fp).split('__')[1].split('/')[0].split('.')[0].lower().replace(' ', '-')
    if h in std_names_d.keys():
        h = std_names_d[h]
    if a in std_names_d.keys():
        a = std_names_d[a]
    season = str(shotmap_fp.parent.parent).split('/')[-1] 
    fpss[season + '__' + 'date' + '__' + h + '__' + a] = shotmap_fp

    # Take a peek at the original shotmap filepaths 
for k in list(fpss.keys())[0:5]:
    print(f'{k} : {fpss[k]}')

In [None]:
df1 = df.copy(deep=True)

def copy_src_to_dest(src_fp, dest_fp):
    if not dest_fp.exists():
        dest_fp.parent.mkdir(parents=True, exist_ok=True)
        # Copy the original files without touching them
        dest_fp.write_bytes(src_fp.read_bytes())

def form_image_fp(row, league=None, image_type=None):
    row = row.copy(deep=True)
    season = row.name[0]
    date = str(row.name[1].date())
    h = row['h']
    a = row['a']
    key = season + '__' + 'date' + '__' + h + '__' + a
    src_fp = fps[key]
    ext = src_fp.suffix
    dest_fp = INTERIM_DATA_DIR / league/ 'heatmaps' / str(key.replace('date', date) + src_fp.suffix) 
    copy_src_to_dest(src_fp, dest_fp)
    rel_path = str(dest_fp).replace(str(PROJECT_DIR), '')
    return rel_path
    
df1.loc[:, 'heatmap_path'] = df1.apply(form_image_fp,
                                       league=league,
                                       image_type='heatmaps', axis=1)
df1.head()


In [None]:
def form_image_fp(row, league=None, image_type=None):
    row = row.copy(deep=True)
    season = row.name[0]
    date = str(row.name[1].date())
    h = row['h']
    a = row['a']
    key = season + '__' + 'date' + '__' + h + '__' + a
    src_fp = fpss[key]
    ext = src_fp.suffix
    dest_fp = INTERIM_DATA_DIR / league/ 'shotmaps' / str(key.replace('date', date) + src_fp.suffix) 
    copy_src_to_dest(src_fp, dest_fp)
    rel_path = str(dest_fp).replace(str(PROJECT_DIR), '')
    return rel_path
    
df1.loc[:, 'shotmap_path'] = df1.apply(form_image_fp,
                                       league=league,
                                       image_type='shotmaps', axis=1)
df1.head()


In [None]:
df1.iloc[0].loc['heatmap_path']

In [None]:
df1.iloc[0].loc['shotmap_path']

In [None]:
df1.to_csv(INTERIM_DATA_DIR / league / save_file_name, index=True)

In [None]:
## Tidy up code - leave image paths as relative to project dir