# Standardize Team Names

In [2]:
import pathlib
import sys
import pickle

from typing import List

import numpy as np
import pandas as pd

# Load the "autoreload" extension
%load_ext autoreload
# always reload modules marked with "%aimport"
%autoreload 1
# add the 'src' directory to path to import modules
src_dir = pathlib.Path().cwd().resolve().parent / 'src'
#src_dir = os.path.join(os.getcwd(), os.pardir, 'src')
sys.path.append(src_dir)
# import my class code from the source
# %aimport src-dir.filename

pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

PROJECT_DIR = pathlib.Path.cwd().resolve().parent
STDZED_DATA_DIR = PROJECT_DIR / 'data' / '04-standardized'
MERGED_DATA_DIR = PROJECT_DIR / 'data' / '05-merged'

In [66]:
def get_fps(top_level_dir, ext='csv'):
    """
    COMMON !!!!!!!!!!!
    """
    src_fps = list(top_level_dir.rglob('*.' + ext))
    return src_fps

def read_csvs(fps):
    """
    COMMON !!!!!!!!!!
    """
    dfs = [pd.read_csv(fp) for fp in fps]
    return dfs

def get_std_dict(df, std_dict_top_dir):
    """
    COMMON !!!!!!!!!!!!!!!!!!
    """
    nation = df['nation'].unique()[0]
    league = df['league'].unique()[0]
    fn = league + '.pkl'
    fp = std_dict_top_dir / nation / league / fn
    
    try:
        with open(fp, 'rb') as handle:
            std_dict = pickle.load(handle)
        return std_dict
    except:
        return {'key':'value'}
    
def get_std_dict_from_path(fp, std_dict_top_dir):
    #'/media/david/5C14F53A14F517AA/code/ana_py37/projects/soccer-predictions/data/
    #03-cleaned/whoscored-com-shotmaps/england/english-premier-league/2009-2010/Arsenal__Aston Villa.png')
    fp_parts = str(fp).split('/')
    nation = fp_parts[-4]
    league = fp_parts[-3]
    fn = league + '.pkl'
    fp = std_dict_top_dir / nation / league / fn
    # print(fp)
    
    try:
        with open(fp, 'rb') as handle:
            std_dict = pickle.load(handle)
    except:
        std_dict = {'key':'value'}
    return std_dict

def standardize_team_names(df_orig, std_names_dict):
    """
    COMMON !!!!!!!!!!!!!!!!
    """
    df = df_orig.copy(deep=True)
    
    # If there is no standard dictioanry available yet, return an empty dataframe
    # so that we don't write a non-standardized dataframe to the standardized directory
    if std_names_dict == {'key':'value'}:
        df = pd.DataFrame()
    else:
        # Standardize the team names
        df['h'] = df['h'].str.strip().str.lower().str.replace(' ', '-')
        df.loc[df['h'].isin(std_names_dict.keys()), 'h'] = df['h'].map(std_names_dict)
        df['a'] = df['a'].str.strip().str.lower().str.replace(' ', '-')
        df.loc[df['a'].isin(std_names_dict.keys()), 'a'] = df['a'].map(std_names_dict)
        if df['h'].isnull().sum() + df['a'].isnull().sum() > 0:
            print(df.head(2))
    return df

def standardize_dfs(dfs, std_dict_top_dir):
    """
    COMMON !!!!!!!!!!!!
    """
    stdzed_dfs = []
    for df in dfs:
        std_dict = get_std_dict(df, std_dict_top_dir)
        df = standardize_team_names(df, std_dict)
        stdzed_dfs.append(df)
    return stdzed_dfs


def make_save_fps(top_level_dir, season_dfs, source = 'indatabet-com'):
    """
    COMMON !!!!!!!!!!!!
    """
    
    scoped_fdcuk_fps = []
    for season_df in season_dfs:
        nation = season_df['nation'].unique()[0]
        league = season_df['league'].unique()[0]
        season = season_df['season'].unique()[0]
        fn = str(season) + '.csv'
        save_fp = top_level_dir / source / nation / league / season / fn
        scoped_fdcuk_fps.append(save_fp)
    return scoped_fdcuk_fps


def save_dfs_to_fps(dfs, fps):
    """
    COMMON !!!!!!!!!!!!!
    """
    n = 0
    for df, fp in zip(dfs, fps):
        if not fp.exists():
            fp.parent.mkdir(parents=True, exist_ok=True)
        df.to_csv(fp, index=False)
        n += 1
    return n


def standardize_team_names_on_fp(fp, std_names_dict):
    if std_names_dict == {'key': 'value'}:
        fn = None
    else:
        h = str(fp).split('__')[0].split('/')[-1].lower().replace(' ', '-')
        a = str(fp).split('__')[1].split('/')[0].split('.')[0].lower().replace(' ', '-')
        if h in std_names_dict.keys():
            h = std_names_dict[h]
        if a in std_names_dict.keys():
            a = std_names_dict[a]
        fn = h + '__' + a + fp.suffix
    return fn


def standardize_fns(src_fps, std_dict_top_dir):
    stdzed_fns = []
    for fp in src_fps:
        std_dict = get_std_dict_from_path(fp, std_dict_top_dir)
        fn = standardize_team_names_on_fp(fp, std_dict)
        stdzed_fns.append(fn)
    return stdzed_fns
    

def standardize_fps(cleaned_src_fps, stdzed_fns, dest_top_dir, source_dir):
    # '/media/david/5C14F53A14F517AA/code/ana_py37/projects/soccer-predictions/data/
    # 03-cleaned/whoscored-com-shotmaps/england/english-premier-league/2009-2010/Arsenal__Aston Villa.png'
    # stdzed_fn = 'arsenal__aston-villa.png'
    stdzed_dest_fps = []
    for cleaned_src_fp, stdzed_fn in zip(cleaned_src_fps, stdzed_fns):
        if stdzed_fn == None:
            stdzed_dest_fp = False
        else:
            fp_parts = str(cleaned_src_fp).split('/')
            nation = fp_parts[-4]
            league = fp_parts[-3]
            season = fp_parts[-2]
            stdzed_dest_fp = dest_top_dir / source_dir / nation / league / season / stdzed_fn
            # print(fp)
            # Compile dest file path
        stdzed_dest_fps.append(stdzed_dest_fp)
    return stdzed_dest_fps  
    

def copy_data(src_fps, dest_fps):
    """
    Accepts a list of strings representing filepaths
    Copies the files if they do not exist, otherwise counts if already there
    Returns the number of files copied, and number of files already existing
    """
    n_copied = 0
    n_exist = 0
    for src_fp, dest_fp in zip(src_fps, dest_fps):
        if not dest_fp.exists():
            dest_fp.parent.mkdir(parents=True, exist_ok=True)
            # Copy the original files without touching them
            dest_fp.write_bytes(src_fp.read_bytes())
            n_copied += 1
        else:
            n_exist += 1
    return n_copied, n_exist

def merger(football_df, odds_df2):
    #dfs = sorted(list(df_dict.values()), key=lambda x: len(x), reverse=True)
    # merge_asof on date using home team, awat_team, home_goals, and away_goals to match
    # merge_asof does a left join, so put longest daf on left, so get max data into merged
    merged = pd.merge_asof(football_df, odds_df2,
                           on='date',
                           by=['h', 'a', 'h_ftGoals', 'a_ftGoals'],
                           suffixes=('_ic', '_fdcu'),
                           tolerance=pd.Timedelta(days=2),
                           direction='nearest'
                           )
    # # Put a date difference column into the merged df
    # merged['dates_diff'] = merged['date_ic'] - merged['date_fdcu']
    # # Write the merge issues data to a yaml file

    merged.sort_values(by='date', ascending=True, inplace=True)
    return merged


def do_merge(left_dfs, right_dfs):
    merged_dfs = []
    for left_df, right_df in zip(left_dfs, right_dfs):

        # cast to float to enable a join cannot join on integer and float
        # This should move into Clean 
        left_df['date'] = pd.to_datetime(left_df['date'])
        right_df['date'] = pd.to_datetime(right_df['date'])
        
        cols = ['h_ftGoals', 'a_ftGoals']
        left_df[cols] = left_df[cols].apply(pd.to_numeric, errors='coerce', downcast='float', axis=1)
        right_df[cols] = right_df[cols].apply(pd.to_numeric, errors='coerce', downcast='float', axis=1)
        
        try:
            merged_df = pd.merge_asof(left_df, right_df,
                                   on='date',
                                   by = ['h', 'a', 'h_ftGoals', 'a_ftGoals',
                                         'nation', 'league', 'season', 'result'],
                                   suffixes=('_ic', '_fdcu'),
                                   tolerance=pd.Timedelta(days=2),
                                   direction='nearest')
            merged_df.sort_values(by='date', ascending=True, inplace=True)
            merged_dfs.append(merged_df)
        except:
            print("Unexpected error:", sys.exc_info()[0])
    return merged_dfs


def get_matching_fps(left_df_fps, right_df_fps, left_source, right_source):
    left_fps = [fp for fp in left_df_fps if pathlib.Path(str(fp).replace(left_source, right_source)) in right_df_fps]
    right_fps =[fp for fp in right_df_fps if pathlib.Path(str(fp).replace(left_source, right_source)) in right_df_fps]
    return left_fps, right_fps
    
    
def make_save_fps(top_level_dir, season_dfs, source = 'indatabet-com'):
    """
    COMMON !!!!!!!!!!!!
    """
    
    scoped_fdcuk_fps = []
    for season_df in season_dfs:
        nation = season_df['nation'].unique()[0]
        league = season_df['league'].unique()[0]
        season = season_df['season'].unique()[0]
        fn = str(season) + '.csv'
        save_fp = top_level_dir / source / nation / league / season / fn
        scoped_fdcuk_fps.append(save_fp)
    return scoped_fdcuk_fps   

left_source = 'football-data-co-uk'
right_source = 'indatabet-com'
left_df_fps = get_fps(STDZED_DATA_DIR / left_source, ext='csv')
right_df_fps = get_fps(STDZED_DATA_DIR / right_source, ext='csv')
left_df_fps, right_df_fps = get_matching_fps(left_df_fps, right_df_fps, left_source, right_source)
left_dfs = read_csvs(left_df_fps)
right_dfs = read_csvs(right_df_fps)
merged_dfs = do_merge(left_dfs, right_dfs)
#merged_dfs[0].head()

# NEXT NEXT NEXT NEXT NEXT !!!!!!!!!!!!!!!!!!!!!!!!!!
# !!!!!!!!!!!!!!!!!!!!!!!!!! 
merged_fps = make_save_fps(MERGED_DATA_DIR, merged_dfs, source= '')
n_saved = save_dfs_to_fps(merged_dfs, merged_fps)
print(n_saved)
# NEXT NEXT NEXT NEXT NEXT !!!!!!!!!!!!!!!!!!!!!!!!!!
# !!!!!!!!!!!!!!!!!!!!!!!!!! 
# INCORORATE IMAGES into DATAFRAMR - original code below



    
    
    


11


## Merge - Get Image Paths into DataFrame

In [None]:
df1 = df.copy(deep=True)

def copy_src_to_dest(src_fp, dest_fp):
    if not dest_fp.exists():
        dest_fp.parent.mkdir(parents=True, exist_ok=True)
        # Copy the original files without touching them
        dest_fp.write_bytes(src_fp.read_bytes())

def form_image_fp(row, league=None, image_type=None):
    row = row.copy(deep=True)
    season = row.name[0]
    date = str(row.name[1].date())
    h = row['h']
    a = row['a']
    key = season + '__' + 'date' + '__' + h + '__' + a
    src_fp = fps[key]
    ext = src_fp.suffix
    dest_fp = INTERIM_DATA_DIR / league/ 'heatmaps' / str(key.replace('date', date) + src_fp.suffix) 
    copy_src_to_dest(src_fp, dest_fp)
    rel_path = str(dest_fp).replace(str(PROJECT_DIR), '')
    return rel_path
    
df1.loc[:, 'heatmap_path'] = df1.apply(form_image_fp,
                                       league=league,
                                       image_type='heatmaps', axis=1)
df1.head()


In [None]:
def form_image_fp(row, league=None, image_type=None):
    row = row.copy(deep=True)
    season = row.name[0]
    date = str(row.name[1].date())
    h = row['h']
    a = row['a']
    key = season + '__' + 'date' + '__' + h + '__' + a
    src_fp = fpss[key]
    ext = src_fp.suffix
    dest_fp = INTERIM_DATA_DIR / league/ 'shotmaps' / str(key.replace('date', date) + src_fp.suffix) 
    copy_src_to_dest(src_fp, dest_fp)
    rel_path = str(dest_fp).replace(str(PROJECT_DIR), '')
    return rel_path
    
df1.loc[:, 'shotmap_path'] = df1.apply(form_image_fp,
                                       league=league,
                                       image_type='shotmaps', axis=1)
df1.head()
