# Join Within Season

In [1]:
import pathlib
import sys
import pickle

from typing import List

import numpy as np
import pandas as pd

# Load the "autoreload" extension
%load_ext autoreload
# always reload modules marked with "%aimport"
%autoreload 1
# add the 'src' directory to path to import modules
src_dir = pathlib.Path().cwd().resolve().parent / 'src'
#src_dir = os.path.join(os.getcwd(), os.pardir, 'src')
sys.path.append(str(src_dir))
# import my class code from the source
# %aimport src-dir.filename

from data.pfuncs import (get_filepaths,
                         read_csvs_to_dfs,
                         write_dfs_to_filepaths,
                         make_filepaths_from_dfs,
                         get_matching_filepaths,
                         copy_files,
                         make_equiv_image_dest_fps)

pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

PROJECT_DIR = pathlib.Path.cwd().resolve().parent
STDZED_DIR = PROJECT_DIR / 'data' / '04-standardized'
MERGED_DIR = PROJECT_DIR / 'data' / '05-joinedWithinSeason'

In [2]:
def merger(football_df, odds_df2):
    #dfs = sorted(list(df_dict.values()), key=lambda x: len(x), reverse=True)
    # merge_asof on date using home team, awat_team, home_goals, and away_goals to match
    # merge_asof does a left join, so put longest daf on left, so get max data into merged
    merged = pd.merge_asof(football_df, odds_df2,
                           on='date',
                           by=['h', 'a', 'h_ftGoals', 'a_ftGoals'],
                           suffixes=('_ic', '_fdcu'),
                           tolerance=pd.Timedelta(days=2),
                           direction='nearest'
                           )
    # # Put a date difference column into the merged df
    # merged['dates_diff'] = merged['date_ic'] - merged['date_fdcu']
    # # Write the merge issues data to a yaml file

    merged.sort_values(by='date', ascending=True, inplace=True)
    return merged


def do_merge(left_dfs, right_dfs):
    merged_dfs = []
    for left_df, right_df in zip(left_dfs, right_dfs):

        # cast to float to enable a join cannot join on integer and float
        # This should move into Clean 
        left_df['date'] = pd.to_datetime(left_df['date'])
        right_df['date'] = pd.to_datetime(right_df['date'])
        
        cols = ['h_ftGoals', 'a_ftGoals']
        left_df[cols] = left_df[cols].apply(pd.to_numeric, errors='coerce', downcast='float', axis=1)
        right_df[cols] = right_df[cols].apply(pd.to_numeric, errors='coerce', downcast='float', axis=1)
        
        try:
            merged_df = pd.merge_asof(left_df, right_df,
                                   on='date',
                                   by = ['h', 'a', 'h_ftGoals', 'a_ftGoals',
                                         'nation', 'league', 'season', 'result'],
                                   suffixes=('_ic', '_fdcu'),
                                   tolerance=pd.Timedelta(days=2),
                                   direction='nearest')
            merged_df.sort_values(by='date', ascending=True, inplace=True)
            merged_dfs.append(merged_df)
        except:
            print("Unexpected error:", sys.exc_info()[0])
    return merged_dfs


def join_within_league_seasons(sources):    
    left_df_fps = get_filepaths(STDZED_DIR / left_source, ext='csv')
    right_df_fps = get_filepaths(STDZED_DIR / right_source, ext='csv')
    left_df_fps, right_df_fps = get_matching_filepaths(left_df_fps, right_df_fps, left_source, right_source)
    left_dfs = read_csvs_to_dfs(left_df_fps)
    right_dfs = read_csvs_to_dfs(right_df_fps)
    merged_dfs = do_merge(left_dfs, right_dfs)

    merged_fps = make_filepaths_from_dfs(MERGED_DIR, merged_dfs, '')
    n_saved = write_dfs_to_filepaths(merged_dfs, merged_fps)
    print(n_saved)


left_source = 'football-data-co-uk'
right_source = 'indatabet-com'
sources = [left_source, right_source]

join_within_league_seasons(sources)



11


In [3]:
def form_image_fp(row, image_type=None):
    row = row.copy(deep=True)
    nation = row['nation']
    league = row['league']
    season = row['season']
    h = row['h']
    a = row['a']
    fn = h + '__' + a + '.png'
    rel_path = str(pathlib.Path(nation) / league / season / image_type / fn)
    return rel_path


def insert_filepaths(dfs, image_type):
    for df in dfs:
        df[image_type] = df.apply(form_image_fp, image_type=image_type, axis=1)
    return dfs

    
def check_filepath_exists(dfs, source_dir, image_type):
    for df in df:
        df[image_type] = df.apply(check_image_fp, image_type=image_type, axis=1)
    
    
def insert_rel_path_into_dfs(image_type):
    filepaths = get_filepaths(MERGED_DIR)
    df_origs =  read_csvs_to_dfs(filepaths)
    dfs = insert_filepaths(df_origs, image_type)
    return dfs


def make_merged_image_filepaths(existing_image_fps):
    new_image_fps = make_equiv_image_dest_fps(STDZED_DIR, MERGED_DIR, existing_image_fps)
    new_image_fps1 = [str(fp).replace(top_dir, '').split('/') for fp in new_image_fps]
    [(fp.insert(-1, image_type)) for fp in new_image_fps1]
    final_image_fps = [pathlib.Path('/'.join(fp)) for fp in new_image_fps1]
    return final_image_fps
    

def update_for_image_file_exists(df_origs, image_type):
    updated_dfs = []
    for df_orig in df_origs:
        df = df_orig.copy(deep=True)
        mask = [pathlib.Path(MERGED_DIR / val).exists() for val in df[image_type].values]*1
        df[image_type+ '_exists'] = mask
        df[image_type+ '_exists'] = df[image_type+ '_exists'].astype(int)
        updated_dfs.append(df)
    return updated_dfs
        
    
image_types = ['heatmap', 'shotmap']
for image_type in image_types:
    dfs = insert_rel_path_into_dfs(image_type)
    top_dir = 'whoscored-com-' + image_type + 's'
    existing_image_fps = get_filepaths(STDZED_DIR / top_dir, ext='png')
    new_image_fps = make_merged_image_filepaths(existing_image_fps)
    n_copied = copy_files(existing_image_fps, new_image_fps)
    print(n_copied)
    updated_dfs = update_for_image_file_exists(dfs, image_type)
    save_fps = make_filepaths_from_dfs(MERGED_DIR, updated_dfs, '')
    n_written = write_dfs_to_filepaths(updated_dfs, save_fps)
    print(n_written)


3420
11
3420
11


In [None]:
stop

In [69]:
updated_dfs[0].head()

Unnamed: 0,date,h,a,h_ftGoals,a_ftGoals,h_htGoals,a_htGoals,h_shots,a_shots,h_shotsOnTarget,a_shotsOnTarget,h_fouls,a_fouls,h_corners,a_corners,h_yellowCards,a_yellowCards,h_redCards,a_redCards,hwinOddsBet365,drawOddsBet365,awinOddsBet365,hwinOddsBwa,drawOddsBwa,awinOddsBwa,hwinOddsGb,drawOddsGb,awinOddsGb,hwinOddsIw,drawOddsIw,awinOddsIw,hwinOddsLb,drawOddsLb,awinOddsLb,hwinOddsSb,drawOddsSb,awinOddsSb,hwinOddsWh,drawOddsWh,awinOddsWh,hwinOddsSj,drawOddsSj,awinOddsSj,hwinOddsVc,drawOddsVc,awinOddsVc,hwinOddsBsa,drawOddsBsa,awinOddsBsa,n_Bb1X2,hwinOddsBbMax,hwinOddsBbMean,drawOddsBbMax,drawOddsBbMean,awinOddsBbMax,awinOddsBbMean,n_BbOU,ftGoalsO2.5OddsBbMax,ftGoalsO2.5OddsBbMean,ftGoalsU2.5OddsBbMax,ftGoalsU2.5OddsBbMean,n_BbAsian,BbAsianHandicap,asianHomeOddsBbMax,oddsAsianHomeBbMean,asianAwayOddsBbMax,oddsAsianAwayBbMean,nation,league,season,result,id_fifa,h_htgoals,a_htgoals,hwinOddsPinnIndatabet,drawOddsPinnIndatabet,awinOddsPinnIndatabet,hwinOddsBet365Indatabet,drawOddsBet365Indatabet,awinOddsBet365Indatabet,heatmap,heatmap_exists
0,2007-08-11,aston-villa,liverpool,1.0,2.0,0,1,10,17,6,7,18,11,4,2,4,2,0,0,4.0,3.25,1.9,3.65,3.2,1.95,4.4,3.5,2.04,3.6,3.2,1.9,3.5,3.2,1.9,3.75,3.2,1.91,,,,3.4,3.25,2.0,3.5,3.2,2.0,3.4,3.25,2.0,43,4.0,3.67,3.4,3.23,2.1,1.95,39,2.3,2.14,1.7,1.63,26,0.5,1.95,1.85,2.07,1.95,england,english-premier-league,2007-2008,,,,,,,,,,,england/english-premier-league/2007-2008/heatm...,0
1,2007-08-11,bolton-wanderers,newcastle-united,1.0,3.0,0,3,13,7,9,5,15,16,4,3,1,1,0,0,2.5,3.2,2.75,2.4,3.2,2.7,2.4,3.25,2.8,2.2,3.1,3.0,2.37,3.0,2.75,2.4,3.2,2.7,,,,2.38,3.2,2.75,2.3,3.25,2.9,2.4,3.2,2.7,45,2.6,2.41,3.25,3.18,3.05,2.8,40,2.33,2.1,1.75,1.65,26,0.0,1.83,1.75,2.18,2.04,england,english-premier-league,2007-2008,,,,,,,,,,,england/english-premier-league/2007-2008/heatm...,0
2,2007-08-11,derby,portsmouth,2.0,2.0,1,1,12,12,5,6,14,17,6,6,1,2,0,0,2.8,3.25,2.4,2.95,3.15,2.25,2.8,3.25,2.4,2.7,3.1,2.4,2.6,3.2,2.37,2.7,3.2,2.4,,,,2.75,3.2,2.38,2.7,3.25,2.4,2.7,3.2,2.4,43,3.25,2.85,3.3,3.19,2.55,2.38,39,2.35,2.13,1.9,1.63,25,0.0,2.26,2.09,1.76,1.71,england,english-premier-league,2007-2008,,,,,,,,,,,england/english-premier-league/2007-2008/heatm...,0
3,2007-08-11,everton,wigan-athletic,2.0,1.0,1,0,12,14,8,4,8,13,6,2,0,0,0,0,1.66,3.4,5.5,1.65,3.4,5.0,1.68,3.5,5.0,1.65,3.5,4.5,1.61,3.5,4.5,1.62,3.4,5.25,,,,1.57,3.5,5.5,1.7,3.4,5.0,1.67,3.3,5.0,44,1.75,1.66,3.65,3.45,6.0,5.25,40,2.2,2.0,1.83,1.73,26,-0.75,1.97,1.94,2.02,1.94,england,english-premier-league,2007-2008,,,,,,,,,,,england/english-premier-league/2007-2008/heatm...,0
4,2007-08-11,middlesbrough,blackburn-rovers,1.0,2.0,1,0,10,4,6,4,16,16,13,3,3,4,0,0,2.37,3.25,2.87,2.35,3.15,2.8,2.4,3.25,2.8,2.3,3.1,2.8,2.25,3.2,2.75,2.4,3.2,2.7,,,,2.25,3.25,2.88,2.5,3.25,2.65,2.3,3.2,2.8,43,2.7,2.41,3.25,3.18,2.95,2.81,39,2.3,2.08,1.75,1.67,26,0.0,1.9,1.77,2.1,1.97,england,english-premier-league,2007-2008,,,,,,,,,,,england/english-premier-league/2007-2008/heatm...,0


In [None]:
stop

## Merge - Get Image Paths into DataFrame

In [None]:
df1 = df.copy(deep=True)

def copy_src_to_dest(src_fp, dest_fp):
    if not dest_fp.exists():
        dest_fp.parent.mkdir(parents=True, exist_ok=True)
        # Copy the original files without touching them
        dest_fp.write_bytes(src_fp.read_bytes())

def form_image_fp(row, league=None, image_type=None):
    row = row.copy(deep=True)
    season = row.name[0]
    date = str(row.name[1].date())
    h = row['h']
    a = row['a']
    key = season + '__' + 'date' + '__' + h + '__' + a
    src_fp = fps[key]
    ext = src_fp.suffix
    dest_fp = INTERIM_DATA_DIR / league/ 'heatmaps' / str(key.replace('date', date) + src_fp.suffix) 
    copy_files(src_fp, dest_fp)
    rel_path = str(dest_fp).replace(str(PROJECT_DIR), '')
    return rel_path
    
df1.loc[:, 'heatmap_path'] = df1.apply(form_image_fp,
                                       league=league,
                                       image_type='heatmaps', axis=1)
df1.head()


In [None]:
def form_image_fp(row, league=None, image_type=None):
    row = row.copy(deep=True)
    season = row.name[0]
    date = str(row.name[1].date())
    h = row['h']
    a = row['a']
    key = season + '__' + 'date' + '__' + h + '__' + a
    src_fp = fpss[key]
    ext = src_fp.suffix
    dest_fp = INTERIM_DATA_DIR / league/ 'shotmaps' / str(key.replace('date', date) + src_fp.suffix) 
    copy_src_to_dest(src_fp, dest_fp)
    rel_path = str(dest_fp).replace(str(PROJECT_DIR), '')
    return rel_path
    
df1.loc[:, 'shotmap_path'] = df1.apply(form_image_fp,
                                       league=league,
                                       image_type='shotmaps', axis=1)
df1.head()
