In [136]:
import pandas as pd
import scipy
import glob
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import scipy
import os
from unidecode import unidecode
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 1000)
pd.set_option('max_colwidth', 100)

In [137]:
#event data
event_df = pd.read_csv("C:/MyDevelopment/Goalscorers/clean_data/event_data.csv")

#expectancies data
exp_df = pd.read_csv("C:/MyDevelopment/Goalscorers/clean_data/match_expectancies.csv")

In [138]:
#get unique teams list for fbref and betfair, unidecode names
event_teams = pd.Series(pd.unique(event_df[['home_team', 'away_team']].values.ravel('K'))).transform(lambda x: unidecode(x))
exp_teams = pd.Series(pd.unique(exp_df[['home_team', 'away_team']].values.ravel('K'))).transform(lambda x: unidecode(x))

In [139]:
#helper functions
def transform_dict(input_dict):
    output_dict = {}
    for key, value_list in input_dict.items():
        for value in value_list:
            output_dict[value] = key
    return output_dict

def match_words(A, B):
    result = {}
    for a in A:
        words = set(a.split())
        words = [w for w in words if len(w) > 2 and w not in ["City","Town","United","Real", "West"]]
        matching_elements = [b for b in B if any(word in b for word in words)]
        result[a] = matching_elements
    return result

def filter_function(row):
    flag = row['player'].split()[-1].split('-')[-1] in [word for part in row['runner_name'].split() for word in part.split('-')]
    return flag

def get_scores(row):
    score = fuzz.ratio(row["player"], row["runner_name"])
    return score

In [140]:
#current mapping (key:fbref -> val:betfair)
mapping = {
    'ADO Den Haag': ['Den Haag'],
    'Arminia': ['Bielefeld'],
    'Athletic Club': ['Ath Bilbao'],
    'Atletico Madrid': ['Ath Madrid'],
    'Bayer Leverkusen': ['Leverkusen'],
    'Belenenses SAD': ['Belenenses'],
    'Birmingham City': ['Birmingham'],
    'Blackburn Rovers': ['Blackburn'],
    'Bolton Wanderers': ['Bolton'],
    'Braga': ['Sp Braga'],
    'Brighton & Hove Albion': ['Brighton'],
    'Cardiff City': ['Cardiff'],
    'Celta Vigo': ['Celta'],
    'Charlton Athletic': ['Charlton'],
    'Clermont Foot': ['Clermont'],
    'Coventry City': ['Coventry'],
    'De Graafschap': ['Graafschap'],
    'Deportivo La Coruna': ['La Coruna'],
    'Derby County': ['Derby'],
    'Dusseldorf': ['Fortuna Dusseldorf'],
    'Eintracht Frankfurt': ['Ein Frankfurt'],
    'Emmen': ['FC Emmen'],
    'Espanyol': ['Espanol'],
    'Fortuna Sittard': ['For Sittard'],
    'Gil Vicente FC': ['Gil Vicente'],
    'Hamburger SV': ['Hamburg'],
    'Hannover 96': ['Hannover'],
    'Hellas Verona': ['Verona'],
    'Heracles Almelo': ['Heracles'],
    'Hertha BSC': ['Hertha'],
    'Huddersfield Town': ['Huddersfield'],
    'Hull City': ['Hull'],
    'Internazionale': ['Inter'],
    'Ipswich Town': ['Ipswich'],
    'Koln': ['FC Koln'],
    'Leeds United': ['Leeds'],
    'Leicester City': ['Leicester'],
    'Luton Town': ['Luton'],
    'Mainz 05': ['Mainz'],
    'Manchester City': ['Man City'],
    'Manchester United': ['Man United'],
    'Monchengladbach': ["M'gladbach"],
    'NEC Nijmegen': ['Nijmegen'],
    'Newcastle United': ['Newcastle'],
    'Norwich City': ['Norwich'],
    'Nottingham Forest': ["Nott'm Forest"],
    'Pacos de Ferreira': ['Pacos Ferreira'],
    'Paderborn 07': ['Paderborn'],
    'Paris Saint-Germain': ['Paris SG'],
    'Peterborough United': ['Peterboro'],
    'Preston North End': ['Preston'],
    'Queens Park Rangers': ['QPR'],
    'RKC Waalwijk': ['Waalwijk'],
    'Rayo Vallecano': ['Vallecano'],
    'Real Betis': ['Betis'],
    'Real Sociedad': ['Sociedad'],
    'Rotherham United': ['Rotherham'],
    'SPAL': ['Spal'],
    'Saint-Etienne': ['St Etienne'],
    'Sheffield Wednesday': ['Sheffield Weds'],
    'Sporting CP': ['Sp Lisbon'],
    'Stoke City': ['Stoke'],
    'Swansea City': ['Swansea'],
    'Tottenham Hotspur': ['Tottenham'],
    'VVV-Venlo': ['VVV Venlo'],
    'Vitoria Guimaraes': ['Guimaraes'],
    'Vitoria Setubal': ['Setubal'],
    'West Bromwich Albion': ['West Brom'],
    'West Ham United': ['West Ham'],
    'Wigan Athletic': ['Wigan'],
    'Wolverhampton Wanderers': [],
    'Wycombe Wanderers': ['Wycombe']
}

team_mapping = transform_dict(mapping)
exp_teams = exp_teams.transform(lambda x: team_mapping[x] if x in team_mapping.keys() else x)

In [141]:
intersection = np.intersect1d(event_teams, exp_teams)
outersection = np.setdiff1d(event_teams, exp_teams)
assert(len(outersection) + len(intersection) == len(event_teams))

print(outersection)

['Wolverhampton Wanderers']


In [142]:
#automatically search for team name pairs that share a word (see match_words function def), if raw_map contains key, val pairs,
#check if they are correct and update the mapping above and re run.
raw_map = match_words(outersection, exp_teams)
raw_map

{'Wolverhampton Wanderers': ['Bolton Wanderers', 'Wycombe Wanderers']}

In [143]:
exp_teams[exp_teams.str.contains("Wol")]

10       Wolves
90    Wolfsburg
dtype: object

In [175]:
event_df['home_team_ud'] = event_df['home_team'].transform(lambda x: unidecode(x))
event_df['away_team_ud'] = event_df['away_team'].transform(lambda x: unidecode(x))

exp_df['home_team_ud'] = exp_df['home_team'].transform(lambda x: unidecode(x))
exp_df['away_team_ud'] = exp_df['away_team'].transform(lambda x: unidecode(x))

exp_df['home_team'] = exp_df['home_team'].transform(lambda x: team_mapping[x] if x in team_mapping.keys() else x)
exp_df['away_team'] = exp_df['away_team'].transform(lambda x: team_mapping[x] if x in team_mapping.keys() else x)

event_df.datetime = pd.to_datetime(event_df.datetime)
event_df["date"] = pd.to_datetime(event_df.datetime.dt.date)
exp_df["date"] = pd.to_datetime(exp_df.date)

#merge matches on matchdate, home_team and away_team (left merge)
event_matches = event_df[['home_team_ud', 'away_team_ud','home_team', 'away_team', 'date']].drop_duplicates()
exp_matches = exp_df[['home_team_ud', 'away_team_ud', 'date',"home_exp","away_exp"]].drop_duplicates()

matches = pd.merge(event_matches, exp_matches, on=['date', 'home_team_ud', 'away_team_ud'], how='inner')

In [176]:
matches.head(5)

Unnamed: 0,home_team_ud,away_team_ud,home_team,away_team,date,home_exp,away_exp
0,Juventus,Cagliari,Juventus,Cagliari,2017-08-19,2.684264,0.459997
1,Hellas Verona,Napoli,Hellas Verona,Napoli,2017-08-19,0.819507,2.221524
2,Atalanta,Roma,Atalanta,Roma,2017-08-20,1.295109,1.524255
3,Udinese,Chievo,Udinese,Chievo,2017-08-20,1.402496,0.967474
4,Internazionale,Fiorentina,Internazionale,Fiorentina,2017-08-20,1.990541,0.905028


In [177]:
#check duplicates
matches[["home_team", "away_team", "date"]].duplicated(keep=False).sum()

0

In [178]:
#check missing matches
missing_matches = pd.merge(event_matches, matches,
                           on=["home_team_ud","away_team_ud","home_team","away_team","date"],
                           how='left',
                           indicator=True)

print(len(missing_matches[missing_matches._merge == 'left_only']))
missing_matches[missing_matches._merge == 'left_only'].head(5)

240


Unnamed: 0,home_team_ud,away_team_ud,home_team,away_team,date,home_exp,away_exp,_merge
2377,Aston Villa,West Bromwich Albion,Aston Villa,West Bromwich Albion,2019-05-11,,,left_only
2378,Derby County,Leeds United,Derby County,Leeds United,2019-05-11,,,left_only
2379,West Bromwich Albion,Aston Villa,West Bromwich Albion,Aston Villa,2019-05-14,,,left_only
2380,Leeds United,Derby County,Leeds United,Derby County,2019-05-15,,,left_only
2381,Aston Villa,Derby County,Aston Villa,Derby County,2019-05-27,,,left_only


In [179]:
matches.drop(columns=["home_team_ud","away_team_ud"], inplace=True)
matches.to_csv("C:/MyDevelopment/Goalscorers/goal_expectancies/fbref_matched_expectancies.csv", index=False)