In [24]:
import bz2
import os
import tarfile
import json
from datetime import datetime
import pandas as pd
from unidecode import unidecode
import numpy as np
from fuzzywuzzy import fuzz
import sys
sys.path.insert(0,'C:/MyDevelopment/goalscorer-model')

import data_cleaning as dc

In [25]:
#import betfair data
betfair_df = pd.read_csv("betfairhistoricalprices/goalscorer_price_data.csv")

In [26]:
#import fbref data
fbref_df = dc.load_data(seasons_to_load=None, leagues_to_load=None)

In [27]:
#make deep copy of fbref data
event_df = fbref_df.copy(deep=True)

In [28]:
betfair_df.head()

Unnamed: 0,operation_type,publish_time,event_name,event_id,market_name,open_date,market_time,inplay,runner_name,runner_id,ltp
0,mcm,2017-01-01 12:45:30.823,Watford v Tottenham,28054213,To Score,2017-01-01T13:30:00.000Z,2017-01-01T13:30:00.000Z,False,Abdoulaye Doucoure,7647245,8.0
1,mcm,2017-01-01 13:00:30.805,Watford v Tottenham,28054213,To Score,2017-01-01T13:30:00.000Z,2017-01-01T13:30:00.000Z,False,Abdoulaye Doucoure,7647245,11.5
2,mcm,2017-01-01 13:12:30.539,Watford v Tottenham,28054213,To Score,2017-01-01T13:30:00.000Z,2017-01-01T13:30:00.000Z,False,Abdoulaye Doucoure,7647245,11.0
3,mcm,2017-01-01 13:53:30.864,Watford v Tottenham,28054213,To Score,2017-01-01T13:30:00.000Z,2017-01-01T13:30:00.000Z,True,Adlene Guedioura,4506604,16.0
4,mcm,2016-12-30 17:09:59.154,Watford v Tottenham,28054213,To Score,2017-01-01T13:30:00.000Z,2017-01-01T13:30:00.000Z,False,Christian Eriksen,4540367,2.98


In [45]:
#helper functions
def transform_dict(input_dict):
    output_dict = {}
    for key, value_list in input_dict.items():
        for value in value_list:
            output_dict[value] = key
    return output_dict

def match_words(A, B):
    result = {}
    for a in A:
        words = set(a.split())
        words = [w for w in words if len(w) > 2 and w not in ["City","Town","United","Real", "West"]]
        matching_elements = [b for b in B if any(word in b for word in words)]
        result[a] = matching_elements
    return result

def filter_function(row):
    flag = row['player'].split()[-1].split('-')[-1] in [word for part in row['runner_name'].split() for word in part.split('-')]
    return flag

def get_scores(row):
    score = fuzz.ratio(row["player"], row["runner_name"])
    return score

## 1. Map team names

In [46]:
#split event name into home away teams
betfair_df[['home_team', 'away_team']] = betfair_df['event_name'].str.split(' v ', n=1, expand=True)

#get unique teams list for fbref and betfair, unidecode names
event_df_teams = pd.Series(pd.unique(event_df[['home_team', 'away_team']].values.ravel('K'))).transform(lambda x: unidecode(x))
betfair_df_teams = pd.Series(pd.unique(betfair_df[['home_team', 'away_team']].values.ravel('K'))).transform(lambda x: unidecode(x))

In [47]:
#current mapping (key:fbref -> val:betfair)
mapping = {
    'Ajaccio': ['AC Ajaccio'],
    'Arminia': ['Arminia Bielefeld'],
    'Arouca': [],
    'Athletic Club': ['Athletic Bilbao'],
    'Aves': [],
    'Bayer Leverkusen': ['Leverkusen'],
    'Belenenses SAD': [],
    'Birmingham City': ['Birmingham'],
    'Blackburn Rovers': ['Blackburn'],
    'Boavista': [],
    'Bolton Wanderers': ['Bolton'],
    'Brighton & Hove Albion': ['Brighton'],
    'Cambuur': [],
    'Cardiff City': ['Cardiff'],
    'Casa Pia': [],
    'Charlton Athletic': ['Charlton'],
    'Chaves': [],
    'Clermont Foot': ['Clermont'],
    'Coventry City': ['Coventry'],
    'Cremonese': ['US Cremonese'],
    'De Graafschap': [],
    'Deportivo La Coruna': ['Deportivo'],
    'Derby County': ['Derby'],
    'Dusseldorf': ['Fortuna Dusseldorf'],
    'Emmen': [],
    'Estoril': [],
    'Famalicao': [],
    'Farense': [],
    'Feirense': [],
    'Fortuna Sittard': [],
    'Gil Vicente FC': [],
    'Go Ahead Eagles': [],
    'Hannover 96': ['Hannover'],
    'Heerenveen': [],
    'Hellas Verona': ['Verona'],
    'Heracles Almelo': [],
    'Hertha BSC': ['Hertha Berlin'],
    'Internazionale': ['Inter Milan'],
    'Ipswich Town': ['Ipswich'],
    'Koln': ['FC Koln'],
    'Leeds United': ['Leeds', 'Leeds Utd'],
    'Leicester City': ['Leicester'],
    'Luton Town': ['Luton'],
    'Mainz 05': ['Mainz', 'FSV Mainz 05'],
    'Manchester City': ['Man City'],
    'Manchester United': ['Man Utd'],
    'Maritimo': [],
    'Milan': ['AC Milan'],
    'Monchengladbach': ['Mgladbach'],
    'Monza': ['AC Monza'],
    'Moreirense': [],
    'NAC Breda': [],
    'NEC Nijmegen': [],
    'Nacional': [],
    'Newcastle United': ['Newcastle'],
    'Norwich City': ['Norwich'],
    'Nottingham Forest': ['Nottm Forest'],
    'PSV Eindhoven': ['PSV'],
    'Pacos de Ferreira': [],
    'Paderborn 07': ['Paderborn'],
    'Paris Saint-Germain': ['Paris St-G'],
    'Peterborough United': ['Peterborough'],
    'Portimonense': [],
    'Preston North End': ['Preston'],
    'Queens Park Rangers': ['Rangers'],
    'RKC Waalwijk': [],
    'Real Betis': ['Betis'],
    'Rio Ave': [],
    'Rotherham United': ['Rotherham'],
    'Saint-Etienne': ['St Etienne'],
    'Santa Clara': [],
    'Sheffield United': ['Sheff Utd'],
    'Sheffield Wednesday': ['Sheff Wed'],
    'Sparta Rotterdam': [],
    'Sporting CP': ['Sporting Lisbon'],
    'Swansea City': ['Swansea'],
    'Tondela': [],
    'Tottenham Hotspur': ['Tottenham'],
    'Troyes': ['ESTAC Troyes'],
    'VVV-Venlo': [],
    'Vitoria Guimaraes': ['Guimaraes'],
    'Vitoria Setubal': [],
    'Vizela': [],
    'Volendam': [],
    'West Bromwich Albion': ['West Brom'],
    'West Ham United': ['West Ham'],
    'Wigan Athletic': ['Wigan'],
    'Willem II': [],
    'Wolverhampton Wanderers': ['Wolves'],
    'Wycombe Wanderers': ['Wycombe']
}

team_mapping = transform_dict(mapping)
betfair_df_teams = betfair_df_teams.transform(lambda x: team_mapping[x] if x in team_mapping.keys() else x)

In [48]:
intersection = np.intersect1d(event_df_teams, betfair_df_teams)
outersection = np.setdiff1d(event_df_teams, betfair_df_teams)
assert(len(outersection) + len(intersection) == len(event_df_teams))

#print(intersection)
print(outersection)

['Arouca' 'Aves' 'Belenenses SAD' 'Cambuur' 'Casa Pia' 'De Graafschap'
 'Emmen' 'Estoril' 'Famalicao' 'Farense' 'Fortuna Sittard'
 'Gil Vicente FC' 'Go Ahead Eagles' 'Heracles Almelo' 'Maritimo'
 'NAC Breda' 'Nacional' 'Pacos de Ferreira' 'Portimonense' 'RKC Waalwijk'
 'Rio Ave' 'Santa Clara' 'VVV-Venlo' 'Vitoria Setubal' 'Vizela' 'Volendam']


In [49]:
#automatically search for team name pairs that share a word (see match_words function def), if raw_map contains key, val pairs,
#check if they are correct and update the mapping above and re run.
raw_map = match_words(outersection, betfair_df_teams)
raw_map

{'Arouca': [],
 'Aves': [],
 'Belenenses SAD': [],
 'Cambuur': [],
 'Casa Pia': [],
 'De Graafschap': [],
 'Emmen': [],
 'Estoril': [],
 'Famalicao': [],
 'Farense': [],
 'Fortuna Sittard': [],
 'Gil Vicente FC': ['Gillingham', 'Union St Gilloise'],
 'Go Ahead Eagles': [],
 'Heracles Almelo': ['Heracles'],
 'Maritimo': [],
 'NAC Breda': ['Breda'],
 'Nacional': [],
 'Pacos de Ferreira': [],
 'Portimonense': [],
 'RKC Waalwijk': [],
 'Rio Ave': [],
 'Santa Clara': [],
 'VVV-Venlo': [],
 'Vitoria Setubal': ['Setubal', 'Vitoria Guimaraes'],
 'Vizela': [],
 'Volendam': []}

## 2. Map fbref and betfair matches and players

### 2.1 Matches

In [50]:
#apply pre-processing to event_df and betfair_df (unidecode and map betfair names to fbref names)
event_df['home_team'] = event_df['home_team'].transform(lambda x: unidecode(x))
event_df['away_team'] = event_df['away_team'].transform(lambda x: unidecode(x))

betfair_df['home_team'] = betfair_df['home_team'].transform(lambda x: unidecode(x))
betfair_df['away_team'] = betfair_df['away_team'].transform(lambda x: unidecode(x))

#map betfair team names to fbref team names
betfair_df['home_team'] = betfair_df['home_team'].transform(lambda x: team_mapping[x] if x in team_mapping.keys() else x)
betfair_df['away_team'] = betfair_df['away_team'].transform(lambda x: team_mapping[x] if x in team_mapping.keys() else x)

#rename betfair open_date as datetime, and convert both to string datetimes,remove timezone from utc betfair datetime
betfair_df["datetime"] = betfair_df.open_date.copy()
betfair_df.datetime = pd.to_datetime(betfair_df.datetime).dt.tz_localize(None)
event_df.datetime = pd.to_datetime(event_df.datetime)

#create date columns
betfair_df["date"] = pd.to_datetime(betfair_df.datetime.dt.date)
event_df["date"] = pd.to_datetime(event_df.datetime.dt.date)

#merge matches on matchdate, home_team and away_team (left merge)
betfair_matches = betfair_df[["event_name", "date", "home_team", "away_team"]].drop_duplicates()
event_matches = event_df[['home_team', 'away_team', 'date']].drop_duplicates()

#temporary pre-2023 and post-2017 filter!!
#event_matches = event_matches[(event_matches.date < '2023-01-01') & (event_matches.date > '2018-01-01')]

matches = pd.merge(event_matches, betfair_matches, on=['date', 'home_team', 'away_team'])

### 2.2 Players

In [51]:
#repeat the merge but this time include the players
#merge matches on matchdate, home_team and away_team (left merge)
betfair_players = betfair_df[["date", "home_team", "away_team", "event_id","runner_id", "runner_name"]].drop_duplicates()
event_players = event_df[['home_team', 'away_team', 'date',"player_id", "player"]].drop_duplicates()

betfair_players.runner_name = betfair_players.runner_name.transform(lambda x: unidecode(x))
event_players.player = event_players.player.transform(lambda x: unidecode(x))

#temporary pre-2023 and post-2017 filter!!
#event_players = event_players[(event_players.date < '2023-01-01') & (event_players.date > '2018-01-01')]

#use only matches that exist both in betfair and fbref datasets (from 2.1)
matched_players_fbref = pd.merge(matches, event_players, on=['date', 'home_team', 'away_team'], how='left')
matched_players = pd.merge(matched_players_fbref, betfair_players, on=['date', 'home_team', 'away_team'], how='left')

In [52]:
#filter for rows where fbref player last name is one of the names in runner_name 
filtered_df = matched_players[matched_players.apply(lambda row: filter_function(row), axis=1)].copy(deep=True)

#score each row for similarity between player and runner_name
filtered_df["str_scores"] = filtered_df.apply(get_scores, axis=1)

#check duplicates
print(f"N. duplicates = {len(filtered_df[filtered_df[['home_team', 'away_team', 'date', 'player']].duplicated(keep=False)])}")

#rank rows by similarity and for each "player" keep only the best match
indices =  filtered_df.groupby(["home_team", "away_team", "date", "player"])['str_scores'].idxmax()
filtered_df = filtered_df.loc[indices]

#check duplicates
print(f"N. duplicates after removal = {len(filtered_df[filtered_df[['home_team', 'away_team', 'date', 'player']].duplicated(keep=False)])}")

N. duplicates = 1914
N. duplicates after removal = 0


## 3. Check missing values

### 3.1 Check missing matches

In [53]:
print(f"N. betfair matches = {len(betfair_matches)}")
print(f"N. fbref matches = {len(event_matches)}")
print(f"N. of 'matched' matches = {len(matches)}")

N. betfair matches = 13427
N. fbref matches = 16645
N. of 'matched' matches = 7688


In [54]:
all_matches = event_matches.merge(matches, on=["home_team","away_team","date"], how='left')
all_matches['year'] = all_matches['date'].dt.year

# Group by year and count
result = all_matches.groupby('year').agg(
    total_rows=pd.NamedAgg(column='date', aggfunc='size'),
    null_event_rows=pd.NamedAgg(column='event_name', aggfunc=lambda x: x.isnull().sum())
)

display(result.head(10))

# # Group by leaue and count
# result = all_matches.groupby('league_name').agg(
#     total_rows=pd.NamedAgg(column='date', aggfunc='size'),
#     null_event_rows=pd.NamedAgg(column='event_name', aggfunc=lambda x: x.isnull().sum())
# )

# display(result.head(10))

# # Group by leaue and year, and count
# result = all_matches.groupby(['league_name','year']).agg(
#     total_rows=pd.NamedAgg(column='date', aggfunc='size'),
#     null_event_rows=pd.NamedAgg(column='event_name', aggfunc=lambda x: x.isnull().sum())
# )

# display(result.head(80))

Unnamed: 0_level_0,total_rows,null_event_rows
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2017,909,547
2018,2387,1509
2019,3005,1976
2020,2562,1488
2021,3259,1592
2022,2829,1229
2023,1694,616


In [55]:
#all_matches[(all_matches.league_name == "La Liga") & (all_matches.event_name.isnull())].head(3)

### 3.2 Check missing players (for matched matches)

In [56]:
#check missing values
print(matched_players_fbref.shape)
print(filtered_df.shape)

# all_matches = matched_players_fbref.merge(filtered_df,
#                          on = ["home_team","away_team","date","league_name","season","event_name","player_id","player"],
#                          how='left')

# Group by year and count
# result = all_matches.groupby('season').agg(
#     total_rows=pd.NamedAgg(column='date', aggfunc='size'),
#     null_event_rows=pd.NamedAgg(column='runner_name', aggfunc=lambda x: x.isnull().sum())
# )

# display(result.head(10))

# Group by leaue and count
# result = all_matches.groupby('position').agg(
#     total_rows=pd.NamedAgg(column='date', aggfunc='size'),
#     null_event_rows=pd.NamedAgg(column='runner_name', aggfunc=lambda x: x.isnull().sum())
# )

# display(result.head(15))

(225353, 8)
(86424, 12)


# Export data

In [57]:
event_df["home_team_original"] = fbref_df["home_team"].values
event_df["away_team_original"] = fbref_df["away_team"].values

event_df_mapped = event_df.merge(filtered_df[["home_team", "away_team", "date","player_id", "event_id", "runner_id"]],
                                 on=["home_team", "away_team", "date","player_id"])



#select only key columns
event_df_mapped = event_df_mapped[["home_team_original","away_team_original","datetime","player_id","event_id","runner_id"]]
event_df_mapped = event_df_mapped.rename(columns={"home_team_original":"home_team","away_team_original":"away_team"})

In [58]:
event_df_mapped.to_csv("betfairhistoricalprices/fbref_betfair_mapping.csv", index=False)

In [59]:
event_df_mapped.head()

Unnamed: 0,home_team,away_team,datetime,player_id,event_id,runner_id
0,Hellas Verona,Napoli,2017-08-19 18:45:00,5370cba7,28332149,8851417
1,Atalanta,Roma,2017-08-20 16:00:00,6e4df551,28332152,14098432
2,Torino,Sassuolo,2017-08-27 16:00:00,d705ba44,28346754,8783691
3,Napoli,Atalanta,2017-08-27 18:45:00,89b2c8a9,28346750,7673127
4,Napoli,Atalanta,2017-08-27 18:45:00,2f557579,28346750,6637779


In [44]:
event_df_mapped.shape

(86424, 6)

In [60]:
event_df_mapped.shape

(86424, 6)