# **Football Goals Prediction Model**

In [1]:
!pip install -q unidecode
!pip install -q rapidfuzz pandas 

In [2]:
import pandas as pd
import json, unidecode, ast, unicodedata
from rapidfuzz import process, fuzz
from functools import lru_cache

## **1.1 Data Preprocessing Stage**

* Datasets we've : `clubs_team_players_v1.json`, `national_team_players_v1.json` `matches_v1.json` and `players_rating_v1.csv`

* First, we need to map the players along with their team in the `matches_v1.json`

In [44]:
clubs = pd.read_csv('data/clubs_team_players_v1.csv')
national = pd.read_csv('data/national_team_players_v1.csv')
with open('data/matches_v1.json', 'r', encoding='utf-8') as f:
    matches_data = json.load(f)
ratings = pd.read_csv('data/players_rating_v1.csv')

In [35]:
clubs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 194 entries, 0 to 193
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   team     194 non-null    object
 1   players  194 non-null    object
dtypes: object(2)
memory usage: 3.2+ KB


In [53]:
def normalize_team_name(name):
    return unidecode.unidecode(name).strip().lower()

def fix_players_list(player_list):
    try:
        return ast.literal_eval(player_list)
    except SyntaxError:
        player_list = player_list.strip('[]')
        player_list = player_list.split(', ')
        player_list = [f'"{player}"' for player in player_list]
        player_list = f'[{", ".join(player_list)}]'
        return ast.literal_eval(player_list)

def normalize_text(text):
    text = text.lower().strip()
    text = ''.join(c for c in unicodedata.normalize('NFD', text) if unicodedata.category(c) != 'Mn')
    return text

In [38]:
for match in matches_data:
    match['home_team'] = normalize_team_name(match['home_team'])
    match['away_team'] = normalize_team_name(match['away_team'])

clubs['team'] = clubs['team'].apply(normalize_team_name)
national['team'] = national['team'].apply(normalize_team_name)

clubs_team_to_players = {row['team']: fix_players_list(row['players']) for index, row in clubs.iterrows()}
national_team_to_playes = {row['team']: fix_players_list(row['players']) for index, row in national.iterrows()}
team_to_players = {**clubs_team_to_players, **national_team_to_playes}

list(team_to_players.items())[0] # lets see the first item in the dict

('bsc young boys',
 ['Meschack Elia',
  'Aurèle Amenda',
  'Filip Ugrinic',
  'Kastriot Imeri',
  'Joel Mvuka',
  'Darian Males',
  'Mohamed Camara',
  'Cheikh Niasse',
  'Cedric Itten',
  'Joël Monteiro',
  'Ebrima Colley'])

In [39]:
for match in matches_data:
    match['home_team'] = normalize_team_name(match['home_team'])
    match['away_team'] = normalize_team_name(match['away_team'])


for match in matches_data:
    home_team = match['home_team']
    away_team = match['away_team']

    ### assigning players to the home team & away team ###
    if home_team in team_to_players:
        for i in range(1, min(12, len(team_to_players[home_team]) + 1)):
            match[f'home_team_p{i}'] = team_to_players[home_team][i-1]

    if away_team in team_to_players:
        for i in range(1, min(12, len(team_to_players[away_team]) + 1)):
            match[f'away_team_p{i}'] = team_to_players[away_team][i-1]
with open("data/matches_with_players.json", "w", encoding='utf-8') as file:
    json.dump(matches_data, file, ensure_ascii=False, indent=4)

##=======================================##
print(matches_data[-1])
print(matches_data[0])

{'home_team': 'netherlands', 'away_team': 'spain', 'home_team_score': 0, 'away_team_score': 0, 'match_type': 'country', 'home_team_p1': 'Xavi Simons', 'home_team_p2': 'Frenkie de Jong', 'home_team_p3': 'Matthijs de Ligt', 'home_team_p4': 'Cody Gakpo', 'home_team_p5': 'Jeremie Frimpong', 'home_team_p6': 'Micky van de Ven', 'home_team_p7': 'Nathan Aké', 'home_team_p8': 'Donyell Malen', 'home_team_p9': 'Teun Koopmeiners', 'home_team_p10': 'Ryan Gravenberch', 'home_team_p11': 'Virgil van Dijk', 'away_team_p1': 'Rodri', 'away_team_p2': 'Lamine Yamal', 'away_team_p3': 'Dani Olmo', 'away_team_p4': 'Mikel Merino', 'away_team_p5': 'Mikel Oyarzabal', 'away_team_p6': 'Martín Zubimendi', 'away_team_p7': 'Nico Williams', 'away_team_p8': 'Alejandro Grimaldo', 'away_team_p9': 'Pedro Porro', 'away_team_p10': 'Robin Le Normand', 'away_team_p11': 'David Raya'}
{'home_team': 'atletico de madrid', 'away_team': 'inter milan', 'home_team_score': 5, 'away_team_score': 5, 'match_type': 'club', 'home_team_p1':

In [61]:
with open('data/matches_with_players.json', 'r', encoding='utf-8') as file:
    matches = json.load(file)

Awesome! Now let's get the player ratings.
- First we have to normalize the names of players in both datasets
- Then we will use fuzzy matching so in this way we can map the names and their corresponding ratings

In [48]:
ratings['name'] = ratings['name'].apply(normalize_text)
ratings['full_name'] = ratings['full_name'].apply(normalize_text)

In [49]:
name_to_rating = pd.Series(ratings.overall_rating.values, index=ratings.name).to_dict()
fullname_to_rating = pd.Series(ratings.overall_rating.values, index=ratings.full_name).to_dict()

In [62]:
for match in matches:
  for i in range(1, 12):
    if f'home_team_p{i}' in match:
      match[f'home_team_p{i}'] = normalize_text(match[f'home_team_p{i}'])
      if f'away_team_p{i}' in match:
        match[f'away_team_p{i}'] = normalize_text(match[f'away_team_p{i}'])

In [63]:
@lru_cache(maxsize=None)
def get_ratings(player_name):
    best_match = process.extractOne(player_name, name_to_rating.keys(), scorer=fuzz.WRatio, score_cutoff=90)
    best_full_match = process.extractOne(player_name, fullname_to_rating.keys(), scorer=fuzz.WRatio, score_cutoff=90)
    if best_match and best_full_match:
        if best_match[1] >= best_full_match[1]:
            return name_to_rating[best_match[0]]
        else:
            return fullname_to_rating[best_full_match[0]]
    elif best_match:
        return name_to_rating[best_match[0]]
    elif best_full_match:
        return fullname_to_rating[best_full_match[0]]
    return None

def process_match(match):
    for i in range(1, 12):
        home_player_key = f'home_team_p{i}'
        away_player_key = f'away_team_p{i}'
        if home_player_key in match:
            rating = get_ratings(match[home_player_key])
            if rating:
                match[f'{home_player_key}_rating'] = rating
        if away_player_key in match:
            rating = get_ratings(match[away_player_key])
            if rating:
                match[f'{away_player_key}_rating'] = rating
    return match
matches = [process_match(match) for match in matches]
with open('football_matches_dataset_v1.json', 'w', encoding='utf-8') as file:
    json.dump(matches, file, ensure_ascii=False, indent=4)

print(matches[0])
print(matches[-1])

{'home_team': 'atletico de madrid', 'away_team': 'inter milan', 'home_team_score': 5, 'away_team_score': 5, 'match_type': 'club', 'home_team_p1': 'nahuel molina', 'home_team_p2': 'jan oblak', 'home_team_p3': 'rodrigo de paul', 'home_team_p4': 'marcos llorente', 'home_team_p5': 'arthur vermeeren', 'home_team_p6': 'antoine griezmann', 'home_team_p7': 'jose maria gimenez', 'home_team_p8': 'mario hermoso', 'home_team_p9': 'rodrigo riquelme', 'home_team_p10': 'samuel lino', 'home_team_p11': 'pablo barrios', 'away_team_p1': 'lautaro martinez', 'away_team_p2': 'nicolo barella', 'away_team_p3': 'alessandro bastoni', 'away_team_p4': 'marcus thuram', 'away_team_p5': 'federico dimarco', 'away_team_p6': 'benjamin pavard', 'away_team_p7': 'hakan calhanoglu', 'away_team_p8': 'davide frattesi', 'away_team_p9': 'denzel dumfries', 'away_team_p10': 'carlos augusto', 'away_team_p11': 'kristjan asllani', 'home_team_p1_rating': 66, 'away_team_p1_rating': 79, 'home_team_p2_rating': 90, 'away_team_p2_rating'

## **1.2 Data Cleaning Stage**

We will clean our data using this strategy:
 1. Handling missing values by imputing the mean rating for missing player ratings.








In [4]:
with open('data/football_matches_dataset_v1.json', 'r', encoding='utf-8') as file:
    data = json.load(file)

df = pd.json_normalize(data)
df.head()

Unnamed: 0,home_team,away_team,home_team_score,away_team_score,match_type,home_team_p1,home_team_p2,home_team_p3,home_team_p4,home_team_p5,...,home_team_p7_rating,away_team_p7_rating,home_team_p8_rating,away_team_p8_rating,home_team_p9_rating,away_team_p9_rating,home_team_p10_rating,away_team_p10_rating,home_team_p11_rating,away_team_p11_rating
0,atletico de madrid,inter milan,5.0,5.0,club,nahuel molina,jan oblak,rodrigo de paul,marcos llorente,arthur vermeeren,...,70.0,80.0,77.0,66.0,70.0,76.0,62.0,66.0,77.0,
1,borussia dortmund,psv eindhoven,2.0,2.0,club,julian brandt,gregor kobel,donyell malen,nico schlotterbeck,ian maatsen,...,,,79.0,54.0,62.0,,,63.0,84.0,
2,fc barcelona,ssc napoli,3.0,3.0,club,gavi,pedri,lamine yamal,frenkie de jong,ronald araujo,...,75.0,52.0,80.0,69.0,61.0,73.0,,65.0,73.0,70.0
3,arsenal fc,fc porto,4.0,4.0,club,bukayo saka,declan rice,martin ødegaard,william saliba,gabriel martinelli,...,78.0,87.0,81.0,73.0,61.0,,74.0,79.0,,61.0
4,real madrid,rb leipzig,1.0,1.0,club,jude bellingham,vinicius junior,federico valverde,rodrygo,aurelien tchouameni,...,79.0,76.0,69.0,70.0,89.0,76.0,63.0,60.0,82.0,66.0


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1178 entries, 0 to 1177
Data columns (total 49 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   home_team             1177 non-null   object 
 1   away_team             1177 non-null   object 
 2   home_team_score       1177 non-null   float64
 3   away_team_score       1177 non-null   float64
 4   match_type            1177 non-null   object 
 5   home_team_p1          1141 non-null   object 
 6   home_team_p2          1141 non-null   object 
 7   home_team_p3          1141 non-null   object 
 8   home_team_p4          1141 non-null   object 
 9   home_team_p5          1141 non-null   object 
 10  home_team_p6          1141 non-null   object 
 11  home_team_p7          1141 non-null   object 
 12  home_team_p8          1141 non-null   object 
 13  home_team_p9          1141 non-null   object 
 14  home_team_p10         1141 non-null   object 
 15  home_team_p11        

In [8]:
def impute_missing_ratings(df):
    rating_columns = [col for col in df.columns if 'rating' in col]
    for col in rating_columns:
        df[col] = df.groupby('home_team')[col].transform(lambda x: x.fillna(x.mean()))
        df[col] = df.groupby('away_team')[col].transform(lambda x: x.fillna(x.mean()))

    return df

df['home_team_score'] = df['home_team_score'].fillna(0).astype(int)
df['away_team_score'] = df['away_team_score'].fillna(0).astype(int)
df = impute_missing_ratings(df)

## **1.3 Feature Engineering Stage**

1. team-level features by aggregating player ratings.




In [70]:
def aggregate_team_ratings(df):
    home_team_ratings = df.filter(like='home_team_p').filter(like='rating')
    away_team_ratings = df.filter(like='away_team_p').filter(like='rating')

    df['home_team_avg_rating'] = home_team_ratings.mean(axis=1)
    df['away_team_avg_rating'] = away_team_ratings.mean(axis=1)

    return df

df = aggregate_team_ratings(df)
df.to_csv('data/football_matches_dataset_v2.csv', index=False)