In [1]:
import os
import pandas as pd
import json
import re

### Functions to Process Raw Data

In [2]:
def load_json(file_path):
    try:
        with open(file_path, 'r') as file:
            return json.load(file)
    except FileNotFoundError:
        print(f"File not found: {file_path}")
        return None
    except json.JSONDecodeError:
        print(f"Invalid JSON format in file: {file_path}")
        return None

In [3]:
def parse_events_data(json_data, team_name_dict):
    if json_data is None:
        return pd.DataFrame()

    parsed_data = [{
        "match_id": entry.get("matchId"),
        "match_period": entry.get("matchPeriod"),
        "event_sec": entry.get("eventSec"),
        "event_name": entry.get("eventName"),
        "player_id": entry.get("playerId"),
        "team_id": entry.get("teamId"),
        "team_caused_event": team_name_dict.get(entry.get("teamId"), "Unknown"),  # Look up the team name that caused the event
        "sub_event_name": entry.get("subEventName"),
        "positions": entry.get("positions", []),
        "tags": [tag.get("id") for tag in entry.get("tags", [])]
    } for entry in json_data if entry.get("tags")]

    df_events = pd.json_normalize(parsed_data, sep='_')
    return df_events

In [4]:
def adjust_time_and_create_breaks(group):
    halftime_end = group[group['match_period'] == '1H']['event_sec'].max()
    break_events = [
        {
            'match_id': group.name,
            'match_period': 'Break',
            'event_sec': halftime_end + sec,
            'event_name': 'Break Time',
            'sub_event_name': 'Break Time',
            'positions': [{'y': 0, 'x': 0}, {'y': 0, 'x': 0}],
            'tags': ['Break Time']
        }
        for sec in range(3, 900 + 3, 3)
    ]
    
    break_df = pd.DataFrame(break_events)
    group = pd.concat([group, break_df], ignore_index=True)
    group.loc[group['match_period'] == '2H', 'event_sec'] += halftime_end + 903
    return group.sort_values(by='event_sec')

In [5]:
def parse_matches_data(json_data, team_name_dict):
    if json_data is None:
        return pd.DataFrame()

    # Modified to include home_team_id and away_team_id
    parsed_data = [{
        "match_id": match.get("wyId"),
        "date": match.get("dateutc"),
        "winner": match.get("winner"),
        "match_name": match.get("label"),
        # Continue mapping team names for home and away teams
        "home_team": team_name_dict.get(next((item for item in match.get("teamsData").values() if item.get("side") == 'home'), {}).get("teamId", 0), "Unknown"),
        "away_team": team_name_dict.get(next((item for item in match.get("teamsData").values() if item.get("side") == 'away'), {}).get("teamId", 0), "Unknown"),
        # Include IDs for home and away teams
        "home_team_id": next((item for item in match.get("teamsData").values() if item.get("side") == 'home'), {}).get("teamId", 0),
        "away_team_id": next((item for item in match.get("teamsData").values() if item.get("side") == 'away'), {}).get("teamId", 0)
    } for match in json_data]
    
    df_match = pd.DataFrame(parsed_data)    
    return df_match


In [6]:
# Assign which team caused event (home/away team)
def adjust_team_caused_event(row):
    if row['team_id'] == row['home_team_id']:  
        return 'home_team'
    elif row['team_id'] == row['away_team_id']:  
        return 'away_team'
    else:
        return 'Unknown'
    
# Map team names from the team id     
def map_team_names(json_data_team):
    team_name_dict = {item.get("wyId", 0): item.get("name", "Draw") for item in json_data_team}
    return team_name_dict

# Making match names consistent to merge with betting data
def reformat_match_info(s):
    pattern = re.compile(r'(.+?)\s*-\s*(.+?),\s*\d+\s*-\s*\d+')
    return pattern.sub(r'\1 v \2', s)

# Map tag IDs to descriptions
def load_tags_mapping(file_path):
    tags_decode_df = pd.read_csv(file_path)
    return tags_decode_df.set_index('Tag')['Label'].to_dict()

def map_ids_to_descriptions(ids, tags_decode_dict):
    return [tags_decode_dict.get(id) for id in ids]

#### Applying Data Processing Functions

In [7]:
# File paths
events_file_path = "D:/ARU Modules/Final Project/dataset/events_England.json" # Contains events info of the match
matches_file_path = "D:/ARU Modules/Final Project/dataset/matches_England.json" # Info for date and time of match
teams_file_path = "D:/ARU Modules/Final Project/dataset/teams.json" # list of team ID and team names
tags_file_path = "D:/ARU Modules/Final Project/dataset/tags2name.csv" #to find tag ID info for events

# Load JSON data
events_json = load_json(events_file_path)
matches_json = load_json(matches_file_path)
teams_json = load_json(teams_file_path)

# Create dict for team IDs and team names
team_name_dict = map_team_names(teams_json)

# Parse and process events data
df_events = parse_events_data(events_json,team_name_dict)
df_events = df_events.groupby('match_id').apply(adjust_time_and_create_breaks).reset_index(drop=True)

# Mapping Match & Team names 
df_matches = parse_matches_data(matches_json, team_name_dict)

# Load tags mapping and apply to events data
tags_decode_dict = load_tags_mapping(tags_file_path)
df_events['tags'] = df_events['tags'].apply(lambda ids: map_ids_to_descriptions(ids, tags_decode_dict))

# Apply transformations to matches data
df_matches['winner'] = df_matches['winner'].map(team_name_dict)
df_matches['match_name'] = df_matches['match_name'].apply(reformat_match_info)

# Merge events and matches data
merged_df = pd.merge(df_events, df_matches, how='left', on='match_id')
merged_df['date'] = pd.to_datetime(merged_df['date'])

# Apply the function to find which team caused the event to the merged dataframe
merged_df['team_caused_event'] = merged_df.apply(adjust_team_caused_event, axis=1)

  df_events = df_events.groupby('match_id').apply(adjust_time_and_create_breaks).reset_index(drop=True)


### Fix for Handling Draw as Winner

In [17]:
'''Since the mapping teams json file doesnt have the id for the draw, it leaves a NaN value for the winner if the match was draw
The below code does a hot fix and handles the dataframe for futher processing by assigning filling the NaN values in the winner column with Draw stirng
and further generating the lables for the dataset that will be used later by the ML model
'''
# Fill NA values with "Draw"
merged_df["winner"] = merged_df["winner"].fillna("Draw")

# Reset winner flags for all rows initially
merged_df['home_winner'] = 0
merged_df['away_winner'] = 0
merged_df['draw_winner'] = 0

# Group by 'match_id'
grouped = merged_df.groupby('match_id')

# Iterate over each group
for match_id, group in grouped:
    # Get the index of the first row in the group
    first_index = group.index[0]
    winner = group.loc[first_index, 'winner']
    
    # Determine the outcome based on the winner and assign flags accordingly
    if winner == group.loc[first_index, 'home_team']:
        merged_df.loc[group.index, 'home_winner'] = 1
    elif winner == group.loc[first_index, 'away_team']:
        merged_df.loc[group.index, 'away_winner'] = 1
    elif winner == "Draw":
        merged_df.loc[group.index, 'draw_winner'] = 1


### Adding the publishTime to dataset

In [21]:
#To align this events dataset to the betting dataset, we are using the date of the match and eventSec 
#To add the publishTime to the dataset, this will help us to align this events dataset to the betting dataset
merged_df['publishTime'] = (merged_df['date'] + pd.to_timedelta(merged_df['event_sec'], unit='s'))

## Feature Engineering

In [25]:
class MatchMetrics:
    def __init__(self):
        # Metrics initialization
        self.reset_metrics()

    def reset_metrics(self):
        self.ht_attack_intensity = []
        self.at_attack_intensity = []
        self.ht_defense_intensity = []
        self.at_defense_intensity = []
        self.ht_total_possession_time = 0
        self.at_total_possession_time = 0
        self.ht_passes = {'accurate': 0, 'total': 0}
        self.at_passes = {'accurate': 0, 'total': 0}
        self.last_event_time = 0

        # Dynamic metrics lists
        self.ht_pos = []
        self.at_pos = []
        self.ht_pass_accuracy = []
        self.at_pass_accuracy = []
        
        # Initialize lists for dynamic tracking of disciplinary actions
        self.ht_red_cards = []
        self.at_red_cards = []
        self.ht_yellow_cards = []
        self.at_yellow_cards = []
        self.ht_other_fouls = []
        self.at_other_fouls = []
        self.last_event_time = 0
        
        # Initialize goal counts
        self.ht_goals = []
        self.at_goals = []
        self.ht_goal_count = 0
        self.at_goal_count = 0

    def update_metrics(self, event, team_caused_event, sub_event, tags, event_sec):
        # Calculate time since the last event for dynamic updates
        time_since_last_event = event_sec - self.last_event_time
        self.last_event_time = event_sec

        # Update metrics based on event details
        if event == "Pass":
            self._update_passes(team_caused_event, tags)
        self._update_attack_intensity(team_caused_event, sub_event, tags)
        self._calculate_metrics(time_since_last_event, team_caused_event)
        self._update_defense_intensity(team_caused_event, sub_event, tags)
        self._update_disciplinary_actions(team_caused_event, sub_event, tags, event_sec)
        self._update_goals(team_caused_event, tags, event_sec)
        
    def _update_goals(self, team_caused_event, tags, event_sec):
        if 'Goal' in tags and 'accurate' in tags:
            if team_caused_event == "home_team":
                self.ht_goal_count += 1
                # Append current goal count with the timestamp of the event
                self.ht_goals.append((event_sec, self.ht_goal_count))
            elif team_caused_event == "away_team":
                self.at_goal_count += 1
                # Append current goal count with the timestamp of the event
                self.at_goals.append((event_sec, self.at_goal_count))
                
        if 'own_goal' in tags:
            if team_caused_event == "home_team":
                self.at_goal_count += 1
                # Append current goal count with the timestamp of the event
                self.at_goals.append((event_sec, self.at_goal_count))
            elif team_caused_event == "away_team":
                self.ht_goal_count += 1
                # Append current goal count with the timestamp of the event
                self.ht_goals.append((event_sec, self.ht_goal_count))
                
    def _update_disciplinary_actions(self, team_caused_event, sub_event, tags, event_sec):
        # Increment counters based on the type of disciplinary action
        action_increment = {
            'red_cards': 'red_card' in tags or 'second_yellow_card' in tags,
            'yellow_cards': 'yellow_card' in tags,
            'other_fouls': sub_event in ['Foul', 'Violent Foul', 'Late card foul', 'Out of game foul', 'Penalty']
        }

        for action, condition in action_increment.items():
            if condition:
                if team_caused_event == "home_team":
                    current_count = len(self.__dict__[f'ht_{action}']) + 1
                    self.__dict__[f'ht_{action}'].append((event_sec, current_count))
                elif team_caused_event == "away_team":
                    current_count = len(self.__dict__[f'at_{action}']) + 1
                    self.__dict__[f'at_{action}'].append((event_sec, current_count))


    def _update_passes(self, team_caused_event, tags):
        if team_caused_event == "home_team":
            self.ht_passes['total'] += 1
            if 'accurate' in tags:
                self.ht_passes['accurate'] += 1
        elif team_caused_event == "away_team":
            self.at_passes['total'] += 1
            if 'accurate' in tags:
                self.at_passes['accurate'] += 1

    def _update_attack_intensity(self, team_caused_event, sub_event, tags):
        # Define attacking events and tags
        attacking_events = ['Ground attacking duel', 'Shot', 'Cross', 'Corner', 'Free Kick']
        attacking_tags = ['Goal', 'assist', 'keyPass', 'direct', 'free kick shot']

        # Check if the current event qualifies as an attacking effort
        is_attacking_event = sub_event in attacking_events and any(tag in attacking_tags for tag in tags)

        # Update attack intensity based on the event
        if team_caused_event == "home_team" and is_attacking_event:
            self.ht_attack_intensity.append(1)  # Representing an attacking event
        else:
            self.ht_attack_intensity.append(0)  # No attacking event

        if team_caused_event == "away_team" and is_attacking_event:
            self.at_attack_intensity.append(1)
        else:
            self.at_attack_intensity.append(0)
            
    def _update_defense_intensity(self, team_caused_event, sub_event, tags):
        # Define defensive events and tags
        defensive_events = ['Ground defending duel', 'Air duel', 'Interception', 'Clearance']
        defensive_tags = ['interception', 'clearance', 'won']  # Example tags

        # Check if the current event qualifies as a defensive effort
        is_defensive_event = sub_event in defensive_events and any(tag in defensive_tags for tag in tags)

        # Update defense intensity based on the event
        if team_caused_event == "home_team" and is_defensive_event:
            self.ht_defense_intensity.append(1)  # Representing a defensive event
        else:
            self.ht_defense_intensity.append(0)  # No defensive event

        if team_caused_event == "away_team" and is_defensive_event:
            self.at_defense_intensity.append(1)
        else:
            self.at_defense_intensity.append(0)

    def _calculate_metrics(self, time_since_last_event, team_caused_event):
        # Update total possession time based on the event's team
        if team_caused_event == "home_team":
            self.ht_total_possession_time += time_since_last_event
        elif team_caused_event == "away_team":
            self.at_total_possession_time += time_since_last_event

        # Dynamic possession and pass accuracy calculations
        current_total_time = self.ht_total_possession_time + self.at_total_possession_time
        ht_pos_pct = (self.ht_total_possession_time / current_total_time) * 100 if current_total_time > 0 else 50
        at_pos_pct = (self.at_total_possession_time / current_total_time) * 100 if current_total_time > 0 else 50
        ht_pass_acc = (self.ht_passes['accurate'] / self.ht_passes['total']) * 100 if self.ht_passes['total'] > 0 else 0
        at_pass_acc = (self.at_passes['accurate'] / self.at_passes['total']) * 100 if self.at_passes['total'] > 0 else 0

        # Append the current metrics to their respective lists
        self.ht_pos.append(ht_pos_pct)
        self.at_pos.append(at_pos_pct)
        self.ht_pass_accuracy.append(ht_pass_acc)
        self.at_pass_accuracy.append(at_pass_acc)

    def append_metrics_to_df(self, match_df):
        # Normalize attack intensity over the number of events
        match_df['ht_attack_intensity'] = self._normalize_intensity(self.ht_attack_intensity)
        match_df['at_attack_intensity'] = self._normalize_intensity(self.at_attack_intensity)
        match_df['ht_defense_intensity'] = self._normalize_intensity(self.ht_defense_intensity)
        match_df['at_defense_intensity'] = self._normalize_intensity(self.at_defense_intensity)
        

        # Append other dynamic metrics
        match_df['ht_pos_pct'] = self.ht_pos
        match_df['at_pos_pct'] = self.at_pos
        match_df['ht_pass_accuracy'] = self.ht_pass_accuracy
        match_df['at_pass_accuracy'] = self.at_pass_accuracy
        
        # Append dynamic goal counts
        match_df['ht_goals'] = [len([t for t, _ in self.ht_goals if t <= event_sec]) for event_sec in match_df['event_sec']]
        match_df['at_goals'] = [len([t for t, _ in self.at_goals if t <= event_sec]) for event_sec in match_df['event_sec']]

    
        #Append fouls during the match
        for action in ['red_cards', 'yellow_cards', 'other_fouls']:
            match_df[f'ht_{action}'] = [len([t for t, _ in self.__dict__[f'ht_{action}'] if t <= event_sec]) for event_sec in match_df['event_sec']]
            match_df[f'at_{action}'] = [len([t for t, _ in self.__dict__[f'at_{action}'] if t <= event_sec]) for event_sec in match_df['event_sec']]
            
            
        return match_df


    def _normalize_intensity(self, intensity_list):
        # Normalize attack intensity by converting to a percentage of total events
        total_events = len(intensity_list)
        normalized_intensity = [sum(intensity_list[:i+1])/total_events*100 for i in range(total_events)]
        return normalized_intensity

def process_match(match_df):
    metrics_calculator = MatchMetrics()
    
    for index, row in match_df.iterrows():
        metrics_calculator.update_metrics(row['event_name'], row['team_caused_event'], row['sub_event_name'], row['tags'], row['event_sec'])
    
    return metrics_calculator.append_metrics_to_df(match_df)

# Apply the processing to each match
processed_df = merged_df.groupby('match_id').apply(process_match).reset_index(drop=True)

  processed_df = merged_df.groupby('match_id').apply(process_match).reset_index(drop=True)


In [278]:
processed_df.to_csv("processed_df/event_df.csv", index = False)

### Sorting Home/Away Team names

In [279]:
"""
Since we have dont have home/away team info in the betting dataset, we will use this dataframe to extract the home team and away team 
we will use this to map the team names in the betting dataset. For this we will validate the matches with the match_id and the publishTime
"""
# Convert 'publishTime' to datetime if it's not already, and then extract the date part
#processed_df['publishTime'] = pd.to_datetime(processed_df['publishTime']).dt.date

# Drop duplicates keeping the first occurrence of each 'match_id'
unique_matches_df = processed_df.drop_duplicates(subset=['match_id'], keep='first')

# Keep only the desired columns
unique_matches_df = unique_matches_df[['home_team', 'away_team', 'match_id', 'publishTime']]

unique_matches_df.to_csv("processed_df/map_home_away_team.csv", index = False)