## For building datasets to put in GNN

### Load dataframe

In [1]:
import pandas as pd

DATA_PATH = "data/raw/@@@.csv"
data = pd.read_csv(DATA_PATH)

data.head()

Unnamed: 0,name,ID,stat_date,height,weight,preferred_foot,age,ball_control,dribbling,marking,...,long_shots,curve,fk_acc,penalties,volleys,gk_positioning,gk_diving,gk_handling,gk_kicking,gk_reflexes
0,Kevin De Bruyne,192985,"May 19, 2022",181,70,Right,30,90,88,68,...,91,85,83,83,83,10,15,13,5,13
1,Kevin De Bruyne,192985,"May 16, 2022",181,70,Right,30,90,88,68,...,91,85,83,83,83,10,15,13,5,13
2,Kevin De Bruyne,192985,"May 13, 2022",181,70,Right,30,90,88,68,...,91,85,83,83,83,10,15,13,5,13
3,Kevin De Bruyne,192985,"May 12, 2022",181,70,Right,30,90,88,68,...,91,85,83,83,83,10,15,13,5,13
4,Kevin De Bruyne,192985,"May 9, 2022",181,70,Right,30,90,88,68,...,91,85,83,83,83,10,15,13,5,13


### General Information About the Data

In [2]:
data.shape

(184509, 41)

In [4]:
# data['result'].value_counts()
data['preferred_foot'].value_counts()

Right    137293
Left      47216
Name: preferred_foot, dtype: int64

### Download dependencies

In [7]:
import torch

print(torch.__version__)
print(torch.version.cuda)

1.11.0+cpu
None


In [14]:
%%capture
%pip install torch-scatter torch-sparse torch-cluster torch-spline-conv torch-geometric -f https://data.pyg.org/whl/torch-1.11.0+cpu.html

### Generate a Dataset

In [15]:
from torch_geometric.data import Dataset, Data
import os
from tqdm import tqdm

class ProcessedDataset(Dataset):
    def __init__(self, root, filename, test=False, transform=None, pre_transform=None):
        """
        root = Where the dataset should be stored. This folder is split
        into raw_dir (downloaded dataset) and processed_dir (processed data). 
        """
        self.test = test
        self.filename = filename
        super(ProcessedDataset, self).__init__(root, transform, pre_transform)
        
    @property
    def raw_file_names(self):
        """ If this file exists in raw_dir, the download is not triggered.
            (The download func. is not implemented here)  
        """
        return self.filename

    @property
    def processed_file_names(self):
        """ If these files are found in raw_dir, processing is skipped"""
        self.data = pd.read_csv(self.raw_paths[0]).reset_index()

        if self.test:
            return [f'data_test_{i}.pt' for i in list(self.data.index)]
        else:
            return [f'data_{i}.pt' for i in list(self.data.index)]

    def download(self):
        pass

    def process(self):
        self.data = pd.read_csv(self.raw_paths[0])
        for index, match in tqdm(self.data.iterrows(), total=self.data.shape[0]):
            print(match)
            # Get node features
            node_feats = self._get_node_features(match)
            # Get edge features(adjacent, front-back, opponent etc.)
            edge_feats = self._get_edge_features([match['home_formation'], match['visiting_formation']])
            # Get adjacency info(whether or not there's an edge)
            edge_index = self._get_adjacency_info([match['home_formation'], match['visiting_formation']])
            # Get labels info
            label = self._get_labels(match["result"])

            # Create data object
            data = Data(x=node_feats, 
                        edge_index=edge_index,
                        edge_attr=edge_feats,
                        y=label,
                        match=match
                        ) 
            if self.test:
                torch.save(data, 
                    os.path.join(self.processed_dir, 
                                 f'data_test_{index}.pt'))
            else:
                torch.save(data, 
                    os.path.join(self.processed_dir, 
                                 f'data_{index}.pt'))
    def _get_most_recent_player_stat(self, name, date):
        """
        Returns the most recent player stat for a given player
        """
        # Get player stats
        with open('player_stats.csv','rt', encoding='utf-8-sig')as f:
            data = csv.reader(f)

            data_ = pd.DataFrame(data)

            f.close()

        # Get player stats for a given player
        player_stats = data_[data_['player_name'] == name]

        # Get the most recent date
        filtered_df = player_stats.loc[(data_['date'] <= date)]
        most_recent_date = filtered_df['date'].max()
        if most_recent_date is None:
            # Get the closest date
            
            ...

        # Get the most recent player stat
        most_recent_player_stat = player_stats[player_stats['date'] == most_recent_date]

        return most_recent_player_stat

    def _get_node_features(self, match):
        """ 
        This will return a matrix / 2d array of the shape
        [Number of Nodes, Node Feature size]

        # features:
        general: height, weight, preferred_foot, age
        ball skills: ball_control, dribbling
        defense: marking, slide_tackle, stand_tackle
        mental: aggression, reactions, att_position, interceptions, vision, composure
        passing: crossing, short_pass, long_pass
        physical: acceleration, stamina, strength, balance, sprint_speed, agility, jumping
        shooting: heading, shot_power, finishing, long_shots, curve, fk_acc, penalties, volleys
        goalkeeper: gk_positioning, gk_diving, gk_handling, gk_kicking, gk_reflexes
        """
        all_node_feats = []

        for player in match:
            # Get player stats
            player_stats = self._get_most_recent_player_stat(player, match['date'])

            node_feats = []
            # Feature 1~4: height, weight, preferred_foot, age -> General
            node_feats.append(player_stats['height'])
            node_feats.append(player_stats['weight'])
            if player_stats['preferred_foot'] == 'Right':
                node_feats.append(1)
            else:
                node_feats.append(0)
            node_feats.append(player_stats['age'])
            # Feature 5~6: ball_control, dribbling -> Ball skills
            node_feats.append(player_stats['ball_control'])
            node_feats.append(player_stats['dribbling'])
            # Feature 7~9: marking, slide_tackle, stand_tackle -> Defense
            node_feats.append(player_stats['marking'])
            node_feats.append(player_stats['slide_tackle'])
            node_feats.append(player_stats['stand_tackle'])
            # Feature 10~15: aggression, reactions, att_position, interceptions, vision, composure -> Mental
            node_feats.append(player_stats['aggression'])
            node_feats.append(player_stats['reactions'])
            node_feats.append(player_stats['att_position'])
            node_feats.append(player_stats['interceptions'])
            node_feats.append(player_stats['vision'])
            node_feats.append(player_stats['composure'])
            # Feature 16~18: crossing, short_pass, long_pass -> Passing
            node_feats.append(player_stats['crossing'])
            node_feats.append(player_stats['short_pass'])
            node_feats.append(player_stats['long_pass'])
            # Feature 19~25: acceleration, stamina, strength, balance, sprint_speed, agility, jumping -> Physical
            node_feats.append(player_stats['acceleration'])
            node_feats.append(player_stats['stamina'])
            node_feats.append(player_stats['strength'])
            node_feats.append(player_stats['balance'])
            node_feats.append(player_stats['sprint_speed'])
            node_feats.append(player_stats['agility'])
            node_feats.append(player_stats['jumping'])
            # Feature 26~33: heading, shot_power, finishing, long_shots, curve, fk_acc, penalties, volleys -> Shooting
            node_feats.append(player_stats['heading'])
            node_feats.append(player_stats['shot_power'])
            node_feats.append(player_stats['finishing'])
            node_feats.append(player_stats['long_shots'])
            node_feats.append(player_stats['curve'])
            node_feats.append(player_stats['fk_acc'])
            node_feats.append(player_stats['penalties'])
            node_feats.append(player_stats['volleys'])
            # Feature 34~38: gk_positioning, gk_diving, gk_handling, gk_kicking, gk_reflexes -> Goalkeeper
            node_feats.append(player_stats['gk_positioning'])
            node_feats.append(player_stats['gk_diving'])
            node_feats.append(player_stats['gk_handling'])
            node_feats.append(player_stats['gk_kicking'])
            node_feats.append(player_stats['gk_reflexes'])

            # Append node features to matrix
            all_node_feats.append(node_feats)

        all_node_feats = np.asarray(all_node_feats)
        return torch.tensor(all_node_feats, dtype=torch.float)

    def _get_edge_features(self, home_formation, visiting_formation):
        """ 
        This will return a matrix / 2d array of the shape
        [Number of edges, Edge Feature size]

        The edge features should be based on the relative position of each player
        0: side-by-side
        1: front-to-back
        2: goalkeeper-to-defender
        -1: between opponents

        """

        # Split the formations
        home_formation_split = home_formation.split('-')
        home_formation_len = len(home_formation_split)
        visiting_formation_split = visiting_formation.split('-')
        visiting_formation_len = len(visiting_formation_split)
        total_formation = "1-" + home_formation + "-" + visiting_formation[::-1] + "-1"
        total_formation_split = total_formation.split('-')
        total_formation_len = len(total_formation_split)

        all_edge_feats = []
        # add goalkeeper-to-defender
        for _ in range(int(total_formation_split[1])):
            edge_feats = []
            edge_feats.append(2)
            all_edge_feats += [edge_feats, edge_feats]

        # home
        for i, s in enumerate(home_formation_split):
            # add side-by-sde
            for times in range(int(s)-1):
                edge_feats = []
                edge_feats.append(0)
                all_edge_feats += [edge_feats, edge_feats]
            # add front-to-back
            if i < home_formation_len-1:
                for times in range(int(home_formation_split[i]) * int(home_formation_split[i+1])):
                    edge_feats = []
                    edge_feats.append(1)
                    all_edge_feats += [edge_feats, edge_feats]

        # add between opponents
        for times in range(int(home_formation_split[-1]) * int(visiting_formation_split[-1])):
            edge_feats = []
            edge_feats.append(-1)
            all_edge_feats += [edge_feats, edge_feats]

        # visiting
        reversed = visiting_formation_split[::-1]
        for i, s in enumerate(reversed):
            # add side-by-sde
            for times in range(int(s)-1):
                edge_feats = []
                edge_feats.append(0)
                all_edge_feats += [edge_feats, edge_feats]
            # add front-to-back
            if i < visiting_formation_len-1:
                for times in range(int(reversed[i]) * int(reversed[i+1])):
                    edge_feats = []
                    edge_feats.append(1)
                    all_edge_feats += [edge_feats, edge_feats]
        # add goalkeeper-to-defender
        for _ in range(int(total_formation_split[-2])):
            edge_feats = []
            edge_feats.append(2)
            all_edge_feats += [edge_feats, edge_feats]

        all_edge_feats = np.asarray(all_edge_feats)
        return torch.tensor(all_edge_feats, dtype=torch.float)

    def _get_adjacency_info(self, home_formation, visiting_formation):
        """
        The return should be in COO format.
        Make sure that the order of the indices
        matches the order of the edge features!
        """
        # Split the formations
        home_formation_split = home_formation.split('-')
        home_formation_len = len(home_formation_split)
        visiting_formation_split = visiting_formation.split('-')
        visiting_formation_len = len(visiting_formation_split)
        sum_formation = home_formation + "-" + visiting_formation[::-1]
        sum_formation_split = sum_formation.split('-')

        # Get the level lists( = the index of nodes for each graph)
        # i.e. for 4-2-3-1 vs 4-3-3 -> [[0], [1, 2, 3, 4], [5, 6], [7, 8, 9], [10], [11, 12, 13], [14, 15, 16], [17, 18, 19, 20], [21]]
        level_lists = [[0]]
        start = 1
        for index, s in enumerate(sum_formation_split):
            adding_list = list(range(start, start + int(s)))
            level_lists.append(adding_list)
            start += int(s)
        level_lists.append([start])

        edge_indices = []
        for i, _ in enumerate(level_lists[:-1]):
            # side-by-side
            for node in level_lists[i][:-1]:
                #print(node, ",", node+1)
                edge_indices += [[node, node+1], [node+1, node]]
            # front-to-back
            for r in itertools.product(level_lists[i], level_lists[i+1]): 
                #print (r[0], ",",  r[1])
                edge_indices += [[r[0], r[1]], [r[1], r[0]]]

        edge_indices = torch.tensor(edge_indices)
        edge_indices = edge_indices.t().to(torch.long).view(2, -1)
        return edge_indices

    def _get_labels(self, label):
        label = np.asarray([label])
        return torch.tensor(label, dtype=torch.int64)

    def len(self):
        return self.data.shape[0]

    def get(self, idx):
        """ - Equivalent to __getitem__ in pytorch
            - Is not needed for PyG's InMemoryDataset
        """
        if self.test:
            data = torch.load(os.path.join(self.processed_dir, 
                                 f'data_test_{idx}.pt'))
        else:
            data = torch.load(os.path.join(self.processed_dir, 
                                 f'data_{idx}.pt'))   
        return data

