<a href="https://colab.research.google.com/github/DManiscalco/MMA-Matchups/blob/main/Graph_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Can use a Graph Neural Network if we have data of stats of specific fights (nodes are fighters and edges are fights)

In [1]:
%%capture
!pip install torch_geometric
!pip install kagglehub --upgrade

In [2]:
import kagglehub
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
import torch
from torch.utils.data import Dataset  #, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments

# Libraries for the graph model
from torch_geometric.data import Data
from torch_geometric.loader import DataLoader
from torch_geometric.nn import GCNConv, global_mean_pool
from torch_geometric.transforms import RandomLinkSplit
from torch_geometric.utils import train_test_split_edges
from torch.nn import functional as F
import torch.optim as optim
import torch.nn as nn

In [3]:
# Download the dataset from kaggle
path = kagglehub.dataset_download('calmdownkarm/ufcdataset')

# Use $ to keep python variable in the terminal command
!ls $path  # make sure there are files in the path as we expect
!cp -r $path/* /content/  # move to /content folder

data.csv


In [4]:
# Read CSV file to a pandas df
data_csv = pd.read_csv('/content/data.csv')

In [5]:
# Fight information for the edges
fight_info_cols = ['Event_ID', 'Fight_ID', 'Last_round', 'Max_round', 'winby', 'winner', 'B_ID', 'R_ID', 'B_Age', 'R_Age']  # don't include 'Date' for now
for col in data_csv.columns:
  # Using the below to keep the order of the last round and max round columns
  if 'round' in col.lower() and col not in ['Last_round', 'Max_round']:
    fight_info_cols.append(col)

fight_info_df = data_csv[fight_info_cols]
fight_info_df.loc[:, 'winner'] = fight_info_df.apply(lambda x: x['B_ID'] if x['winner'] == 'blue' else (x['R_ID'] if x['winner'] == 'red' else x['winner']), axis=1)
fight_info_df = fight_info_df[(fight_info_df['winner'] != 'draw') & (fight_info_df['winner'] != 'no contest')]  # get rid of rows with no contest or draw

# Fighter information for the nodes
fighter_info_cols = ['B_Height',	'B_HomeTown',	'B_ID', 'B_Location', 'B_Name', 'R_Height', 'R_HomeTown', 'R_ID', 'R_Location', 'R_Name']
fighter_info_init = data_csv[fighter_info_cols]

# Make df of fighter names and information for red and blue
fighter_info_red = fighter_info_init[['R_Height', 'R_HomeTown', 'R_ID', 'R_Location', 'R_Name']]
fighter_info_blue = fighter_info_init[['B_Height',	'B_HomeTown',	'B_ID',	'B_Location',	'B_Name']]

# Rename the cols to be the same for both dfs
fighter_col_names = ['Height', 'HomeTown', 'ID', 'Location', 'Name']
fighter_info_red.columns = fighter_col_names
fighter_info_blue.columns = fighter_col_names

# Concat the cols and drop duplicates
fighter_info_concat = pd.concat([fighter_info_blue, fighter_info_red])

In [6]:
# Fill in NaN values - 'Unknown' for string values and average fighter height for unknown height
na_fill_values_fighter = {'Height': fighter_info_concat['Height'].mean(), 'HomeTown': 'Parts Unknown', 'Location': 'Parts Unknown'}
fighter_info_concat = fighter_info_concat.fillna(value=na_fill_values_fighter).infer_objects(copy=False)

# Fill in the NaN values for the fight non-round cols
na_fill_values_fight = {'winby': 'UNK', 'B_Age': round(fight_info_df['B_Age'].mean(), 1), 'R_Age': round(fight_info_df['R_Age'].mean(), 1)}
fight_info_df = fight_info_df.fillna(value=na_fill_values_fight).infer_objects(copy=False)

# Fill in the NaN values for the fight 'round' cols
fight_info_df = fight_info_df.fillna(0).infer_objects(copy=False)

In [7]:
# For fighters, check if there are any duplicates between names and IDs
dupe_df = fighter_info_concat.drop_duplicates(subset=['ID'])
# dupe_df[dupe_df.duplicated(['Name'], keep=False)]  # uncomment this to show duplicates

# Duplicate is Dong Hyun Kim with ID of 455 and 2709 - change one of them
fighter_info_concat.loc[fighter_info_concat['ID'] == 2709, 'ID'] = 455

In [8]:
# Drop duplicates of fighter IDs
fighter_info_df = fighter_info_concat.drop_duplicates(subset=['ID'])

In [9]:
# Change anything with the ID of 2709 to be 455 since these are the same fighter
fight_info_df.loc[fight_info_df['R_ID'] == 2709, 'R_ID'] = 455
fight_info_df.loc[fight_info_df['B_ID'] == 2709, 'B_ID'] = 455
fight_info_df.loc[fight_info_df['winner'] == 2709, 'winner'] = 455

### Start setting up the model

In [10]:
# Use label encoding because we have some text columns - fighter cols first
fighter_info_df = fighter_info_df.copy()  # use to avoid warnings

for col in fighter_info_df.select_dtypes(exclude=['number']).columns:
  fighter_info_df[col] = LabelEncoder().fit_transform(fighter_info_df[col])
  fighter_info_df[col] = fighter_info_df[col].astype(int)  # convert to integer

# Label encoding for the fights
fight_info_df = fight_info_df.copy()  # use to avoid warnings
fight_info_df['winner'] = fight_info_df['winner'].astype(int)  # convert to integer so it doesn't get labeled

for col in fight_info_df.select_dtypes(exclude=['number']).columns:
  fight_info_df[col] = fight_info_df[col].astype(str)  # change to str - we can't have ints and str for the encoder
  fight_info_df[col] = LabelEncoder().fit_transform(fight_info_df[col])
  fight_info_df[col] = fight_info_df[col].astype(int)  # convert to integer

In [11]:
# Connect each fighter ID to a node and then specify which node is the fight winner
fighter_id_to_node_idx = {id: idx for idx, id in enumerate(fighter_info_df['ID'])}
fight_info_df['winner_node'] = fight_info_df['winner'].map(fighter_id_to_node_idx)

In [12]:
# Scale node features and edge features
scaler = StandardScaler()

# Node features into tensors - drop Fighter ID from node df because it isn't a feature
node_features_scaled = scaler.fit_transform(fighter_info_df.drop('ID', axis=1).values)
node_features = torch.tensor(node_features_scaled, dtype=torch.float)

# Edge indices into tensors
edge_index = torch.tensor([[fighter_id_to_node_idx[blue], fighter_id_to_node_idx[red]] for blue, red in zip(fight_info_df['B_ID'], fight_info_df['R_ID'])], dtype=torch.long).t().contiguous()

# Edge features to tensors
fight_edge_cols = fight_info_df.columns[np.r_[2:6, 8:len(fight_info_df.columns)-1]]  # take out event ID, fight ID, blue ID, and red ID cols (not features)
edge_features_scaled = scaler.fit_transform(fight_info_df[fight_edge_cols].values)
edge_features = torch.tensor(edge_features_scaled, dtype=torch.float)

# Edge labels for naming which node won
edge_labels = torch.tensor(fight_info_df['winner_node'].values, dtype=torch.float)

In [13]:
# Creating a geometric object from PyTorch
data = Data(
  x=node_features,
  edge_index=edge_index,
  edge_attr=edge_features,
  y=edge_labels)

In [14]:
# Defining the GNN (Graph Neural Network) model
class GNNModel(nn.Module):
  def __init__(self, input_dim, edge_dim, hidden_dim, num_nodes):
    super(GNNModel, self).__init__()
    self.conv1 = GCNConv(input_dim, hidden_dim)
    self.conv2 = GCNConv(hidden_dim, hidden_dim)
    self.fc1 = nn.Linear(hidden_dim + edge_dim, hidden_dim)
    self.fc2 = nn.Linear(hidden_dim, num_nodes)

  def forward(self, data):
    x, edge_index, edge_attr = data.x, data.edge_index, data.edge_attr
    x = self.conv1(x, edge_index)
    x = torch.relu(x)
    x = self.conv2(x, edge_index)
    x = torch.relu(x)

    # Concatenate node embeddings with edge features
    edge_embeddings = torch.cat([
      x[data.edge_index[0]],  # node embeddings for source nodes
      edge_attr], dim=1)

    edge_embeddings = self.fc1(edge_embeddings)
    edge_embeddings = torch.relu(edge_embeddings)
    out = self.fc2(edge_embeddings)
    return out

In [15]:
# Define the split transformation for train/test data
split = RandomLinkSplit(
  num_val=0.1,  # validation ratio
  num_test=0.1,  # test ratio
  is_undirected=True,  # graph is undirected
  add_negative_train_samples=False)  # don't add negative samples to the training set

# Apply the transformation
train_data, val_data, test_data = split(data)

# Change train_data.y to be an integer because it gives an error if it is a float
train_data.y = train_data.y.long()

In [16]:
# Setting optimizer, loss, and training the model thorugh epochs
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = GNNModel(input_dim=node_features.shape[1], edge_dim=edge_features.shape[1],
                 hidden_dim=16, num_nodes=len(fighter_info_df)).to(device)
data = data.to(device)

optimizer = optim.Adam(model.parameters(), lr=0.01)
criterion = nn.CrossEntropyLoss()

for epoch in range(200):
  model.train()
  optimizer.zero_grad()
  out = model(train_data)
  loss = criterion(out, train_data.y)
  loss.backward()
  optimizer.step()

  if (epoch + 1) == 1 or (epoch + 1) % 10 == 0:
    print(f'Epoch {epoch + 1}, Loss: {loss.item():.4f}')

Epoch 1, Loss: 7.0279
Epoch 10, Loss: 5.1316
Epoch 20, Loss: 3.5316
Epoch 30, Loss: 2.4470
Epoch 40, Loss: 1.7132
Epoch 50, Loss: 1.1635
Epoch 60, Loss: 0.7405
Epoch 70, Loss: 0.4569
Epoch 80, Loss: 0.2772
Epoch 90, Loss: 0.1710
Epoch 100, Loss: 0.1090
Epoch 110, Loss: 0.0727
Epoch 120, Loss: 0.0508
Epoch 130, Loss: 0.0369
Epoch 140, Loss: 0.0279
Epoch 150, Loss: 0.0220
Epoch 160, Loss: 0.0178
Epoch 170, Loss: 0.0148
Epoch 180, Loss: 0.0126
Epoch 190, Loss: 0.0108
Epoch 200, Loss: 0.0094


In [20]:
# Evaluate the model
model.eval()
with torch.no_grad():
  val_out = model(val_data)
  val_preds = val_out.argmax(dim=1)
  val_correct = (val_preds == val_data.y).sum().item()
  val_acc = val_correct / val_data.y.size(0)
  print(f'Validation Accuracy: {val_acc:.4f}')

  test_out = model(test_data)
  test_preds = test_out.argmax(dim=1)
  test_correct = (test_preds == test_data.y).sum().item()
  test_acc = test_correct / test_data.y.size(0)
  print(f'Test Accuracy: {test_acc:.4f}')

Validation Accuracy: 1.0000
Test Accuracy: 0.8816


In [None]:
# Looks like a problem with the validation accuracy - to go back and check