<a href="https://colab.research.google.com/github/DManiscalco/MMA-Matchups/blob/main/Pairwise_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Pairwise Model (Siamese Model to compare fighter past stats at each fight)

In [None]:
import kagglehub
import numpy as np
import pandas as pd
from pyspark.sql import functions as F
from pyspark.sql import SparkSession
from pyspark.sql.window import Window
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments

# For pairwise network model
import torch.nn as nn
import torch.optim as optim

# The below dataset got taken down so had to use another one and collapse this section

## Download the dataset and bring in the dataframe

In [None]:
# Download the dataset from kaggle
path = kagglehub.dataset_download('danmcinerney/mma-differentials-and-elo')

# Use $ to keep python variable in the terminal command
!ls $path  # make sure there are files in the path as we expect
!cp -r $path/* /content/  # move to /content folder

In [None]:
# Create a SparkSession
spark = SparkSession.builder.getOrCreate()

In [None]:
# Read the CSV file
mma_df = spark.read.csv('/content/masterdataframe.csv', header=True, inferSchema=True)

In [None]:
# Show the first few rows of the DataFrame
# mma_df.show(10)

## Start preprocessing the data

In [None]:
## Drop any column that is a url:
# Get the list of column names where the first entry starts with 'http:' and drop them
first_row = mma_df.first()  # Get first row of data from dataframe
columns_with_http = [col for col in mma_df.columns if str(first_row[col]).startswith("http:")]
mma_df_reduced = mma_df.drop(*columns_with_http)  # Drop these columns

# Dropping 'time' col - it is the time that we run df and unrelated to the data
mma_df_reduced = mma_df_reduced.drop('time')

# 'age' col refers to age in 2021 so we calculate true age at time of fight
mma_df_reduced = mma_df_reduced.withColumn(
  'age',
  F.floor(F.months_between('date', 'dob') / 12)  # returns NULL if a col is NULL
)

In [None]:
# mma_df_reduced.show(5)

In [None]:
## Split the dataframe into a training and testing set based on specific fighters
## and the dates of their fights
# Proportions for training and testing sets
train_ratio = 0.7  # 70% for training
test_ratio = 0.3   # 30% for testing

# Calculate the total row count per fighter then cutoff counts based on proportions
row_counts = mma_df_reduced.groupBy('fighter').count().withColumnRenamed('count', 'total_rows')
cutoffs = row_counts.withColumn('train_cutoff', (F.col('total_rows') * train_ratio).cast('int'))
cutoffs = cutoffs.withColumn('test_cutoff', F.col('total_rows') - F.col('train_cutoff'))

# Join the cutoffs back to the original dataframe for reference
mma_df_reduced = mma_df_reduced.join(cutoffs.select('fighter', 'train_cutoff', 'test_cutoff'), on='fighter', how='left')

# Order rows by date within each person's group and assign row numbers
window = Window.partitionBy('fighter').orderBy('date')
mma_df_reduced = mma_df_reduced.withColumn('row_num', F.row_number().over(window))

# Create train and test dfs based on row numbers and cutoffs
train_df = mma_df_reduced.filter(F.col('row_num') <= F.col('train_cutoff')).drop('row_num', 'train_cutoff', 'test_cutoff')
test_df = mma_df_reduced.filter(F.col('row_num') > F.col('train_cutoff')).drop('row_num', 'train_cutoff', 'test_cutoff')

In [None]:
# train_df.show(5)

In [None]:
# Use the previously trained BERT model to classify our columns as striking or grappling (or neither)
# Load fine-tuned model and tokenizer from where we saved it
model = BertForSequenceClassification.from_pretrained('./fine_tuned_bert')
tokenizer = BertTokenizer.from_pretrained('./fine_tuned_bert')

# Tokenize the entire batch of column names at once instead of looping one at a time
inputs = tokenizer(train_df.columns, padding=True, truncation=True, return_tensors='pt')

# Make predictions for the entire batch
with torch.no_grad():
  outputs = model(**inputs)
  predicted_classes = torch.argmax(outputs.logits, dim=1).tolist()

## Short detour to create a metric that shows whether a fighter is more of a striker, a grappler, or balanced

In [None]:
# Dictionary that contains column names and grappler/striker category
fight_type_dict = {}

# 0 for striking, 1 for grappling, 2 for neither
for column, pred_class in zip(train_df.columns, predicted_classes):
  fight_type_dict[column] = pred_class

# Make list of striking and grappling column names
striking_names = []
grappling_names = []

for col_name in fight_type_dict:
  if fight_type_dict[col_name] == 0:
    striking_names.append(col_name)
  elif fight_type_dict[col_name] == 1:
    grappling_names.append(col_name)
  else:
    pass

In [None]:
## The below is too slow when using PySpark so try Pandas
# # Make a copy that we can do calculations on for convenience
# train_df_copy = train_df.select('*')

# # Normalize each column if it is labeled 0 or 1; skip if 2
# for col in train_df_copy.columns:

#   if fight_type_dict[col] != 2:
#     min_col = train_df_copy.agg(F.min(col)).collect()[0][0]
#     max_col = train_df_copy.agg(F.max(col)).collect()[0][0]
#     train_df_copy = train_df_copy.withColumn(col, (F.col(col) - min_col) / (max_col - min_col))

#   else:
#     pass

# train_df_copy.show(5)

In [None]:
# Convert the spark dataframe to a pandas df for easier use
train_df_pd = train_df.toPandas()

In [None]:
train_df_pd.head(10)

# Make a copy that we can do calculations on for convenience
train_df_pd_copy = train_df_pd.copy()

# Normalize each column if it is labeled 0 or 1; skip if 2
for col in train_df_pd_copy.columns:

  if fight_type_dict[col] != 2:
    col_max = train_df_pd_copy[col].max()
    col_min = train_df_pd_copy[col].min()
    train_df_pd_copy[col] = (train_df_pd_copy[col] - col_min) / (col_max - col_min)

  # Keep fighter name column but drop anything else that isn't relevant
  else:
    if col == 'fighter':
      pass
    else:
      train_df_pd_copy = train_df_pd_copy.drop(col, axis=1)

train_df_pd_copy.head(5)

Unnamed: 0,fighter,knockdowns,sub_attempts,reversals,control,takedowns_landed,takedowns_attempts,sig_strikes_landed,sig_strikes_attempts,total_strikes_landed,...,recent_avg_clinch_strikes_attempts_per_min,precomp_recent_avg_clinch_strikes_attempts_per_min,avg_ground_strikes_landed_per_min,precomp_avg_ground_strikes_landed_per_min,recent_avg_ground_strikes_landed_per_min,precomp_recent_avg_ground_strikes_landed_per_min,avg_ground_strikes_attempts_per_min,precomp_avg_ground_strikes_attempts_per_min,recent_avg_ground_strikes_attempts_per_min,precomp_recent_avg_ground_strikes_attempts_per_min
0,Aalon Cruz,0.0,0.0,0.0,0.003726,0.0,0.030303,0.008403,0.024242,0.00554,...,,,0.0,,,,0.0,,,
1,Aaron Phillips,0.0,0.1,0.2,0.050671,0.0,0.0,0.07563,0.080808,0.301939,...,,,0.008333,,,,0.01037,,,
2,Aaron Phillips,0.0,0.0,0.0,0.0,0.0,0.030303,0.159664,0.109091,0.637119,...,,,0.009028,0.008333,,,0.012315,0.01037,,
3,Aaron Rosa,0.0,0.0,0.0,0.005961,0.0,0.030303,0.306723,0.361616,0.373961,...,,,0.0,,,,0.0,,,
4,Aaron Rosa,0.0,0.0,0.0,0.038748,0.0,0.0,0.247899,0.2,0.709141,...,,,0.0,0.0,,,0.0,0.0,,


In [None]:
# Group by fighter rows
train_df_fighter = train_df_pd_copy.groupby('fighter').mean()
train_df_fighter.head(5)

Unnamed: 0_level_0,knockdowns,sub_attempts,reversals,control,takedowns_landed,takedowns_attempts,sig_strikes_landed,sig_strikes_attempts,total_strikes_landed,total_strikes_attempts,...,recent_avg_clinch_strikes_attempts_per_min,precomp_recent_avg_clinch_strikes_attempts_per_min,avg_ground_strikes_landed_per_min,precomp_avg_ground_strikes_landed_per_min,recent_avg_ground_strikes_landed_per_min,precomp_recent_avg_ground_strikes_landed_per_min,avg_ground_strikes_attempts_per_min,precomp_avg_ground_strikes_attempts_per_min,recent_avg_ground_strikes_attempts_per_min,precomp_recent_avg_ground_strikes_attempts_per_min
fighter,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Aalon Cruz,0.0,0.0,0.0,0.003726,0.0,0.030303,0.008403,0.024242,0.00554,0.022901,...,,,0.0,,,,0.0,,,
Aaron Phillips,0.0,0.05,0.1,0.025335,0.0,0.015152,0.117647,0.094949,0.469529,0.396947,...,,,0.008681,0.008333,,,0.011343,0.01037,,
Aaron Riley,0.0,0.016667,0.0,0.10072,0.039683,0.075758,0.161765,0.194949,0.193906,0.249682,...,0.098108,0.101986,0.003615,0.003366,0.017178,0.018323,0.004433,0.004109,0.016012,0.017268
Aaron Rosa,0.0,0.0,0.0,0.022355,0.0,0.015152,0.277311,0.280808,0.541551,0.542939,...,,,0.0,0.0,,,0.0,0.0,,
Aaron Simpson,0.057143,0.042857,0.0,0.251011,0.170068,0.311688,0.159064,0.150649,0.203799,0.220556,...,0.125313,0.119597,0.066633,0.072011,0.070328,0.079321,0.113326,0.12282,0.083525,0.096636


In [None]:
# Strike score and grapple score by fighter (make sure to divide by num cols to normalize)
train_df_fighter['strike score'] = train_df_fighter[striking_names].sum(axis=1) / len(striking_names)
train_df_fighter['grapple score'] = train_df_fighter[grappling_names].sum(axis=1) / len(grappling_names)

# Get the fight type score (close to 0 for striking, close to 1 for grappling)
train_df_fighter['fight type'] = train_df_fighter['grapple score'] / (train_df_fighter['strike score'] + train_df_fighter['grapple score'])

# Defragment so we don't get an error
train_df_fighter = train_df_fighter.copy()

In [None]:
# Test our score using Charles Oliveira (grappler) and Max Holloway (striker)
print(f"Charles Oliveira score is: {train_df_fighter.loc['Charles Oliveira']['fight type']}")
print(f"Max Holloway score is: {train_df_fighter.loc['Max Holloway']['fight type']}")

Charles Oliveira score is: 0.46198129584102104
Max Holloway score is: 0.3129558063353952


# Using new dataset for the UFC matchups

In [None]:
# Download the dataset from kaggle
path = kagglehub.dataset_download('rajeevw/ufcdata')

# Use $ to keep python variable in the terminal command
!ls $path  # make sure there are files in the path as we expect
!cp -r $path/* /content/  # move to /content folder

data.csv  preprocessed_data.csv  raw_fighter_details.csv  raw_total_fight_data.csv


In [None]:
# Read CSV file to a pandas df
data_csv = pd.read_csv('/content/data.csv')

In [None]:
# Drop cols that aren't relevant (need to keep date col so we can have fights be in time order)
smaller_data_csv = data_csv.drop(['Referee', 'location'], axis=1)

## Change the format of the dataframe so that indices are fighter names and result (dependent var) will be win/loss
# Red fighter processing
red_fighter_data = smaller_data_csv.set_index('R_fighter')
red_fighter_data = red_fighter_data.rename(columns={'B_fighter': 'Opp_'})
red_fighter_data['Ftr_Fight Result'] = red_fighter_data.apply(lambda x: 'Win' if x['Winner'] == 'Red' else ('Draw' if x['Winner'] == 'Draw' else 'Loss'), axis=1)
red_fighter_data.columns = ['Ftr_' + col[2:] if col[:2] == 'R_' else ('Opp_' + col[2:] if col[:2] == 'B_' else col) for col in red_fighter_data.columns]
red_fighter_data = red_fighter_data.drop(['Winner'], axis=1)

# Blue fighter processing
blue_fighter_data = smaller_data_csv.set_index('B_fighter')
blue_fighter_data = blue_fighter_data.rename(columns={'R_fighter': 'Opp_'})
blue_fighter_data['Ftr_Fight Result'] = blue_fighter_data.apply(lambda x: 'Win' if x['Winner'] == 'Blue' else ('Draw' if x['Winner'] == 'Draw' else 'Loss'), axis=1)
blue_fighter_data.columns = ['Ftr_' + col[2:] if col[:2] == 'B_' else ('Opp_' + col[2:] if col[:2] == 'R_' else col) for col in blue_fighter_data.columns]
blue_fighter_data = blue_fighter_data.drop(['Winner'], axis=1)

blue_fighter_data['Opp_Fight Result'] = red_fighter_data['Ftr_Fight Result'].values
red_fighter_data['Opp_Fight Result'] = blue_fighter_data['Ftr_Fight Result'].values

# Concat the two dataframes
fighter_data = pd.concat([red_fighter_data, blue_fighter_data], axis=0)

# Some more preprocessing
fighter_data = fighter_data.rename(columns={'title_bout':'Opp_title_bout', 'weight_class':'Opp_weight_class', 'date':'Opp_date'})
fighter_data['Ftr_title_bout'] = fighter_data['Opp_title_bout']
fighter_data['Ftr_weight_class'] = fighter_data['Opp_weight_class']
fighter_data['Ftr_date'] = fighter_data['Opp_date']

# Dates to datetime and dropping any fights without a date
fighter_data.dropna(subset=['Ftr_date', 'Opp_date'], axis=0)  # drop all rows for fights that don't have a date
fighter_data['Ftr_date'] = pd.to_datetime(fighter_data['Ftr_date'])
fighter_data['Opp_date'] = pd.to_datetime(fighter_data['Opp_date'])

# Bring the index into the df as a col so we have fighter names
fighter_data.index.name = 'Ftr_'
fighter_data = fighter_data.reset_index(drop=False)

fighter_data = fighter_data.dropna()
fighter_data.head()

Unnamed: 0,Ftr_,Opp_,Opp_date,Opp_title_bout,Opp_weight_class,Opp_avg_KD,Opp_avg_opp_KD,Opp_avg_SIG_STR_pct,Opp_avg_opp_SIG_STR_pct,Opp_avg_TD_pct,...,Ftr_Height_cms,Ftr_Reach_cms,Ftr_Weight_lbs,Opp_age,Ftr_age,Ftr_Fight Result,Opp_Fight Result,Ftr_title_bout,Ftr_weight_class,Ftr_date
0,Adrian Yanez,Gustavo Lopez,2021-03-20,False,Bantamweight,0.0,0.0,0.42,0.495,0.33,...,170.18,177.8,135.0,31.0,27.0,Win,Loss,False,Bantamweight,2021-03-20
1,Trevin Giles,Roman Dolidze,2021-03-20,False,Middleweight,0.5,0.0,0.66,0.305,0.3,...,182.88,187.96,185.0,32.0,28.0,Win,Loss,False,Middleweight,2021-03-20
4,Marion Reneau,Macy Chiasson,2021-03-20,False,WomenBantamweight,0.125,0.0,0.535625,0.57875,0.185,...,167.64,172.72,135.0,29.0,43.0,Loss,Win,False,WomenBantamweight,2021-03-20
5,Leonardo Santos,Grant Dawson,2021-03-20,False,Lightweight,0.0,0.0,0.515,0.47375,0.435,...,182.88,190.5,155.0,27.0,41.0,Loss,Win,False,Lightweight,2021-03-20
6,Song Kenan,Max Griffin,2021-03-20,False,Welterweight,0.046875,0.125,0.459277,0.404687,0.322188,...,182.88,180.34,170.0,35.0,31.0,Loss,Win,False,Welterweight,2021-03-20


### Pairwise model for predicting fight based on fighter stats up to the day of the fight

In [None]:
class SiameseFightModel(nn.Module):
  def __init__(self, input_dim, hidden_dim):
    super(SiameseFightModel, self).__init__()
    # Sub-network for processing fighter stats
    self.shared_network = nn.Sequential(
      nn.Linear(input_dim, hidden_dim),
      nn.ReLU(),
      nn.Linear(hidden_dim, hidden_dim),
      nn.ReLU()
    )
    # Output layers for combined features
    self.out_layer = nn.Sequential(
      nn.Linear(hidden_dim * 2, hidden_dim),
      nn.ReLU(),
      nn.Linear(hidden_dim, 3),  # Three outputs
      nn.Softmax(dim=1)  # Normalize outputs
    )

  def forward(self, fighter_stats, opponent_stats):
    # Pass both fighter and opponent stats through the shared network
    fighter_embedding = self.shared_network(fighter_stats)
    opponent_embedding = self.shared_network(opponent_stats)
    # Concatenate embeddings
    combined = torch.cat((fighter_embedding, opponent_embedding), dim=1)
    # Predict outcome
    return self.out_layer(combined)

class FightDataset(Dataset):
  def __init__(self, fighter_stats, opponent_stats, results):
    self.fighter_stats = fighter_stats
    self.opponent_stats = opponent_stats
    self.results = results

  def __len__(self):
    return len(self.results)

  def __getitem__(self, idx):
    return self.fighter_stats[idx], self.opponent_stats[idx], self.results[idx]

In [None]:
# Convert datetime series values for the neural network
fighter_data['Ftr_date'] = fighter_data['Ftr_date'].astype('int64') // 10**9
fighter_data['Opp_date'] = fighter_data['Opp_date'].astype('int64') // 10**9

# Use label encoding because we have some text columns
for col in fighter_data.select_dtypes(exclude=['number']).columns:
  fighter_data[col] = LabelEncoder().fit_transform(fighter_data[col])

# Separate data into fighter and opponent stats
fight_results = fighter_data['Ftr_Fight Result']
fighter_features = fighter_data[[col for col in fighter_data.columns if col[:3] == 'Ftr']].drop('Ftr_Fight Result', axis=1)
opponent_features = fighter_data[[col for col in fighter_data.columns if col[:3] == 'Opp']].drop('Opp_Fight Result', axis=1)

# Standardize data sets (first must have column names match)
opponent_features.columns = ['Ftr_' + col[4:] for col in opponent_features.columns]  # match col names
opponent_features = opponent_features[fighter_features.columns]  # get cols in same order

scaler = StandardScaler()
fighter_features = scaler.fit_transform(fighter_features)
opponent_features = scaler.transform(opponent_features)

# Split data into train and test sets
X_train_f, X_test_f, X_train_o, X_test_o, y_train, y_test = train_test_split(
    fighter_features, opponent_features, fight_results, test_size=0.2, random_state=42
)

# Convert to PyTorch tensors
X_train_f, X_test_f = torch.tensor(X_train_f, dtype=torch.float32), torch.tensor(X_test_f, dtype=torch.float32)
X_train_o, X_test_o = torch.tensor(X_train_o, dtype=torch.float32), torch.tensor(X_test_o, dtype=torch.float32)
y_train, y_test = torch.tensor(y_train.values, dtype=torch.long), torch.tensor(y_test.values, dtype=torch.long)

In [None]:
# Create DataLoader
train_dataset = FightDataset(X_train_f, X_train_o, y_train)
test_dataset = FightDataset(X_test_f, X_test_o, y_test)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

# Create instance of model
input_dim = X_train_f.shape[1]
hidden_dim = 256
model = SiameseFightModel(input_dim, hidden_dim)

In [None]:
# Define loss and optimizer
criterion = nn.CrossEntropyLoss()
# optimizer = optim.Adam(model.parameters(), lr=0.001)
optimizer = optim.RMSprop(model.parameters(), lr=0.001)

# Training loop
epochs = 150
for epoch in range(epochs):
  model.train()
  total_loss = 0

  for fighter_stats, opponent_stats, labels in train_loader:
    optimizer.zero_grad()
    outputs = model(fighter_stats, opponent_stats)
    loss = criterion(outputs, labels)
    loss.backward()
    optimizer.step()
    total_loss += loss.item()

  if (epoch + 1) % 10 == 0:
    print(f'Epoch {epoch+1}/{epochs}, Loss: {total_loss:.4f}')

Epoch 10/150, Loss: 73.0932
Epoch 20/150, Loss: 69.1161
Epoch 30/150, Loss: 68.4110
Epoch 40/150, Loss: 66.7697
Epoch 50/150, Loss: 66.1089
Epoch 60/150, Loss: 65.4254
Epoch 70/150, Loss: 65.3254
Epoch 80/150, Loss: 65.2983
Epoch 90/150, Loss: 64.8239
Epoch 100/150, Loss: 64.3507
Epoch 110/150, Loss: 64.8030
Epoch 120/150, Loss: 64.0668
Epoch 130/150, Loss: 63.0399
Epoch 140/150, Loss: 63.7213
Epoch 150/150, Loss: 63.0088


In [None]:
model.eval()
correct = 0
total = 0
with torch.no_grad():
    for fighter_stats, opponent_stats, labels in test_loader:
        outputs = model(fighter_stats, opponent_stats)
        predictions = torch.argmax(outputs, dim=1)  # Get predicted class
        correct += (predictions == labels).sum().item()
        total += labels.size(0)

accuracy = correct / total
print(f"Test Accuracy: {accuracy:.2f}")

Test Accuracy: 0.80
