<a href="https://colab.research.google.com/github/DManiscalco/MMA-Matchups/blob/main/MMA_Matchups.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import kagglehub
import pandas as pd
from pyspark.sql import functions as F
from pyspark.sql import SparkSession
from pyspark.sql.window import Window
import torch
from torch.utils.data import Dataset
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments

# The below dataset got taken down so had to use another one and collapse this section

## Download the dataset and bring in the dataframe

In [None]:
# Download the dataset from kaggle
path = kagglehub.dataset_download('danmcinerney/mma-differentials-and-elo')

# Use $ to keep python variable in the terminal command
!ls $path  # make sure there are files in the path as we expect
!cp -r $path/* /content/  # move to /content folder

KaggleApiHTTPError: 403 Client Error.

You don't have permission to access resource at URL: https://www.kaggle.com/datasets/danmcinerney/mma-differentials-and-elo. The server reported the following issues: Permission 'datasets.get' was denied
Please make sure you are authenticated if you are trying to access a private resource or a resource requiring consent.

In [None]:
# Create a SparkSession
spark = SparkSession.builder.getOrCreate()

In [None]:
# Read the CSV file
mma_df = spark.read.csv('/content/masterdataframe.csv', header=True, inferSchema=True)

In [None]:
# Show the first few rows of the DataFrame
# mma_df.show(10)

## Start preprocessing the data

In [None]:
## Drop any column that is a url:
# Get the list of column names where the first entry starts with 'http:' and drop them
first_row = mma_df.first()  # Get first row of data from dataframe
columns_with_http = [col for col in mma_df.columns if str(first_row[col]).startswith("http:")]
mma_df_reduced = mma_df.drop(*columns_with_http)  # Drop these columns

# Dropping 'time' col - it is the time that we run df and unrelated to the data
mma_df_reduced = mma_df_reduced.drop('time')

# 'age' col refers to age in 2021 so we calculate true age at time of fight
mma_df_reduced = mma_df_reduced.withColumn(
  'age',
  F.floor(F.months_between('date', 'dob') / 12)  # returns NULL if a col is NULL
)

In [None]:
# mma_df_reduced.show(5)

In [None]:
## Split the dataframe into a training and testing set based on specific fighters
## and the dates of their fights
# Proportions for training and testing sets
train_ratio = 0.7  # 70% for training
test_ratio = 0.3   # 30% for testing

# Calculate the total row count per fighter then cutoff counts based on proportions
row_counts = mma_df_reduced.groupBy('fighter').count().withColumnRenamed('count', 'total_rows')
cutoffs = row_counts.withColumn('train_cutoff', (F.col('total_rows') * train_ratio).cast('int'))
cutoffs = cutoffs.withColumn('test_cutoff', F.col('total_rows') - F.col('train_cutoff'))

# Join the cutoffs back to the original dataframe for reference
mma_df_reduced = mma_df_reduced.join(cutoffs.select('fighter', 'train_cutoff', 'test_cutoff'), on='fighter', how='left')

# Order rows by date within each person's group and assign row numbers
window = Window.partitionBy('fighter').orderBy('date')
mma_df_reduced = mma_df_reduced.withColumn('row_num', F.row_number().over(window))

# Create train and test dfs based on row numbers and cutoffs
train_df = mma_df_reduced.filter(F.col('row_num') <= F.col('train_cutoff')).drop('row_num', 'train_cutoff', 'test_cutoff')
test_df = mma_df_reduced.filter(F.col('row_num') > F.col('train_cutoff')).drop('row_num', 'train_cutoff', 'test_cutoff')

In [None]:
# train_df.show(5)

In [None]:
# Use the previously trained BERT model to classify our columns as striking or grappling (or neither)
# Load fine-tuned model and tokenizer from where we saved it
model = BertForSequenceClassification.from_pretrained('./fine_tuned_bert')
tokenizer = BertTokenizer.from_pretrained('./fine_tuned_bert')

# Tokenize the entire batch of column names at once instead of looping one at a time
inputs = tokenizer(train_df.columns, padding=True, truncation=True, return_tensors='pt')

# Make predictions for the entire batch
with torch.no_grad():
  outputs = model(**inputs)
  predicted_classes = torch.argmax(outputs.logits, dim=1).tolist()

## Short detour to create a metric that shows whether a fighter is more of a striker, a grappler, or balanced

In [None]:
# Dictionary that contains column names and grappler/striker category
fight_type_dict = {}

# 0 for striking, 1 for grappling, 2 for neither
for column, pred_class in zip(train_df.columns, predicted_classes):
  fight_type_dict[column] = pred_class

# Make list of striking and grappling column names
striking_names = []
grappling_names = []

for col_name in fight_type_dict:
  if fight_type_dict[col_name] == 0:
    striking_names.append(col_name)
  elif fight_type_dict[col_name] == 1:
    grappling_names.append(col_name)
  else:
    pass

In [None]:
## The below is too slow when using PySpark so try Pandas
# # Make a copy that we can do calculations on for convenience
# train_df_copy = train_df.select('*')

# # Normalize each column if it is labeled 0 or 1; skip if 2
# for col in train_df_copy.columns:

#   if fight_type_dict[col] != 2:
#     min_col = train_df_copy.agg(F.min(col)).collect()[0][0]
#     max_col = train_df_copy.agg(F.max(col)).collect()[0][0]
#     train_df_copy = train_df_copy.withColumn(col, (F.col(col) - min_col) / (max_col - min_col))

#   else:
#     pass

# train_df_copy.show(5)

In [None]:
# Convert the spark dataframe to a pandas df for easier use
train_df_pd = train_df.toPandas()

In [None]:
train_df_pd.head(10)

# Make a copy that we can do calculations on for convenience
train_df_pd_copy = train_df_pd.copy()

# Normalize each column if it is labeled 0 or 1; skip if 2
for col in train_df_pd_copy.columns:

  if fight_type_dict[col] != 2:
    col_max = train_df_pd_copy[col].max()
    col_min = train_df_pd_copy[col].min()
    train_df_pd_copy[col] = (train_df_pd_copy[col] - col_min) / (col_max - col_min)

  # Keep fighter name column but drop anything else that isn't relevant
  else:
    if col == 'fighter':
      pass
    else:
      train_df_pd_copy = train_df_pd_copy.drop(col, axis=1)

train_df_pd_copy.head(5)

Unnamed: 0,fighter,knockdowns,sub_attempts,reversals,control,takedowns_landed,takedowns_attempts,sig_strikes_landed,sig_strikes_attempts,total_strikes_landed,...,recent_avg_clinch_strikes_attempts_per_min,precomp_recent_avg_clinch_strikes_attempts_per_min,avg_ground_strikes_landed_per_min,precomp_avg_ground_strikes_landed_per_min,recent_avg_ground_strikes_landed_per_min,precomp_recent_avg_ground_strikes_landed_per_min,avg_ground_strikes_attempts_per_min,precomp_avg_ground_strikes_attempts_per_min,recent_avg_ground_strikes_attempts_per_min,precomp_recent_avg_ground_strikes_attempts_per_min
0,Aalon Cruz,0.0,0.0,0.0,0.003726,0.0,0.030303,0.008403,0.024242,0.00554,...,,,0.0,,,,0.0,,,
1,Aaron Phillips,0.0,0.1,0.2,0.050671,0.0,0.0,0.07563,0.080808,0.301939,...,,,0.008333,,,,0.01037,,,
2,Aaron Phillips,0.0,0.0,0.0,0.0,0.0,0.030303,0.159664,0.109091,0.637119,...,,,0.009028,0.008333,,,0.012315,0.01037,,
3,Aaron Rosa,0.0,0.0,0.0,0.005961,0.0,0.030303,0.306723,0.361616,0.373961,...,,,0.0,,,,0.0,,,
4,Aaron Rosa,0.0,0.0,0.0,0.038748,0.0,0.0,0.247899,0.2,0.709141,...,,,0.0,0.0,,,0.0,0.0,,


In [None]:
# Group by fighter rows
train_df_fighter = train_df_pd_copy.groupby('fighter').mean()
train_df_fighter.head(5)

Unnamed: 0_level_0,knockdowns,sub_attempts,reversals,control,takedowns_landed,takedowns_attempts,sig_strikes_landed,sig_strikes_attempts,total_strikes_landed,total_strikes_attempts,...,recent_avg_clinch_strikes_attempts_per_min,precomp_recent_avg_clinch_strikes_attempts_per_min,avg_ground_strikes_landed_per_min,precomp_avg_ground_strikes_landed_per_min,recent_avg_ground_strikes_landed_per_min,precomp_recent_avg_ground_strikes_landed_per_min,avg_ground_strikes_attempts_per_min,precomp_avg_ground_strikes_attempts_per_min,recent_avg_ground_strikes_attempts_per_min,precomp_recent_avg_ground_strikes_attempts_per_min
fighter,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Aalon Cruz,0.0,0.0,0.0,0.003726,0.0,0.030303,0.008403,0.024242,0.00554,0.022901,...,,,0.0,,,,0.0,,,
Aaron Phillips,0.0,0.05,0.1,0.025335,0.0,0.015152,0.117647,0.094949,0.469529,0.396947,...,,,0.008681,0.008333,,,0.011343,0.01037,,
Aaron Riley,0.0,0.016667,0.0,0.10072,0.039683,0.075758,0.161765,0.194949,0.193906,0.249682,...,0.098108,0.101986,0.003615,0.003366,0.017178,0.018323,0.004433,0.004109,0.016012,0.017268
Aaron Rosa,0.0,0.0,0.0,0.022355,0.0,0.015152,0.277311,0.280808,0.541551,0.542939,...,,,0.0,0.0,,,0.0,0.0,,
Aaron Simpson,0.057143,0.042857,0.0,0.251011,0.170068,0.311688,0.159064,0.150649,0.203799,0.220556,...,0.125313,0.119597,0.066633,0.072011,0.070328,0.079321,0.113326,0.12282,0.083525,0.096636


In [None]:
# Strike score and grapple score by fighter (make sure to divide by num cols to normalize)
train_df_fighter['strike score'] = train_df_fighter[striking_names].sum(axis=1) / len(striking_names)
train_df_fighter['grapple score'] = train_df_fighter[grappling_names].sum(axis=1) / len(grappling_names)

# Get the fight type score (close to 0 for striking, close to 1 for grappling)
train_df_fighter['fight type'] = train_df_fighter['grapple score'] / (train_df_fighter['strike score'] + train_df_fighter['grapple score'])

# Defragment so we don't get an error
train_df_fighter = train_df_fighter.copy()

In [None]:
# Test our score using Charles Oliveira (grappler) and Max Holloway (striker)
print(f"Charles Oliveira score is: {train_df_fighter.loc['Charles Oliveira']['fight type']}")
print(f"Max Holloway score is: {train_df_fighter.loc['Max Holloway']['fight type']}")

Charles Oliveira score is: 0.46198129584102104
Max Holloway score is: 0.3129558063353952


# Using new dataset for the UFC matchups

In [2]:
# Download the dataset from kaggle
path = kagglehub.dataset_download('rajeevw/ufcdata')

# Use $ to keep python variable in the terminal command
!ls $path  # make sure there are files in the path as we expect
!cp -r $path/* /content/  # move to /content folder

Downloading from https://www.kaggle.com/api/v1/datasets/download/rajeevw/ufcdata?dataset_version_number=2...


100%|██████████| 3.70M/3.70M [00:00<00:00, 99.2MB/s]

Extracting files...





data.csv  preprocessed_data.csv  raw_fighter_details.csv  raw_total_fight_data.csv


In [3]:
# Read CSV file to a pandas df
data_csv = pd.read_csv('/content/data.csv')

In [81]:
# Drop cols that aren't relevant
smaller_data_csv = data_csv.drop(['Referee', 'date', 'location'], axis=1)

## Change the format of the dataframe so that indices are fighter names and result (dependent var) will be win/loss
# Red fighter processing
red_fighter_data = smaller_data_csv.set_index('R_fighter')
red_fighter_data = red_fighter_data.rename(columns={'B_fighter': 'Opp'})
red_fighter_data['Fight Result'] = red_fighter_data.apply(lambda x: 'Win' if x['Winner'] == 'Red' else ('Draw' if x['Winner'] == 'Draw' else 'Loss'), axis=1)
red_fighter_data.columns = ['Ftr_' + col[2:] if col[:2] == 'R_' else ('Opp_' + col[2:] if col[:2] == 'B_' else col) for col in red_fighter_data.columns]
red_fighter_data = red_fighter_data.drop(['Winner'], axis=1)

# Blue fighter processing
blue_fighter_data = smaller_data_csv.set_index('B_fighter')
blue_fighter_data = blue_fighter_data.rename(columns={'R_fighter': 'Opp'})
blue_fighter_data['Fight Result'] = blue_fighter_data.apply(lambda x: 'Win' if x['Winner'] == 'Blue' else ('Draw' if x['Winner'] == 'Draw' else 'Loss'), axis=1)
blue_fighter_data.columns = ['Ftr_' + col[2:] if col[:2] == 'B_' else ('Opp_' + col[2:] if col[:2] == 'R_' else col) for col in blue_fighter_data.columns]
blue_fighter_data = blue_fighter_data.drop(['Winner'], axis=1)

# Concat the two dataframes
fighter_data = pd.concat([red_fighter_data, blue_fighter_data], axis=0)

In [82]:
fighter_data

Unnamed: 0,Opp,title_bout,weight_class,Opp_avg_KD,Opp_avg_opp_KD,Opp_avg_SIG_STR_pct,Opp_avg_opp_SIG_STR_pct,Opp_avg_TD_pct,Opp_avg_opp_TD_pct,Opp_avg_SUB_ATT,...,Ftr_win_by_KO/TKO,Ftr_win_by_Submission,Ftr_win_by_TKO_Doctor_Stoppage,Ftr_Stance,Ftr_Height_cms,Ftr_Reach_cms,Ftr_Weight_lbs,Opp_age,Ftr_age,Fight Result
Adrian Yanez,Gustavo Lopez,False,Bantamweight,0.000,0.0,0.420000,0.49500,0.330,0.36000,0.500,...,1,0,0,Orthodox,170.18,177.80,135.0,31.0,27.0,Win
Trevin Giles,Roman Dolidze,False,Middleweight,0.500,0.0,0.660000,0.30500,0.300,0.50000,1.500,...,3,0,0,Orthodox,182.88,187.96,185.0,32.0,28.0,Win
Tai Tuivasa,Harry Hunsucker,False,Heavyweight,,,,,,,,...,3,0,0,Southpaw,187.96,190.50,264.0,32.0,28.0,Win
Cheyanne Buys,Montserrat Conejo,False,WomenStrawweight,,,,,,,,...,0,0,0,Switch,160.02,160.02,115.0,28.0,25.0,Loss
Marion Reneau,Macy Chiasson,False,WomenBantamweight,0.125,0.0,0.535625,0.57875,0.185,0.16625,0.125,...,2,2,0,Orthodox,167.64,172.72,135.0,29.0,43.0,Loss
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Alberta Cerra Leon,Remco Pardoel,False,OpenWeight,,,,,,,,...,0,0,0,,172.72,,238.0,,,Loss
Robert Lucarelli,Orlando Wiet,False,OpenWeight,,,,,,,,...,0,0,0,,187.96,,245.0,,,Loss
David Levicki,Johnny Rhodes,False,OpenWeight,,,,,,,,...,0,0,0,,195.58,,275.0,,,Loss
Ray Wizard,Patrick Smith,False,OpenWeight,,,,,,,,...,0,0,0,,,,,30.0,,Loss


## Looking to solve what I'm calling the "Circular Sport Problem": To determine how good a competitor is, you must take into account the skill of their opponents.  However, to determine the skill of these opponents, we must consider the skill of their opponents, and so on.