# Fine Tuning Third Iteration


In [36]:
!pip install -U transformers



In [37]:
!pip install datasets scikit-learn



In [38]:
# importing libraries
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset

In [39]:
# loading matchup file
file_path = '/content/final_matchups.csv'
df_matchups = pd.read_csv(file_path)

df_matchups.head(5)

Unnamed: 0,fighter_a,fighter_b,input_text,label
0,Hamdy Abdelwahab,Ildemar Alcantara,Fighter A: Hamdy Abdelwahab | Height: 74.0 in ...,0
1,Daichi Abe,Rostem Akman,Fighter A: Daichi Abe | Height: 71.0 in | Reac...,1
2,Leon Aliu,Jailton Almeida,Fighter A: Leon Aliu | Height: 72.0 in | Reach...,1
3,John Adajar,Arnold Allen,Fighter A: John Adajar | Height: 69.0 in | Rea...,1
4,Jose Alday,Jailton Almeida,Fighter A: Jose Alday | Height: 67.0 in | Reac...,1


In [40]:
# final preparation
matchups = df_matchups[['input_text', 'label']]

train_texts, val_texts, train_labels, val_labels = train_test_split(
    matchups['input_text'].tolist(),
    matchups['label'].tolist(),
    test_size = 0.1,
    random_state = 42
)

In [41]:
# load Tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# tokenize datasets
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings = tokenizer(val_texts, truncation=True, padding=True)


In [42]:
# convert to HuggingFace dataset
class MatchupDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = MatchupDataset(train_encodings, train_labels)
val_dataset = MatchupDataset(val_encodings, val_labels)

In [43]:
# load model
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [44]:
# define training arguments

# training_args = TrainingArguments(
#     output_dir="./results",
#     num_train_epochs=4,
#     per_device_train_batch_size=8,
#     per_device_eval_batch_size=8,
#     warmup_steps=50,
#     evaluation_strategy="epoch",
#     logging_dir="./logs",
#     logging_steps=10,
#     save_strategy="epoch",
# )

training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=4,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=50,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
)

# define trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

# training the model
trainer.train()

# save the model
model.save_pretrained("/content/fine_tuned_ufc_model")
tokenizer.save_pretrained("/content/fine_tuned_ufc_model")


Step,Training Loss
10,0.6883
20,0.69
30,0.7069
40,0.7452
50,0.6995
60,0.6223
70,0.7157
80,0.682
90,0.6725
100,0.6954


('/content/fine_tuned_ufc_model/tokenizer_config.json',
 '/content/fine_tuned_ufc_model/special_tokens_map.json',
 '/content/fine_tuned_ufc_model/vocab.txt',
 '/content/fine_tuned_ufc_model/added_tokens.json')

In [45]:
from sklearn.metrics import accuracy_score, classification_report

In [46]:
# Evaluation: Accuracy and Classification Report

# Get predictions on validation set
predictions = trainer.predict(val_dataset)
preds = torch.argmax(torch.tensor(predictions.predictions), axis=1)
labels = torch.tensor(predictions.label_ids)

# Calculate accuracy
accuracy = accuracy_score(labels, preds)
print(f"Validation Accuracy: {accuracy:.4f}")

Validation Accuracy: 0.8800


In [47]:
# Detailed classification report
print(classification_report(labels, preds, target_names=["Fighter A wins", "Fighter B wins"]))

                precision    recall  f1-score   support

Fighter A wins       0.89      0.81      0.85        42
Fighter B wins       0.87      0.93      0.90        58

      accuracy                           0.88       100
     macro avg       0.88      0.87      0.88       100
  weighted avg       0.88      0.88      0.88       100



In [48]:
# Inference Script: Predict Winner

def predict_winner(fighter_a_stats, fighter_b_stats, model, tokenizer):
    input_text = (
        f"Fighter A: {fighter_a_stats} || Fighter B: {fighter_b_stats}"
    )
    inputs = tokenizer(input_text, return_tensors="pt", truncation=True, padding=True).to(device)
    outputs = model(**inputs)
    probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
    pred = torch.argmax(probs, dim=1).item()
    return {"Fighter A wins": float(probs[0][0]), "Fighter B wins": float(probs[0][1])}, pred

In [49]:
fighter_a = "Height: 73 in | Reach: 80 in | Str. Acc: 0.57 | Str. Def: 0.58 | SLpM: 4.25 | SApM: 2.12"
fighter_b = "Height: 70 in | Reach: 71 in | Str. Acc: 0.49 | Str. Def: 0.55 | SLpM: 4.00 | SApM: 3.00"

probs, winner = predict_winner(fighter_a, fighter_b, model, tokenizer)
print(probs, "Winner Label (0=A, 1=B):", winner)

{'Fighter A wins': 0.03644789755344391, 'Fighter B wins': 0.9635520577430725} Winner Label (0=A, 1=B): 1


In [51]:
# Zip and Download
!zip -r /content/fine_tuned_ufc_model.zip /content/fine_tuned_ufc_model
from google.colab import files
files.download('/content/fine_tuned_ufc_model.zip')

updating: content/fine_tuned_ufc_model/ (stored 0%)
updating: content/fine_tuned_ufc_model/model.safetensors (deflated 8%)
updating: content/fine_tuned_ufc_model/vocab.txt (deflated 53%)
updating: content/fine_tuned_ufc_model/tokenizer_config.json (deflated 75%)
updating: content/fine_tuned_ufc_model/special_tokens_map.json (deflated 42%)
updating: content/fine_tuned_ufc_model/config.json (deflated 45%)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# Script to Reload Model Later for Inference

from transformers import DistilBertForSequenceClassification, DistilBertTokenizer

# Reload model and tokenizer
reloaded_model = DistilBertForSequenceClassification.from_pretrained("/content/fine_tuned_ufc_model")
reloaded_tokenizer = DistilBertTokenizer.from_pretrained("/content/fine_tuned_ufc_model")

In [53]:
# Demo: Pick 5 Random Real Matchups and Predict
import random

sampled_matchups = random.sample(val_texts, 5)

print("\n DEMO: 5 Random Matchup Predictions \n")
for idx, matchup in enumerate(sampled_matchups, 1):
    inputs = tokenizer(matchup, return_tensors="pt", truncation=True, padding=True).to(device)
    outputs = model(**inputs)
    probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
    pred = torch.argmax(probs, dim=1).item()

    print(f"Matchup {idx}:\nInput: {matchup}\n")
    print(f"Predicted Winner: {'Fighter A' if pred == 0 else 'Fighter B'}")
    print(f"Probability - Fighter A: {probs[0][0]:.4f}, Fighter B: {probs[0][1]:.4f}\n")


 DEMO: 5 Random Matchup Predictions 

Matchup 1:
Input: Fighter A: Mansur Abdul-Malik | Height: 74.0 in | Reach: 79.0 in | Str. Acc: 0.54 | Str. Def: 0.49 | SLpm: 5.29 | SApM: 3.84 || Fighter B: Yoshihiro Akiyama | Height: 70.0 in | Reach: 75.0 in | Str. Acc: 0.41 | Str. Def: 0.57 | SLpm: 3.1 | SApM: 3.62

Predicted Winner: Fighter A
Probability - Fighter A: 0.9861, Fighter B: 0.0139

Matchup 2:
Input: Fighter A: Hamdy Abdelwahab | Height: 74.0 in | Reach: 72.0 in | Str. Acc: 0.51 | Str. Def: 0.51 | SLpm: 3.4 | SApM: 3.87 || Fighter B: Ricardo Almeida | Height: 72.0 in | Reach: 74.0 in | Str. Acc: 0.49 | Str. Def: 0.61 | SLpm: 2.02 | SApM: 1.2

Predicted Winner: Fighter A
Probability - Fighter A: 0.9877, Fighter B: 0.0123

Matchup 3:
Input: Fighter A: Mostapha Al-Turk | Height: 74.0 in | Reach: 77.0 in | Str. Acc: 0.19 | Str. Def: 0.53 | SLpm: 1.36 | SApM: 3.61 || Fighter B: John Dave Almanza | Height: 67.0 in | Reach: 67.0 in | Str. Acc: 0.41 | Str. Def: 0.27 | SLpm: 1.92 | SApM: 5.7