<a href="https://colab.research.google.com/github/DManiscalco/MMA-Matchups/blob/main/MMA_Column_Classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import torch
from torch.utils.data import Dataset
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments

In [3]:
class ColumnDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=16):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text, label = self.data[idx]
        inputs = self.tokenizer(
            text,
            padding='max_length',
            truncation=True,
            max_length=self.max_length,
            return_tensors='pt'  # make sure we use a form for pytorch
        )

        # Return input_ids and attention_mask as tensors for Hugging Face Trainer compatibility
        item = {key: val.squeeze(0) for key, val in inputs.items()}
        item['labels'] = torch.tensor(label, dtype=torch.long)
        return item

In [4]:
# Load BERT model with 3 classes - striker, grappler, neither (num_labels is 3)
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', clean_up_tokenization_spaces=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [19]:
# Give some sample column names so that we can fine-tune the BERT model with names and labels
# 0 label is striking, 1 is grappling, 2 is neither
training_data = [
    ('strikes_landed', 0),           # Striking
    ('takedowns_attempted', 1),      # Grappling
    ('submission_attempts', 1),      # Grappling
    ('punch_accuracy', 0),           # Striking
    ('kicks_landed', 0),             # Striking
    ('rounds_fought', 2),            # Neither
    ('fight_duration', 2),           # Neither
    ('takedown_defense', 1),         # Grappling
    ('punches_thrown', 0),           # Striking
    ('grappling_control_time', 1),   # Grappling
    ('knockdowns', 0),               # Striking
    ('age', 2),                      # Neither
    ('weight_class', 2),             # Neither
    ('fighter', 2),
    ('date', 2),
    ('result', 2),
    ('opponent', 2),
    ('division', 2),
    ('stance', 2),
    ('dob', 2),
    ('method', 2),
    ('total_comp_time', 2),
    ('round', 2),
    ('referee', 2),
    ('time_format', 2),
    ('reach', 2),
    ('height', 2),
    ('age', 2),
    ('knockdowns', 0),
    ('sub_attempts', 1),
    ('reversals', 1),
    ('control', 1),
    ('takedown', 1),
    ('subs', 1),
    ('sub', 1),
    ('submission', 1),
    ('submission_attempts', 1),
    ('submissions', 1),
    ('takedowns_landed', 1),
    ('takedowns_attempts', 1),
    ('sig_strikes_landed', 0),
    ('sig_strikes_attempts', 0),
    ('total_strikes_landed', 0),
    ('total_strikes_attempts', 0),
    ('head_strikes_landed', 0),
    ('head_strikes_attempts', 0),
    ('body_strikes_landed', 0),
    ('body_strikes_attempts', 0),
    ('leg_strikes_landed', 0),
    ('leg_strikes_attempts', 0),
    ('distance_strikes_landed', 0),
    ('distance_strikes_attempts', 0),
    ('clinch_strikes_landed', 0),
    ('clinch_strikes_attempts', 0),
    ('ground_strikes_landed', 1),
    ('ground_strikes_attempts', 1),
    ('takedowns_accuracy', 1),
    ('sig_strikes_accuracy', 0)
]

In [20]:
# Turn the training data into a form that can be recognized by pytorch by running
# it through the tokenizer and using the class defined above
dataset = ColumnDataset(training_data, tokenizer)

In [21]:
# Training arguments for the Hugging Face Trainer - don't have many col names to
# train on so these may need to be adjusted if we don't get great outputs initially
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=7,
    per_device_train_batch_size=8,
    learning_rate=2e-5,
    logging_dir='./logs',
    eval_strategy='no',
    report_to='none'  # Use this to get rid of weights and biases login attempts
)

In [22]:
# Fine-tune our model using the dataset we gave it as well as the model we gave it (BERT)
# and the training args from above
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
)

# Start training
trainer.train()

Step,Training Loss


TrainOutput(global_step=56, training_loss=0.5356804643358503, metrics={'train_runtime': 145.5998, 'train_samples_per_second': 2.788, 'train_steps_per_second': 0.385, 'total_flos': 3338251487424.0, 'train_loss': 0.5356804643358503, 'epoch': 7.0})

In [23]:
# Save the model's info so it can be called later without being rerun
model.save_pretrained('./fine_tuned_bert')
tokenizer.save_pretrained('./fine_tuned_bert')

('./fine_tuned_bert/tokenizer_config.json',
 './fine_tuned_bert/special_tokens_map.json',
 './fine_tuned_bert/vocab.txt',
 './fine_tuned_bert/added_tokens.json')

In [24]:
# This is the data that we are running the model on and getting labels for - this is
# just a small subset of all of the cols that will be used in the true dataset
col_names = [
  'fighter',
  'date',
  'result',
  'opponent',
  'division',
  'stance',
  'dob',
  'method',
  'total_comp_time',
  'round',
  'referee',
  'time_format',
  'reach',
  'height',
  'age',
  'knockdowns',
  'sub_attempts',
  'reversals',
  'control',
  'takedowns_landed',
  'takedowns_attempts',
  'sig_strikes_landed',
  'sig_strikes_attempts',
  'total_strikes_landed',
  ]

In [25]:
# Load fine-tuned model and tokenizer from where we saved it
model = BertForSequenceClassification.from_pretrained('./fine_tuned_bert')
tokenizer = BertTokenizer.from_pretrained('./fine_tuned_bert')

# Tokenize the entire batch of column names at once instead of looping one at a time
inputs = tokenizer(col_names, padding=True, truncation=True, return_tensors='pt')

# Make predictions for the entire batch
with torch.no_grad():
    outputs = model(**inputs)
    predicted_classes = torch.argmax(outputs.logits, dim=1).tolist()

# 0 for striking, 1 for grappling, 2 for neither
for column, pred_class in zip(col_names, predicted_classes):
    print(f'Column: {column}, Predicted class: {pred_class}')

Column: fighter, Predicted class: 2
Column: date, Predicted class: 2
Column: result, Predicted class: 2
Column: opponent, Predicted class: 2
Column: division, Predicted class: 2
Column: stance, Predicted class: 2
Column: dob, Predicted class: 2
Column: method, Predicted class: 2
Column: total_comp_time, Predicted class: 2
Column: round, Predicted class: 2
Column: referee, Predicted class: 2
Column: time_format, Predicted class: 2
Column: reach, Predicted class: 2
Column: height, Predicted class: 2
Column: age, Predicted class: 2
Column: knockdowns, Predicted class: 0
Column: sub_attempts, Predicted class: 1
Column: reversals, Predicted class: 1
Column: control, Predicted class: 1
Column: takedowns_landed, Predicted class: 1
Column: takedowns_attempts, Predicted class: 1
Column: sig_strikes_landed, Predicted class: 0
Column: sig_strikes_attempts, Predicted class: 0
Column: total_strikes_landed, Predicted class: 0
