In [1]:
import torch
import torch.nn as nn

import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
# !pip install transformers evaluate datasets

In [3]:
from transformers import AutoTokenizer, AutoConfig, AutoModelForMaskedLM, DataCollatorForLanguageModeling, Trainer,  TrainingArguments
from transformers import BertModel, BertConfig

import datasets
import evaluate

In [4]:
data = pd.read_csv('../data/data.csv')

In [5]:
data.head(1)

Unnamed: 0,antigen,TCR,interaction
0,AAGIGILTV,CAISEVGVGQPQHF,1


In [6]:
data.drop(columns=['interaction'], inplace=True)

In [7]:
# Apply a lambda function to insert spaces between characters
data['antigen'] = data['antigen'].apply(lambda x: ' '.join(list(x)))
data['TCR'] = data['TCR'].apply(lambda x: ' '.join(list(x)))

In [8]:
data

Unnamed: 0,antigen,TCR
0,A A G I G I L T V,C A I S E V G V G Q P Q H F
1,A A G I G I L T V,C A S S L S F G T E A F F
2,A A R A V F L A L,C A S L G A Q N N E Q F
3,A A R A V F L A L,C A S S Y S T G D E Q Y F
4,A I M D K N I I L,C A S S V D G G S Q P Q H F
...,...,...
130466,Y M G V S Y E M,C A S S P N H G G H S P L H F
130467,Y M G V S Y E M,C A S S P M G G H S P L H F
130468,F L G I Y T V T V V,C A S F L G G T G T E A F F
130469,F L G I Y T V T V V,C A S S A L R L S Q P G E Q Y F


In [9]:
# Convert the DataFrame to a Hugging Face dataset
dataset = datasets.Dataset.from_pandas(data)

In [10]:
dataset = dataset.train_test_split(test_size=0.2)
dataset

DatasetDict({
    train: Dataset({
        features: ['antigen', 'TCR'],
        num_rows: 104376
    })
    test: Dataset({
        features: ['antigen', 'TCR'],
        num_rows: 26095
    })
})

In [11]:
dataset['train'].features

{'antigen': Value(dtype='string', id=None),
 'TCR': Value(dtype='string', id=None)}

In [12]:
BERT_CONFIG = BertConfig(
    vocab_size=25,
    max_position_embeddings=64,
    type_vocab_size=2,
    num_attention_heads=8,
    num_hidden_layers=8,
    hidden_size=512,
    intermediate_size=2048,
    num_labels=2
)

In [13]:
config = BERT_CONFIG

In [14]:
tokenizer = AutoTokenizer.from_pretrained("antigen", config=config)
tokenizer.model_max_length = 64


In [15]:
model = AutoModelForMaskedLM.from_config(BERT_CONFIG)

In [16]:
model

BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(25, 512, padding_idx=0)
      (position_embeddings): Embedding(64, 512)
      (token_type_embeddings): Embedding(2, 512)
      (LayerNorm): LayerNorm((512,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-7): 8 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=512, out_features=512, bias=True)
              (key): Linear(in_features=512, out_features=512, bias=True)
              (value): Linear(in_features=512, out_features=512, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=512, out_features=512, bias=True)
              (LayerNorm): LayerNorm((512,), eps=1e-12, elementwise_affine=

In [17]:
model.config

BertConfig {
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 512,
  "initializer_range": 0.02,
  "intermediate_size": 2048,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 64,
  "model_type": "bert",
  "num_attention_heads": 8,
  "num_hidden_layers": 8,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.35.2",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 25
}

In [18]:
column_names = list(dataset["train"].features)

In [19]:
column_names

['antigen', 'TCR']

[CLS]antigen[SEP]TCR[EOS]

In [20]:
def tokenize_function(examples):
    return tokenizer(examples[column_names[0]], examples[column_names[1]], return_special_tokens_mask=False,
                     padding='longest', truncation='longest_first', return_tensors="pt")

In [21]:
tokenized_datasets = dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=column_names,
    desc="Running tokenizer on every text in dataset",
    )

Running tokenizer on every text in dataset:   0%|          | 0/104376 [00:00<?, ? examples/s]

Running tokenizer on every text in dataset:   0%|          | 0/26095 [00:00<?, ? examples/s]

In [22]:
tokenized_datasets['train']

Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 104376
})

In [23]:
tokenizer.decode(tokenized_datasets["train"][0]["input_ids"])

'[CLS] T P R V T G G G A M [SEP] C A S S I G L A E T Y N E Q F F [SEP] [PAD] [PAD] [PAD] [PAD]'

In [None]:
train_dataset = tokenized_datasets["train"]

In [None]:
eval_dataset = tokenized_datasets["test"]

In [None]:
metric = evaluate.load("accuracy")

In [None]:
def compute_metrics(eval_preds):
    preds, labels = eval_preds
    # preds have the same shape as the labels, after the argmax(-1) has been calculated
    # by preprocess_logits_for_metrics
    labels = labels.reshape(-1)
    preds = preds.reshape(-1)
    mask = labels != -100
    labels = labels[mask]
    preds = preds[mask]
    return metric.compute(predictions=preds, references=labels)

In [None]:
mlm_probability = 0.15 # Percentage of data to mask

In [None]:
data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm_probability=mlm_probability)

In [None]:
training_args = TrainingArguments(output_dir='./results', evaluation_strategy="epoch",
                                  learning_rate=2e-5, per_device_train_batch_size=16,
                                  per_device_eval_batch_size=16, num_train_epochs=1,
                                  weight_decay=0.01)

In [None]:
# Initialize our Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

In [None]:
train_result = trainer.train()
trainer.save_model()  # Saves the tokenizer too for easy upload

In [None]:
metrics = train_result.metrics
metrics["train_samples"] = len(train_dataset)
trainer.save_metrics("train", metrics)


In [None]:
metrics = trainer.evaluate()
metrics["eval_samples"] = len(eval_dataset)
trainer.save_metrics("eval", metrics)

In [None]:
trainer.save_state()

In [None]:
import math

In [None]:
try:
    perplexity = math.exp(metrics["eval_loss"])
except OverflowError:
    perplexity = float("inf")
metrics["perplexity"] = perplexity

In [None]:
trainer.save_metrics("eval", metrics)