# Code example 3: Finetuning

<a href="https://colab.research.google.com/github/BioGeMT/MALTAomics-Summer-School/blob/main/Day4_WorkshopVII_DeepLearningForProteinStructure/maltaomics_ex3_finetuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install transformers[torch] datasets

Collecting transformers[torch]
  Downloading transformers-4.33.1-py3-none-any.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m22.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-2.14.5-py3-none-any.whl (519 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.6/519.6 kB[0m [31m26.2 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.15.1 (from transformers[torch])
  Downloading huggingface_hub-0.17.1-py3-none-any.whl (294 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m294.8/294.8 kB[0m [31m33.3 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers[torch])
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m60.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from tra

In [2]:
HF_DATASET_NAME = 'roa7n/maltaomics_dataset_clustered'
MODEL_NAME = 'Rostlab/prot_bert_bfd'

## 1. Load the Dataset:

In [3]:
from datasets import Dataset, load_dataset

##########################
# TODO: Load the Dataset #
##########################

dss = load_dataset(HF_DATASET_NAME)
dss

Downloading readme:   0%|          | 0.00/577 [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/421k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/106k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/1600 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/400 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['seq', 'label'],
        num_rows: 1600
    })
    test: Dataset({
        features: ['seq', 'label'],
        num_rows: 400
    })
})

## 2. Load the model and tokenizer:

In [4]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline

######################################
# TODO: Load the Tokenizer and Model #
######################################

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, do_lower_case=False)
tokenizer

Downloading (…)okenizer_config.json:   0%|          | 0.00/86.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/361 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/81.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

BertTokenizerFast(name_or_path='Rostlab/prot_bert_bfd', vocab_size=30, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True)

In [5]:
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)
model

Downloading pytorch_model.bin:   0%|          | 0.00/1.68G [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at Rostlab/prot_bert_bfd and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30, 1024, padding_idx=0)
      (position_embeddings): Embedding(40000, 1024)
      (token_type_embeddings): Embedding(2, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-29): 30 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
              (LayerNorm): LayerNorm((1024,)

Tokenize the dataset:

In [6]:
def tokenize_function(s):
  seq_split = ' '.join(s['seq'])
  return tokenizer(seq_split)

tokenize_function({'seq': 'DENCA'})

{'input_ids': [2, 14, 9, 17, 23, 6, 3], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1]}

`input_ids`: list of integers that represent the numerical representation of the input text (integer == token in the vocabulary of the pre-trained model)

`token_type_ids`: list of integers that indicate the type of each token in the input sequence (e.g. in a sequence classification task, the first token of the input sequence could be marked as type 0, and the second token as type 1)

`ttention_mask`: list of 1's and 0's that indicate which tokens should be attended to by the pre-trained model (1) and which should be ignored (0)

In [7]:
##############################
# TODO: Tokenize the Dataset #
##############################

tokenized_datasets = dss.map(tokenize_function, remove_columns=['seq'], num_proc=4)
tokenized_datasets.set_format('pt')

print(tokenized_datasets)
print(tokenized_datasets['train'][0])

Map (num_proc=4):   0%|          | 0/1600 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/400 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1600
    })
    test: Dataset({
        features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 400
    })
})
{'label': tensor(1), 'input_ids': tensor([ 2, 21, 23, 14, 22,  8, 14, 17, 11,  6,  9, 15, 10, 15, 14, 20, 16, 22,
        22, 13,  5, 15, 14, 14, 13, 16, 14, 18,  8, 19, 15,  8, 13, 15, 16, 13,
        11, 11,  6,  6, 13, 12,  5,  5, 12, 10,  6,  7, 13, 13, 12,  9, 12, 12,
        19,  5,  8,  9,  7, 19, 17,  7,  8,  9,  7,  6,  5, 10,  6,  7,  8,  6,
        12,  9,  8, 19,  8,  6, 14, 13,  6, 24,  7, 12, 19, 10, 15,  5, 13,  9,
        10,  6, 13,  9, 18, 13, 11, 16,  8, 10,  8, 11, 14, 14, 12,  6,  6, 10,
         6,  5, 10,  9, 15,  8, 10, 22, 10,  7,  8, 19,  6, 15, 23, 18,  5,  5,
        10, 15, 10,  8,  9, 10, 23, 11, 17, 22,  6, 13, 22,  7, 23,  7,  5,  8,
         6,  8,  7, 11, 14, 21, 10, 14, 16,  7, 17,  

## 3. Define evaluation metrics:

In [8]:
####################################
# TODO: Define the compute metrics #
####################################

from sklearn.metrics import accuracy_score, precision_recall_fscore_support, recall_score

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    specificity=recall_score(labels, preds, pos_label=0)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall (TPR)': recall,
        'specificity (TNR)': specificity
    }

## 4. Set the values of hyperparameters:

In [9]:
from transformers import TrainingArguments

###################################################################
# TODO: Define training arguments (you can get inspiration at     #
# https://huggingface.co/docs/transformers/v4.18.0/en/performance #
###################################################################

# taken from https://huggingface.co/docs/transformers/v4.18.0/en/performance
training_args = TrainingArguments(output_dir='finetuning_output',
                                  learning_rate=1e-5,
                                  warmup_ratio=0.1,
                                  lr_scheduler_type='cosine',
                                  fp16=True,
                                  evaluation_strategy='epoch',
                                  save_strategy='epoch',
                                  per_device_train_batch_size=1,
                                  per_device_eval_batch_size=1,
                                  gradient_accumulation_steps=8,
                                  num_train_epochs=1,
                                  load_best_model_at_end=True,
                                  save_total_limit=1,
                                  weight_decay=0.01,
                                  report_to='none',
                                  gradient_checkpointing=True,
                                  optim='adafactor')

## 5. Train the model:

In [10]:
from transformers import Trainer

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['test'],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall (tpr),Specificity (tnr)
1,No log,0.121307,0.995,0.995215,0.990476,1.0,0.989583


TrainOutput(global_step=200, training_loss=0.3245862579345703, metrics={'train_runtime': 382.412, 'train_samples_per_second': 4.184, 'train_steps_per_second': 0.523, 'total_flos': 941081096843772.0, 'train_loss': 0.3245862579345703, 'epoch': 1.0})

In [11]:
trainer.evaluate()

{'eval_loss': 0.12130722403526306,
 'eval_accuracy': 0.995,
 'eval_f1': 0.9952153110047847,
 'eval_precision': 0.9904761904761905,
 'eval_recall (TPR)': 1.0,
 'eval_specificity (TNR)': 0.9895833333333334,
 'eval_runtime': 19.0088,
 'eval_samples_per_second': 21.043,
 'eval_steps_per_second': 21.043,
 'epoch': 1.0}