In [1]:
!nvidia-smi

Thu Mar 17 04:25:01 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 450.119.04   Driver Version: 450.119.04   CUDA Version: 11.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   38C    P0    27W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
import os
import re
import random
import numpy as np
import pandas as pd
import torch
from Bio import SeqIO
from matplotlib import pyplot as plt
from sklearn import metrics
from torch.utils.data import Dataset
from transformers import AutoTokenizer, Trainer, TrainingArguments, AutoModelForSequenceClassification

In [3]:
# setup seed for experiment reproducibility
def setup_seed(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)
    random.seed(seed)
    torch.backends.cudnn.deterministic = True

setup_seed(42)

In [4]:
# protein dataset
class ProteinDataset(Dataset):
    def __init__(self, split="train", tokenizer_name='Rostlab/prot_bert_bfd', max_length=1024):
        self.max_length = max_length
        
        # define tokenizer
        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name, do_lower_case=False)
        
        # define label mapping
        self.arg_dict = {"aminoglycoside": 0, "macrolide-lincosamide-streptogramin": 1, 
                         "polymyxin": 2, "fosfomycin": 3, "trimethoprim": 4, "bacitracin": 5, 
                         "quinolone": 6, "multidrug": 7, "chloramphenicol": 8, 
                         "tetracycline": 9, "rifampin": 10, "beta_lactam": 11,
                         "sulfonamide": 12, "glycopeptide": 13, "nonarg": 14}
        self.labels_dic = {id: tag for tag, id in self.arg_dict.items()}
        
        # define folder paths
        self.datasetFolderPath = '/kaggle/input/aist4010-a2/data/'
        self.trainFilePath = os.path.join(self.datasetFolderPath, 'train.fasta')
        self.valFilePath = os.path.join(self.datasetFolderPath, 'val.fasta')
        self.testFilePath = os.path.join(self.datasetFolderPath, 'test.fasta')
        
        # load dataset from suitable file
        if split == "train":
            self.seqs, self.labels = self.load_dataset(self.trainFilePath, max_length)
        elif split == "val":
            self.seqs, self.labels = self.load_dataset(self.valFilePath, max_length)
        else:
            self.seqs, self.labels = self.load_dataset(self.testFilePath, max_length, True)
    
    # load dataset in 'FASTA' format
    def load_dataset(self, path, max_len, test=False):
        strs = []
        labels = []
        count = 0
        for record in SeqIO.parse(path, "fasta"):
            count += 1
            # get the protein sequence
            x = str(record.seq)
            # extract label from description, add dummy label for test data
            tmp = record.id.split("|")
            y = 14 if (test or tmp[0] == "sp") else self.arg_dict[tmp[3]]
            strs.append(x)
            labels.append(y)
        if (test):
            return strs, labels
        else:
            # shuffle the training and validation dataset
            shuffle = random.sample(list(range(count)), count)
            return [strs[i] for i in shuffle], [labels[i] for i in shuffle]

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
        # parse the sequence to proper format
        seq = " ".join("".join(self.seqs[idx].split()))
        # substitute rare amino acid by 'X'
        seq = re.sub(r"[UZOB]", "X", seq)
        # preprocess sequence with tokenizer
        seq_ids = self.tokenizer(seq, truncation=True, padding='max_length', max_length=self.max_length)
        sample = {key: torch.tensor(val) for key, val in seq_ids.items()}
        sample['labels'] = torch.tensor(self.labels[idx])
        return sample

In [5]:
# pre-trained ProtBert-BFD model
model_name = 'Rostlab/prot_bert_bfd'

# load the datasets
train_dataset = ProteinDataset(split="train", tokenizer_name=model_name, max_length=400)
val_dataset = ProteinDataset(split="val", tokenizer_name=model_name, max_length=400)
test_dataset = ProteinDataset(split="test", tokenizer_name=model_name, max_length=400)

Downloading:   0%|          | 0.00/86.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/361 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/81.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [6]:
# compute different metrics
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    acc = metrics.accuracy_score(labels, preds)
    precision, recall, f1, _ = metrics.precision_recall_fscore_support(labels, preds, average='macro')
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [7]:
# initialize the model
def model_init():
    return AutoModelForSequenceClassification.from_pretrained(model_name,num_labels=15)

In [8]:
# define training argument
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=5,              # total number of training epochs
    per_device_train_batch_size=1,   # batch size per device during training
    per_device_eval_batch_size=32,   # batch size for evaluation
    learning_rate=5e-5,              # learning rate
    lr_scheduler_type="linear",      # learning rate decay
    warmup_steps=0,                  # number of warmup steps for learning rate scheduler
    weight_decay=5e-3,               # strength of weight decay
    logging_strategy="epoch",        # log after each epoch
    save_strategy="no",              # do not save in the middle of training
    do_train=True,                   # perform training
    do_eval=True,                    # perform evaluation
    evaluation_strategy="epoch",     # evalute after each epoch
    gradient_accumulation_steps=64,  # total number of steps before back propagation
    fp16=True,                       # use mixed precision
    fp16_opt_level="02",             # mixed precision mode
    report_to="none",                # no integrations for result report
    run_name="ProBert-BFD-MS",       # experiment name
    seed=42                          # seed for experiment reproducibility
)

# initialize trainer
trainer = Trainer(
    model_init=model_init,                # the instantiated Transformers model to be trained
    args=training_args,                   # training arguments, defined above
    train_dataset=train_dataset,          # training dataset
    eval_dataset=val_dataset,             # evaluation dataset
    compute_metrics = compute_metrics,    # evaluation metrics
)

# fine-tuning the pre-trained model
trainer.train()

loading configuration file https://huggingface.co/Rostlab/prot_bert_bfd/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/67f460acc7e7e147ff828e909ffe419d00d66ce679c682bc4ab715c107bcbe41.baf557855a8618d0ddfb6c23bfd135bfc38ccf8c3fb099b8df45eb110ccf05e9
Model config BertConfig {
  "_name_or_path": "Rostlab/prot_bert_bfd",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.0,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 1024,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6",
    "7": "LABEL_7",
    "8": "LABEL_8",
    "9": "LABEL_9",
    "10": "LABEL_10",
    "11": "LABEL_11",
    "12": "LABEL_12",
    "13": "LABEL_13",
    "14": "LABEL_14"
  },
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_10": 1

Downloading:   0%|          | 0.00/1.57G [00:00<?, ?B/s]

storing https://huggingface.co/Rostlab/prot_bert_bfd/resolve/main/pytorch_model.bin in cache at /root/.cache/huggingface/transformers/0a05878f9e3a0d39834dc6f21b88471696d7453a07bac7246152a6ef307c9af4.c5b9869da882baaf70e8e70cf32d81500803511e3220e24457115a03445fa65f
creating metadata file for /root/.cache/huggingface/transformers/0a05878f9e3a0d39834dc6f21b88471696d7453a07bac7246152a6ef307c9af4.c5b9869da882baaf70e8e70cf32d81500803511e3220e24457115a03445fa65f
loading weights file https://huggingface.co/Rostlab/prot_bert_bfd/resolve/main/pytorch_model.bin from cache at /root/.cache/huggingface/transformers/0a05878f9e3a0d39834dc6f21b88471696d7453a07bac7246152a6ef307c9af4.c5b9869da882baaf70e8e70cf32d81500803511e3220e24457115a03445fa65f
Some weights of the model checkpoint at Rostlab/prot_bert_bfd were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.bias', 'cls.predictions.dec

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
0,0.4922,0.165568,0.967632,0.705353,0.71181,0.704908
1,0.1115,0.103859,0.981536,0.871949,0.87959,0.871615
2,0.0585,0.079554,0.988375,0.959785,0.973369,0.948042
3,0.0343,0.068231,0.989742,0.967727,0.974476,0.962261
4,0.0224,0.066807,0.989742,0.968221,0.975802,0.961898


  args.max_grad_norm,
***** Running Evaluation *****
  Num examples = 4387
  Batch size = 32
  _warn_prf(average, modifier, msg_start, len(result))
  args.max_grad_norm,
***** Running Evaluation *****
  Num examples = 4387
  Batch size = 32
  _warn_prf(average, modifier, msg_start, len(result))
***** Running Evaluation *****
  Num examples = 4387
  Batch size = 32
***** Running Evaluation *****
  Num examples = 4387
  Batch size = 32
***** Running Evaluation *****
  Num examples = 4387
  Batch size = 32


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=1655, training_loss=0.14377552994788592, metrics={'train_runtime': 25999.0198, 'train_samples_per_second': 4.079, 'train_steps_per_second': 0.064, 'total_flos': 9.6424021998864e+16, 'train_loss': 0.14377552994788592, 'epoch': 5.0})

In [9]:
# make predictions using the trained model
predictions, label_ids, metrics = trainer.predict(test_dataset)
predictions_max = np.argmax(predictions, axis=1)

# output the predictions to the csv file
output = {"id": np.array(["SEQ" + str(i) for i in range(len(predictions_max))]), "label": np.array(predictions_max)}
output_df = pd.DataFrame(output).set_index('id')
output_df.to_csv("output.csv")

# save the trained model
trainer.save_model('./models/')

***** Running Prediction *****
  Num examples = 4469
  Batch size = 32


  _warn_prf(average, modifier, msg_start, len(result))
Saving model checkpoint to ./models/
Configuration saved in ./models/config.json
Model weights saved in ./models/pytorch_model.bin
