In [None]:
from google.colab import drive
drive.mount("/content/drive/")

Mounted at /content/drive/


In [None]:
import torch
from torch import nn
import torch.utils.data
import torch.utils.data.distributed
from torch.utils.data import Dataset, DataLoader, RandomSampler, TensorDataset

class ProteinSequenceDataset(Dataset):
    def __init__(self, sequence, targets, tokenizer, max_len):
        self.sequence = sequence
        self.targets = targets
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.sequence)

    def __getitem__(self, item):
        sequence = str(self.sequence[item])
        target = self.targets[item]
        encoding = self.tokenizer.encode_plus(
            sequence,
            truncation=True,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
        )
        return {
          'protein_sequence': sequence,
          'input_ids': encoding['input_ids'].flatten(),
          'attention_mask': encoding['attention_mask'].flatten(),
          'targets': torch.tensor(target, dtype=torch.long)
        }

In [None]:
pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.25.1-py3-none-any.whl (5.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.8/5.8 MB[0m [31m57.3 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m182.4/182.4 KB[0m [31m19.1 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m56.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.11.1 tokenizers-0.13.2 transformers-4.25.1


In [None]:
from transformers import BertModel, BertTokenizer, AdamW, get_linear_schedule_with_warmup
import torch
import torch.nn.functional as F
import torch.nn as nn


PRE_TRAINED_MODEL_NAME = 'Rostlab/prot_bert_bfd_localization'
class ProteinClassifier(nn.Module):
    def __init__(self, n_classes):
        super(ProteinClassifier, self).__init__()
        self.bert = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME)
        self.classifier = nn.Sequential(nn.Dropout(p=0.2),
                                        nn.Linear(self.bert.config.hidden_size, n_classes),
                                        nn.Tanh())
        
    def forward(self, input_ids, attention_mask):
        output = self.bert(
          input_ids=input_ids,
          attention_mask=attention_mask
        )
        return self.classifier(output.pooler_output)

In [None]:
pip install torch_optimizer

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting torch_optimizer
  Downloading torch_optimizer-0.3.0-py3-none-any.whl (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.9/61.9 KB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pytorch-ranger>=0.1.1
  Downloading pytorch_ranger-0.1.1-py3-none-any.whl (14 kB)
Installing collected packages: pytorch-ranger, torch_optimizer
Successfully installed pytorch-ranger-0.1.1 torch_optimizer-0.3.0


In [None]:
%tb
from __future__ import print_function

import argparse
import json
import logging
import os
import sys

import numpy as np
import pandas as pd
import torch
from torch import nn
import torch.utils.data
import torch.utils.data.distributed
from torch.utils.data import Dataset, DataLoader, RandomSampler, TensorDataset
from transformers import BertTokenizer, get_linear_schedule_with_warmup
import torch_optimizer as optim

# Network definition
# from model_def import ProteinClassifier
# from data_prep import ProteinSequenceDataset
 
## SageMaker Distributed code.
# from smdistributed.dataparallel.torch.parallel.distributed import DistributedDataParallel as DDP
# import smdistributed.dataparallel.torch.distributed as dist

# dist.init_process_group()

logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)
logger.addHandler(logging.StreamHandler(sys.stdout))

MAX_LEN = 512  # this is the max length of the sequence
PRE_TRAINED_MODEL_NAME = 'Rostlab/prot_bert'
tokenizer = BertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME, do_lower_case=False)

def _get_train_data_loader(batch_size, training_dir):

    # Read sequences
    sequences = list()
    print(training_dir+'sequences.txt')
    with open(training_dir+'sequences.txt', 'r') as f:
        for line in f:
            sequences.append(line[:-1])

    # Split data into training and test sets
    sequences_train = list()
    sequences_test = list()
    proteins_test = list()
    y_train = list()
    with open(training_dir+'graph_labels.txt', 'r') as f:
        for i,line in enumerate(f):
            t = line.split(',')
            if len(t[1][:-1]) == 0:
                proteins_test.append(t[0])
                sequences_test.append(sequences[i])
            else:
                sequences_train.append(sequences[i])
                y_train.append(int(t[1][:-1]))
    sequences_train = np.array(sequences_train)
    y_train = np.array(y_train)

    train_data = ProteinSequenceDataset(
        sequence=sequences_train,
        targets=y_train,
        tokenizer=tokenizer,
        max_len=MAX_LEN
  )
    # train_sampler = torch.utils.data.distributed.DistributedSampler(
    #         dataset,
    #         num_replicas=dist.get_world_size(),
    #         rank=dist.get_rank())
    train_dataloader = DataLoader(train_data, batch_size=batch_size, shuffle=False, num_workers=0, pin_memory=True)
    return train_dataloader

def _get_test_data_loader(batch_size, training_dir):
        # Read sequences
    sequences = list()
    with open(training_dir+'sequences.txt', 'r') as f:
        for line in f:
            sequences.append(line[:-1])

    # Split data into training and test sets
    sequences_train = list()
    sequences_test = list()
    proteins_test = list()
    y_train = list()
    with open(training_dir+'graph_labels.txt', 'r') as f:
        for i,line in enumerate(f):
            t = line.split(',')
            if len(t[1][:-1]) == 0:
                proteins_test.append(t[0])
                sequences_test.append(sequences[i])
            else:
                sequences_train.append(sequences[i])
                y_train.append(int(t[1][:-1]))
    sequences_test = np.array(sequences_test)
    proteins_test = np.array(proteins_test)

    test_data = ProteinSequenceDataset(
        sequence=sequences_test,
        targets=proteins_test,
        tokenizer=tokenizer,
        max_len=MAX_LEN
  )
    # test_sampler = RandomSampler(test_data)
    test_dataloader = DataLoader(test_data, batch_size=batch_size)
    return test_dataloader

def freeze(model, frozen_layers):
    modules = [model.bert.encoder.layer[:]] #frozen_layers
    for module in modules:
        for param in module.parameters():
            param.requires_grad = False
            
def train(batch_size,data_dir,test_batch_size,test,frozen_layers,num_labels, lr, epsilon, weight_decay, epochs, log_interval, verbose, model_dir):
    #use_cuda = args.num_gpus > 0
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    # world_size = dist.get_world_size()
    # rank = dist.get_rank()
    # local_rank = dist.get_local_rank()
    
    # set the seed for generating random numbers
    # torch.manual_seed(args.seed)
    # if use_cuda:
    #     torch.cuda.manual_seed(args.seed)

    train_loader = _get_train_data_loader(batch_size, data_dir)
    #if rank == 0:
    test_loader = _get_test_data_loader(test_batch_size, test)
    print("Max length of sequence: ", MAX_LEN)
    print("Freezing {} layers".format(frozen_layers))
    print("Model used: ", PRE_TRAINED_MODEL_NAME)

    # logger.debug(
    #     "Processes {}/{} ({:.0f}%) of train data".format(
    #         len(train_loader.sampler),
    #         len(train_loader.dataset),
    #         100.0 * len(train_loader.sampler) / len(train_loader.dataset),
    #     ))

    model = ProteinClassifier(
        num_labels  # The number of output labels.
    )
    freeze(model, frozen_layers)
    model = model.to(device) #DDP(model.to(device), broadcast_buffers=False)
    # torch.cuda.set_device('local_rank')
    # model.cuda(local_rank)
    
    optimizer = optim.Lamb(
            model.parameters(), 
            lr = lr ,  #* dist.get_world_size()
            betas=(0.9, 0.999), 
            eps=epsilon, 
            weight_decay=weight_decay)
    
    total_steps = len(train_loader.dataset)
    
    scheduler = get_linear_schedule_with_warmup(optimizer,num_warmup_steps=0,num_training_steps=total_steps)
    
    loss_fn = nn.CrossEntropyLoss().to(device)
    
    for epoch in range(1, epochs + 1):
        model.train()
        for step, batch in enumerate(train_loader):
            b_input_ids = batch['input_ids'].to(device)
            b_input_mask = batch['attention_mask'].to(device)
            b_labels = batch['targets'].to(device)

            outputs = model(b_input_ids,attention_mask=b_input_mask)
            loss = loss_fn(outputs, b_labels)

            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            # modified based on their gradients, the learning rate, etc.
            optimizer.step()
            optimizer.zero_grad()
            
            if step % log_interval == 0 : #and rank == 0
                logger.info(
                    "Collecting data from Master Node: \n Train Epoch: {} [{}/{} ({:.0f}%)] Training Loss: {:.6f}".format(
                        epoch,
                        step * len(batch['input_ids']),  #*world_size
                        len(train_loader.dataset),
                        100.0 * step / len(train_loader),
                        loss.item(),
                    ))
            if verbose:
                print('Batch', step)
        test(model, test_loader, device)
        scheduler.step()
    model_save = model.module if hasattr(model, "module") else model
    save_model(model_save, model_dir)

def save_model(model, model_dir):
    path = os.path.join(model_dir, 'model.pth')
    # recommended way from http://pytorch.org/docs/master/notes/serialization.html
    torch.save(model.state_dict(), path)
    logger.info(f"Saving model: {path} \n")

def test(model, test_loader, device):
    model.eval()
    losses = []
    correct_predictions = 0
    loss_fn = nn.CrossEntropyLoss().to(device)
    tmp_eval_accuracy, eval_accuracy = 0, 0
    
    with torch.no_grad():
        for batch in test_loader:
            b_input_ids = batch['input_ids'].to(device)
            b_input_mask = batch['attention_mask'].to(device)
            b_labels = batch['targets'].to(device)

            outputs = model(b_input_ids,attention_mask=b_input_mask)
            _, preds = torch.max(outputs, dim=1)
            loss = loss_fn(outputs, b_labels)
            correct_predictions += torch.sum(preds == b_labels)
            losses.append(loss.item())
            
    print('\nTest set: Validation loss: {:.4f}, Validation Accuracy: {:.0f}%\n'.format(
        np.mean(losses),
        100. * correct_predictions.double() / len(test_loader.dataset)))

# if __name__ == "__main__":
#     parser = argparse.ArgumentParser()

#     # Data and model checkpoints directories
#     parser.add_argument("--num_labels", type=int, default=18, metavar="N", help="input batch size for training (default: 10)")

#     parser.add_argument("--batch-size", type=int, default=8, metavar="N", help="input batch size for training (default: 1)")
#     parser.add_argument("--test-batch-size", type=int, default=8, metavar="N", help="input batch size for testing (default: 8)")
#     parser.add_argument("--epochs", type=int, default=10, metavar="N", help="number of epochs to train (default: 2)")
#     parser.add_argument("--lr", type=float, default=0.3e-5, metavar="LR", help="learning rate (default: 0.3e-5)")
#     parser.add_argument("--weight_decay", type=float, default=0.01, metavar="M", help="weight_decay (default: 0.01)")
#     parser.add_argument("--seed", type=int, default=43, metavar="S", help="random seed (default: 43)")
#     parser.add_argument("--epsilon", type=int, default=1e-8, metavar="EP", help="random seed (default: 1e-8)")
#     parser.add_argument("--frozen_layers", type=int, default=10, metavar="NL", help="number of frozen layers(default: 10)")
#     parser.add_argument('--verbose', action='store_true', default=False,help='For displaying SMDataParallel-specific logs')
#     parser.add_argument("--log-interval",type=int,default=10,metavar="N",help="how many batches to wait before logging training status",)
   
#     # Container environment
#     # parser.add_argument("--hosts", type=list, default=json.loads(os.environ["SM_HOSTS"]))
#     # parser.add_argument("--current-host", type=str, default=os.environ["SM_CURRENT_HOST"])
#     parser.add_argument("--model-dir", type=str, default="/Models")
#     parser.add_argument("--data-dir", type=str, default="/content/drive/MyDrive/Challenge")
#     parser.add_argument("--test", type=str, default="/content/drive/MyDrive/Challenge")
#     # parser.add_argument("--num-gpus", type=int, default=os.environ["SM_NUM_GPUS"])

#     train(parser.parse_args())

batch_size = 8
data_dir = "/content/drive/MyDrive/Challenge/"
test_batch_size = 8
test = "/content/drive/MyDrive/Challenge/"
frozen_layers = 30
num_labels = 18
lr = 0.3e-5
epsilon = 1e-8
weight_decay = 0.01
epochs = 10
log_interval = 10
verbose = False
model_dir ="/content/drive/MyDrive/Challenge/Models/"
train(batch_size,data_dir,test_batch_size,test,frozen_layers,num_labels, lr, epsilon, weight_decay, epochs, log_interval, verbose, model_dir)

No traceback available to show.


Downloading:   0%|          | 0.00/81.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/86.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/361 [00:00<?, ?B/s]

/content/drive/MyDrive/Challenge/sequences.txt
Max length of sequence:  512
Freezing 30 layers
Model used:  Rostlab/prot_bert


Downloading:   0%|          | 0.00/1.68G [00:00<?, ?B/s]

Some weights of the model checkpoint at Rostlab/prot_bert were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Collecting data from Master Node: 


INFO:__main__:Collecting data from Master Node: 


Collecting data from Master Node: 


INFO:__main__:Collecting data from Master Node: 


Collecting data from Master Node: 


INFO:__main__:Collecting data from Master Node: 


Collecting data from Master Node: 


INFO:__main__:Collecting data from Master Node: 


Collecting data from Master Node: 


INFO:__main__:Collecting data from Master Node: 


Collecting data from Master Node: 


INFO:__main__:Collecting data from Master Node: 


Collecting data from Master Node: 


INFO:__main__:Collecting data from Master Node: 


Collecting data from Master Node: 


INFO:__main__:Collecting data from Master Node: 


Collecting data from Master Node: 


INFO:__main__:Collecting data from Master Node: 


Collecting data from Master Node: 


INFO:__main__:Collecting data from Master Node: 


Collecting data from Master Node: 


INFO:__main__:Collecting data from Master Node: 


Collecting data from Master Node: 


INFO:__main__:Collecting data from Master Node: 


Collecting data from Master Node: 


INFO:__main__:Collecting data from Master Node: 


Collecting data from Master Node: 


INFO:__main__:Collecting data from Master Node: 


Collecting data from Master Node: 


INFO:__main__:Collecting data from Master Node: 


Collecting data from Master Node: 


INFO:__main__:Collecting data from Master Node: 


Collecting data from Master Node: 


INFO:__main__:Collecting data from Master Node: 


Collecting data from Master Node: 


INFO:__main__:Collecting data from Master Node: 


Collecting data from Master Node: 


INFO:__main__:Collecting data from Master Node: 


Collecting data from Master Node: 


INFO:__main__:Collecting data from Master Node: 


Collecting data from Master Node: 


INFO:__main__:Collecting data from Master Node: 


Collecting data from Master Node: 


INFO:__main__:Collecting data from Master Node: 


Collecting data from Master Node: 


INFO:__main__:Collecting data from Master Node: 
