In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install -q SentencePiece transformers[sentencepiece]

In [3]:
!pip install datasets
!pip install evaluate



In [2]:
import os.path

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
from torch.utils.data import DataLoader

import re
import numpy as np
import pandas as pd
import copy

import transformers, datasets
from transformers.modeling_outputs import TokenClassifierOutput
from transformers.models.t5.modeling_t5 import T5Config, T5PreTrainedModel, T5Stack
from transformers.utils.model_parallel_utils import assert_device_map, get_device_map
from transformers import T5EncoderModel, T5Tokenizer
from transformers import TrainingArguments, Trainer, set_seed
from transformers import DataCollatorForTokenClassification

from evaluate import load
from datasets import Dataset

from tqdm import tqdm
import random

from scipy import stats
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt

In [3]:
os.environ["MASTER_ADDR"] = "localhost"
os.environ["MASTER_PORT"] = "9994"  # modify if RuntimeError: Address already in use
os.environ["RANK"] = "0"
os.environ["LOCAL_RANK"] = "0"
os.environ["WORLD_SIZE"] = "1"

# Input Data

In [4]:
df = pd.read_csv('/content/drive/MyDrive/DataTest_5M_11_17.tsv', sep='\t')
df = df.drop_duplicates(subset=['AASeq', 'Vregion', 'Jregion']) # drop duplicate datapoints
df['AASeq'] = df.apply(lambda row : " ".join(row["AASeq"]), axis = 1) # space out AAs (REMOVE THIS FROM TRAINING FUNC)

In [None]:
import time
masked_aas = []
s = time.time()
for index, seq in enumerate(list(df['AASeq'])):
  if index%50000 == 0 and index > 0:
    print(f"{index}/{len(df)}")
    print(masked_aas[-1])
    print(time.time() - s)
    s = time.time()
  masked_aas.append([np.random.choice([aa, "<extra_id_0>"] , p=[.85,.15]) if aa != " " else aa for aa in seq])

50000/3853380
['C', ' ', 'A', ' ', '<extra_id_0>', ' ', 'S', ' ', 'L', ' ', 'E', ' ', 'G', ' ', 'N', ' ', 'T', ' ', 'I', ' ', 'Y', ' ', 'F']
15.90232539176941
100000/3853380
['C', ' ', 'A', ' ', 'S', ' ', 'R', ' ', 'P', ' ', 'N', ' ', 'G', ' ', 'Q', ' ', 'D', ' ', 'S', ' ', 'L', ' ', 'R', ' ', 'Y', ' ', 'T', ' ', '<extra_id_0>']
14.633614301681519
150000/3853380
['C', ' ', 'A', ' ', 'I', ' ', '<extra_id_0>', ' ', 'P', ' ', 'S', ' ', 'P', ' ', '<extra_id_0>', ' ', 'T', ' ', 'A', ' ', 'T', ' ', 'Y', ' ', 'G', ' ', 'Y', ' ', 'T', ' ', 'F']
14.624762535095215
200000/3853380
['C', ' ', 'A', ' ', '<extra_id_0>', ' ', '<extra_id_0>', ' ', 'Q', ' ', '<extra_id_0>', ' ', 'P', ' ', 'V', ' ', 'S', ' ', 'V', ' ', 'E', ' ', 'Q', ' ', 'Y', ' ', 'F']
14.391298532485962
250000/3853380
['C', ' ', 'A', ' ', 'S', ' ', 'S', ' ', '<extra_id_0>', ' ', '<extra_id_0>', ' ', 'F', ' ', 'D', ' ', 'D', ' ', '<extra_id_0>', ' ', 'N', ' ', 'T', ' ', 'E', ' ', '<extra_id_0>', ' ', '<extra_id_0>', ' ', 'F']
15.398710

In [46]:
# Tokenize Labels
temp_tokenizer = T5Tokenizer.from_pretrained("Rostlab/prot_t5_xl_uniref50")
df["label"] = temp_tokenizer(list(df["AASeq"]))['input_ids']
df["label"] = [l[:-1] for l in df["label"]] # Take away end sentence token

KeyboardInterrupt: ignored

In [None]:
df["mask"] =[1 for _ in df['label']] # Everything is "unmasked" i.e. want to predict every AA
df = df.rename(columns={"AASeq":"sequence"})
_, use_40 = train_test_split(df,test_size=.4)
train, my_valid = train_test_split(use_40,test_size=.1)
my_train, my_test = train_test_split(train,test_size=.1)

In [7]:
my_train= my_train[["sequence","label","mask"]]
my_valid= my_valid[["sequence","label","mask"]]
my_test =  my_test[["sequence","label","mask"]]

In [8]:
my_train.head(5)

Unnamed: 0_level_0,sequence,label,mask
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
183121,CASSEFGSGYTF,"[1, 0, 15, 15, 3, 4, 5, 15, 5, 19, 16, 4]","[1, 1, 1, 1, 1, -100, 1, -100, 1, 1, 1, 1]"
463685,CASSVHGGATPDTQYF,"[1, 0, 15, 15, 17, 6, 5, 5, 0, 16, 12, 2, 16, ...","[-100, 1, -100, 1, 1, 1, -100, 1, 1, 1, 1, 1, ..."
71347,CASRAWEVPIYEQYF,"[1, 0, 15, 14, 0, 18, 3, 17, 12, 7, 19, 3, 13,...","[-100, 1, -100, 1, 1, 1, 1, 1, -100, -100, 1, ..."
28048,CSVLVLGEQYF,"[1, 15, 17, 9, 17, 9, 5, 3, 13, 19, 4]","[1, 1, 1, 1, -100, 1, 1, 1, -100, 1, 1]"
541873,CASRTGHMNTEAFF,"[1, 0, 15, 14, 16, 5, 6, 10, 11, 16, 3, 0, 4, 4]","[1, 1, 1, 1, 1, 1, 1, -100, 1, 1, 1, 1, 1, 1]"


# Classification Model Definition

In [9]:
class ClassConfig:
    def __init__(self, dropout=0.2, num_labels=3):
        self.dropout_rate = dropout
        self.num_labels = num_labels

class T5EncoderForTokenClassification(T5PreTrainedModel):

    def __init__(self, config: T5Config, class_config):
        super().__init__(config)
        self.num_labels = class_config.num_labels
        self.config = config

        self.shared = nn.Embedding(config.vocab_size, config.d_model)

        encoder_config = copy.deepcopy(config)
        encoder_config.use_cache = False
        encoder_config.is_encoder_decoder = False
        self.encoder = T5Stack(encoder_config, self.shared)

        self.dropout = nn.Dropout(class_config.dropout_rate)
        self.classifier = nn.Linear(config.hidden_size, class_config.num_labels)

        # Initialize weights and apply final processing
        self.post_init()

        # Model parallel
        self.model_parallel = False
        self.device_map = None

    def parallelize(self, device_map=None):
        self.device_map = (
            get_device_map(len(self.encoder.block), range(torch.cuda.device_count()))
            if device_map is None
            else device_map
        )
        assert_device_map(self.device_map, len(self.encoder.block))
        self.encoder.parallelize(self.device_map)
        self.classifier = self.classifier.to(self.encoder.first_device)
        self.model_parallel = True

    def deparallelize(self):
        self.encoder.deparallelize()
        self.encoder = self.encoder.to("cpu")
        self.model_parallel = False
        self.device_map = None
        torch.cuda.empty_cache()

    def get_input_embeddings(self):
        return self.shared

    def set_input_embeddings(self, new_embeddings):
        self.shared = new_embeddings
        self.encoder.set_input_embeddings(new_embeddings)

    def get_encoder(self):
        return self.encoder

    def _prune_heads(self, heads_to_prune):
        """
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        """
        for layer, heads in heads_to_prune.items():
            self.encoder.layer[layer].attention.prune_heads(heads)

    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        head_mask=None,
        inputs_embeds=None,
        labels=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
    ):
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        outputs = self.encoder(
            input_ids=input_ids,
            attention_mask=attention_mask,
            inputs_embeds=inputs_embeds,
            head_mask=head_mask,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        sequence_output = outputs[0]
        sequence_output = self.dropout(sequence_output)
        logits = self.classifier(sequence_output)

        loss = None
        if labels is not None:
            loss_fct = CrossEntropyLoss()

            active_loss = attention_mask.view(-1) == 1
            active_logits = logits.view(-1, self.num_labels)

            active_labels = torch.where(
              active_loss, labels.view(-1), torch.tensor(-100).type_as(labels)
            )

            valid_logits=active_logits[active_labels!=-100]
            valid_labels=active_labels[active_labels!=-100]

            valid_labels=valid_labels.type(torch.LongTensor).to('cuda:0')

            loss = loss_fct(valid_logits, valid_labels)


        if not return_dict:
            output = (logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        return TokenClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

# ProtT5 Model

In [10]:
def PT5_classification_model(num_labels):
    # Load PT5 and tokenizer
    model = T5EncoderModel.from_pretrained("Rostlab/prot_t5_xl_uniref50")
    tokenizer = T5Tokenizer.from_pretrained("Rostlab/prot_t5_xl_uniref50")

    # Create new Classifier model with PT5 dimensions
    class_config=ClassConfig(num_labels=num_labels)
    class_model=T5EncoderForTokenClassification(model.config,class_config)

    # Set encoder and embedding weights to checkpoint weights
    class_model.shared=model.shared
    class_model.encoder=model.encoder

    # Delete the checkpoint model
    model=class_model
    del class_model

    # Print number of trainable parameters
    model_parameters = filter(lambda p: p.requires_grad, model.parameters())
    params = sum([np.prod(p.size()) for p in model_parameters])
    print("ProtT5_Classfier\nTrainable Parameter: "+ str(params))

    # Freeze Embeddings and Encoder (except LoRA)
    for (param_name, param) in model.shared.named_parameters():
                param.requires_grad = False
    for (param_name, param) in model.encoder.named_parameters():
                param.requires_grad = False

    # Print trainable Parameter
    model_parameters = filter(lambda p: p.requires_grad, model.parameters())
    params = sum([np.prod(p.size()) for p in model_parameters])
    print("ProtT5_LoRA_Classfier\nTrainable Parameter: "+ str(params) + "\n")

    return model, tokenizer

# Training

## Deepspeed config

In [11]:
# Deepspeed config for optimizer CPU offload

ds_config = {
    "fp16": {
        "enabled": "auto",
        "loss_scale": 0,
        "loss_scale_window": 1000,
        "initial_scale_power": 16,
        "hysteresis": 2,
        "min_loss_scale": 1
    },

    "optimizer": {
        "type": "AdamW",
        "params": {
            "lr": "auto",
            "betas": "auto",
            "eps": "auto",
            "weight_decay": "auto"
        }
    },

    "scheduler": {
        "type": "WarmupLR",
        "params": {
            "warmup_min_lr": "auto",
            "warmup_max_lr": "auto",
            "warmup_num_steps": "auto"
        }
    },

    "zero_optimization": {
        "stage": 2,
        "offload_optimizer": {
            "device": "cpu",
            "pin_memory": True
        },
        "allgather_partitions": True,
        "allgather_bucket_size": 2e8,
        "overlap_comm": True,
        "reduce_scatter": True,
        "reduce_bucket_size": 2e8,
        "contiguous_gradients": True
    },

    "gradient_accumulation_steps": "auto",
    "gradient_clipping": "auto",
    "steps_per_print": 2000,
    "train_batch_size": "auto",
    "train_micro_batch_size_per_gpu": "auto",
    "wall_clock_breakdown": False
}

## Training Functions

In [12]:
# Set random seeds for reproducibility of your trainings run
def set_seeds(s):
    torch.manual_seed(s)
    np.random.seed(s)
    random.seed(s)
    set_seed(s)

# Dataset creation
def create_dataset(tokenizer,seqs,labels):
    tokenized = tokenizer(seqs, max_length=1024, padding=True, truncation=True)
    dataset = Dataset.from_dict(tokenized)
    # we need to cut of labels after 1023 positions for the data collator to add the correct padding (1023 + 1 special tokens)
    labels = [l[:1023] for l in labels]
    dataset = dataset.add_column("labels", labels)

    return dataset

# Main training fuction
def train_per_residue(
        train_df,         #training data
        valid_df,         #validation data
        num_labels= 20,    #number of classes

        # effective training batch size is batch * accum
        # we recommend an effective batch size of 8
        batch= 4,         #for training
        accum= 2,         #gradient accumulation

        val_batch = 16,   #batch size for evaluation
        epochs= 10,       #training epochs
        lr= 3e-4,         #recommended learning rate
        seed= 42,         #random seed
        deepspeed= True,  #if gpu is large enough disable deepspeed for training speedup
        gpu= 1 ):         #gpu selection (1 for first gpu)

    # Set gpu device
    os.environ["CUDA_VISIBLE_DEVICES"]=str(gpu-1)

    # Set all random seeds
    set_seeds(seed)

    # load model
    model, tokenizer = PT5_classification_model(num_labels=num_labels)

    # Preprocess inputs
    # Replace uncommon AAs with "X"
    train_df["sequence"]=train_df["sequence"].str.replace('|'.join(["O","B","U","Z"]),"X",regex=True)
    valid_df["sequence"]=valid_df["sequence"].str.replace('|'.join(["O","B","U","Z"]),"X",regex=True)
    # Add spaces between each amino acid for PT5 to correctly use them
    #train_df['sequence']=train_df.apply(lambda row : " ".join(row["sequence"]), axis = 1)
    #valid_df['sequence']=valid_df.apply(lambda row : " ".join(row["sequence"]), axis = 1)


    # Create Datasets
    train_set=create_dataset(tokenizer,list(train_df['sequence']),list(train_df['label']))
    valid_set=create_dataset(tokenizer,list(valid_df['sequence']),list(valid_df['label']))

    # Huggingface Trainer arguments
    args = TrainingArguments(
        "./",
        evaluation_strategy = "steps",
        eval_steps = 2000,
        logging_strategy = "epoch",
        save_strategy = "steps",
        save_steps=5000,
        learning_rate=lr,
        per_device_train_batch_size=batch,
        #per_device_eval_batch_size=val_batch,
        per_device_eval_batch_size=batch,
        gradient_accumulation_steps=accum,
        num_train_epochs=epochs,
        seed = seed,
        deepspeed = ds_config if deepspeed else None,
    )

    # Metric definition for validation data
    def compute_metrics(eval_pred):

        metric = load("accuracy")
        predictions, labels = eval_pred

        labels = labels.reshape((-1,))

        predictions = np.argmax(predictions, axis=2)
        predictions = predictions.reshape((-1,))

        predictions = predictions[labels != -100]
        labels = labels[labels != -100]
        return metric.compute(predictions=predictions, references=labels)

    # For token classification we need a data collator here to pad correctly
    data_collator = DataCollatorForTokenClassification(tokenizer)

    # Trainer
    trainer = Trainer(
        model,
        args,
        train_dataset=train_set,
        eval_dataset=valid_set,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics
    )

    # Train model
    trainer.train()

    return tokenizer, model, trainer.state.log_history

In [35]:
temp = T5Tokenizer.from_pretrained("Rostlab/prot_t5_xl_uniref50")
temp("A A B S E F <extra_id_0>", padding=False)['input_ids'][:-1]

[3, 3, 24, 7, 9, 15, 127]

# Train

In [13]:
! pip install -U accelerate
! pip install -U transformers



In [14]:
!pip install deepspeed



In [15]:
tokenizer, model, history = train_per_residue(my_train, my_valid, num_labels=20, batch=2, accum=2, epochs=1, seed=42,
                                              gpu=1, deepspeed=False)


You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


ProtT5_Classfier
Trainable Parameter: 1208162324
ProtT5_LoRA_Classfier
Trainable Parameter: 20500



Step,Training Loss,Validation Loss,Accuracy
2000,No log,0.023736,0.0
4000,No log,0.005527,0.0
6000,No log,0.001849,0.0
8000,No log,0.00073,0.0
10000,No log,0.000319,0.0
12000,No log,0.00016,0.0
14000,No log,8.8e-05,0.0
16000,No log,5.3e-05,0.0
18000,No log,3.5e-05,0.0
20000,No log,2.4e-05,0.0


PREDICTIONS [12  6  6 ...  1  1  1]
LABELS [-100 -100 -100 ... -100 -100 -100]
PREDICTIONS [12  6  6 ...  1  1  1]
LABELS [-100 -100 -100 ... -100 -100 -100]




PREDICTIONS [12  6  6 ...  1  1  1]
LABELS [-100 -100 -100 ... -100 -100 -100]
PREDICTIONS [12  6  6 ...  1  1  1]
LABELS [-100 -100 -100 ... -100 -100 -100]
PREDICTIONS [12  6  6 ...  1  1  1]
LABELS [-100 -100 -100 ... -100 -100 -100]
PREDICTIONS [12  6  6 ...  1  1  1]
LABELS [-100 -100 -100 ... -100 -100 -100]
PREDICTIONS [12  6  6 ...  1  1  1]
LABELS [-100 -100 -100 ... -100 -100 -100]
PREDICTIONS [12  6  6 ...  1  1  1]
LABELS [-100 -100 -100 ... -100 -100 -100]
PREDICTIONS [12  6  6 ...  1  1  1]
LABELS [-100 -100 -100 ... -100 -100 -100]
PREDICTIONS [12  6  6 ...  1  1  1]
LABELS [-100 -100 -100 ... -100 -100 -100]
PREDICTIONS [12  6  6 ...  1  1  1]
LABELS [-100 -100 -100 ... -100 -100 -100]
PREDICTIONS [12  6  6 ...  1  1  1]
LABELS [-100 -100 -100 ... -100 -100 -100]


KeyboardInterrupt: ignored