In [6]:
%pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [7]:
import io
import os
import glob
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import matplotlib 
# matplotlib.use('Agg')
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns


import torch
from transformers import AutoModel, AutoTokenizer, AutoModelForSequenceClassification, AdamW
from torch.utils.data import DataLoader, Dataset
import pandas as pd

%pip install tqdm

from tqdm import tqdm

import tensorflow as tf


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [8]:
if torch.cuda.is_available():
    device = torch.device(0)
    print(f'There are {torch.cuda.device_count()} GPU(s) available.')
    print('Device name:', torch.cuda.get_device_name(0))

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
Device name: Tesla T4


In [9]:
import zipfile
with zipfile.ZipFile("./ChemProt.zip","r") as zip_ref:
    zip_ref.extractall("./Data")

In [10]:
model_name = "dmis-lab/biobert-base-cased-v1.2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

Some weights of the model checkpoint at dmis-lab/biobert-base-cased-v1.2 were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [11]:
print(tokenizer)

BertTokenizerFast(name_or_path='dmis-lab/biobert-base-cased-v1.2', vocab_size=28996, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})


In [12]:
task_name = "chemprot"
data_dir = "./Data/ChemProt/Contrastive/"
model_type = "cl_pretraining"
max_seq_le = 128
batch_size = 64

In [13]:
class DataContrastive(object):
    def _read_tsv(clm, input_file):
        data = []
        with open(input_file, 'r') as f:
            for line in f:
                fields = line.strip().split('\t')
                data.append([fields[0], fields[1]])
                data.append([fields[0], fields[2]])
        return data

In [14]:
class BioBERTChemprotProcessor(DataContrastive):
    def get_train_examples(self, data_dir):
        return self._create_examples(
            self._read_tsv(os.path.join(data_dir, "train.tsv")))
    def get_dev_examples(self, data_dir):
        return self._create_examples(
            self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
    def get_test_examples(self, data_dir):
        return self._create_examples(
            self._read_tsv(os.path.join(data_dir, "test.tsv")), "test")
    def get_labels(self):
        return ["CPR:3", "CPR:4", "CPR:5", "CPR:6", "CPR:9", "false"]
    def _create_examples(self, lines):
        examples = []
        for (i, line) in tqdm(enumerate(lines)):
            inputs = tokenizer(line[1], padding = 'max_length', truncation=True, max_length = max_seq_le, return_tensors = 'pt')
            label = torch.tensor(0)
            examples.append([inputs['input_ids'], inputs['attention_mask'], label])
        return examples
    

In [15]:
class ChemProtContrastivePreTraining(nn.Module):
    def __init__(self, model, tokenizer):
        super().__init__()
        self.model = model
        self.tokenizer = tokenizer
        
        self.projection_head = nn.Sequential(
            nn.Linear(model.config.hidden_size, model.config.hidden_size),
            nn.ReLU(),
            nn.Linear(model.config.hidden_size, model.config.hidden_size)
        )

    def forward(self, input_ids, attention_mask):
        outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)
        embeddings = outputs.last_hidden_state[:, 0, :]
        projections = self.projection_head(embeddings)
        return projections
    

    

In [16]:
processors = {
      "chemprot": BioBERTChemprotProcessor,
}
models = {
    "cl_pretraining": ChemProtContrastivePreTraining
}
processor = processors[task_name]()
label_list = processor.get_labels()
train_contrastive = processor.get_train_examples(data_dir)
model_fn = models[model_type]

print("Task: " + str(task_name))
print("List label: ")
print(label_list)
print(train_contrastive[0])

71000it [00:41, 1724.46it/s]


Task: chemprot
List label: 
['CPR:3', 'CPR:4', 'CPR:5', 'CPR:6', 'CPR:9', 'false']
[tensor([[  101,  1106,  1233,  2497, 21919,  1179,  1110,   170, 14930,   137,
          5297,   109,   170,  1964,  1643,   114,   191,   113,   123,   114,
         10814,  3510,  1200,  1215,  1106, 21497,  1714,  1447,  4267, 10374,
          1548,  1107,  1103,  3252,  1104,   174,  1358,  6005, 14183,  1596,
          1137,   177, 24312,  6005, 14183,  1596,   177,  1183,  5674, 24226,
         16996,  1465,   119,   102,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,    

### 

In [17]:
from torch.nn.utils.rnn import pad_sequence

max_seq_len_pp = 128

def collate_fn(batch):
    

    inputs = [item[0] for item in batch]
    attn_masks = [item[1] for item in batch]
    labels = [item[2] for item in batch]
    
    max_seq_len = max(max_seq_len_pp, max([int(seq.size(1)) for seq in inputs]))
    inputs = [torch.nn.functional.pad(seq, (0, max_seq_len - seq.size(1)), value=0) for seq in inputs]
    attn_masks = [torch.nn.functional.pad(seq, (0, max_seq_len - seq.size(1)), value=0) for seq in attn_masks]
    
    inputs = pad_sequence(inputs, batch_first=True)
    attn_masks = pad_sequence(attn_masks, batch_first=True)
    labels = torch.stack(labels)
    
    return inputs, attn_masks, labels

In [18]:
train_contrastive_dataloader = DataLoader(train_contrastive, batch_size=batch_size, shuffle=True, collate_fn = collate_fn)

In [19]:
for inputs, atention_mask, labels in train_contrastive_dataloader: 
    print(inputs)
    print(atention_mask)
    print(labels)
    break

tensor([[[  101,  1113,  1103,  ...,     0,     0,     0]],

        [[  101,  3336, 17030,  ...,     0,     0,     0]],

        [[  101,  2456,   117,  ...,     0,     0,     0]],

        ...,

        [[  101,  1195,  1276,  ...,     0,     0,     0]],

        [[  101,  1103,  2281,  ...,     0,     0,     0]],

        [[  101,  1165,  1821,  ...,   117,   174,   102]]])
tensor([[[1, 1, 1,  ..., 0, 0, 0]],

        [[1, 1, 1,  ..., 0, 0, 0]],

        [[1, 1, 1,  ..., 0, 0, 0]],

        ...,

        [[1, 1, 1,  ..., 0, 0, 0]],

        [[1, 1, 1,  ..., 0, 0, 0]],

        [[1, 1, 1,  ..., 1, 1, 1]]])
tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])


In [20]:
model = model_fn(model, tokenizer)
model_run = model.to(device)
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)



In [21]:
def cal_loss(output_layer): 

        cosine_sim_1d = tf.keras.losses.CosineSimilarity(axis=1,)
        cosine_sim_2d = tf.keras.losses.CosineSimilarity(axis=2,)


        def _cosine_simililarity_dim1(x, y):
            v = cosine_sim_1d(x, y)
            return v


        def _cosine_simililarity_dim2(x, y):
            v = cosine_sim_2d(tf.expand_dims(x, 1), tf.expand_dims(y, 0))
            return v


        def _dot_simililarity_dim1(x, y):
            v = tf.matmul(tf.expand_dims(x, 1), tf.expand_dims(y, 2))
            return v


        def _dot_simililarity_dim2(x, y):
            v = tf.tensordot(tf.expand_dims(x, 1), tf.expand_dims(tf.transpose(y), 0), axes=2)
            return v

        def get_negative_mask(batch_size):
            negative_mask = np.ones((batch_size, 2 * batch_size), dtype=bool)
            for i in range(batch_size):
                negative_mask[i, i] = 0
                negative_mask[i, i + batch_size] = 0
            return tf.constant(negative_mask)
        
        criterion = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True,)
        negative_mask = get_negative_mask(int(batch_size/2))
        output_layer = output_layer.cpu()
        output_layer = output_layer.detach().numpy()
        zis = output_layer[::2] #z0 z2 z4
        zjs = output_layer[1::2] # z1 z3 z5

        zis = tf.math.l2_normalize(zis, axis=1)
        zjs = tf.math.l2_normalize(zjs, axis=1)
        l_pos = _dot_simililarity_dim1(zis, zjs)
        l_pos = tf.reshape(l_pos, (int(batch_size/2), 1))
        l_pos /= 0.1

        negatives = tf.concat([zjs, zis], axis=0)
        loss = 0

        for positives in [zis, zjs]:
          l_neg = _dot_simililarity_dim2(positives, negatives)

          labels = tf.zeros(int(batch_size/2), dtype=tf.int32)

          l_neg = tf.boolean_mask(l_neg, negative_mask)
          l_neg = tf.reshape(l_neg, (int(batch_size/2), -1))
          l_neg /= 0.1
          # assert l_neg.shape == (
          #     config['batch_size'], 2 * (config['batch_size'] - 1)), "Shape of negatives not expected." + str(
          #     l_neg.shape)
          logits = tf.concat([l_pos, l_neg], axis=1)  # [N,K+1]
          loss += criterion(y_pred=logits, y_true=labels)
          #print(loss)
        
        loss_tf = loss / (batch_size)
        #print("Done")
        #print(loss_tf)
        loss_np = loss_tf.numpy()
        loss_torch = torch.tensor(loss_np, dtype = torch.float32,requires_grad=True)

        return loss_torch


In [22]:
def ContrastiveTrain(model, optimizer, train_loader, num_epochs):

    for epoch in range(num_epochs):
        print('Epoch {}/{}'.format(epoch + 1, num_epochs))
        model.train()
        running_loss = 0.0
        print('Training Contrastive ...')
        for inputs, attention_mask, labels in tqdm(train_loader):
            if len(inputs) != batch_size:
              break
            optimizer.zero_grad()
            inputs = inputs.to(device)
            attention_mask = attention_mask.to(device)
            labels = labels.to(device)

            inputs = inputs.squeeze()
            attention_mask = attention_mask.squeeze()
            labels = labels.squeeze()
            
            outputs = model(input_ids=inputs, attention_mask=attention_mask)
            loss = cal_loss(outputs)
            
            loss.backward()
            optimizer.step()
    return model
            

In [23]:
model_extra_pretrain = ContrastiveTrain(model_run, optimizer, train_contrastive_dataloader, 5)

Epoch 1/5
Training Contrastive ...


100%|█████████▉| 1109/1110 [08:48<00:00,  2.10it/s]


Epoch 2/5
Training Contrastive ...


100%|█████████▉| 1109/1110 [08:49<00:00,  2.10it/s]


Epoch 3/5
Training Contrastive ...


100%|█████████▉| 1109/1110 [08:48<00:00,  2.10it/s]


Epoch 4/5
Training Contrastive ...


100%|█████████▉| 1109/1110 [08:48<00:00,  2.10it/s]


Epoch 5/5
Training Contrastive ...


100%|█████████▉| 1109/1110 [08:48<00:00,  2.10it/s]


# Get datasets

In [24]:
torch.save({
            'epoch': 10,
            'model_state_dict': model_extra_pretrain.state_dict(),
            'optimizer_state_dict': optimizer.state_dict()
            }, 'Model.pt')


In [25]:
train_data = pd.read_csv('./Data/ChemProt/Main/train.tsv', sep="\t")

dev_data = pd.read_csv('./Data/ChemProt/Main/dev.tsv', sep="\t")

test_data = pd.read_csv('./Data/ChemProt/Main/test.tsv', sep="\t")


train_data_aug = pd.read_csv('./Data/ChemProt/Aug/train.tsv', sep="\t")

dev_data_aug = pd.read_csv('./Data/ChemProt/Aug/dev.tsv', sep="\t")

test_data_aug = pd.read_csv('./Data/ChemProt/Aug/test.tsv', sep="\t")

In [26]:
train_data = train_data.dropna(subset=["index", "sentence", "label"])
dev_data = dev_data.dropna(subset=["index", "sentence", "label"])
test_data = test_data.dropna(subset=["index", "sentence", "label"])

train_data_aug = train_data_aug.dropna(subset=["index", "sentence", "label"])
dev_data_aug = dev_data_aug.dropna(subset=["index", "sentence", "label"])
test_data_aug = test_data_aug.dropna(subset=["index", "sentence", "label"])


In [27]:
labels = ['false', 'CPR:3', 'CPR:4', 'CPR:5', 'CPR:6', 'CPR:9']
label_map = {label: i for i, label in enumerate(labels)}

In [28]:
class ChemProtDataset(Dataset):
    def __init__(self, data, tokenizer, label_map):
        self.data = data
        self.tokenizer = tokenizer
        self.label_map = label_map
        
    def __len__(self):
        return len(self.data)
        
    def __getitem__(self, index):
        sentence = self.data.iloc[index]['sentence']
        label = self.data.iloc[index]['label']
        label_id = self.label_map[label]
        inputs = self.tokenizer(sentence, return_tensors='pt', padding=True, truncation=True, max_length=256)
        return inputs['input_ids'], inputs['attention_mask'], torch.tensor(label_id, dtype=torch.long)

In [29]:
train_dataset = ChemProtDataset(train_data, tokenizer, label_map)
dev_dataset = ChemProtDataset(dev_data, tokenizer, label_map)
test_dataset = ChemProtDataset(test_data, tokenizer, label_map)

train_dataset_aug = ChemProtDataset(train_data_aug, tokenizer, label_map)
dev_dataset_aug = ChemProtDataset(dev_data_aug, tokenizer, label_map)
test_dataset_aug = ChemProtDataset(test_data_aug, tokenizer, label_map)

print("Origin --")
print("Length of Train Dataset: " + str(len(train_dataset)))
print("Length of Dev-set: "+ str(len(dev_dataset)))
print("Length of Tests: " + str(len(test_dataset)))
print('\n')
print("Aug --")
print("Length of Train Dataset: " + str(len(train_dataset_aug)))
print("Length of Dev-set: "+ str(len(dev_dataset_aug)))
print("Length of Tests: " + str(len(test_dataset_aug)))

Origin --
Length of Train Dataset: 18035
Length of Dev-set: 11268
Length of Tests: 15745


Aug --
Length of Train Dataset: 18035
Length of Dev-set: 11268
Length of Tests: 31490


In [30]:
from torch.nn.utils.rnn import pad_sequence

max_seq_len_pp = 128

def collate_fn(batch):
    inputs = [item[0] for item in batch]
    attn_masks = [item[1] for item in batch]
    labels = [item[2] for item in batch]
    
    # Find maximum length of sequences along the second dimension
    max_seq_len = max(max_seq_len_pp, max([seq.size(1) for seq in inputs]))
    
    # Pad sequences with zeros to maximum length
    inputs = [torch.nn.functional.pad(seq, (0, max_seq_len - seq.size(1)), value=0) for seq in inputs]
    attn_masks = [torch.nn.functional.pad(seq, (0, max_seq_len - seq.size(1)), value=0) for seq in attn_masks]
    
    # Stack padded sequences
    inputs = pad_sequence(inputs, batch_first=True)
    attn_masks = pad_sequence(attn_masks, batch_first=True)
    labels = torch.stack(labels)
    
    return inputs, attn_masks, labels

In [31]:
train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)
dev_dataloader = DataLoader(dev_dataset, batch_size=32, collate_fn=collate_fn)
test_dataloader = DataLoader(test_dataset, batch_size=32, collate_fn=collate_fn)

train_dataloader_aug = DataLoader(train_dataset_aug, batch_size=32, shuffle=True, collate_fn=collate_fn)
dev_dataloader_aug = DataLoader(dev_dataset_aug, batch_size=32, collate_fn=collate_fn)
test_dataloader_aug = DataLoader(test_dataset_aug, batch_size=32, collate_fn=collate_fn)

# Augmented evaluation datasets

In [32]:
from sklearn.metrics import accuracy_score, f1_score, precision_recall_fscore_support
def compute_metrics(labels, preds):
    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, preds, average="micro", labels=[0, 1, 2, 3, 4]
    )
    return {"micro_precision": precision, "micro_recall": recall, "micro_f1": f1}

In [33]:
optimizer = AdamW(model_run.parameters(), lr=2e-5, eps=1e-8)
criterion = nn.CrossEntropyLoss()


def train(model, optimizer, criterion, train_loader, val_loader, test_loader, num_epochs):
    print("HMM")
    for epoch in range(num_epochs):
        print('Epoch {}/{}'.format(epoch + 1, num_epochs))
        model.train()
        train_loss = 0.0
        print('Traininggg ...')
        for inputs, attention_mask, labels in tqdm(train_loader):
            optimizer.zero_grad()
            inputs = inputs.to(device)
            attention_mask = attention_mask.to(device)
            labels = labels.to(device)
            
            inputs = inputs.squeeze()
            attention_mask = attention_mask.squeeze()
            labels = labels.squeeze()
            
            outputs = model(input_ids=inputs, attention_mask=attention_mask)

            loss = criterion(outputs,labels)
            loss.backward()
            optimizer.step()

            train_loss += loss.item() * inputs.size(0)
        epoch_loss = train_loss / len(train_loader.dataset)
        print("Train loss: " + str(epoch_loss))
            
            
    
    model.eval()
    running_loss = 0.0
    val_true = []
    val_pred = []
    with torch.no_grad():
        for inputs, attention_mask, labels in tqdm(val_loader):
            inputs = inputs.to(device)
            attention_mask = attention_mask.to(device)
            labels = labels.to(device)
            
            inputs = inputs.squeeze()
            attention_mask = attention_mask.squeeze()
            labels = labels.squeeze()

            outputs = model(input_ids=inputs, attention_mask=attention_mask)

            loss = criterion(outputs,labels)

            running_loss += loss.item() * inputs.size(0)
            pred = torch.argmax(outputs, dim=1)
            val_true.extend(labels.cpu().numpy().tolist())
            val_pred.extend(pred.cpu().numpy().tolist())
    eval_loss = running_loss / len(val_loader.dataset)
    precision, recall, f1, _ = precision_recall_fscore_support(val_true, val_pred, average='micro', labels=[0, 1, 2, 3, 4])
    print("Eval loss: ", eval_loss) 
    print("Micro Precision:", precision)
    print("Micro Recall:", recall)
    print("Micro F1 Score:", f1)
    print('\nTest ...')

    test_loss = 0.0
    test_true = []
    test_pred = []
    with torch.no_grad():
        for inputs, attention_mask, labels in tqdm(test_loader):
            inputs = inputs.to(device)
            attention_mask = attention_mask.to(device)
            labels = labels.to(device)
            
            inputs = inputs.squeeze(1)
            attention_mask = attention_mask.squeeze(1)

            outputs = model(input_ids=inputs, attention_mask=attention_mask)

            loss = criterion(outputs,labels)

            running_loss += loss.item() * inputs.size(0)
            pred = torch.argmax(outputs, dim=1)
            test_true.extend(labels.cpu().numpy().tolist())
            test_pred.extend(pred.cpu().numpy().tolist())
    test_loss = running_loss / len(val_loader.dataset)
    precision, recall, f1, _ = precision_recall_fscore_support(test_true, test_pred, average='micro', labels=[0, 1, 2, 3, 4])
    print("Eval loss: ", test_loss) 
    print("Micro Precision:", precision)
    print("Micro Recall:", recall)
    print("Micro F1 Score:", f1)
    print('\nTest ...')
    
    
    return model





In [34]:
model = train(model_run, optimizer, criterion, train_dataloader_aug, dev_dataloader_aug, test_dataloader_aug, num_epochs=8)


HMM
Epoch 1/8
Traininggg ...


100%|██████████| 564/564 [08:42<00:00,  1.08it/s]


Train loss: 0.9115354691412365
Epoch 2/8
Traininggg ...


100%|██████████| 564/564 [08:42<00:00,  1.08it/s]


Train loss: 0.2248274740962596
Epoch 3/8
Traininggg ...


100%|██████████| 564/564 [08:44<00:00,  1.08it/s]


Train loss: 0.1393579354453424
Epoch 4/8
Traininggg ...


100%|██████████| 564/564 [08:44<00:00,  1.08it/s]


Train loss: 0.0959032511856538
Epoch 5/8
Traininggg ...


100%|██████████| 564/564 [08:39<00:00,  1.09it/s]


Train loss: 0.07115332623755201
Epoch 6/8
Traininggg ...


100%|██████████| 564/564 [08:41<00:00,  1.08it/s]


Train loss: 0.05295751909424567
Epoch 7/8
Traininggg ...


100%|██████████| 564/564 [08:40<00:00,  1.08it/s]


Train loss: 0.04708058533333872
Epoch 8/8
Traininggg ...


100%|██████████| 564/564 [08:42<00:00,  1.08it/s]


Train loss: 0.040905041344688975


100%|██████████| 353/353 [01:25<00:00,  4.14it/s]


Eval loss:  0.4479942017119645
Micro Precision: 0.9073366272023865
Micro Recall: 0.9002867449819628
Micro F1 Score: 0.9037979385272541

Test ...


100%|██████████| 985/985 [04:09<00:00,  3.95it/s]

Eval loss:  1.9122408981748396
Micro Precision: 0.8985901476198375
Micro Recall: 0.8967754750711779
Micro F1 Score: 0.8976818942519593

Test ...





IndentationError: ignored