<a href="https://colab.research.google.com/github/AshandPeach/NLP/blob/main/nlp_AAPD.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [72]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [73]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [74]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertModel, BertConfig

In [75]:
def apply_to_dict_values(dict, f):
    for key, value in dict.items():
        dict[key] = f(value)

In [76]:
class AAPDDataset(Dataset):
    """AAPD dataset."""

    def __init__(self, path):
        self.path = path
        self.data = pd.read_csv(self.path, sep='\t', header=None)
        self.tokenizer = BertTokenizer.from_pretrained(BERT_TYPE)

    def __len__(self):
        return self.data.shape[0]

    @staticmethod
    def target_to_tensor(target):
        return torch.tensor([float(label) for label in target])

    def __getitem__(self, idx):
        data = self.tokenizer(self.data.iloc[idx, 1], return_tensors="pt", max_length=512, padding="max_length", truncation=True)
        apply_to_dict_values(data, lambda x: x.flatten())
        return data, AAPDDataset.target_to_tensor(self.data.iloc[idx, 0])

In [77]:
BERT_TYPE = 'bert-base-uncased'

In [125]:
import pandas as pd

In [None]:
# percentage=0.1

In [136]:
train_dataset = AAPDDataset('/content/drive/MyDrive/nlp/AAPD/train.tsv')
val_dataset = AAPDDataset('/content/drive/MyDrive/nlp/AAPD/validation.tsv')
test_dataset = AAPDDataset('/content/drive/MyDrive/nlp/AAPD/test.tsv')

In [None]:
test_dataset[0]

In [141]:
BATCH_SIZE = 256
N_CLASSES = test_dataset[0][1].shape[0]
N_CLASSES

54

In [81]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [146]:
# train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
# val_dataloader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=True)
# test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=True)

In [148]:
#for testing
train_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=True)

In [83]:
config = BertConfig.from_pretrained(BERT_TYPE)
config.return_dict = True
bert = BertModel.from_pretrained(BERT_TYPE, config=config)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [84]:
import torch.nn as nn

class MultiLabelBert(nn.Module):
    def __init__(self, bert_model, n_classes):
        super(MultiLabelBert, self).__init__()
        self.bert_model = bert_model
        for param in self.bert_model.parameters():
            param.requires_grad = False
        self.n_bert_features = bert_model.pooler.dense.out_features
        self.n_classes = n_classes
        self.dense = nn.Linear(self.n_bert_features, self.n_classes)

    def forward(self, inputs):
        bert_output = self.bert_model(**inputs)
        return self.dense(bert_output.pooler_output)

In [85]:
model = MultiLabelBert(bert, N_CLASSES).to(device)

In [86]:
import torch.optim as optim

LEARNING_RATE = 1e-4

In [91]:
a = []
b = []
batch, target = next(iter(train_dataloader))
a.extend(target.to('cpu').tolist())
apply_to_dict_values(batch, lambda x: x.to(device))
b.extend((torch.sigmoid(model(batch)) > 0.5).type(torch.DoubleTensor).to('cpu').tolist())
len(b), len(b[0])

(64, 54)

In [92]:
from sklearn.metrics import hamming_loss

# print( "Classification report: \n", (classification_report(a, b)))
print( "F1 micro averaging:",(hamming_loss(a, b)))

F1 micro averaging: 0.6145833333333334


In [93]:
# from torch.utils.tensorboard import SummaryWriter

In [94]:
# writer = SummaryWriter('/content/drive/MyDrive/nlp/logs')

In [149]:
import tqdm
from sklearn.metrics import f1_score, hamming_loss

def train_model(
        model,
        train_dataloader,
        val_dataloader,
        optimizer,
        criterion,
        n_epochs):

    for epoch in range(1, n_epochs+1):
        model.train()
        total_loss = 0.
        all_targets = []
        all_preds = []
        for batch, targets in tqdm.tqdm(train_dataloader, f"Train epoch#{epoch}", leave=True):

            apply_to_dict_values(batch, lambda x: x.to(device))
            targets = targets.to(device)
            logits = model(batch)
            all_targets.extend(targets.to('cpu').tolist())
            all_preds.extend((torch.sigmoid(logits) > 0.5).type(torch.DoubleTensor).to('cpu').tolist())
            optimizer.zero_grad()
            loss = criterion(logits, targets)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

            # writer.add_scalar("Batch train loss", loss.item())
        #     print("Batch train loss", loss.item())

        print("ave_Train_loss", total_loss / len(train_dataloader))
        print("Train F1 (micro)",(f1_score(all_targets, all_preds, average='micro')))
        print("Train Hamming loss",(hamming_loss(all_targets, all_preds)))
            # writer.close()

        model.eval()

        with torch.no_grad():
            total_loss = 0.
            all_targets = []
            all_preds = []
            for batch, targets in tqdm.tqdm(val_dataloader, f"Val epoch#{epoch}", leave=False):
                apply_to_dict_values(batch, lambda x: x.to(device))
                targets = targets.to(device)
                logits = model(batch)
                all_targets.extend(targets.to('cpu').tolist())
                all_preds.extend((torch.sigmoid(logits) > 0.5).type(torch.DoubleTensor).to('cpu').tolist())
                loss = criterion(logits, targets)
                total_loss += loss.item()
            
            print("Validation loss", total_loss / len(val_dataloader))
            print("Validation F1 (micro)",(f1_score(all_targets, all_preds, average='micro')))
            print("Validation Hamming loss",(hamming_loss(all_targets, all_preds)))

In [150]:
model = MultiLabelBert(bert, N_CLASSES).to(device)
optimizer = optim.Adam(params=[p for p in model.parameters() if p.requires_grad], lr=LEARNING_RATE)
train_model(model, train_dataloader, val_dataloader, optimizer, torch.nn.BCEWithLogitsLoss(), 10)

Train epoch#1: 100%|██████████| 4/4 [00:40<00:00, 10.23s/it]


ave_Train_loss 0.6688603013753891
Train F1 (micro) 0.08407587119541243
Train Hamming loss 0.38451851851851854




Validation loss 0.6231765449047089
Validation F1 (micro) 0.08529210050501901
Validation Hamming loss 0.2716851851851852


Train epoch#2: 100%|██████████| 4/4 [00:41<00:00, 10.38s/it]


ave_Train_loss 0.6083783507347107
Train F1 (micro) 0.0726465109614558
Train Hamming loss 0.2397037037037037




Validation loss 0.563126266002655
Validation F1 (micro) 0.06115965051628276
Validation Hamming loss 0.1751111111111111


Train epoch#3: 100%|██████████| 4/4 [00:41<00:00, 10.34s/it]


ave_Train_loss 0.5525629073381424
Train F1 (micro) 0.06389010030527693
Train Hamming loss 0.159




Validation loss 0.509456068277359
Validation F1 (micro) 0.06361323155216284
Validation Hamming loss 0.12266666666666666


Train epoch#4: 100%|██████████| 4/4 [00:41<00:00, 10.38s/it]


ave_Train_loss 0.5031819641590118
Train F1 (micro) 0.06530127634312852
Train Hamming loss 0.11662962962962962




Validation loss 0.46235212683677673
Validation F1 (micro) 0.0562968378724182
Validation Hamming loss 0.0956111111111111


Train epoch#5: 100%|██████████| 4/4 [00:41<00:00, 10.38s/it]


ave_Train_loss 0.46050839871168137
Train F1 (micro) 0.05758454106280193
Train Hamming loss 0.09031481481481482




Validation loss 0.4211605563759804
Validation F1 (micro) 0.036806883365200764
Validation Hamming loss 0.07462962962962963


Train epoch#6: 100%|██████████| 4/4 [00:42<00:00, 10.55s/it]


ave_Train_loss 0.4225640222430229
Train F1 (micro) 0.02666666666666667
Train Hamming loss 0.06894444444444445




Validation loss 0.38549066334962845
Validation F1 (micro) 0.013268998793727381
Validation Hamming loss 0.060592592592592594


Train epoch#7: 100%|██████████| 4/4 [00:42<00:00, 10.66s/it]


ave_Train_loss 0.3889523297548294
Train F1 (micro) 0.007389989922741014
Train Hamming loss 0.05472222222222222




Validation loss 0.35490595549345016
Validation F1 (micro) 0.00939306358381503
Validation Hamming loss 0.050777777777777776


Train epoch#8: 100%|██████████| 4/4 [00:41<00:00, 10.39s/it]


ave_Train_loss 0.3597874864935875
Train F1 (micro) 0.007527286413248024
Train Hamming loss 0.04883333333333333




Validation loss 0.3288890868425369
Validation F1 (micro) 0.006235385814497273
Validation Hamming loss 0.04722222222222222


Train epoch#9: 100%|██████████| 4/4 [00:41<00:00, 10.39s/it]


ave_Train_loss 0.33521369099617004
Train F1 (micro) 0.0031733439111463714
Train Hamming loss 0.046537037037037036




Validation loss 0.3063978999853134
Validation F1 (micro) 0.006274509803921569
Validation Hamming loss 0.046925925925925926


Train epoch#10: 100%|██████████| 4/4 [00:41<00:00, 10.38s/it]


ave_Train_loss 0.31433022022247314
Train F1 (micro) 0.005443234836702954
Train Hamming loss 0.04737037037037037


                                                           

Validation loss 0.28740374743938446
Validation F1 (micro) 0.005505308690523005
Validation Hamming loss 0.04683333333333333


