In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from sklearn import model_selection, metrics
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from transformers import AdamW, get_linear_schedule_with_warmup, BertTokenizerFast, BertPreTrainedModel, BertModel, BertConfig
from tqdm.autonotebook import tqdm
# from torch.utils.data.sampler import WeightedRandomSampler

In [2]:
# from ipywidgets import IntProgress

In [3]:
df = pd.read_csv('../data/can_train_ns.csv')

In [4]:
df.columns

Index(['TEXT', 'LABEL'], dtype='object')

In [5]:
df.columns = ['text', 'label']
df.label = df.label.astype(int)

In [6]:
df["kfold"] = -1
df = df.sample(frac=1, random_state=42).reset_index(drop=True)
kf = model_selection.StratifiedKFold(n_splits=5)

for fold, (trn_, val_) in enumerate(kf.split(X=df, y=df.label.values)):
    df.loc[val_, 'kfold'] = fold

# df.to_csv('train.csv', index=False)

In [7]:
class AverageMeter:
    """
    Computes and stores the average and current value
    """

    def __init__(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

In [8]:
MAX_LEN = 512
TRAIN_BATCH_SIZE = 32
VALID_BATCH_SIZE = 32
EPOCHS = 2
LR = 3e-5
# WARMUP_STEPS = 30
BERT_PATH = '../src/load_models/pubmedbert'

In [9]:
TOKENIZER = BertTokenizer.from_pretrained(BERT_PATH, truncation=True)

In [10]:
class CBDDataset:
    def __init__(self, text, label):
        self.text = text
        self.label = label
        self.tokenizer = TOKENIZER
    
    def __len__(self):
        return len(self.text)

    def __getitem__(self, item):
        text = ' '.join(self.text[item].split())
        label = self.label[item]
        enc = self.tokenizer(text, max_length = MAX_LEN, truncation=True, padding='max_length', return_tensors='pt')

        return {
            'ids': enc.input_ids[0],
            'mask': enc.attention_mask[0],
            'token_type_ids': enc.token_type_ids[0],
            'targets': torch.tensor(label, dtype=torch.long)
        }

In [11]:
# def loss_fn(outputs, targets):
#     return nn.CrossEntropyLoss()(outputs, targets)

In [12]:
def train_fn(data_loader, model, optimizer, device, scheduler=None):
    model.train()
    losses = AverageMeter()
    f1s = AverageMeter()
    tk0 = tqdm(data_loader, total=len(data_loader))
    
    for bi, d in enumerate(tk0):
        ids = d["ids"]
        token_type_ids = d["token_type_ids"]
        mask = d["mask"]
        targets = d["targets"]

        ids = ids.to(device, dtype=torch.long)
        token_type_ids = token_type_ids.to(device, dtype=torch.long)
        mask = mask.to(device, dtype=torch.long)
        targets = targets.to(device, dtype=torch.long)

        model.zero_grad()
        outputs = model(ids, attention_mask=mask, token_type_ids=token_type_ids, labels=targets, return_dict=True)
        
        loss = outputs.loss
        logits = outputs.logits
#         loss = loss_fn(outputs, targets)
        loss.backward()
        optimizer.step()
        # scheduler.step()
        outputs = torch.argmax(logits, dim=1).cpu().detach().numpy()
        targets = targets.cpu().detach().numpy().astype(int)
        f1 = metrics.f1_score(targets,outputs)
        f1s.update(f1, ids.size(0))
        losses.update(loss.item(), ids.size(0))
        tk0.set_postfix(loss=losses.avg, f1=f1s.avg)

In [13]:
def eval_fn(data_loader, model, device):
    model.eval()
    fin_targets = []
    fin_outputs = []
    with torch.no_grad():
        for bi, d in tqdm(enumerate(data_loader), total=len(data_loader)):
            ids = d["ids"]
            token_type_ids = d["token_type_ids"]
            mask = d["mask"]
            targets = d["targets"]

            ids = ids.to(device, dtype=torch.long)
            token_type_ids = token_type_ids.to(device, dtype=torch.long)
            mask = mask.to(device, dtype=torch.long)
            
            outputs = model(ids, attention_mask=mask, token_type_ids=token_type_ids, labels=targets, return_dict=True)
            logits = outputs.logits
            
            outputs = torch.argmax(logits, dim=1).cpu().detach().numpy().tolist()
            fin_targets.extend(targets.cpu().detach().numpy().tolist())
            fin_outputs.extend(outputs)

    f1 = metrics.f1_score(fin_targets,fin_outputs)
    return f1

In [14]:
def run(fold):
    df_train = df[df.kfold != fold].reset_index(drop=True)
    df_valid = df[df.kfold == fold].reset_index(drop=True)
    # df_train = df_train[:64]
    # df_valid = df_valid[:64]

#     target = df_train.label.values
#     class_sample_count = np.array([len(np.where(target == t)[0]) for t in np.unique(target)])
#     weight = 1. / class_sample_count
#     samples_weight = np.array([weight[t] for t in target])
#     samples_weight = torch.from_numpy(samples_weight)
#     samples_weigth = samples_weight.double()
#     sampler = WeightedRandomSampler(samples_weight, len(samples_weight))
    
    train_dataset = CBDDataset(
        text=df_train.text.values,
        label=df_train.label.values,
    )

    train_data_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=TRAIN_BATCH_SIZE,
        num_workers=1,
        shuffle=True
        #sampler=sampler
    )

    valid_dataset = CBDDataset(
        text=df_valid.text.values,
        label=df_valid.label.values,
    )

    valid_data_loader = torch.utils.data.DataLoader(
        valid_dataset,
        batch_size=VALID_BATCH_SIZE,
        shuffle=False,
        num_workers=2
    )

    device = torch.device("cpu")
    model = BertForSequenceClassification.from_pretrained(BERT_PATH,num_labels = 2, output_attentions = False, output_hidden_states = False)
    model.to(device)

#     num_train_steps = int(len(df_train) / TRAIN_BATCH_SIZE * EPOCHS)
    optimizer = AdamW(model.parameters(), lr = LR, eps = 1e-8)
#     param_optimizer = list(model.named_parameters())[:-2]
#     no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    
#     optimizer_parameters = [
#         {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
#         {'params': [p for n, p in param_optimizer if (any(nd in n for nd in no_decay))], 'weight_decay': 0.0},
#         {'params': model.l0.weight, "lr": HEAD_LR, 'weight_decay': 0.01}, 
#         {'params': model.l0.bias, "lr": HEAD_LR, 'weight_decay': 0.0}, 
#     ]
#     optimizer = AdamW(optimizer_parameters, lr=LR)
#     scheduler = get_linear_schedule_with_warmup(
#         optimizer, 
#         num_warmup_steps=WARMUP_STEPS, 
#         num_training_steps=num_train_steps
#     )

    print(f"Training is starting for fold: {fold}")
    
    for epoch in range(EPOCHS):
        train_fn(train_data_loader, model, optimizer, device, scheduler=None)
        f1 = eval_fn(valid_data_loader, model, device)
        print(f"Epoch: {epoch}, F1 score = {f1}")
    
    model_path=f"model_{fold}.bin"
    torch.save(model.state_dict(), model_path)

In [15]:
run(0)

Some weights of the model checkpoint at ../src/load_models/pubmedbert were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were 

Training is starting for fold: 0


  0%|          | 0/43 [00:00<?, ?it/s]

  average, "true nor predicted", 'F-score is', len(true_sum)
  average, "true nor predicted", 'F-score is', len(true_sum)
  average, "true nor predicted", 'F-score is', len(true_sum)


  0%|          | 0/11 [00:00<?, ?it/s]

Epoch: 0, F1 score = 0.0


  0%|          | 0/43 [00:00<?, ?it/s]

  average, "true nor predicted", 'F-score is', len(true_sum)


  0%|          | 0/11 [00:00<?, ?it/s]

Epoch: 1, F1 score = 0.0


In [16]:
run(1)

Some weights of the model checkpoint at ../src/load_models/pubmedbert were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were 

Training is starting for fold: 1


  0%|          | 0/43 [00:00<?, ?it/s]

  average, "true nor predicted", 'F-score is', len(true_sum)
  average, "true nor predicted", 'F-score is', len(true_sum)
  average, "true nor predicted", 'F-score is', len(true_sum)


  0%|          | 0/11 [00:00<?, ?it/s]

Epoch: 0, F1 score = 0.0


  0%|          | 0/43 [00:00<?, ?it/s]

  average, "true nor predicted", 'F-score is', len(true_sum)
  average, "true nor predicted", 'F-score is', len(true_sum)


  0%|          | 0/11 [00:00<?, ?it/s]

Epoch: 1, F1 score = 0.64


In [17]:
run(2)

Some weights of the model checkpoint at ../src/load_models/pubmedbert were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were 

Training is starting for fold: 2


  0%|          | 0/43 [00:00<?, ?it/s]

  average, "true nor predicted", 'F-score is', len(true_sum)


  0%|          | 0/11 [00:00<?, ?it/s]

Epoch: 0, F1 score = 0.0


  0%|          | 0/43 [00:00<?, ?it/s]

  average, "true nor predicted", 'F-score is', len(true_sum)


  0%|          | 0/11 [00:00<?, ?it/s]

Epoch: 1, F1 score = 0.7027027027027026


In [18]:
run(3)

Some weights of the model checkpoint at ../src/load_models/pubmedbert were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were 

Training is starting for fold: 3


  0%|          | 0/43 [00:00<?, ?it/s]

  average, "true nor predicted", 'F-score is', len(true_sum)


  0%|          | 0/11 [00:00<?, ?it/s]

Epoch: 0, F1 score = 0.0


  0%|          | 0/43 [00:00<?, ?it/s]

  average, "true nor predicted", 'F-score is', len(true_sum)
  average, "true nor predicted", 'F-score is', len(true_sum)


  0%|          | 0/11 [00:00<?, ?it/s]

Epoch: 1, F1 score = 0.7272727272727272


In [19]:
run(4)

Some weights of the model checkpoint at ../src/load_models/pubmedbert were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were 

Training is starting for fold: 4


  0%|          | 0/43 [00:00<?, ?it/s]

  average, "true nor predicted", 'F-score is', len(true_sum)
  average, "true nor predicted", 'F-score is', len(true_sum)
  average, "true nor predicted", 'F-score is', len(true_sum)
  average, "true nor predicted", 'F-score is', len(true_sum)


  0%|          | 0/11 [00:00<?, ?it/s]

Epoch: 0, F1 score = 0.0


  0%|          | 0/43 [00:00<?, ?it/s]

  average, "true nor predicted", 'F-score is', len(true_sum)


  0%|          | 0/11 [00:00<?, ?it/s]

Epoch: 1, F1 score = 0.6666666666666667


In [20]:
df_test = pd.read_csv('../data/can_test_ns.csv')
df_test.columns = ['text', 'labels']
df_test.labels = df_test.labels.astype(int)

In [21]:
labels = df_test.labels.values

In [22]:
device = torch.device("cpu")

In [1]:
model1 = BertForSequenceClassification.from_pretrained(BERT_PATH,num_labels = 2, output_attentions = False, output_hidden_states = False)
model1.to(device)
model1.load_state_dict(torch.load("model_0.bin"))
model1.eval()

model2 = BertForSequenceClassification.from_pretrained(BERT_PATH,num_labels = 2, output_attentions = False, output_hidden_states = False)
model2.to(device)
model2.load_state_dict(torch.load("model_1.bin"))
model2.eval()

model3 = BertForSequenceClassification.from_pretrained(BERT_PATH,num_labels = 2, output_attentions = False, output_hidden_states = False)
model3.to(device)
model3.load_state_dict(torch.load("model_2.bin"))
model3.eval()

model4 = BertForSequenceClassification.from_pretrained(BERT_PATH,num_labels = 2, output_attentions = False, output_hidden_states = False)
model4.to(device)
model4.load_state_dict(torch.load("model_3.bin"))
model4.eval()

model5 = BertForSequenceClassification.from_pretrained(BERT_PATH,num_labels = 2, output_attentions = False, output_hidden_states = False)
model5.to(device)
model5.load_state_dict(torch.load("model_4.bin"))
model5.eval()

In [24]:
final_output = []

test_dataset = CBDDataset(
        text=df_test.text.values,
        label=df_test.labels.values,
    )

data_loader = torch.utils.data.DataLoader(
    test_dataset,
    shuffle=False,
    batch_size=VALID_BATCH_SIZE,
    num_workers=1
)

with torch.no_grad():
    tk0 = tqdm(data_loader, total=len(data_loader))
    for bi, d in enumerate(tk0):
        ids = d["ids"]
        token_type_ids = d["token_type_ids"]
        mask = d["mask"]

        ids = ids.to(device, dtype=torch.long)
        token_type_ids = token_type_ids.to(device, dtype=torch.long)
        mask = mask.to(device, dtype=torch.long)
        
        outputs1 = model1(ids, attention_mask=mask, token_type_ids=token_type_ids, return_dict=True)
        outputs2 = model2(ids, attention_mask=mask, token_type_ids=token_type_ids, return_dict=True)
        outputs3 = model3(ids, attention_mask=mask, token_type_ids=token_type_ids, return_dict=True)
        outputs4 = model4(ids, attention_mask=mask, token_type_ids=token_type_ids, return_dict=True)
        outputs5 = model5(ids, attention_mask=mask, token_type_ids=token_type_ids, return_dict=True)
        
        outputs = (outputs1.logits + outputs2.logits + outputs3.logits + outputs4.logits + outputs5.logits) / 5
        outputs = torch.argmax(outputs, dim=1).cpu().detach().numpy().tolist()
        final_output.extend(outputs)

  0%|          | 0/6 [00:00<?, ?it/s]

In [25]:
precision, recall, f1, _ = precision_recall_fscore_support(labels, final_output, average='binary')
acc = accuracy_score(labels, final_output)
print( {
    'accuracy': acc,
    'f1': f1,
    'precision': precision,
    'recall': recall
})

{'accuracy': 0.9521276595744681, 'f1': 0.7272727272727272, 'precision': 0.8, 'recall': 0.6666666666666666}
