In [145]:
!pip install --quiet transformers

In [146]:
!python --version

Python 3.8.10


In [147]:
!python -c "from transformers import pipeline; print(pipeline('sentiment-analysis')('we love you'))"

[{'label': 'POSITIVE', 'score': 0.9998704791069031}]


In [148]:
!which python3

/home/nlpab/anaconda3/envs/aeop/bin/python3


In [149]:
!which jupyter

/home/nlpab/anaconda3/envs/aeop/bin/jupyter


In [32]:
import numpy

In [33]:
import pickle # reading in our data

import torch # pytorch
from torch.utils.data import DataLoader # this helps us iterate over our data efficiently
from tqdm import tqdm

In [34]:
import transformers

## Load Data

### Load training data

In [35]:
with open('./datasets/covidqa/covid_train_contexts.pkl', 'rb') as f1:
  train_contexts = pickle.load(f1)

In [36]:
with open('./datasets/covidqa/covid_train_questions.pkl', 'rb') as f2:
  train_questions = pickle.load(f2)

In [37]:
with open('./datasets/covidqa/covid_train_answers.pkl', 'rb') as f3:
  train_answers = pickle.load(f3)

### Load validation data

In [38]:
with open('./datasets/covidqa/covid_val_contexts.pkl', 'rb') as f1:
  val_contexts = pickle.load(f1)

In [39]:
with open('./datasets/covidqa/covid_val_questions.pkl', 'rb') as f2:
  val_questions = pickle.load(f2)

In [40]:
with open('./datasets/covidqa/covid_val_answers.pkl', 'rb') as f3:
  val_answers = pickle.load(f3)

In [41]:
def add_end_index(answers, contexts):

  # loop over each context-answer pair
  for answer, context in zip(answers, contexts):

    # the actual answer
    expected_answer = answer['text']

    # start index of the answer
    start_index = answer['answer_start']

    # end index
    end_index = start_index + len(expected_answer)

    # take into account if answer is off by a couple characters
    if context[start_index:end_index] == expected_answer:
      answer['answer_end'] = end_index
    else:
      # if answer is off by 1 or 2 tokens
      for i in [1, 2]:
        if context[start_index-i:end_index-i] == expected_answer:
          answer['answer_start'] = start_index - i
          answer['answer_end'] = end_index - i

In [42]:
# apply our function above to dig up all the answer_start and end for each context-answer pair
add_end_index(train_answers, train_contexts)
add_end_index(val_answers, val_contexts)

In [43]:
train_answers[:3]

[{'text': 'Mother-to-child transmission (MTCT) is the main cause of HIV-1 infection in children worldwide. ',
  'answer_start': 370,
  'answer_end': 466},
 {'text': 'DC-SIGNR plays a crucial role in MTCT of HIV-1 and that impaired placental DC-SIGNR expression increases risk of transmission.',
  'answer_start': 2003,
  'answer_end': 2129},
 {'text': 'more than 400,000 children were infected worldwide, mostly through MTCT and 90% of them lived in sub-Saharan Africa. ',
  'answer_start': 2291,
  'answer_end': 2408}]

In [44]:
# from transformers import DistilBertTokenizerFast
from transformers import AlbertTokenizerFast
# defining the tokenizer we are going to use
# tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
tokenizer = AlbertTokenizerFast.from_pretrained('albert-base-v2')

Downloading:   0%|          | 0.00/760k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.31M [00:00<?, ?B/s]

In [45]:
# apply tokenizer to our training and validation datasets
train_tokenized = tokenizer(train_contexts, train_questions, truncation=True, padding=True)
val_tokenized = tokenizer(val_contexts, val_questions, truncation=True, padding=True)

In [46]:
tokenizer.decode(train_tokenized['input_ids'][0])

'[CLS] functional genetic variants in dc-signr are associated with mother-to-child transmission of hiv-1 https://www.ncbi.nlm.nih.gov/pmc/articles/pmc2752805/ boily-larouche, genevieve; iscache, anne-laure; zijenah, lynn s.; humphrey, jean h.; mouland, andrew j.; ward, brian j.; roger, michel 2009-10-07 doi:10.1371/journal.pone.0007211 license:cc-by abstract: background: mother-to-child transmission (mtct) is the main cause of hiv-1 infection in children worldwide. given that the c-type lectin receptor, dendritic cell-specific icam-grabbing non-integrin-related (dc-signr, also known as cd209l or liver/lymph node–specific icam-grabbing non-integrin (l-sign)), can interact with pathogens including hiv-1 and is expressed at the maternal-fetal interface, we hypothesized that it could influence mtct of hiv-1. methods and findings: to investigate the potential role of dc-signr in mtct of hiv-1, we carried out a genetic association study of dc-signr in a well-characterized cohort of 197 hiv-i

In [47]:
def add_token_positions(tokenized_data, answers):
  # let's define a couple lists to keep track of our start/end tokens
  start_positions = [] # aggregate all start positions for each sample
  end_positions = [] # aggregate all end positions for each sample

  # loop over the answers list
  for i in range(len(answers)):
    # add the start and end encodings to our lists
    start_positions.append(tokenized_data.char_to_token(i, answers[i]['answer_start']))
    end_positions.append(tokenized_data.char_to_token(i, answers[i]['answer_end']))

    # if no start position, the answer passage was truncated
    if start_positions[-1] is None:
      start_positions[-1] = tokenizer.model_max_length
    
    # if we can't find the end positions, then we want to shift characters left until we do find it
    shift = 1
    while end_positions[-1] is None:
      end_positions[-1] = tokenized_data.char_to_token(i, answers[i]['answer_end'] - shift)
      shift += 1
  
  tokenized_data.update({'start_positions': start_positions, 'end_positions': end_positions})

In [48]:
# apply function to train and validations data
add_token_positions(train_tokenized, train_answers)
add_token_positions(val_tokenized, val_answers)

In [49]:
train_tokenized.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'start_positions', 'end_positions'])

In [50]:
class CovidDataset(torch.utils.data.Dataset):
  # constructor
  def __init__(self, encodings):
    self.encodings = encodings
  
  # Query to get a specific item at an index in our dataset
  def __getitem__(self, idx):
    return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()} # comprehension (shortcut)

  # returns the length of our dataset
  def __len__(self):
    return len(self.encodings.input_ids)

In [51]:
# Datasets for the training and validations sets
train_data = CovidDataset(train_tokenized)
val_data = CovidDataset(val_tokenized)

In [52]:
# Used to help us iterate over our dataset conveniently, especially when we have batches
train_dataloader = DataLoader(train_data, batch_size=8, shuffle=True)
val_dataloader = DataLoader(val_data, batch_size=8, shuffle=True)

In [53]:
from transformers import AlbertForQuestionAnswering # This is the BERT model used for question answering

# BERT model already finetuned for question answering
model = AlbertForQuestionAnswering.from_pretrained('albert-base-v2')

Downloading:   0%|          | 0.00/684 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/47.4M [00:00<?, ?B/s]

Some weights of the model checkpoint at albert-base-v2 were not used when initializing AlbertForQuestionAnswering: ['predictions.dense.bias', 'predictions.dense.weight', 'predictions.LayerNorm.bias', 'predictions.decoder.bias', 'predictions.LayerNorm.weight', 'predictions.decoder.weight', 'predictions.bias']
- This IS expected if you are initializing AlbertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing AlbertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of AlbertForQuestionAnswering were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN t

In [54]:
from transformers import AdamW

optim = AdamW(model.parameters(), lr=5e-5)

In [55]:
if torch.cuda.is_available():
  device = torch.device("cuda") # this is the device our model will use for computing
  print(f'There are {torch.cuda.device_count()} GPUs available')
  print(f'Device name:', torch.cuda.get_device_name(0))
else:
  print('No GPU available, using CPU instead')
  device = torch.device('cpu')

There are 2 GPUs available
Device name: Quadro RTX 8000


In [56]:
model = model.to(device)

In [57]:
def train(model, train_dataloader):

  # run loop 3 times
  for epoch in range(3):
    # first, set the mode of the model to train
    model.train()

    # progress bar
    loop = tqdm(train_dataloader, leave=True)

    # loop over each batch 
    for batch in loop:
      # zero out the optimizer
      optim.zero_grad()

      # grab the batch attributes and attach them to the GPU device
      input_ids, attention_mask, start_positions, end_positions = batch['input_ids'].to(device), batch['attention_mask'].to(device), batch['start_positions'].to(device), batch['end_positions'].to(device)

      # Feedforward!!!
      outputs = model(input_ids, attention_mask, start_positions=start_positions, end_positions=end_positions)

      # Compute the Loss
      loss = outputs[0]

      # BACKPROPAGATION!!!!
      loss.backward()

      # update parameters for the next run
      optim.step()


      # display some stuff here
      loop.set_description(f'Epoch {epoch}')
      loop.set_postfix(loss=loss.item())


In [58]:
train(model, train_dataloader)

Epoch 0: 100%|██████████| 202/202 [01:03<00:00,  3.18it/s, loss=4.29]  
Epoch 1: 100%|██████████| 202/202 [01:04<00:00,  3.14it/s, loss=0.166] 
Epoch 2: 100%|██████████| 202/202 [01:04<00:00,  3.14it/s, loss=6.34]  


In [59]:
def evaluate(model, val_dataloader):
    # set to evaluate mode
    model.eval()
    
    # keep track of accuracies
    accuracies = []
    
    # loop through batches
    for batch in val_dataloader:
        # grab the batch attributes and attach them to the GPU device
        input_ids, attention_mask, start_positions, end_positions = batch['input_ids'].to(device), batch['attention_mask'].to(device), batch['start_positions'].to(device), batch['end_positions'].to(device)
        
        # make prediction
        outputs = model(input_ids, attention_mask=attention_mask)
        
        # get the prediction for the start and end token for answer
        start_probs = outputs['start_logits']
        end_probs = outputs['end_logits']
        
        start_pred = torch.argmax(start_probs, dim=1)
        end_pred = torch.argmax(end_probs, dim=1)
        
        # append the accuracy
        accuracies.append(((start_pred == start_positions).sum() / len(start_pred)).item())
        accuracies.append(((end_pred == end_positions).sum() / len(end_pred)).item())
    
    avg_acc = sum(accuracies) / len(accuracies)
    
    return avg_acc
    

In [None]:
avg_acc = evaluate(model, val_dataloader)

In [31]:
avg_acc

0.4497549019607843