Fine Tune example computing


In [39]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [40]:
!pip install transformers
from transformers import BertTokenizerFast
from transformers import AlbertTokenizerFast
from transformers import ElectraTokenizerFast
import torch
path = '/content/gdrive/MyDrive/BERT_tuned_for_SQUAD_2.0_complete'
bert_tune = torch.load(path, map_location=torch.device('cpu'))
albert_tune = torch.load('/content/gdrive/MyDrive/ALBERT_tuned_for_SQUAD_2.0_complete', map_location = torch.device('cpu'))
electra_tune = torch.load('/content/gdrive/MyDrive/ELECTRA_tuned_for_SQUAD_2.0_complete', map_location = torch.device('cpu'))
#Tokenizer
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
tokenizer_albert = AlbertTokenizerFast.from_pretrained('albert-base-v2')
tokenizer_electra = ElectraTokenizerFast.from_pretrained('google/electra-small-discriminator')



In [41]:
def normalize_text(s):
    """Removing articles and punctuation, and standardizing whitespace are all typical text processing steps."""
    import string, re

    def remove_articles(text):
        regex = re.compile(r"\b(a|an|the)\b", re.UNICODE)
        return re.sub(regex, " ", text)

    def white_space_fix(text):
        return " ".join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation)
        return "".join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()

    return white_space_fix(remove_articles(remove_punc(lower(s))))

def compute_exact_match(prediction, truth):
    return int(normalize_text(prediction) == normalize_text(truth))

def compute_f1(prediction, truth):
    pred_tokens = normalize_text(prediction).split()
    truth_tokens = normalize_text(truth).split()
    
    # if either the prediction or the truth is no-answer then f1 = 1 if they agree, 0 otherwise
    if len(pred_tokens) == 0 or len(truth_tokens) == 0:
        return int(pred_tokens == truth_tokens)
    
    common_tokens = set(pred_tokens) & set(truth_tokens)
    
    # if there are no common tokens then f1 = 0
    if len(common_tokens) == 0:
        return 0
    
    prec = len(common_tokens) / len(pred_tokens)
    rec = len(common_tokens) / len(truth_tokens)
    
    return 2 * (prec * rec) / (prec + rec)

In [42]:
context = 'Following the Cretaceous–Paleogene extinction event, the extinction of the dinosaurs and the wetter climate may have allowed the tropical rainforest to spread out across the continent. From 66–34 Mya, the rainforest extended as far south as 45°. Climate fluctuations during the last 34 million years have allowed savanna regions to expand into the tropics. During the Oligocene, for example, the rainforest spanned a relatively narrow band. It expanded again during the Middle Miocene, then retracted to a mostly inland formation at the last glacial maximum. However, the rainforest still managed to thrive during these glacial periods, allowing for the survival and evolution of a broad diversity of species.'
question = 'Which type of climate may have allowed the rainforest to spread across the continent?'

Bert


In [43]:
def bert_fine_tune(context, question):
  answers = []
  encoding = tokenizer.encode_plus(text=context,text_pair=question)
  inputs = encoding['input_ids']  #Token embeddings
  sentence_embedding = encoding['token_type_ids']  #Segment embeddings
  tokens = tokenizer.convert_ids_to_tokens(inputs) #input tokens
  outputs = bert_tune(input_ids=torch.tensor([inputs]), token_type_ids=torch.tensor([sentence_embedding]))
  start_index = torch.argmax(outputs.start_logits)
  end_index = torch.argmax(outputs.end_logits)
  answer = ' '.join(tokens[start_index:end_index+1])
  corrected_answer = ''
  for word in answer.split():
        
        #If it's a subword token
      if word[0:2] == '##':
          corrected_answer += word[2:]
      else:
          corrected_answer += ' ' + word
  answers.append (corrected_answer)
  return answers

In [44]:
bert_tune_answer = bert_fine_tune(context, question)
bert_tune_answer

[' wetter']

In [45]:
true_answer = ['the wetter climate may have allowed the tropical rainforest to spread out across the continent', 'wetter']

In [46]:
em_score = max((compute_exact_match(bert_tune_answer[0], a) for a in true_answer))
f1_score = max((compute_f1(bert_tune_answer[0], a) for a in true_answer))
print(em_score)
print(f1_score)

1
1.0


Albert


In [47]:
def albert_fine_tune(context, question):
  answers = []
  encoding = tokenizer_albert.encode_plus(text=context,text_pair=question)
  inputs = encoding['input_ids']  #Token embeddings
  sentence_embedding = encoding['token_type_ids']  #Segment embeddings
  tokens = tokenizer_albert.convert_ids_to_tokens(inputs) #input tokens
  outputs = albert_tune(input_ids=torch.tensor([inputs]), token_type_ids=torch.tensor([sentence_embedding]))
  start_index = torch.argmax(outputs.start_logits)
  end_index = torch.argmax(outputs.end_logits)
  answer = ' '.join(tokens[start_index:end_index+1])
  corrected_answer = ''
  for word in answer.split():
        
        #If it's a subword token
      if word[0:2] == '##':
          corrected_answer += word[2:]
      else:
          corrected_answer += ' ' + word
  answers.append(corrected_answer)
  return answers

In [48]:
albert_tune_answer = albert_fine_tune(context, question)
albert_tune_answer[0]

' ▁we tter'

In [49]:
import re
import string
albert_tune_answer[0] = albert_tune_answer[0].replace("▁", "")
albert_tune_answer[0] = albert_tune_answer[0].replace(" ", "")
albert_tune_answer

['wetter']

In [50]:
em_score = max((compute_exact_match(albert_tune_answer[0], a) for a in true_answer))
f1_score = max((compute_f1(albert_tune_answer[0], a) for a in true_answer))
print(em_score)
print(f1_score)

1
1.0


Electra

In [51]:
def electra_fine_tune(context, question):
  answers = []
  encoding = tokenizer_electra.encode_plus(text=context,text_pair=question)
  inputs = encoding['input_ids']  #Token embeddings
  sentence_embedding = encoding['token_type_ids']  #Segment embeddings
  tokens = tokenizer_electra.convert_ids_to_tokens(inputs) #input tokens
  outputs = electra_tune(input_ids=torch.tensor([inputs]), token_type_ids=torch.tensor([sentence_embedding]))
  start_index = torch.argmax(outputs.start_logits)
  end_index = torch.argmax(outputs.end_logits)
  answer = ' '.join(tokens[start_index:end_index+1])
  corrected_answer = ''
  for word in answer.split():
        
        #If it's a subword token
      if word[0:2] == '##':
          corrected_answer += word[2:]
      else:
          corrected_answer += ' ' + word
  answers.append(corrected_answer)
  return answers

In [52]:
electra_tune_answer = electra_fine_tune(context, question)
electra_tune_answer

[' wetter climate']

In [53]:
em_score = max((compute_exact_match(electra_tune_answer[0], a) for a in true_answer))
f1_score = max((compute_f1(electra_tune_answer[0], a) for a in true_answer))
print(em_score)
print(f1_score)

0
0.6666666666666666
