In [None]:
!nvidia-smi


Thu Jan 19 11:21:28 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   66C    P0    31W /  70W |  15102MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
pip install farm==0.7.1

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import logging
import torch
import os
import pprint
from pathlib import Path

from farm.utils import set_all_seeds, MLFlowLogger, initialize_device_settings
from farm.modeling.tokenization import Tokenizer
from farm.data_handler.utils import write_squad_predictions
from farm.modeling.optimization import initialize_optimizer
from farm.train import Trainer
from farm.modeling.prediction_head import QuestionAnsweringHead
from farm.modeling.language_model import LanguageModel
from farm.data_handler.processor import SquadProcessor
from farm.data_handler.data_silo import DataSilo, DataSiloForCrossVal
from farm.modeling.adaptive_model import AdaptiveModel
from farm.infer import QAInferencer
from farm.eval import Evaluator
from farm.evaluation.metrics import metrics_per_bin

In [None]:
##########################
########## Settings
##########################


set_all_seeds(seed=42)
device, n_gpu = initialize_device_settings(use_cuda=True)

# Setting the path to the desired transformer model
lang_model = "deepset/roberta-base-squad2"

# Setting do_lower_case = False, because used model is cased
do_lower_case = False
batch_size = 24 # 80
n_epochs = 5 

# Setting path for input files
data_dir = Path("data")

# Due to hardware limitations crossvalidation could not be used

#save_per_fold_results = False # unsupported for now crossvalidation
# n_epochs = 2  crossvalidation
# learning_rate = 3e-5 crossvalidation
#xval_folds = 5 #for k-fold
#dev_split = 0 #for k-fold
#evaluate_every = 0 #for k-fold
#no_ans_boost = 0 # use large negative values to disable giving "no answer" option for k-fold
#use_amp = None # for k-fold

# Create variables for the training and validation dataset file names
train_filename = "train_squad_format_new.json"  
dev_filename = "val_squad_format_new.json"

# Setting the amount of expected answers by the system

accuracy_at = 10 # accuracy at n is useful for answers inside long documents

In [None]:
# 1.Create a tokenizer
tokenizer = Tokenizer.load(
pretrained_model_name_or_path=lang_model,
do_lower_case=do_lower_case)

# 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset
processor = SquadProcessor(
tokenizer=tokenizer,
max_seq_len=384, #  Samples are truncated after this many tokens.
label_list=["start_token", "end_token"],
metric="squad", # name of metric that shall be used for evaluation, can be “squad” or “top_n_accuracy”
train_filename=train_filename,
dev_filename=dev_filename,
data_dir=data_dir,
doc_stride=192, # When the document containing the answer is too long it gets split into part, strided by doc_stride
)

# 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets
data_silo = DataSilo(
processor=processor,
batch_size=batch_size, distributed=False) 

# Parameters and their descriptions from the official farm documentation

#batch_size (int) – The size of batch that should be returned by the DataLoader for the training set.
#eval_batch_size (int) – The size of batch that should be returned by the DataLoaders for the dev and test set.
#distributed (bool) – Set to True if you are running in a distributed evn, e.g. using DistributedDataParallel. The DataSilo will init the DataLoader with a DistributedSampler() to distribute batches. 
#automatic_loading (bool) – Set to False, if you don’t want to automatically load data at initialization
#max_multiprocessing_chunksize (int) – max possible value for chunksize as calculated by calc_chunksize() in farm.utils. For certain cases like lm_finetuning, a smaller value can be set, as the default chunksize values are rather large that might cause memory issues.
#max_processes (int) – the maximum number of processes to spawn in the multiprocessing.Pool used in DataSilo. It can be set to 1 to disable the use of multiprocessing or make debugging easier.
#caching (bool) – save the processed datasets on disk to save time/compute if the same train data is used to run multiple experiments. Each cache has a checksum based on the train_filename of the Processor and the batch size.
#cache_path (Path) – root dir for storing the datasets’ cache.



Example will not be converted for training/evaluation.
Example will not be converted for training/evaluation.
Preprocessing Dataset data/train_squad_format_new.json: 100%|██████████| 2962/2962 [00:11<00:00, 268.15 Dicts/s]
ERROR:farm.data_handler.processor:Unable to convert 2 samples to features. Their ids are : 649-0-0, 505-0-0
Example will not be converted for training/evaluation.
Preprocessing Dataset data/val_squad_format_new.json: 100%|██████████| 671/671 [00:02<00:00, 279.28 Dicts/s]
ERROR:farm.data_handler.processor:Unable to convert 1 samples to features. Their ids are : 284-0-5
Example will not be converted for training/evaluation.
Preprocessing Dataset data/val_squad_format_new.json: 100%|██████████| 671/671 [00:02<00:00, 277.67 Dicts/s]
ERROR:farm.data_handler.processor:Unable to convert 1 samples to features. Their ids are : 284-0-5


In [None]:

language_model = LanguageModel.load(lang_model)
# b) and a prediction head on top that is suited for our task => Question Answering
prediction_head = QuestionAnsweringHead()
    
model = AdaptiveModel(
        language_model=language_model,   
        prediction_heads=[prediction_head],
        embeds_dropout_prob=0.1,   #The probability that a value in the embeddings returned by the language model will be zeroed. 
        lm_output_types=["per_token"], #How to extract the embeddings from the final layer of the language model. When set to “per_token”, one embedding will be extracted per input token.
                                       # If set to “per_sequence”, a single embedding will be extracted to represent the full input sequence. Can either be a single string, or a list of strings, one for each prediction head.
        device=device,
    )

# Parameters and their descriptions from the official farm documentation

#loss_aggregation_fn (function) – Function to aggregate the loss of multiple prediction heads. 
#Input: loss_per_head (list of tensors), global_step (int), batch (dict) Output: aggregated loss (tensor) 
#Default is a simple sum: lambda loss_per_head, global_step=None, batch=None: sum(tensors) However, you can pass more complex functions
# that depend on the current step (e.g. for round-robin style multitask learning) or the actual content of the batch (e.g. certain labels) 
#Note: The loss at this stage is per sample, i.e one tensor of shape (batchsize) per prediction head.

Some weights of RobertaModel were not initialized from the model checkpoint at deepset/roberta-base-squad2 and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [1]:
 # 5. Create an optimizer
model, optimizer, lr_schedule = initialize_optimizer(
  model=model,
  learning_rate=3e-5, 
  schedule_opts={"name": "LinearWarmup", "warmup_proportion": 0.2},
  n_batches=len(data_silo.loaders["train"]), #number of batches for training
  n_epochs=n_epochs, # number of epochs for training
  device=device
)

# Parameters and their descriptions from the official farm documentation

# optimizer_opts – Dict to customize the optimizer. Choose any optimizer available from torch.optim, apex.optimizers or transformers.optimization by supplying the class name and the parameters for the constructor. 
#Examples: 1) AdamW from Transformers (Default): {“name”: “TransformersAdamW”, “correct_bias”: False, “weight_decay”: 0.01} 2) SGD from pytorch: {“name”: “SGD”, “momentum”: 0.0} 
#3) FusedLAMB from apex: {“name”: “FusedLAMB”, “bias_correction”: True}

#schedule_opts – Dict to customize the learning rate schedule. Choose any Schedule from Pytorch or Huggingface’s Transformers by supplying the class name and the parameters needed by the constructor. 
#If the dict does not contain num_training_steps it will be set by calculating it from n_batches, grad_acc_steps and n_epochs. 
#Examples: 1) Linear Warmup (Default): {“name”: “LinearWarmup”, “num_warmup_steps”: 0.1 * num_training_steps, “num_training_steps”: num_training_steps} 
#2) CosineWarmup: {“name”: “CosineWarmup”, “num_warmup_steps”: 0.1 * num_training_steps, “num_training_steps”: num_training_steps} 
#3) CyclicLR from pytorch: {“name”: “CyclicLR”, “base_lr”: 1e-5, “max_lr”:1e-4, “step_size_up”: 100}

NameError: ignored

In [None]:
# Add the training parameters to the function

trainer = Trainer(
      model=model,
      optimizer=optimizer,
      data_silo=data_silo,
      epochs=n_epochs,
      n_gpu=n_gpu,
      lr_schedule=lr_schedule,
      device=device,
  )

In [None]:
# 7. Starting the training with the choosen parameters
trainer.train()

Train epoch 0/4 (Cur. train loss: 0.3636):  26%|██▌       | 100/390 [03:13<09:19,  1.93s/it]
Evaluating:   0%|          | 0/95 [00:00<?, ?it/s][A
Evaluating:  17%|█▋        | 16/95 [00:10<00:51,  1.54it/s][A
Evaluating:  34%|███▎      | 32/95 [00:20<00:41,  1.53it/s][A
Evaluating:  51%|█████     | 48/95 [00:31<00:30,  1.53it/s][A
Evaluating:  67%|██████▋   | 64/95 [00:41<00:20,  1.53it/s][A
Evaluating: 100%|██████████| 95/95 [01:01<00:00,  1.55it/s]
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
Train epoch 0/4 (Cur. train loss: 1.1717):  51%|█████▏    | 200/390 [07:22<05:56,  1.88s/it]
Evaluating:   0%|          | 0/95 [00:00<?, ?it/s][A
Evaluating:  17%|█▋        | 16/95 [00:10<00:51,  1.54it/s][A
Evaluating:  34%|███▎      | 32/95 [00:20<00:40,  1.54it/s][A
Evaluating:  51%|█████     | 48/95 [00:31<00:30,  1.55it/s][A
Evaluating:  51%|█████     | 48/95 [00:41<00:30,  1.55it/s][A
Evaluating:  67%|██████▋   | 64/95 [00:41<00:20,  1.54

AdaptiveModel(
  (language_model): Roberta(
    (model): RobertaModel(
      (embeddings): RobertaEmbeddings(
        (word_embeddings): Embedding(50265, 768, padding_idx=1)
        (position_embeddings): Embedding(514, 768, padding_idx=1)
        (token_type_embeddings): Embedding(1, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): RobertaEncoder(
        (layer): ModuleList(
          (0): RobertaLayer(
            (attention): RobertaAttention(
              (self): RobertaSelfAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (output): RobertaSelfOutput(
                (dense): Linear(in_features=768, ou

In [None]:
 # 8. Load pre-trained question-answering model
model.connect_heads_with_processor(data_silo.processor.tasks, require_labels=True)
# Number of predictions the model will make per Question.
# The multiple predictions are used for evaluating top n recall.
#model.prediction_heads[0].n_best = accuracy_at
model.prediction_heads[0].n_best = 10

# 5. The calibration of model confidence scores sets one parameter, which is called temperature and can be accessed through the prediction_head.
# This temperature is applied to each logit in the forward pass, where each logit is divided by the temperature.
# A softmax function is applied to the logits afterward to get confidence scores in the range [0,1].
# A temperature larger than 1 decreases the model’s confidence scores.


In [None]:
# 9a. We can either manually set the temperature (default value is 1.0)...
model.prediction_heads[0].temperature_for_confidence = torch.nn.Parameter((torch.ones(1) * 1.0).to(device=device))

# 9b. ...or we can run the evaluator on the dev set and use it to calibrate confidence scores with a technique called temperature scaling.
# It will align the confidence scores with the model's accuracy based on the dev set data by tuning the temperature parameter.
# During the calibration, this parameter is automatically set internally as an attribute of the prediction head.
evaluator_dev = Evaluator(
    data_loader=data_silo.get_data_loader("dev"),
    tasks=data_silo.processor.tasks,
    device=device
)
result_dev = evaluator_dev.eval(model, return_preds_and_labels=True, calibrate_conf_scores=True)
# evaluator_dev.log_results(result_dev, "Dev", logging=False, steps=len(data_silo.get_data_loader("dev"))

Evaluating: 100%|██████████| 95/95 [01:02<00:00,  1.52it/s]


In [None]:
# 10. Optionally, run the evaluator on the test set to see how well the confidence scores are aligned with the model's accuracy
evaluator_test = Evaluator(
    data_loader=data_silo.get_data_loader("test"),
    tasks=data_silo.processor.tasks,
    device=device
)
result_test = evaluator_test.eval(model, return_preds_and_labels=True)[0]
em_per_bin, confidence_per_bin, count_per_bin = metrics_per_bin(result_test["preds"], result_test["labels"], num_bins=10)
for bin_number in range(10):
    print(f"Bin {bin_number} - exact match: {em_per_bin[bin_number]}, average confidence score: {confidence_per_bin[bin_number]}")


Evaluating: 100%|██████████| 95/95 [01:02<00:00,  1.53it/s]


Bin 0 - exact match: 0.12359550561797752, average confidence score: 0.056291134986147455
Bin 1 - exact match: 0.23469387755102042, average confidence score: 0.1450109962868143
Bin 2 - exact match: 0.25, average confidence score: 0.2470808042164304
Bin 3 - exact match: 0.2545454545454545, average confidence score: 0.3480545675890012
Bin 4 - exact match: 0.28888888888888886, average confidence score: 0.4553984383535054
Bin 5 - exact match: 0.23636363636363636, average confidence score: 0.555782878974622
Bin 6 - exact match: 0.3230769230769231, average confidence score: 0.6563104570413438
Bin 7 - exact match: 0.5609756097560976, average confidence score: 0.7448070222262021
Bin 8 - exact match: 0.6428571428571429, average confidence score: 0.8573968206133161
Bin 9 - exact match: 0.881578947368421, average confidence score: 0.9323726782673284


In [None]:
# 11. Hooray! You have a model with calibrated confidence scores.
# Store the model and the temperature parameter will be stored automatically as an attribute of the prediction head.
save_dir = Path("saved_models/qa-model-task2")
model.save(save_dir)
processor.save(save_dir)


In [None]:
!zip -r content/file.zip saved_models/qa-model-task2


  adding: saved_models/qa-model-task2/ (stored 0%)
  adding: saved_models/qa-model-task2/prediction_head_0_config.json (deflated 43%)
  adding: saved_models/qa-model-task2/tokenizer_config.json (deflated 71%)
  adding: saved_models/qa-model-task2/prediction_head_0.bin (deflated 17%)
  adding: saved_models/qa-model-task2/processor_config.json (deflated 56%)
  adding: saved_models/qa-model-task2/special_tokens_map.json (deflated 83%)
  adding: saved_models/qa-model-task2/vocab.json (deflated 59%)
  adding: saved_models/qa-model-task2/language_model_config.json (deflated 50%)
  adding: saved_models/qa-model-task2/language_model.bin (deflated 7%)
  adding: saved_models/qa-model-task2/merges.txt (deflated 53%)


In [None]:
from google.colab import files
files.download("content/file.zip")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# 12. When making a prediction with the calibrated model, we could filter out predictions where the model is not confident enough
# To this end, load the stored model, which will automatically load the stored temperature parameter.
# The confidence scores are automatically adjusted based on this temperature parameter.
# For each prediction, we can check the model's confidence and decide whether to output the prediction or not.
inferencer = QAInferencer.load(save_dir, batch_size=40, gpu=True, task_type= "question_answering")

# Parameters and their descriptions from the official farm documentation

#max_seq_len (int) – maximum length of one text sample
#doc_stride (int) – Only QA: When input text is longer than max_seq_len it gets split into parts, strided by doc_stride
#extraction_strategy (str) – Strategy to extract vectors. Choices: ‘cls_token’ (sentence vector), ‘reduce_mean’ (sentence vector), reduce_max (sentence vector), ‘per_token’ (individual token vectors)
#extraction_layer (int) – number of layer from which the embeddings shall be extracted. Default: -1 (very last layer).

# Testing the trained model
QA_input = [
    {
        "questions": ["Who counted the game among the best ever made?"],
        "text": "Twilight Princess was released to universal critical acclaim and commercial success. It received perfect scores from major publications such as 1UP.com, Computer and Video Games, Electronic Gaming Monthly, Game Informer, GamesRadar, and GameSpy. On the review aggregators GameRankings and Metacritic, Twilight Princess has average scores of 95% and 95 for the Wii version and scores of 95% and 96 for the GameCube version. GameTrailers in their review called it one of the greatest games ever created."
    }]
result = inferencer.inference_from_dicts(dicts=QA_input, return_json=False)[0]
#print(result)

for i in result.prediction:
  print(i.answer, i.confidence)
if result.prediction[0].confidence > 0.9:
    print(result.prediction[0].answer)
else:
    print("The confidence is not high enough to give an answer.")

Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00, 24.97 Batches/s]

Twilight Princess 0.341556578874588
GameTrailers 0.37477704882621765
Twilight Princess was released to universal critical acclaim and commercial success. It received perfect scores from major publications such as 1UP.com, Computer and Video Games, Electronic Gaming Monthly, Game Informer, GamesRadar, and GameSpy. On the review aggregators GameRankings and Metacritic, Twilight Princess has average scores of 95% and 95 for the Wii version and scores of 95% and 96 for the GameCube version. GameTrailers 0.341556578874588
Princess 0.013279247097671032
ilight Princess 0.004485561978071928
no_answer 0.036037511425092816
The confidence is not high enough to give an answer.



