In [1]:
import os
import pathlib
import json
    
import pandas as pd
from simpletransformers.question_answering import QuestionAnsweringModel, QuestionAnsweringArgs

In [12]:
from transformers import logging

logging.set_verbosity_warning()

### Data processing

Using simpletransformers the input data for a QuestionAnsweringModel should be a single list of dictionaries or a path to a json file containing the same.\
Read more about the input data format here: https://simpletransformers.ai/docs/qa-data-formats/

In [2]:
# read in the data
train_data_path = pathlib.Path("data/questions_answers_train.xlsx")
test_data_path = pathlib.Path("data/questions_answers_test.xlsx")
train, test = pd.read_excel(train_data_path), pd.read_excel(test_data_path)
display(train.head())

Unnamed: 0,id,question,answer,context
0,1,A total of what percentage for members of supe...,40%,ntary achievement of the goal of sexual divers...
1,2,Are ESG risks considered as a new risk category?,ESG risks do not represent a new risk category...,ESG risks do not represent a new risk category...
2,3,At NLB Komercijalna banka Beograd the way the ...,CollectiveAgreement of the Bank,he way in which thebank and trade union cooper...
3,4,"At the end of 2022, what was conducted on the ...",the internal ESG audit review,"At the end of 2022, the internal ESG audit rev..."
4,5,"By 2022, how much of new corporate financing h...",166.9 million,Target achivements in 2022By year 2022 the NLB...


In [3]:
# function to process the data in tabular format row by row
def process_row(row):
    context, id, question, answer = str(row["context"]), int(row["id"]), str(row["question"]), str(row["answer"])
    
    answer_start = context.find(answer)
    is_impossible = False if answer_start != -1 else True
    answers = [{"text": answer, "answer_start": answer_start}] if not is_impossible else []
    
    # a list of questions and answers 
    # in principle we could have multiple questions per context and multiple answers per question
    qas = [
        {
            "id": id,
            "question": question,
            "is_impossible": is_impossible,
            "answers": answers
        }
    ]
    
    return {"context": context, "qas": qas}

In [4]:
# process the data and write to json file
train["json"] = train.apply(process_row, axis=1)
test["json"] = test.apply(process_row, axis=1)

with open(train_data_path.with_suffix(".json"), "w", encoding="utf-8") as f:
    json.dump(train["json"].tolist(), f, indent=2, ensure_ascii=False)

with open(test_data_path.with_suffix(".json"), "w", encoding="utf-8") as f:
    json.dump(test["json"].tolist(), f, indent=2, ensure_ascii=False)

### Train and evaluate

In [5]:
model_type = "bert"
model_name = "bert-base-cased"

# paths for saving models
models_output_dir = pathlib.Path("models")
model_output_dir = models_output_dir / model_type
best_model_output_dir = model_output_dir / "best_model"

# create folders if they don't exist
model_output_dir.mkdir(parents=True, exist_ok=True)
best_model_output_dir.mkdir(parents=True, exist_ok=True)

In [9]:
train_args = {
    "reprocess_input_data": True,
    "overwrite_output_dir": True,
    "use_cached_eval_features": True,
    "output_dir": str(model_output_dir),
    "best_model_dir": str(best_model_output_dir),
    "evaluate_during_training": True,
    "max_seq_length": 128,
    "num_train_epochs": 5,
    "evaluate_during_training_steps": 1000,
    "save_model_every_epoch": False,
    "save_eval_checkpoints": False,
    "n_best_size":8,
    "train_batch_size": 16,
    "eval_batch_size": 16
}

model = QuestionAnsweringModel(model_type, model_name, args=train_args, use_cuda=False)

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['qa_outputs.weight', 'qa_outputs.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
# load train and test data from json
with open(train_data_path.with_suffix(".json"), "r", encoding="utf-8") as f:
    train = json.load(f)

with open(test_data_path.with_suffix(".json"), "r", encoding="utf-8") as f:
    test = json.load(f)

In [11]:
model.train_model(train, eval_data=test)

convert squad examples to features: 100%|███████████████████████████████████████████| 544/544 [00:01<00:00, 460.18it/s]
add example index and unique id: 100%|███████████████████████████████████████████| 544/544 [00:00<00:00, 543650.55it/s]


Epoch:   0%|          | 0/5 [00:00<?, ?it/s]

Running Epoch 0 of 5:   0%|          | 0/34 [00:00<?, ?it/s]


convert squad examples to features: 100%|█████████████████████████████████████████████| 52/52 [00:00<00:00, 547.19it/s][A

add example index and unique id: 100%|█████████████████████████████████████████████████████████| 52/52 [00:00<?, ?it/s][A


Running Evaluation:   0%|          | 0/4 [00:00<?, ?it/s]

Running Epoch 1 of 5:   0%|          | 0/34 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/4 [00:00<?, ?it/s]

Running Epoch 2 of 5:   0%|          | 0/34 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/4 [00:00<?, ?it/s]

Running Epoch 3 of 5:   0%|          | 0/34 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/4 [00:00<?, ?it/s]

Running Epoch 4 of 5:   0%|          | 0/34 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/4 [00:00<?, ?it/s]

(170,
 {'global_step': [34, 68, 102, 136, 170],
  'correct': [31, 36, 36, 38, 39],
  'similar': [17, 12, 13, 11, 10],
  'incorrect': [4, 4, 3, 3, 3],
  'train_loss': [2.478241443634033,
   0.8000431060791016,
   0.6468679904937744,
   0.5595228672027588,
   0.1823987364768982],
  'eval_loss': [-3.2782684564590454,
   -4.601487159729004,
   -5.46842348575592,
   -6.073521852493286,
   -6.340406179428101]})

In [13]:
result, texts = model.eval_model(test)
result

Running Evaluation:   0%|          | 0/4 [00:00<?, ?it/s]

{'correct': 39, 'similar': 10, 'incorrect': 3, 'eval_loss': -6.340406179428101}