### Question Answer Application
The goal of Question Answering is to find the answer to a question given a question and an accompanying context. The predicted answer will be either a span of text from the context or an empty string (indicating the question cannot be answered from the context).

In [None]:
!pip install simpletransformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting simpletransformers
  Downloading simpletransformers-0.63.7-py3-none-any.whl (249 kB)
[K     |████████████████████████████████| 249 kB 37.2 MB/s 
Collecting streamlit
  Downloading streamlit-1.12.0-py2.py3-none-any.whl (9.1 MB)
[K     |████████████████████████████████| 9.1 MB 51.8 MB/s 
Collecting wandb>=0.10.32
  Downloading wandb-0.13.1-py2.py3-none-any.whl (1.8 MB)
[K     |████████████████████████████████| 1.8 MB 48.5 MB/s 
[?25hCollecting tokenizers
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 57.8 MB/s 
[?25hCollecting sentencepiece
  Downloading sentencepiece-0.1.97-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[K     |████████████████████████████████| 1.3 MB 62.1 MB/s 
Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[K     |████

In [None]:
import json
import random

def split_data(data):
    """
    Split the data in 80:20 ratio
    """
    random.shuffle(data)
    split_index = int(len(data) * 0.85)
    return data[:split_index], data[split_index:]

def main():
    """
    Main function
    """
    with open('/content/answers_v2.json') as f:
        data = json.load(f)['data']

    train_data, test_data = split_data(data)

    with open('/content/train.json', 'w') as f:
        json.dump({'data': train_data}, f)

    with open('/content/test.json', 'w') as f:
        json.dump({'data': test_data}, f)

main()

In [None]:
import json
with open(r"train.json", "r") as read_file:
    train = json.load(read_file)['data']

In [None]:
train_squad = []
for item in train:
    for paragraph in item['paragraphs']:
        context = paragraph['context']
        qas = []
        for qa in paragraph['qas']:
            question = qa['question']
            id = qa['id']
            answers = []
            for answer in qa['answers']:
                text = answer['text']
                answer_start = answer['answer_start']
                answers.append({'text': text, 'answer_start': answer_start})
            is_impossible = qa['is_impossible']
            qas.append({'question': question, 'id': id, 'answers': answers, 'is_impossible': is_impossible})
        train_squad.append({'context': context, 'qas': qas})

In [None]:
train_squad

[{'context': 'Target focused on manufacturing and R&D operations.  2017  Emissions - 66, 609 Metric tonnes.  23% reduction compared to 2020 goals of 20% reduction.   ',
  'qas': [{'question': 'What is the emission reduction mechanism or technology used here? ',
    'id': 430862,
    'answers': [],
    'is_impossible': True},
   {'question': 'What emission reduction target is supposedly aimed?',
    'id': 430864,
    'answers': [{'text': '23% reduction', 'answer_start': 95}],
    'is_impossible': False}]},
 {'context': 'We made efforts to reduce emissions by introducing highly-efficient facilities, etc. However, due to a large increase in production volume, the emission amount was lower than our target. ',
  'qas': [{'question': 'What is the emission reduction mechanism or technology used here? ',
    'id': 430862,
    'answers': [{'text': 'introducing highly-efficient facilities',
      'answer_start': 39}],
    'is_impossible': False},
   {'question': 'What emission reduction target i

In [None]:
with open(r"test.json", "r") as read_file:
    test = json.load(read_file)['data']

In [None]:
test_squad = []
for item in train:
    for paragraph in item['paragraphs']:
        context = paragraph['context']
        qas = []
        for qa in paragraph['qas']:
            question = qa['question']
            id = qa['id']
            answers = []
            for answer in qa['answers']:
                text = answer['text']
                answer_start = answer['answer_start']
                answers.append({'text': text, 'answer_start': answer_start})
            is_impossible = qa['is_impossible']
            qas.append({'question': question, 'id': id, 'answers': answers, 'is_impossible': is_impossible})
        test_squad.append({'context': context, 'qas': qas})

In [None]:
import logging

from simpletransformers.question_answering import QuestionAnsweringModel, QuestionAnsweringArgs

In [None]:
model_type="roberta"
model_name= "roberta-base"
if model_type == "bert":
    model_name = "bert-base-cased"

elif model_type == "roberta":
    model_name = "roberta-base"

elif model_type == "distilbert":
    model_name = "distilbert-base-cased"

elif model_type == "distilroberta":
    model_type = "roberta"
    model_name = "distilroberta-base"

elif model_type == "electra-base":
    model_type = "electra"
    model_name = "google/electra-base-discriminator"

elif model_type == "electra-small":
    model_type = "electra"
    model_name = "google/electra-small-discriminator"

In [None]:
# Configure the model 
model_args = QuestionAnsweringArgs()
model_args.train_batch_size = 32
model_args.evaluate_during_training = True
model_args.n_best_size=2
model_args.num_train_epochs=10


In [None]:
### Advanced Methodology
train_args = {
    "reprocess_input_data": True,
    "overwrite_output_dir": True,
    "use_cached_eval_features": True,
    "output_dir": f"outputs/{model_type}",
    "best_model_dir": f"outputs/{model_type}/best_model",
    "evaluate_during_training": True,
    "max_query_length": 64,
    "num_train_epochs": 5,
    # "evaluate_during_training_steps": 1000,
    "wandb_project": "Question Answer Application",
    "wandb_kwargs": {"name": model_name},
    "save_model_every_epoch": False,
    "save_eval_checkpoints": False,
    "n_best_size":2
    # "use_early_stopping": True,
    # "early_stopping_metric": "mcc",
    # "n_gpu": 2,
    # "manual_seed": 4,
    # "use_multiprocessing": False,
    # "train_batch_size": 128,
    # "eval_batch_size": 64,
    # "config": {
    #     "output_hidden_states": True
    # }
}

In [None]:
model = QuestionAnsweringModel(
    model_type, model_name, args=train_args
)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForQuestionAnswering: ['lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForQuestionAnswering were not initialized from the model checkpoint at roberta-base and are newly initialized: ['qa_outputs.weight', 'qa_outputs.bias']
You should probably TRAIN this model on a down-stream task to be able to use 

In [None]:
### Remove output folder
!rm -rf outputs

In [None]:
# Train the model
model.train_model(train_squad, eval_data=test_squad)

convert squad examples to features: 100%|██████████| 123/123 [00:00<00:00, 155.91it/s]
add example index and unique id: 100%|██████████| 123/123 [00:00<00:00, 116798.59it/s]


Epoch:   0%|          | 0/5 [00:00<?, ?it/s]

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
Training loss,▁
correct,▁▁▁▁▁
eval_loss,█▃▂▁▁
global_step,▁▃▅▅▆█
incorrect,█▁▁▁▁
lr,▁
similar,▁████
train_loss,█▅▃▁▃

0,1
Training loss,2.19543
correct,0.0
eval_loss,-4.0513
global_step,75.0
incorrect,1.0
lr,1e-05
similar,1.0
train_loss,2.07861


Running Epoch 0 of 5:   0%|          | 0/16 [00:00<?, ?it/s]


convert squad examples to features:   0%|          | 0/123 [00:00<?, ?it/s][A
convert squad examples to features: 100%|██████████| 123/123 [00:00<00:00, 158.64it/s]

add example index and unique id: 100%|██████████| 123/123 [00:00<00:00, 279771.90it/s]


Running Evaluation:   0%|          | 0/16 [00:00<?, ?it/s]

Running Epoch 1 of 5:   0%|          | 0/16 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/16 [00:00<?, ?it/s]

Running Epoch 2 of 5:   0%|          | 0/16 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/16 [00:00<?, ?it/s]

Running Epoch 3 of 5:   0%|          | 0/16 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/16 [00:00<?, ?it/s]

Running Epoch 4 of 5:   0%|          | 0/16 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/16 [00:00<?, ?it/s]

(80,
 {'global_step': [16, 32, 48, 64, 80],
  'correct': [0, 0, 0, 0, 0],
  'similar': [2, 2, 2, 2, 2],
  'incorrect': [0, 0, 0, 0, 0],
  'train_loss': [4.119726657867432,
   2.411181688308716,
   3.156445264816284,
   1.832128882408142,
   1.697509765625],
  'eval_loss': [-0.5979461669921875,
   -2.6019287109375,
   -2.9969482421875,
   -3.3046875,
   -3.64013671875]})

In [None]:
# Evaluate the model
result, texts = model.eval_model(test_squad)

Running Evaluation:   0%|          | 0/16 [00:00<?, ?it/s]

In [None]:
result

{'correct': 0, 'similar': 2, 'incorrect': 0, 'eval_loss': -3.64013671875}

In [None]:
# Make predictions with the model
to_predict = [
    {
        "context": "1.We make office energy efficiency policy to reduce power consumption  2. low-carbon transportationï¼Œ slowly cut back on car usage.",
        "qas": [
            {
                "question": "What is the emission reduction mechanism or technology used here?",
                "id": "0",
            }
        ],
    }
]

In [None]:
answers, probabilities = model.predict(to_predict)

print(answers[0]['answer'])

convert squad examples to features: 100%|██████████| 1/1 [00:00<00:00, 127.30it/s]
add example index and unique id: 100%|██████████| 1/1 [00:00<00:00, 624.43it/s]


Running Prediction:   0%|          | 0/1 [00:00<?, ?it/s]

['reduce power consumption 2. low-carbon transportationï¼Œ slowly cut back on car usage.', 'low-carbon transportationï¼Œ slowly cut back on car usage.']


In [None]:
# Make predictions with the model
to_predict_ex2 = [
    {
        "context": "Our base year is 2018/19. We have used science-based target setting tools to calculate a target in line with less than 1.5 degrees of global warming (i.e. .more ambitious than the SBTi tool required for 1.5 degrees) to reduce our scope 1 + 2 emissions by 2030 by 80% using the market-based Scope 2 accounting approach. ",
        "qas": [
            {
                "question": "What is the emission reduction mechanism or technology used here?",
                "id": "1",
            }
        ],
    }
]

In [None]:
answers, probabilities = model.predict(to_predict_ex2)

print(answers[0]['answer'])

convert squad examples to features: 100%|██████████| 1/1 [00:00<00:00, 79.62it/s]
add example index and unique id: 100%|██████████| 1/1 [00:00<00:00, 424.40it/s]


Running Prediction:   0%|          | 0/1 [00:00<?, ?it/s]

['science-based target setting tools to calculate a target in line with less than 1.5 degrees of global warming (i.e. .more ambitious than the SBTi tool required for 1.5 degrees) to reduce our scope 1 + 2 emissions by 2030 by 80% using the market-based Scope 2 accounting approach.']


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
import shutil

def copytree(src, dst, symlinks=False, ignore=None):
    for item in os.listdir(src):
        s = os.path.join(src, item)
        d = os.path.join(dst, item)
        if os.path.exists(d):
            try:
                shutil.rmtree(d)
            except Exception as e:
                print(e)
                os.unlink(d)
        if os.path.isdir(s):
            shutil.copytree(s, d, symlinks, ignore)
        else:
            shutil.copy2(s, d)

copytree(src='/content/outputs/roberta/', dst='/content/drive/MyDrive/QNA_Run3/')