In [3]:
import pandas as pd
import numpy as np
import torch
from transformers import BertForQuestionAnswering
from transformers import BertTokenizer

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
coqa = pd.read_json('http://downloads.cs.stanford.edu/nlp/data/coqa/coqa-train-v1.0.json')
coqa.head()

Unnamed: 0,version,data
0,1,"{'source': 'wikipedia', 'id': '3zotghdk5ibi9ce..."
1,1,"{'source': 'cnn', 'id': '3wj1oxy92agboo5nlq4r7..."
2,1,"{'source': 'gutenberg', 'id': '3bdcf01ogxu7zdn..."
3,1,"{'source': 'cnn', 'id': '3ewijtffvo7wwchw6rtya..."
4,1,"{'source': 'gutenberg', 'id': '3urfvvm165iantk..."


In [5]:
del coqa["version"]

In [6]:
#required columns in our dataframe
cols = ["text","question","answer"]
#list of lists to create our dataframe
comp_list = []
for index, row in coqa.iterrows():
    for i in range(len(row["data"]["questions"])):
        temp_list = []
        temp_list.append(row["data"]["story"])
        temp_list.append(row["data"]["questions"][i]["input_text"])
        temp_list.append(row["data"]["answers"][i]["input_text"])
        comp_list.append(temp_list)
new_df = pd.DataFrame(comp_list, columns=cols)
#saving the dataframe to csv file for further loading
new_df.to_csv("CoQA_data.csv", index=False)

In [7]:
data = pd.read_csv("CoQA_data.csv")
data.head()

Unnamed: 0,text,question,answer
0,"The Vatican Apostolic Library (), more commonl...",When was the Vat formally opened?,It was formally established in 1475
1,"The Vatican Apostolic Library (), more commonl...",what is the library for?,research
2,"The Vatican Apostolic Library (), more commonl...",for what subjects?,"history, and law"
3,"The Vatican Apostolic Library (), more commonl...",and?,"philosophy, science and theology"
4,"The Vatican Apostolic Library (), more commonl...",what was started in 2014?,a project


In [8]:
print("Number of question and answers: ", len(data))

Number of question and answers:  108647


In [9]:
model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')

Downloading (…)lve/main/config.json:   0%|          | 0.00/443 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-large-uncased-whole-word-masking-finetuned-squad were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

In [10]:
random_num = np.random.randint(0,len(data))
question = data["question"][random_num]
text = data["text"][random_num]

In [11]:
input_ids = tokenizer.encode(question, text)
print("The input has a total of {} tokens.".format(len(input_ids)))

The input has a total of 289 tokens.


In [12]:
tokens = tokenizer.convert_ids_to_tokens(input_ids)
for token, id in zip(tokens, input_ids):
    print('{:8}{:8,}'.format(token,id))

[CLS]        101
for        2,005
what       2,054
?          1,029
[SEP]        102
brazil     4,380
has        2,038
both       2,119
modern     2,715
technologies   6,786
in         1,999
the        1,996
center     2,415
-          1,011
south      2,148
portion    4,664
,          1,010
counting  10,320
with       2,007
lt         8,318
##e        2,063
,          1,010
3          1,017
##g        2,290
hs        26,236
##pa       4,502
,          1,010
ds        16,233
##l        2,140
is         2,003
##db      18,939
based      2,241
digital    3,617
tv         2,694
.          1,012
other      2,060
areas      2,752
of         1,997
the        1,996
country    2,406
,          1,010
particularly   3,391
the        1,996
north      2,167
and        1,998
northeast   4,794
regions    4,655
,          1,010
lack       3,768
even       2,130
basic      3,937
analog    11,698
ps         8,827
##t        2,102
##n        2,078
telephone   7,026
lines      3,210
.          1,012
this

In [13]:
#first occurence of [SEP] token
sep_idx = input_ids.index(tokenizer.sep_token_id)
print("SEP token index: ", sep_idx)
#number of tokens in segment A (question) - this will be one more than the sep_idx as the index in Python starts from 0
num_seg_a = sep_idx+1
print("Number of tokens in segment A: ", num_seg_a)
#number of tokens in segment B (text)
num_seg_b = len(input_ids) - num_seg_a
print("Number of tokens in segment B: ", num_seg_b)
#creating the segment ids
segment_ids = [0]*num_seg_a + [1]*num_seg_b
#making sure that every input token has a segment id
assert len(segment_ids) == len(input_ids)

SEP token index:  4
Number of tokens in segment A:  5
Number of tokens in segment B:  284


In [14]:
#token input_ids to represent the input and token segment_ids to differentiate our segments - question and text
output = model(torch.tensor([input_ids]),  token_type_ids=torch.tensor([segment_ids]))

In [15]:
#tokens with highest start and end scores
answer_start = torch.argmax(output.start_logits)
answer_end = torch.argmax(output.end_logits)
if answer_end >= answer_start:
    answer = " ".join(tokens[answer_start:answer_end+1])
else:
    print("I am unable to find the answer to this question. Can you please ask another question?")

print("\nQuestion:\n{}".format(question.capitalize()))
print("\nAnswer:\n{}.".format(answer.capitalize()))


Question:
For what?

Answer:
[cls] for what ? [sep].


In [16]:
answer = tokens[answer_start]
for i in range(answer_start+1, answer_end+1):
    if tokens[i][0:2] == "##":
        answer += tokens[i][2:]
    else:
        answer += " " + tokens[i]

In [17]:
def question_answer(question, text):

    #tokenize question and text as a pair
    input_ids = tokenizer.encode(question, text)

    #string version of tokenized ids
    tokens = tokenizer.convert_ids_to_tokens(input_ids)

    #segment IDs
    #first occurence of [SEP] token
    sep_idx = input_ids.index(tokenizer.sep_token_id)
    #number of tokens in segment A (question)
    num_seg_a = sep_idx+1
    #number of tokens in segment B (text)
    num_seg_b = len(input_ids) - num_seg_a

    #list of 0s and 1s for segment embeddings
    segment_ids = [0]*num_seg_a + [1]*num_seg_b
    assert len(segment_ids) == len(input_ids)

    #model output using input_ids and segment_ids
    output = model(torch.tensor([input_ids]), token_type_ids=torch.tensor([segment_ids]))

    #reconstructing the answer
    answer_start = torch.argmax(output.start_logits)
    answer_end = torch.argmax(output.end_logits)
    if answer_end >= answer_start:
        answer = tokens[answer_start]
        for i in range(answer_start+1, answer_end+1):
            if tokens[i][0:2] == "##":
                answer += tokens[i][2:]
            else:
                answer += " " + tokens[i]

    if answer.startswith("[CLS]"):
        answer = "Unable to find the answer to your question."

    print("\nPredicted answer:\n{}".format(answer.capitalize()))

In [24]:
q='where the knowledge is opened?'
page="""The One Source of Truth nourishes the Tree of Knowledge, stretching its roots
into Infinite Eternity. In each cycle of human evolution it bears good fruit in the
form of a Great Book, which serves as the source of all knowledge and wisdom
for humanity for a specific period. And every Book of this kind, like a tree,
grows and yields its own fruit for the good of the whole world.
At the dawn of the conscious evolution of humanity, approximately eighteen
million years ago, the Sons of Light brought the Fiery Teaching of Kalachakra
from the Distant Worlds as a gift for the people of Lemuria, sowing the first
seeds of all the existing religions of the Earth. The new pages of knowledge
opened in the one universal language of the Sun — Senzar — inviting them to
further delve into the books of enlightened knowledge. And thus ever after, each
epoch aimed to reveal the secret pages of universal knowledge, thereby
facilitating the progress of human consciousness round by round.
Many of the innermost pages of the Great Book were revealed to the human
population of Atlantis. They knew more than anyone else, for many secrets of
surrounding Nature were confided to them. And their best representatives had
gained the right to look beyond the field of vision, unveiling the invisible Life.
But the Atlanteans had not accepted the heart as the organ to which one should
subordinate the intellect — they honoured the mind above all. And they
disappeared from the stage of Life without learning this simple truth:
“Knowledge without Love is dead.” Thus, because of their pride, people
deprived themselves of the Secret Wisdom, gradually forgetting their pristine
universal language. Nevertheless, the Great Initiates remained in every land and
nation, guarding the Ancient Wisdom as a sacred trust."""

In [25]:
question_answer(q,page)


Predicted answer:
In the one universal language of the sun — senzar
