In [None]:
# pip install torch==1.7.1
# pip install transformer

In [1]:
import pandas as pd
import numpy as np
import torch
from transformers import BertForQuestionAnswering
from transformers import BertTokenizer

In [2]:
coqa = pd.read_json('http://downloads.cs.stanford.edu/nlp/data/coqa/coqa-train-v1.0.json')
coqa.head()

Unnamed: 0,version,data
0,1,"{'source': 'wikipedia', 'id': '3zotghdk5ibi9ce..."
1,1,"{'source': 'cnn', 'id': '3wj1oxy92agboo5nlq4r7..."
2,1,"{'source': 'gutenberg', 'id': '3bdcf01ogxu7zdn..."
3,1,"{'source': 'cnn', 'id': '3ewijtffvo7wwchw6rtya..."
4,1,"{'source': 'gutenberg', 'id': '3urfvvm165iantk..."


In [3]:
del coqa['version']

In [4]:
#required columns in our dataframe
cols = ["text","question","answer"]
#list of lists to create our dataframe
comp_list = []
for index, row in coqa.iterrows():
    for i in range(len(row["data"]["questions"])):
        temp_list = []
        temp_list.append(row["data"]["story"])
        temp_list.append(row["data"]["questions"][i]["input_text"])
        temp_list.append(row["data"]["answers"][i]["input_text"])
        comp_list.append(temp_list)
new_df = pd.DataFrame(comp_list, columns=cols) 
#saving the dataframe to csv file for further loading
new_df.to_csv("CoQA_data.csv", index=False)

In [9]:
new_df['text'][0]

'The Vatican Apostolic Library (), more commonly called the Vatican Library or simply the Vat, is the library of the Holy See, located in Vatican City. Formally established in 1475, although it is much older, it is one of the oldest libraries in the world and contains one of the most significant collections of historical texts. It has 75,000 codices from throughout history, as well as 1.1 million printed books, which include some 8,500 incunabula. \n\nThe Vatican Library is a research library for history, law, philosophy, science and theology. The Vatican Library is open to anyone who can document their qualifications and research needs. Photocopies for private study of pages from books published between 1801 and 1990 can be requested in person or by mail. \n\nIn March 2014, the Vatican Library began an initial four-year project of digitising its collection of manuscripts, to be made available online. \n\nThe Vatican Secret Archives were separated from the library at the beginning of t

In [10]:
data = pd.read_csv("CoQA_data.csv")
data.head()

Unnamed: 0,text,question,answer
0,"The Vatican Apostolic Library (), more commonl...",When was the Vat formally opened?,It was formally established in 1475
1,"The Vatican Apostolic Library (), more commonl...",what is the library for?,research
2,"The Vatican Apostolic Library (), more commonl...",for what subjects?,"history, and law"
3,"The Vatican Apostolic Library (), more commonl...",and?,"philosophy, science and theology"
4,"The Vatican Apostolic Library (), more commonl...",what was started in 2014?,a project


In [11]:
print("Number of question and answers: ", len(data))

Number of question and answers:  108647


In [27]:
model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad', truncation=True, max_length=512)

In [28]:
random_num = np.random.randint(0,len(data))
question = data["question"][random_num]
text = data["text"][random_num]

In [29]:
input_ids = tokenizer.encode(question, text)
print("The input has a total of {} tokens.".format(len(input_ids)))

The input has a total of 370 tokens.


In [30]:
tokens = tokenizer.convert_ids_to_tokens(input_ids)
for token, id in zip(tokens, input_ids):
    print('{:8}{:8,}'.format(token,id))

[CLS]        101
was        2,001
he         2,002
alive      4,142
300        3,998
years      2,086
ago        3,283
?          1,029
[SEP]        102
charles    2,798
dickens   19,675
is         2,003
often      2,411
thought    2,245
of         1,997
as         2,004
one        2,028
of         1,997
the        1,996
greatest   4,602
british    2,329
writers    4,898
.          1,012
february   2,337
7          1,021
marked     4,417
the        1,996
200        3,263
##th       2,705
anniversary   5,315
of         1,997
his        2,010
birthday   5,798
.          1,012
yet        2,664
for        2,005
many       2,116
,          1,010
his        2,010
language   2,653
is         2,003
old        2,214
-          1,011
fashioned  13,405
and        1,998
his        2,010
stories    3,441
often      2,411
imp       17,727
##ro       3,217
##ba       3,676
##ble      3,468
.          1,012
so         2,061
why        2,339
do         2,079
so         2,061
many       2,116
people    

In [31]:
#first occurence of [SEP] token
sep_idx = input_ids.index(tokenizer.sep_token_id)
print("SEP token index: ", sep_idx)
#number of tokens in segment A (question) - this will be one more than the sep_idx as the index in Python starts from 0
num_seg_a = sep_idx+1
print("Number of tokens in segment A: ", num_seg_a)
#number of tokens in segment B (text)
num_seg_b = len(input_ids) - num_seg_a
print("Number of tokens in segment B: ", num_seg_b)
#creating the segment ids
segment_ids = [0]*num_seg_a + [1]*num_seg_b
#making sure that every input token has a segment id
assert len(segment_ids) == len(input_ids)

SEP token index:  8
Number of tokens in segment A:  9
Number of tokens in segment B:  361


In [32]:
len(segment_ids)

370

In [35]:
model

BertForQuestionAnswering(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 1024, padding_idx=0)
      (position_embeddings): Embedding(512, 1024)
      (token_type_embeddings): Embedding(2, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
              (LayerNorm): LayerNorm((1024,), eps=1e-12,

In [34]:
#token input_ids to represent the input and token segment_ids to differentiate our segments - question and text
output = model(torch.tensor([input_ids]), 
token_type_ids=torch.tensor([segment_ids]))

In [36]:
#tokens with highest start and end scores
answer_start = torch.argmax(output.start_logits)
print(answer_start)
answer_end = torch.argmax(output.end_logits)
print(answer_end)
if answer_end >= answer_start:
    answer = " ".join(tokens[answer_start:answer_end+1])
else:
    print("I am unable to find the answer to this question. Can you please ask another question?")
    
print("\nQuestion:\n{}".format(question.capitalize()))
print("\nAnswer:\n{}.".format(answer.capitalize()))

I am unable to find the answer to this question. Can you please ask another question?

Question:
Was he alive 300 years ago?


NameError: name 'answer' is not defined

In [46]:
random_num = np.random.randint(0,len(data))
question = data["question"][random_num]
text = data["text"][random_num]

<class 'str'>
(CNN) -- The biological mother of a missing 7-year-old Oregon boy has written an open letter to her son saying, "I am sorry that I was not there to protect you." 

"I am so soooooo sorry that this has happened," Desiree Young wrote to Kyron Horman in a letter that was released Monday to NBC News. 

"I will never be able to forgive myself for being so many hours away when you needed me the most. I pray that you come back to me because I am afraid that I can't live without you." 

The letter comes more than two weeks after the boy disappeared on June 4. 

The boy's stepmother, Terri Horman, said she last saw Kyron walking down the hallway towards his classroom at Skyline Elementary School, police said. Cell phone records indicate she may not have been at the school at that time, according to a report in People Magazine. Authorities refused to comment on the report. 

In the emotional letter, Young speaks directly to her young son. "When you come home I will show you all of 

In [62]:
#random_num = np.random.randint(0,len(data))
#question = data["question"][random_num]
#text = data["text"][random_num]

question='Where does Han study at?'
text='Beloit College is a nice school. There are a lot of students like Ethan and Jason. Jason Study at Beloit College. Han study at Missouri College'

input_ids = tokenizer.encode(question, text)
print(input_ids)

tokens = tokenizer.convert_ids_to_tokens(input_ids)
print(tokens)
#first occurence of [SEP] token
sep_idx = input_ids.index(tokenizer.sep_token_id)
#number of tokens in segment A (question) - this will be one more than the sep_idx as the index in Python starts from 0
num_seg_a = sep_idx+1
#number of tokens in segment B (text)
num_seg_b = len(input_ids) - num_seg_a
#creating the segment ids
segment_ids = [0]*num_seg_a + [1]*num_seg_b
#making sure that every input token has a segment id
assert len(segment_ids) == len(input_ids)

output = model(torch.tensor([input_ids]), 
token_type_ids=torch.tensor([segment_ids]))

#tokens with highest start and end scores
answer_start = torch.argmax(output.start_logits)
print(answer_start)
answer_end = torch.argmax(output.end_logits)
print(answer_end)
if answer_end >= answer_start:
    answer = " ".join(tokens[answer_start:answer_end+1])
else:
    print("I am unable to find the answer to this question. Can you please ask another question?")
    
print("\nQuestion:\n{}".format(question.capitalize()))
print("\nAnswer:\n{}.".format(answer.capitalize()))

[101, 2073, 2515, 7658, 2817, 2012, 1029, 102, 19337, 28100, 2267, 2003, 1037, 3835, 2082, 1012, 2045, 2024, 1037, 2843, 1997, 2493, 2066, 6066, 1998, 4463, 1012, 4463, 2817, 2012, 19337, 28100, 2267, 1012, 7658, 2817, 2012, 5284, 2267, 102]
['[CLS]', 'where', 'does', 'han', 'study', 'at', '?', '[SEP]', 'bel', '##oit', 'college', 'is', 'a', 'nice', 'school', '.', 'there', 'are', 'a', 'lot', 'of', 'students', 'like', 'ethan', 'and', 'jason', '.', 'jason', 'study', 'at', 'bel', '##oit', 'college', '.', 'han', 'study', 'at', 'missouri', 'college', '[SEP]']
tensor(37)
tensor(38)

Question:
Where does han study at?

Answer:
Missouri college.


In [57]:
print("\nText:\n{}".format(text.capitalize()))
print("\nQuestion:\n{}".format(question.capitalize()))
print("\nAnswer:\n{}.".format(answer.capitalize()))


Text:
Beloit college is a nice school. there are a lot of students. ethan study at beloit college. han study at missouri college

Question:
Where was he from?

Answer:
Missouri.
