In [1]:
!pip install transformers



In [2]:
import pandas as pd
import numpy as np
import torch
from transformers import BertForQuestionAnswering
from transformers import BertTokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
coqa = pd.read_json('http://downloads.cs.stanford.edu/nlp/data/coqa/coqa-train-v1.0.json')
coqa.head()

Unnamed: 0,version,data
0,1,"{'source': 'wikipedia', 'id': '3zotghdk5ibi9ce..."
1,1,"{'source': 'cnn', 'id': '3wj1oxy92agboo5nlq4r7..."
2,1,"{'source': 'gutenberg', 'id': '3bdcf01ogxu7zdn..."
3,1,"{'source': 'cnn', 'id': '3ewijtffvo7wwchw6rtya..."
4,1,"{'source': 'gutenberg', 'id': '3urfvvm165iantk..."


In [4]:
del coqa["version"]
cols = ["text","question","answer"]
comp_list = []
for index, row in coqa.iterrows():
    for i in range(len(row["data"]["questions"])):
        temp_list = []
        temp_list.append(row["data"]["story"])
        temp_list.append(row["data"]["questions"][i]["input_text"])
        temp_list.append(row["data"]["answers"][i]["input_text"])
        comp_list.append(temp_list)

new_df = pd.DataFrame(comp_list, columns=cols)
new_df.to_csv("CoQA_data.csv", index=False)

In [5]:
data = pd.read_csv("CoQA_data.csv")
data.head()

Unnamed: 0,text,question,answer
0,"The Vatican Apostolic Library (), more commonl...",When was the Vat formally opened?,It was formally established in 1475
1,"The Vatican Apostolic Library (), more commonl...",what is the library for?,research
2,"The Vatican Apostolic Library (), more commonl...",for what subjects?,"history, and law"
3,"The Vatican Apostolic Library (), more commonl...",and?,"philosophy, science and theology"
4,"The Vatican Apostolic Library (), more commonl...",what was started in 2014?,a project


In [6]:
print("Number of question and answers: ", len(data))

Number of question and answers:  108647


In [7]:
model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Some weights of the model checkpoint at bert-large-uncased-whole-word-masking-finetuned-squad were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQue

In [12]:
random_num = np.random.randint(0,len(data))
question = data["question"][random_num]
text = data["text"][random_num]

input_ids = tokenizer.encode(question, text)
print("The input has a total of {} tokens.".format(len(input_ids)))
tokens = tokenizer.convert_ids_to_tokens(input_ids)

for token, id in zip(tokens, input_ids):
    print('{:8}{:8,}'.format(token,id))

The input has a total of 335 tokens.
[CLS]        101
can        2,064
you        2,017
find       2,424
it         2,009
in         1,999
the        1,996
dictionary   9,206
?          1,029
[SEP]        102
a          1,037
new        2,047
word       2,773
is         2,003
becoming   3,352
more       2,062
and        1,998
more       2,062
popular    2,759
on         2,006
the        1,996
internet   4,274
in         1,999
china      2,859
-          1,011
but        2,021
no         2,053
one        2,028
knows      4,282
quite      3,243
what       2,054
it         2,009
means      2,965
.          1,012
the        1,996
word       2,773
"          1,000
du         4,241
##ang      5,654
"          1,000
is         2,003
so         2,061
new        2,047
that       2,008
you        2,017
can        2,064
'          1,005
t          1,056
even       2,130
find       2,424
it         2,009
in         1,999
the        1,996
chinese    2,822
dictionary   9,206
.          1,012
but    

In [14]:
sep_idx = input_ids.index(tokenizer.sep_token_id)
print("SEP token index: ", sep_idx)

num_seg_a = sep_idx+1
print("Number of tokens in segment A: ", num_seg_a)

num_seg_b = len(input_ids) - num_seg_a
print("Number of tokens in segment B: ", num_seg_b)

segment_ids = [0]*num_seg_a + [1]*num_seg_b
assert len(segment_ids) == len(input_ids)

#token input_ids to represent the input and token segment_ids to differentiate our segments - question and text
output = model(torch.tensor([input_ids]),  token_type_ids=torch.tensor([segment_ids]))

answer_start = torch.argmax(output.start_logits)
answer_end = torch.argmax(output.end_logits)

if answer_end >= answer_start:
    answer = " ".join(tokens[answer_start:answer_end+1])
else:
    print("I am unable to find the answer to this question. Can you please ask another question?")

print("nQuestion:n{}".format(question.capitalize()))
print("nAnswer:n{}.".format(answer.capitalize()))

SEP token index:  9
Number of tokens in segment A:  10
Number of tokens in segment B:  325
nQuestion:nCan you find it in the dictionary?
nAnswer:nYou can ' t even find it in the chinese dictionary.


In [15]:
answer = tokens[answer_start]

for i in range(answer_start+1, answer_end+1):
    if tokens[i][0:2] == "##":
        answer += tokens[i][2:]
    else:
        answer += " " + tokens[i]

In [29]:
def question_answer(question, text):

    #tokenize question and text as a pair
    input_ids = tokenizer.encode(question, text)

    #string version of tokenized ids
    tokens = tokenizer.convert_ids_to_tokens(input_ids)

    #segment IDs
    #first occurence of [SEP] token
    sep_idx = input_ids.index(tokenizer.sep_token_id)

    #number of tokens in segment A (question)
    num_seg_a = sep_idx+1

    #number of tokens in segment B (text)
    num_seg_b = len(input_ids) - num_seg_a

    #list of 0s and 1s for segment embeddings
    segment_ids = [0]*num_seg_a + [1]*num_seg_b

    assert len(segment_ids) == len(input_ids)

    #model output using input_ids and segment_ids
    output = model(torch.tensor([input_ids]), token_type_ids=torch.tensor([segment_ids]))

    #reconstructing the answer
    answer_start = torch.argmax(output.start_logits)
    answer_end = torch.argmax(output.end_logits)

    if answer_end >= answer_start:
        answer = tokens[answer_start]
        for i in range(answer_start+1, answer_end+1):
            if tokens[i][0:2] == "##":
                answer += tokens[i][2:]
            else:
                answer += " " + tokens[i]

    if answer.startswith("[CLS]"):
        answer = "Unable to find the answer to your question."

    print("nPredicted answer:n{}".format(answer.capitalize()))
    return answer.capitalize()

In [18]:
text = """New York (CNN) -- More than 80 Michael Jackson collectibles -- including the late pop star's famous rhinestone-studded glove from a 1983 performance -- were auctioned off Saturday, reaping a total $2 million. Profits from the auction at the Hard Rock Cafe in New York's Times Square crushed pre-sale expectations of only $120,000 in sales. The highly prized memorabilia, which included items spanning the many stages of Jackson's career, came from more than 30 fans, associates and family members, who contacted Julien's Auctions to sell their gifts and mementos of the singer. Jackson's flashy glove was the big-ticket item of the night, fetching $420,000 from a buyer in Hong Kong, China. Jackson wore the glove at a 1983 performance during "Motown 25," an NBC special where he debuted his revolutionary moonwalk. Fellow Motown star Walter "Clyde" Orange of the Commodores, who also performed in the special 26 years ago, said he asked for Jackson's autograph at the time, but Jackson gave him the glove instead. "The legacy that [Jackson] left behind is bigger than life for me," Orange said. "I hope that through that glove people can see what he was trying to say in his music and what he said in his music." Orange said he plans to give a portion of the proceeds to charity. Hoffman Ma, who bought the glove on behalf of Ponte 16 Resort in Macau, paid a 25 percent buyer's premium, which was tacked onto all final sales over $50,000. Winners of items less than $50,000 paid a 20 percent premium."""

question = "Where was the Auction held?"

question_answer(question, text)

print("Original answer:n", data.loc[data["question"] == question]["answer"].values[0])


nPredicted answer:nHard rock cafe in new york ' s times square
Original answer:n Hard Rock Cafe


In [26]:
!pip install deep-translator

Collecting deep-translator
  Downloading deep_translator-1.11.4-py3-none-any.whl.metadata (30 kB)
Downloading deep_translator-1.11.4-py3-none-any.whl (42 kB)
Installing collected packages: deep-translator
Successfully installed deep-translator-1.11.4


In [27]:
def translate_to_romanian(text):
    from deep_translator import GoogleTranslator

    try:
        translated = GoogleTranslator(source='auto', target='ro').translate(text)
        return translated
    except Exception as e:
        print(f"Translation error: {e}")
        return text

In [33]:
text = input("Please enter your text:")
question = input("Please enter your question:")

while True:
    answer = question_answer(question, text)
    print(f"Answer in romanian:\n{translate_to_romanian(answer)}")
    flag = True
    flag_N = False

    while flag:
        response = input("Do you want to ask another question based on this text (Y/N)? ")
        if response[0] == "Y":
            question = input("Please enter your question:")
            flag = False
        elif response[0] == "N":
            print("Bye!")
            flag = False
            flag_N = True

    if flag_N == True:
        break

nPredicted answer:nVideo games
Answer in romanian:
Jocuri video
nPredicted answer:nProgrammer
Answer in romanian:
Programator


KeyboardInterrupt: Interrupted by user