# BERT based online question answerer

In [1]:
import tensorflow as tf

In [2]:
from transformers import *
import transformers

In [3]:
transformers.__version__

'2.8.0'

In [4]:
tf.__version__

'2.1.0'

In [5]:
from googlesearch import search

In [6]:
from lxml import html
import re
import requests

In [7]:
import progressbar

In [8]:
tokenizer = AutoTokenizer.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad")
model = TFAutoModelForQuestionAnswering.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad")

In [9]:
text = '''South Park is an American animated sitcom created by Trey Parker and Matt Stone 
and developed by Brian Graden for the Comedy Central television network. The series revolves 
around four boys—Stan Marsh, Kyle Broflovski, Eric Cartman, and Kenny McCormick—and their exploits
in and around the titular Colorado town. The show became infamous for its profanity and dark, 
surreal humor that satirizes a wide range of topics towards a mature audience.'''
# source: https://en.wikipedia.org/wiki/South_Park

In [10]:
question = "What is South Park?"

In [11]:
# src: https://huggingface.co/transformers/usage.html?highlight=question%20answering

inputs = tokenizer.encode_plus(question, text, add_special_tokens=True, return_tensors="tf")
input_ids = inputs["input_ids"].numpy()[0]

text_tokens = tokenizer.convert_ids_to_tokens(input_ids)
answer_start_scores, answer_end_scores = model(inputs)

answer_start = tf.argmax(
answer_start_scores, axis=1
).numpy()[0]  # Get the most likely beginning of answer with the argmax of the score
answer_end = (
tf.argmax(answer_end_scores, axis=1) + 1
).numpy()[0]  # Get the most likely end of answer with the argmax of the score
answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input_ids[answer_start:answer_end]))

print(f"Question: {question}")
print(f"Answer: {answer}\n")

Question: What is South Park?
Answer: an american animated sitcom



In [12]:
def get_score(question, text):
    inputs = tokenizer.encode_plus(question, text, add_special_tokens=True, return_tensors="tf")
    input_ids = inputs["input_ids"].numpy()[0]

    text_tokens = tokenizer.convert_ids_to_tokens(input_ids)
    start_scores, end_scores = model(inputs)

    start = tf.argmax( start_scores, axis=1 ).numpy()[0]
    end = (tf.argmax(end_scores, axis=1) + 1 ).numpy()[0]
    
    answer = tokenizer.convert_tokens_to_string(
                    tokenizer.convert_ids_to_tokens(input_ids[start:end])
    )
    start_score = tf.math.reduce_max(start_scores, axis=1).numpy()[0]
    end_score = tf.math.reduce_max(end_scores, axis=1).numpy()[0]
    
    return start_score, end_score, answer

In [18]:
def ask(question, results=1):
    print("Searching the internet...")
    
    urls = [uri for uri in  search(question, tld='com', lang='en', start=0, stop=5)]
    text = []
    scores = []

    with progressbar.ProgressBar(max_value=10) as bar:
        for i in range(len(urls)):
            content = requests.get(urls[i]).text
            paragraphs = html.fromstring(content).findall('.//p')
            text += [re.sub("\w*[^0-9a-zA-Z.,;' ]\w*/g", "", p.text_content()) 
                           for p in paragraphs if len(p.text_content())>100][:5]
            bar.update(i)
    print("Looking for an answer...")        
    
    for p in text:
        scores.append(get_score(question, p))
    
    ranked = sorted(scores ,key=lambda x: x[1], reverse=True)[:results] 
    answers = [ x[2] for x in ranked]
    return answers

In [19]:
ask("What is South Park?")

Searching the internet...


100% (10 of 10) |########################| Elapsed Time: 0:00:00 Time:  0:00:00


Looking for an answer...


['an american animated television sitcom']