# Synthetic SQuAD dataset based on MKQA (Google)

In [1]:
import json
import html2text
import nltk
import os
import re
import requests
import threading
from time import sleep

from tokenizers import BertWordPieceTokenizer
from transformers import BertTokenizerFast

from requests.packages.urllib3.exceptions import InsecureRequestWarning
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)

In [2]:
LM_NAME = 'bert-base-multilingual-cased'
INPUT_FILE = '../data/mkqa/mkqa.jsonl'

#LANG_NAME = 'spanish'
#LANG_CODE = 'ES-es'
#REGION_CODE = 'es'

LANG_NAME = 'japanese'
LANG_CODE = 'ja-JP'
REGION_CODE = 'jp'

OUTPUT_PATH = '../artifacts/synthetic_google/%s/' % REGION_CODE

In [3]:
with open(INPUT_FILE, 'r', encoding='utf-8') as fp:
    mkqa_dataset = list(fp)

mkqa_dataset = [json.loads(jline) for jline in mkqa_dataset]

In [4]:
if LANG_NAME in nltk.corpus.stopwords.fileids():
    STOPWORDS = set(nltk.corpus.stopwords.words(LANG_NAME))
else:
    STOPWORDS = set()

STOPWORDS

set()

In [5]:
def str_to_num(text):
    try:
        return int(text)
    except:
        return float(text)

## Google Search functions

Useful resources:
- API usage: https://developers.google.com/custom-search/v1/reference/rest/v1/cse/list
- Free API Key: https://developers.google.com/custom-search/v1/introduction
- Setup Search Engine and get ID: https://cse.google.com/

Non-free API Key can be obtained from the Google's console.

In [6]:
API_KEY = 'XXXX'
SEARCH_ENGINE_ID = 'XXXX'

In [7]:
def google_search(query, top_results=5, lang_code=LANG_CODE, region_code=REGION_CODE, recall=False):
    query = re.sub(r'"', '', query)
    params = {
        'key': API_KEY,
        'cx': SEARCH_ENGINE_ID,
        'q': query,
        'hl': lang_code,
        'gl': region_code,
    }
    url = 'https://www.googleapis.com/customsearch/v1'
    r = requests.get(url=url, params=params)
    if r.status_code != 200:
        #print('')
        #print('GSearch - HTTP Status: %d - Query: %s' % (r.status_code, query))
        raise Exception('GSearch - HTTP Status: %d - Query: %s' % (r.status_code, query))
        return []

    raw_data = r.json()
    
    try:
        if 'items' not in raw_data:
            if recall:
                # If already this method has been re-called
                return []
            elif 'spelling' in raw_data and 'correctedQuery' in raw_data['spelling']:
                return google_search(query=raw_data['spelling']['correctedQuery'],
                                     top_results=top_results,
                                     lang_code=lang_code,
                                     region_code=region_code,
                                     recall=True)
            else:
                print('GSearch - No items - Query: %s' % (query))
                return []
                #raise Exception('No items')
        else:
            parsed_data = []
            for item in raw_data['items']:
                if 'title' not in item or 'link' not in item:
                    # Bad links
                    continue
                elif re.search(r'\.pdf', item['link']):
                    # Skip pdfs
                    continue
                parsed_data.append({
                    'title': item['title'],
                    'url': item['link'],
                })
            return parsed_data[:top_results]
    except Exception as e:
        print()
        print('Query: %s' % query)
        raise e

In [8]:
query = 'Quien es Fernando Alonso?'
google_search(query)

[{'title': 'Fernando Alonso - Wikipedia, la enciclopedia libre',
  'url': 'https://es.wikipedia.org/wiki/Fernando_Alonso'},
 {'title': 'Home - Fernando Alonso Official Site',
  'url': 'https://www.fernandoalonso.com/'},
 {'title': 'Fernando Alonso | Wiki La Isla, El Reality | Fandom',
  'url': 'https://laislaelreality.fandom.com/es/wiki/Fernando_Alonso'},
 {'title': 'Fernando Alonso. Biografía',
  'url': 'https://www.biografiasyvidas.com/reportaje/fernando_alonso/'},
 {'title': '¿Quién es la novia de Fernando Alonso?',
  'url': 'https://fansided.com/es/posts/quien-es-novia-fernando-alonso-01egrtssax58'}]

In [9]:
query = 'フェルナンドアロンソは誰ですか？'
google_search(query)

[{'title': 'フェルナンド・アロンソ：インタビュー 【 F1-Gate.com 】',
  'url': 'https://f1-gate.com/alonso/f1_6697.html'},
 {'title': 'フェルナンド・アロンソは「インディ500」で勝てるか？100年の ...',
  'url': 'https://news.yahoo.co.jp/byline/tsujinohiroshi/20170526-00071370/'},
 {'title': '送料無料 中古 東大なんか入らなきゃよかった 誰も教えてくれなかっ ...',
  'url': 'http://cranebrewing.com/9-79-750bb14/Y_jJm_ZjZh_Mj/bec_fccde_/14-51-15/12731290619292/becfcc/113'},
 {'title': 'フェルナンド・アロンソ - Wikipedia',
  'url': 'https://ja.wikipedia.org/wiki/%E3%83%95%E3%82%A7%E3%83%AB%E3%83%8A%E3%83%B3%E3%83%89%E3%83%BB%E3%82%A2%E3%83%AD%E3%83%B3%E3%82%BD'},
 {'title': '【動画】アロンソがルノーF1のウェアでメッセージ「『お願い ...',
  'url': 'https://www.as-web.jp/f1/598441'}]

## HTML-to-Text functions

`!pip install html2text`

In [10]:
def get_page_html(url, max_tries=2):
    headers = {
        'Accept': '*/*',
        'Accept-Encoding': 'gzip, deflate, br',
        'Connection': 'keep-alive',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36',
    }
    
    n_try = 0
    last_code = -1
    while True:
        if n_try >= max_tries:
            print('')
            print('PageContent - HTTP Status: %d - URL: %s' % (last_code, url))
            #raise Exception('PageContent - HTTP Status: %d - URL: %s' % (r.status_code, url))
            break

        try:
            r = requests.get(url=url, headers=headers, verify=False, timeout=20)
            last_code = r.status_code
            if last_code >= 400 and last_code < 500:
                return ''
            elif last_code != 200:
                n_try += 1
                sleep(2)
                continue
            else:
                break
        except requests.exceptions.ChunkedEncodingError as e:
            pass # Nothing
        except requests.exceptions.Timeout as e:
            pass # Nothing
        except requests.exceptions.ContentDecodingError as e:
            pass # Nothing
        except requests.exceptions.TooManyRedirects:
            # Ignore exception
            return ''
        except requests.exceptions.InvalidSchema:
            # Ignore
            return ''
        except requests.exceptions.ConnectionError:
            # Ignore
            return ''
        except UnicodeDecodeError:
            # Ignore
            return ''
        except Exception as e:
            # Raise other exceptions
            raise e
        
        n_try += 1
        sleep(5)
        continue

    if n_try >= max_tries:
        return ''

    r.encoding = 'utf-8'
    return r.text

In [11]:
def html_to_paragraphs(title, html, html_parser, min_length=50, max_length=1300, group_max_length=1000):
    try:
        text = html_parser.handle(html)
    except:
        return []
    
    # Join titles to paragraph
    text = re.sub(r'\#+([^\n]+)\n+', r'\1 ', text)
    
    # Remove rubbish of Wikipedia (if it is wiki)
    text = re.sub(r'\[editar[^\]]*\]', '', text)
    text = re.sub(r'\[\d+\]', '', text)
    
    paragraphs = re.split(r'\n\n+', text)
    paragraphs = [re.sub(r'\#+', ' ', x) for x in paragraphs]
    paragraphs = [x.strip() for x in paragraphs]
    paragraphs = [re.sub(r'\s+', ' ', x) for x in paragraphs]
    paragraphs = [x for x in paragraphs if len(x) > 5]
    
    # Split big paragraphs
    measured_paragraphs = []
    for p in paragraphs:
        if len(p) > max_length:
            # Big paragraph must be split
            p_sentences = nltk.tokenize.sent_tokenize(p)
            grouped_sentences = ''
            k = 0
            while k < len(p_sentences):
                sentence = p_sentences[k]
                if len(grouped_sentences) > group_max_length:
                    measured_paragraphs.append('%s %s' % (title, grouped_sentences))
                    grouped_sentences = ''
                else:
                    grouped_sentences = ('%s %s' % (grouped_sentences, sentence)).strip()
                    k += 1
        else:
            # The paragraph is not too big
            p = '%s %s' % (title, p)
            if len(p) >= min_length:
                measured_paragraphs.append(p)

    return measured_paragraphs

In [12]:
def get_html_parser():
    html_parser = html2text.HTML2Text()
    html_parser.ignore_links = True
    html_parser.ignore_emphasis = True
    html_parser.ignore_images = True
    html_parser.ignore_anchors = True
    html_parser.unicode_snob = True
    return html_parser

In [13]:
html_parser = get_html_parser()
title = 'SAMPLE TITLE'
url = 'https://es.wikipedia.org/wiki/Frozen_II'
html = get_page_html(url, max_tries=1)
html_to_paragraphs(title, html, html_parser)

['SAMPLE TITLE Frozen II De Wikipedia, la enciclopedia libre',
 'SAMPLE TITLE Frozen II --- Ficha técnica Dirección|',
 'SAMPLE TITLE Producción| Peter Del Vecho Guion| Jennifer Lee Historia| Chris Buck Jennifer Lee Marc E. Smith Kristen Anderson-Lopez Robert Lopez Música| Kristen Anderson-Lopez Fotografía| Tracy Scott Beattie Mohit Kallianpur Montaje| Jeff Draheim Protagonistas| Idina Menzel Kristen Bell Jonathan Groff Josh Gad Aurora Aksnes Ver todos los créditos (IMDb) Datos y cifras País| Estados Unidos Año| 2019 Estreno| 21 de noviembre de 2019 21 de noviembre de 2019 22 de noviembre de 2019 22 de noviembre de 2019 22 de noviembre de 2019 28 de noviembre de 2019\u200b 29 de noviembre de 2019 5 de diciembre de 2019 5 de diciembre de 2019 2 de enero de 2020 2 de enero de 2020 Género| musical fantasía animada por computadora Duración| 103 minutos\u200b Clasificación| México: AA: Apto para todo público y comprensible para menores de 7 años TE T A A Idioma(s)| Inglés Compañías Producto

## MKQA functions

In [14]:
def get_es_number_unit(raw_unit_name, is_plural=False):
    # First list correspond to singular and second list to plural
    es_conversion = {
        'Antes de la era vulgar': [['a.C.'], ['a.C.']],
        'Galones': [['galón'], ['galones']],
        'Millas por hora': [['mph', 'milla por hora'], ['mph', 'millas por hora']],
        'acre': [['acre'], ['acres']],
        'antes del Mediodia': [['AM', 'A.M.'], ['AM', 'A.M.']],
        'año terrestre': [['año'], ['años']],
        'caballo de potencia metrico': [['caballo de potencia', 'caballo', 'hp', 'cv'], ['caballos de potencia', 'caballos', 'hp', 'cv']],
        'centímetro': [['cm', 'centímetro'], ['cm', 'centímetros']],
        'día': [['día'], ['días']],
        'dólar estadounidense': [['dólar', '$'], ['dólares', '$']],
        'episodio': [['episodio'], ['episodios']],
        'escala Fahrenheit': [['grado Fahrenheit', 'Fahrenheit', 'ºF'], ['grados Fahrenheit', 'Fahrenheits', 'ºF']],
        'estaciones del año': [['temporada'], ['temporadas']],
        'grados centigrados': [['grado centigrado', 'grado', 'ºC'], ['grados centigrados', 'grados', 'ºC']],
        'gramo': [['gramo', 'gr', 'g'], ['gramos', 'gr', 'g']],
        'hora': [['hora', 'h'], ['horas', 'h']],
        'kilometraje': [['kilómetro', 'km'], ['kilómetros', 'km']],
        'libra avoirdupois': [['libra avoirdupois', 'libra', 'lb'], ['libras avoirdupois', 'libras', 'lb']],
        'light año terrestre': [['año luz'], ['años luz']],
        'mes sinódico': [['mes sinódico', 'mes'], ['meses sinódicos', 'meses']],
        'metros': [['metro', 'm'], ['metros', 'm']],
        'metros por segundo': [['metro por segundo', 'mps', 'm/s'], ['metros por segundo', 'mps', 'm/s']],
        'mililitro': [['mililitro', 'ml'], ['mililitros', 'ml']],
        'milimetro': [['milímetro', 'mm'], ['milímetros', 'mm']],
        'milla': [['milla', 'mi'], ['millas', 'mi']],
        'millas cuadradas': [['milla cuadrada'], ['millas cuadradas']],
        'minuto': [['minuto'], ['minutos']],
        'onza': [['onza'], ['onzas']],
        'other currency': [[], []], # Nothing to do
        'other unit': [[], []], # Nothing to do
        'palabra': [['palabra'], ['palabras']],
        'pie': [['pie'], ['pies']],
        'pies cuadrados': [['pie cuadrado'], ['pies cuadrados']],
        'post meridiem (time)': [['PM', 'P.M.'], ['PM', 'P.M.']],
        'pulgada': [['pulgada'], ['pulgadas']],
        'segundos': [['segundo'], ['segundos']],
        'septenario': [['septenario'], ['septenarios']],
        'tanto por ciento': [['porcentaje', '%'], ['porcentaje', '%']],
    }
    
    if raw_unit_name not in es_conversion or len(es_conversion[raw_unit_name]) == 0:
        return []
    else:
        return [x for x in es_conversion[raw_unit_name][int(is_plural)]]

def parse_nwu_answer_es(raw_answer):
    """
    Parses Spanish answers of type number_with_unit (nwu).
    """
    answers = []

    x_interval = re.search(r'^([\d.]+) ([\d.]+) (.+)$', raw_answer)
    if x_interval:
        unit_value_1 = str_to_num(x_interval[1])
        unit_value_2 = str_to_num(x_interval[2])
        unit_names = get_es_number_unit(x_interval[3], is_plural=True)
        if len(unit_names) == 0:
            # Skip if not unit name is provided in range of values
            pass
        else:
            for unit_name in unit_names:
                answers.append('entre %s y %s %s' % (str(unit_value_1), str(unit_value_2), unit_name))
                answers.append('desde %s hasta %s %s' % (str(unit_value_1), str(unit_value_2), unit_name))
    else:
        x_single = re.search(r'^([\d.]+) (.+)$', raw_answer)
        if x_single is None:
            return []
        unit_value = str_to_num(x_single[1])
        unit_names = get_es_number_unit(x_single[2], is_plural=(unit_value > 0))
        if len(unit_names) == 0:
            answers.append(str(unit_value))
        else:
            for unit_name in unit_names:
                answers.append('%s %s' % (str(unit_value), unit_name))
    
    return answers

In [15]:
def get_ja_number_unit(raw_unit_name):
    return [raw_unit_name]

def parse_nwu_answer_ja(raw_answer):
    """
    Parses Japanese answers of type number_with_unit (nwu).
    """
    answers = []

    x_interval = re.search(r'^([\d.]+) ([\d.]+)(?: (.+))?$', raw_answer)
    if x_interval:
        unit_value_1 = str_to_num(x_interval[1])
        unit_value_2 = str_to_num(x_interval[2])
        unit_names = get_ja_number_unit(x_interval[3])
        if len(unit_names) == 0:
            # No unit names provided
            unit_names = ['']

        for unit_name in unit_names:
            answers.append('%s%sから%s%sまで' % (str(unit_value_1), unit_name, str(unit_value_2), unit_name))
            answers.append('%s%sから%s%s' % (str(unit_value_1), unit_name, str(unit_value_2), unit_name))
            answers.append('%s%s乃至%s%s' % (str(unit_value_1), unit_name, str(unit_value_2), unit_name))
    else:
        x_single = re.search(r'^([\d.]+)(?: (.+))?$', raw_answer)
        if x_single is None:
            return []
        unit_value = str_to_num(x_single[1])
        unit_names = get_ja_number_unit(x_single[2])
        if len(unit_names) == 0:
            answers.append(str(unit_value))
        else:
            for unit_name in unit_names:
                answers.append('%s%s' % (str(unit_value), unit_name))
    
    return answers

In [16]:
def parse_date_answer_es(raw_answer):
    date_parts = re.search('^([\d.]+)-([\d.]+)-([\d.]+)$', raw_answer)
    if date_parts is None:
        # No ISO date format
        return [raw_answer]
    
    month_conversion = ['enero', 'febrero', 'marzo', 'abril', 'mayo', 'junio', 'julio',
                        'agosto', 'septiembre', 'octubre', 'noviembre', 'diciembre']
    
    year_number = date_parts[1]
    month_number = date_parts[2]
    month_str = month_conversion[str_to_num(date_parts[2]) - 1]
    day_number = date_parts[3]
    
    return [
        '%s-%s-%s' % (year_number, month_number, day_number),
        '%s-%s-%s' % (day_number, month_number, year_number),
        '%s-%s-%s' % (month_number, day_number, year_number),
        '%s/%s/%s' % (month_number, day_number, year_number),
        '%s de %s del %s' % (day_number, month_number, year_number),
        '%s de %s, %s' % (day_number, month_number, year_number),
    ]

In [17]:
def parse_date_answer_ja(raw_answer):
    date_parts = re.search('^([\d.]+)-([\d.]+)-([\d.]+)$', raw_answer)
    if date_parts is None:
        # No ISO date format
        return [raw_answer]
    
    number_conversion = ['一', '二', '三', '四', '五', '六', '七', '八', '九', '十',
                         '十一', '十二', '十三', '十四', '十五', '十六', '十七', '十八', '十九', '二十',
                         '二十一', '二十二', '二十三', '二十四', '二十五', '二十六', '二十七', '二十八', '二十九', '三十',
                         '三十一']
    
    year_number = date_parts[1]
    month_number = date_parts[2]
    day_number = date_parts[3]
    
    all_dates = [
        '%s-%s-%s' % (year_number, month_number, day_number),
        '%s-%s-%s' % (day_number, month_number, year_number),
        '%s/%s/%s' % (year_number, month_number, day_number),
        '%s/%s/%s' % (day_number, month_number, year_number),
    ]
    
    kanji_date = '%s年 %s月 %s日' % (year_number, month_number, day_number)
    all_dates = [kanji_date] + all_dates
    
    month_str = number_conversion[str_to_num(date_parts[2]) - 1]
    day_str = number_conversion[str_to_num(date_parts[3]) - 1]
    kanji_date = '%s月 %s日' % (month_str, day_str)
    all_dates = [kanji_date] + all_dates
    
    return all_dates

In [18]:
def parse_raw_answer_es(type, main_answer, aliases):
    """
    Parses MKQA Answers of Spanish language.
    """
    parsed_answers = []
    if type == 'number_with_unit':
        # Example: 16.0 año terrestre
        parsed_answers += parse_nwu_answer_es(main_answer)
        for alias in aliases:
            parsed_answers += parse_nwu_answer_es(alias)
    #elif type == 'number':
    #    # Example: 104.0
    #    parsed_answers.append(main_answer)
    #    for alias in aliases:
    #        parsed_answers.append(alias)
    elif type == 'date':
        # Example: 2001-08-29
        parsed_answers += parse_date_answer_es(main_answer)
    elif type == 'entity':
        # Example: Pokémon Ranger: Sombras de Almia
        parsed_answers.append(main_answer.strip())
        for alias in aliases:
            parsed_answers.append(alias.strip())
    elif type == 'short_phrase':
        # Example: rosemary almond
        parsed_answers.append(main_answer.strip())
        for alias in aliases:
            parsed_answers.append(alias.strip())
    else:
        # Ignored types: unanswerable, long_answer, binary
        pass
    
    # Filter low-quality answers (e.g. only numbers)
    parsed_answers = [x for x in parsed_answers if not re.search(r'^[\d.]+$', x)]
    parsed_answers = [x for x in parsed_answers if x != '']
    
    return parsed_answers

In [19]:
def parse_raw_answer_ja(type, main_answer, aliases):
    """
    Parses MKQA Answers of Japanese language.
    """
    parsed_answers = []
    if type == 'number_with_unit':
        # Example: 18.0 年
        parsed_answers += parse_nwu_answer_ja(main_answer)
        for alias in aliases:
            parsed_answers += parse_nwu_answer_ja(alias)
    #elif type == 'number':
    #    # Example: 104.0
    #    parsed_answers.append(main_answer)
    #    for alias in aliases:
    #        parsed_answers.append(alias)
    elif type == 'date':
        # Example: 2001-08-29
        parsed_answers += parse_date_answer_ja(main_answer)
    elif type == 'entity':
        # Example: Pokémon Ranger: Sombras de Almia
        parsed_answers.append(main_answer.strip())
        for alias in aliases:
            parsed_answers.append(alias.strip())
    elif type == 'short_phrase':
        # Example: rosemary almond
        parsed_answers.append(main_answer.strip())
        for alias in aliases:
            parsed_answers.append(alias.strip())
    else:
        # Ignored types: unanswerable, long_answer, binary
        pass
    
    # Filter low-quality answers (e.g. only numbers)
    parsed_answers = [x for x in parsed_answers if not re.search(r'^[\d.]+$', x)]
    parsed_answers = [x for x in parsed_answers if x != '']
    
    return parsed_answers

In [20]:
def find_answers(context, answers, word_tokenizer, min_bleu, min_answer_length=3, max_answer_length=30):
    context_tokens = word_tokenizer(context)
    found_answers = []
    
    context_offsets = []
    last_span = 0
    for token in context_tokens:
        span_start = context.find(token, last_span)
        if span_start == -1:
            raise Exception('Error to tokenize context: %s' % context)
        span_end = span_start + len(token)
        last_span = span_end
        context_offsets.append([span_start, span_end])
    
    for answer in list(set(answers)):
        answer_tokens = word_tokenizer(answer)
        window_size = len(answer_tokens)
        i = 0
        while i < len(context_tokens) - window_size:
            score = bleu_score(answer_tokens, context_tokens[i:i+window_size])
            span_start = context_offsets[i][0]
            span_end = context_offsets[i+window_size-1][1]
            context_answer = context[span_start:span_end]
            if (score >= min_bleu and len(context_answer) <= max_answer_length and
                len(context_answer) >= min_answer_length and context_answer.strip() != ''):
                found_answers.append({
                    'answer_start': span_start,
                    'answer_end': span_end,
                    'text': context[span_start:span_end],
                })
                i += window_size # Skips answer position
            else:
                i += 1
    
    return found_answers

## Tokenization and analysis

In [21]:
def get_word_tokenizer(artifacts_path='../artifacts/', lm_name='bert-base-multilingual-cased', lowercase=False):
    save_path = '%s%s/' % (artifacts_path, lm_name)
    tokenizer = BertTokenizerFast('%svocab.txt' % save_path, do_lower_case=lowercase)
    return lambda text : [text[start:end] \
                          for (start, end) in tokenizer(text, return_offsets_mapping=True,
                                                        return_special_tokens_mask=False)['offset_mapping'][1:-1]]

In [22]:
def bleu_score(reference, hypothesis):
    matches = [int(x == y) for x, y in zip(reference, hypothesis)]
    return sum(matches) / len(reference)

In [23]:
a = ['esta', 'es', 'prueba','1']
b = ['esta', 'es', 'prueba','2']

bleu_score(a, b)

0.75

In [24]:
text = '¡Hola mundo! ¡Adiós mundo!'

tokenizer = get_word_tokenizer()
tokenizer(text)

['¡', 'Ho', 'la', 'mundo', '!', '¡', 'Adi', 'ós', 'mundo', '!']

In [25]:
text = 'こんにちは世界！さようなら世界！'

tokenizer = get_word_tokenizer()
tokenizer(text)

['こ', 'ん', 'に', 'ち', 'は', '世', '界', '！', 'さ', 'ような', 'ら', '世', '界', '！']

## App

In [26]:
def get_best_squad_items(squad_items, top_items):
    squad_items = sorted(squad_items, key=lambda kv: kv['score'], reverse=True)
    return squad_items[:top_items]

In [27]:
def get_static_entities(text, default_score=5.):
    all_matches = []
    matches = re.findall(r'"(.+?)"', text)
    if len(matches) > 0:
        all_matches += [x.strip() for x in matches]
    matches = re.findall(r'\'(.+?)\'', text)
    if len(matches) > 0:
        all_matches += [x.strip() for x in matches]
    matches = re.findall(r'((?:(?:[A-Z][A-Za-z]+|de)\s?)+)', text)
    if len(matches) > 0:
        all_matches += [x.strip() for x in matches]
    
    # Format static entities
    static_entities = []
    for x in all_matches:
        static_entities.append({
            'entity_text': x,
            'entity_score': default_score,
            'is_mandatory': True,
        })
    
    return static_entities

In [28]:
def has_entities(context, entities, n=1):
    if entities is None or len(entities) == 0:
        return True, 0.
    
    counter = 0
    max_score = 0.
    current_score = 0.
    for entity in entities:
        max_score = entity['entity_score']
        if context.find(entity['entity_text']) != -1:
            # Found entity
            counter += 1
            current_score += entity['entity_score']
        elif 'is_mandatory' in entity and entity['is_mandatory']:
            # Mandatory entitiy not found
            return False, 0.
    
    return (counter >= n), (current_score / max_score)

In [29]:
def has_tokens(context, tokens, token_score=1.):
    if tokens is None or len(tokens) == 0:
        raise Exception('No tokens provided for context: %s' % context)
    
    max_score = 0
    current_score = 0.
    for token in tokens:
        if token in STOPWORDS:
            continue
        max_score += token_score
        if context.find(token) != -1:
            # Found token
            current_score += token_score
    
    return current_score / max_score

In [30]:
def find_squad_item(idx, question, answers, answer_types, word_tokenizers, html_parser,
                    query_top_results, min_bleu=0.8, max_tokens_length=512,
                    max_chars_length=1500, verbose=False):
    if len(word_tokenizers) != query_top_results:
        raise Exception('Need more tokenizers (expected: %d).' % query_top_results)

    question_tokens = word_tokenizers[0](question)
    
    # Sort answers by length
    a_zip = sorted(zip(answers, answer_types), key=lambda x: len(x[0]), reverse=True)
    answers, answer_types = list(zip(*a_zip))
    
    # Get top results of Google
    url_items = google_search(question, top_results=query_top_results)
    static_entities = get_static_entities(question)
    
    # Get page content of each ID
    squad_items = []
    end_status = []
    all_threads = []

    n_threads = 0
    for i, url_item in enumerate(url_items):
        x = threading.Thread(
            target=find_squad_item_in_url,
            args=(end_status, squad_items, idx, url_item, word_tokenizers[i], question, question_tokens, answers,
                  answer_types, static_entities, min_bleu, max_tokens_length, max_chars_length))
        x.start()
        all_threads.append(x)
        #find_squad_item_in_url(squad_items, idx, url_item, word_tokenizer, question, question_tokens, answers,
        #                       answer_types, static_entities, min_bleu, max_tokens_length, max_chars_length)
    
    # Wait for threads
    print('- Item: %d / %d | URL: %d / %d | Found: %d' % (verbose['i'], verbose['n_items'], 0,
            len(url_items), verbose['n_found']), ' '*20, end='\r')
    for i, x in enumerate(all_threads):
        x.join()
        if verbose:
            print('- Item: %d / %d | URL: %d / %d | Found: %d' % (verbose['i'], verbose['n_items'], i + 1,
                    len(url_items), verbose['n_found']), ' '*20, end='\r')
    
    if sum(end_status) != len(all_threads):
        raise Exception('Some thread failed!')
    
    return get_best_squad_items(squad_items, top_items=2)

In [31]:
def find_squad_item_in_url(end_status, squad_items, idx, url_item, word_tokenizer, question, question_tokens, answers,
                           answer_types, static_entities, min_bleu, max_tokens_length, max_chars_length):
    page_url = url_item['url']
    page_title = url_item['title']

    html = get_page_html(page_url)
    paragraphs = html_to_paragraphs(page_title, html, html_parser)

    for paragraph_i, paragraph in enumerate(paragraphs):
        paragraph_tokens = word_tokenizer(paragraph)

        tokens_length = len(paragraph_tokens) + len(question_tokens) # Context + question
        chars_length = sum(len(token) for token in paragraph_tokens) # Only context
        if tokens_length <= (max_tokens_length - 3) and chars_length <= max_chars_length:
            p_has_entities, entities_score = has_entities(paragraph, static_entities, n=1)
            if p_has_entities:
                found_answers = find_answers(paragraph, answers, word_tokenizer, min_bleu=min_bleu)
                #tokens_score = has_tokens(paragraph, question_tokens)
                if len(found_answers) > 0:
                    squad_score = entities_score + sum([len(x['text']) for x in found_answers])
                    squad_item = {
                        'score': squad_score,
                        'title': page_title,
                        'paragraphs': [{
                            'context': paragraph,
                            'qas': [{
                                'id': '%s_%d' % (idx, paragraph_i),
                                'question': question,
                                'answers': [],
                            }],
                        }],
                    }
                    for found_answer in found_answers:
                        squad_item['paragraphs'][0]['qas'][0]['answers'].append({
                            'answer_start': found_answer['answer_start'],
                            'text': found_answer['text'],
                        })
                    squad_items.append(squad_item)
    
    end_status.append(1)

In [32]:
def main(mkqa_dataset, save_path, region_code=REGION_CODE, max_aliases=5, query_top_results=7):
    # Need one tokenizer as many query_top_results (since we need one per thread)
    word_tokenizers = [get_word_tokenizer() for _ in range(query_top_results)]
    html_parser = get_html_parser()
    
    os.makedirs(save_path, exist_ok=True)
    
    # Load config of parsing
    config_file = os.path.join(save_path, 'config.json')
    if os.path.exists(config_file):
        with open(config_file, 'r') as fp:
            config = json.load(fp)
    else:
        with open(config_file, 'w') as fp:
            config = {'skipped': [], 'found': []}
            json.dump(config, fp)
    
    # Count items
    n_items = sum([1 for _ in mkqa_dataset])
    
    print('Process items...')
    for i, item in enumerate(mkqa_dataset):
        idx = 'mkqa_' + str(item['example_id'])
        query = item['queries'][region_code]
        
        filename = '%s.json' % idx
        output_file = os.path.join(save_path, filename)
        
        # Skip if already parsed
        if item['example_id'] in config['found'] or item['example_id'] in config['skipped']:
            continue
        
        print('- Item %d of %d' % (i + 1, n_items), ' '*20, end='\r')
        
        parsed_answers = []
        answer_types = []
        for raw_answer_data in item['answers'][region_code]:
            main_answer = raw_answer_data['text']
            aliases = raw_answer_data['aliases'][:max_aliases] if 'aliases' in raw_answer_data else []
            
            if REGION_CODE == 'es':
                iter_parsed_answers = parse_raw_answer_es(raw_answer_data['type'], main_answer, aliases)
            elif REGION_CODE == 'ja':
                iter_parsed_answers = parse_raw_answer_ja(raw_answer_data['type'], main_answer, aliases)
            else:
                raise Exception('Unknown region code: %s' % REGION_CODE)
            
            for iter_parsed_answer in iter_parsed_answers:
                if iter_parsed_answer not in parsed_answers:
                    parsed_answers.append(iter_parsed_answer)
                    answer_types.append(raw_answer_data['type'])
        
        if len(parsed_answers) == 0:
            # No answers for this query
            config['skipped'].append(item['example_id'])
            with open(config_file, 'w') as fp:
                json.dump(config, fp)
            continue
        
        print('- Item: %d / %d | Found: %d' % (i + 1, n_items, len(config['found'])), ' '*20, end='\r')
        
        squad_dataset = {}
        squad_dataset['data'] = find_squad_item(
                idx, query, parsed_answers, answer_types, word_tokenizers, html_parser, query_top_results=query_top_results,
                verbose={'i': i+1, 'n_items': n_items, 'n_found': len(config['found'])})
        
        #print(len(parsed_answers))
        #print('-----------------')
        #raise Exception('OOOOOK')
        
        if len(squad_dataset['data']) == 0:
            config['skipped'].append(item['example_id'])
            with open(config_file, 'w') as fp:
                json.dump(config, fp)
            continue
        
        with open(output_file, 'w', encoding='utf8') as fp:
            json.dump(squad_dataset, fp, ensure_ascii=False)
        config['found'].append(item['example_id'])
        with open(config_file, 'w') as fp:
            json.dump(config, fp)

In [33]:
"""
mkqa_dataset = [
    {
        "query": "who won america's got talent in 2015",
        "answers": {"es": [{"type": "entity", "entity": "Q7154545", "text": "Paul Zerdin", "aliases": []}]},
        "queries": {"es": "quien ganó got talent america en 2015"},
        "example_id": 3309140698422618645
    }
]
"""

max_errors = 2
n_errors = 0

while True:
    try:
        main(mkqa_dataset, save_path=OUTPUT_PATH, region_code=REGION_CODE)
        break
    except requests.exceptions.ConnectionError as e:
        n_errors += 1
        if n_errors >= max_errors:
            raise e
    except Exception as e:
        #raise e
        print('-' * 10)
        print('Exception')
        print(e)
        sleep(5)

Process items...
- Item: 9768 / 10000 | URL: 1 / 5 | Found: 1452                                                               
PageContent - HTTP Status: 500 - URL: https://www.sfbrandnewcondos.com/properties/401-harrison-condos/
GSearch - No items - Query: クイーン・オブ・ザ・サウス ～女王への階段～ のcamilla vargas 役はだれ                  
- Item: 9820 / 10000 | URL: 3 / 7 | Found: 1458                     
PageContent - HTTP Status: -1 - URL: http://cranebrewing.com/8d3fd-9-7cfa/ZWY2M2M_2Nj/aceb_bcaf_/11-30-69/12731516003915/acebbc/157
- Item: 9820 / 10000 | URL: 5 / 7 | Found: 1458                     
PageContent - HTTP Status: -1 - URL: http://cranebrewing.com/773d29-0a-a9/N2M0_Yj_g4MT/dfbaafd_a_/188-1-72/11241747107928/dfbaaf/107
- Item: 9821 / 10000 | URL: 5 / 7 | Found: 1458                     
PageContent - HTTP Status: -1 - URL: http://sugita.us/usq.htm
- Item: 9823 / 10000 | URL: 0 / 5 | Found: 1458                     
PageContent - HTTP Status: -1 - URL: http://sugita.us/usq.htm
GSearch - No i