# Synthetic SQuAD dataset based on MKQA (Google)

In [36]:
import json
import requests

from tokenizers import BertWordPieceTokenizer
from transformers import BertTokenizerFast

In [18]:
API_GOOGLE = 'AIzaSyBjpMvWhsaXuDPs481CsIvLaBfPyFpzNkE'

CLIENT_ID = '496348482958-duaot1657g5f1f6cfcsod3thth1ejmo5.apps.googleusercontent.com'
CLIENT_SECRET = '2YPlfd9-qOoYnFqPVgfzlzkR'

In [25]:
from googleapiclient.discovery import build

service = build('cloudsearch', 'v1', developerKey=API_GOOGLE)

In [41]:
output = service.query().search(body={
    'query': 'Fernando Alonso'
})
json.dumps(output.execute().results)

HttpError: <HttpError 401 when requesting https://cloudsearch.googleapis.com/v1/query/search?key=AIzaSyBjpMvWhsaXuDPs481CsIvLaBfPyFpzNkE&alt=json returned "Request is missing required authentication credential. Expected OAuth 2 access token, login cookie or other valid authentication credential. See https://developers.google.com/identity/sign-in/web/devconsole-project.". Details: "Request is missing required authentication credential. Expected OAuth 2 access token, login cookie or other valid authentication credential. See https://developers.google.com/identity/sign-in/web/devconsole-project.">

## MKQA functions

In [9]:
def get_es_number_unit(raw_unit_name, is_plural=False):
    # First list correspond to singular and second list to plural
    es_conversion = {
        'Antes de la era vulgar': [['a.C.'], ['a.C.']],
        'Galones': [['galón'], ['galones']],
        'Millas por hora': [['mph', 'milla por hora'], ['mph', 'millas por hora']],
        'acre': [['acre'], ['acres']],
        'antes del Mediodia': [['AM', 'A.M.'], ['AM', 'A.M.']],
        'año terrestre': [['año'], ['años']],
        'caballo de potencia metrico': [['caballo de potencia', 'caballo', 'hp', 'cv'], ['caballos de potencia', 'caballos', 'hp', 'cv']],
        'centímetro': [['cm', 'centímetro'], ['cm', 'centímetros']],
        'día': [['día'], ['días']],
        'dólar estadounidense': [['dólar', '$'], ['dólares', '$']],
        'episodio': [['episodio'], ['episodios']],
        'escala Fahrenheit': [['grado Fahrenheit', 'Fahrenheit', 'ºF'], ['grados Fahrenheit', 'Fahrenheits', 'ºF']],
        'estaciones del año': [['temporada'], ['temporadas']],
        'grados centigrados': [['grado centigrado', 'grado', 'ºC'], ['grados centigrados', 'grados', 'ºC']],
        'gramo': [['gramo', 'gr', 'g'], ['gramos', 'gr', 'g']],
        'hora': [['hora', 'h'], ['horas', 'h']],
        'kilometraje': [['kilómetro', 'km'], ['kilómetros', 'km']],
        'libra avoirdupois': [['libra avoirdupois', 'libra', 'lb'], ['libras avoirdupois', 'libras', 'lb']],
        'light año terrestre': [['año luz'], ['años luz']],
        'mes sinódico': [['mes sinódico', 'mes'], ['meses sinódicos', 'meses']],
        'metros': [['metro', 'm'], ['metros', 'm']],
        'metros por segundo': [['metro por segundo', 'mps', 'm/s'], ['metros por segundo', 'mps', 'm/s']],
        'mililitro': [['mililitro', 'ml'], ['mililitros', 'ml']],
        'milimetro': [['milímetro', 'mm'], ['milímetros', 'mm']],
        'milla': [['milla', 'mi'], ['millas', 'mi']],
        'millas cuadradas': [['milla cuadrada'], ['millas cuadradas']],
        'minuto': [['minuto'], ['minutos']],
        'onza': [['onza'], ['onzas']],
        'other currency': [[], []], # Nothing to do
        'other unit': [[], []], # Nothing to do
        'palabra': [['palabra'], ['palabras']],
        'pie': [['pie'], ['pies']],
        'pies cuadrados': [['pie cuadrado'], ['pies cuadrados']],
        'post meridiem (time)': [['PM', 'P.M.'], ['PM', 'P.M.']],
        'pulgada': [['pulgada'], ['pulgadas']],
        'segundos': [['segundo'], ['segundos']],
        'septenario': [['septenario'], ['septenarios']],
        'tanto por ciento': [['porcentaje', '%'], ['porcentaje', '%']],
    }
    
    if raw_unit_name not in es_conversion or len(es_conversion[raw_unit_name]) == 0:
        return []
    else:
        return [x for x in es_conversion[raw_unit_name][int(is_plural)]]

def parse_nwu_answer_es(raw_answer):
    """
    Parses Spanish answers of type number_with_unit (nwu).
    """
    answers = []

    x_interval = re.search(r'^([\d.]+) ([\d.]+) (.+)$', raw_answer)
    if x_interval:
        unit_value_1 = str_to_num(x_interval[1])
        unit_value_2 = str_to_num(x_interval[2])
        unit_names = get_es_number_unit(x_interval[3], is_plural=True)
        if len(unit_names) == 0:
            # Skip if not unit name is provided in range of values
            pass
        else:
            for unit_name in unit_names:
                answers.append('entre %s y %s %s' % (str(unit_value_1), str(unit_value_2), unit_name))
                answers.append('desde %s hasta %s %s' % (str(unit_value_1), str(unit_value_2), unit_name))
    else:
        x_single = re.search(r'^([\d.]+) (.+)$', raw_answer)
        if x_single is None:
            return []
        unit_value = str_to_num(x_single[1])
        unit_names = get_es_number_unit(x_single[2], is_plural=(unit_value > 0))
        if len(unit_names) == 0:
            answers.append(str(unit_value))
        else:
            for unit_name in unit_names:
                answers.append('%s %s' % (str(unit_value), unit_name))
    
    return answers

In [10]:
def parse_date_answer_es(raw_answer):
    date_parts = re.search('^([\d.]+)-([\d.]+)-([\d.]+)$', raw_answer)
    if date_parts is None:
        # No ISO date format
        return [raw_answer]
    
    month_conversion = ['enero', 'febrero', 'marzo', 'abril', 'mayo', 'junio', 'julio',
                        'agosto', 'septiembre', 'octubre', 'noviembre', 'diciembre']
    
    year_number = date_parts[1]
    month_number = date_parts[2]
    month_str = month_conversion[str_to_num(date_parts[2]) - 1]
    day_number = date_parts[3]
    
    return [
        '%s-%s-%s' % (year_number, month_number, day_number),
        '%s-%s-%s' % (day_number, month_number, year_number),
        '%s-%s-%s' % (month_number, day_number, year_number),
        '%s/%s/%s' % (month_number, day_number, year_number),
        '%s de %s del %s' % (day_number, month_number, year_number),
        '%s de %s, %s' % (day_number, month_number, year_number),
    ]

In [11]:
def parse_raw_answer_es(type, main_answer, aliases):
    """
    Parses MKQA Answers of Spanish language.
    """
    parsed_answers = []
    if type == 'number_with_unit':
        # Example: 16.0 año terrestre
        parsed_answers += parse_nwu_answer_es(main_answer)
    #    for alias in aliases:
    #        parsed_answers += parse_nwu_answer_es(alias)
    #elif type == 'number':
    #    # Example: 104.0
    #    parsed_answers.append(main_answer)
    #    for alias in aliases:
    #        parsed_answers.append(alias)
    elif type == 'date':
        # Example: 2001-08-29
        parsed_answers += parse_date_answer_es(main_answer)
    elif type == 'entity':
        # Example: Pokémon Ranger: Sombras de Almia
        parsed_answers.append(main_answer.strip())
        for alias in aliases:
            parsed_answers.append(alias.strip())
    elif type == 'short_phrase':
        # Example: rosemary almond
        parsed_answers.append(main_answer.strip())
        for alias in aliases:
            parsed_answers.append(alias.strip())
    else:
        # Ignored types: unanswerable, long_answer, binary
        pass
    
    return [x for x in parsed_answers if x != '']

In [12]:
def find_answers(context, answers, word_tokenizer, min_bleu, min_answer_length=3, max_answer_length=30):
    context_tokens = word_tokenizer(context)
    found_answers = []
    
    context_offsets = []
    last_span = 0
    for token in context_tokens:
        span_start = context.find(token, last_span)
        if span_start == -1:
            raise Exception('Error to tokenize context: %s' % context)
        span_end = span_start + len(token)
        context_offsets.append([span_start, span_end])
    
    for answer in answers:
        answer_tokens = word_tokenizer(answer)
        window_size = len(answer_tokens)
        i = 0
        while i < len(context_tokens) - window_size:
            score = bleu_score(answer_tokens, context_tokens[i:i+window_size])
            span_start = context_offsets[i][0]
            span_end = context_offsets[i+window_size-1][1]
            context_answer = context[span_start:span_end]
            if (score >= min_bleu and len(context_answer) <= max_answer_length and
                len(context_answer) >= min_answer_length and context_answer.strip() != ''):
                found_answers.append({
                    'answer_start': span_start,
                    'answer_end': span_end,
                    'text': context[span_start:span_end],
                })
                i += window_size # Skips answer position
            else:
                i += 1
    
    return found_answers

## Tokenization and analysis

In [13]:
def get_word_tokenizer(artifacts_path='../artifacts/', lm_name='bert-base-multilingual-cased', lowercase=False):
    save_path = '%s%s/' % (artifacts_path, lm_name)
    tokenizer = BertTokenizerFast('%svocab.txt' % save_path, do_lower_case=lowercase)
    return lambda text : [text[start:end] \
                          for (start, end) in tokenizer(text, return_offsets_mapping=True,
                                                        return_special_tokens_mask=False)['offset_mapping'][1:-1]]

In [14]:
def bleu_score(reference, hypothesis):
    matches = [int(x == y) for x, y in zip(reference, hypothesis)]
    return sum(matches) / len(reference)

In [15]:
a = ['esta', 'es', 'prueba','1']
b = ['esta', 'es', 'prueba','2']

bleu_score(a, b)

0.75

In [16]:
text = '¡Hola mundo! ¡Adiós mundo!'

tokenizer = get_word_tokenizer()
tokenizer(text)

['¡', 'Ho', 'la', 'mundo', '!', '¡', 'Adi', 'ós', 'mundo', '!']

## Google Search API

In [19]:
url = 'https://cloudsearch.googleapis.com/v1/query/search?key='
data = {
    'query': 'Quien es Fernando Alonso'
}

r = requests.get(url, headers={'Accept': 'application/json'}, data=data)

print(r)

<Response [400]>
