## Requirements

In [None]:
from datasets import load_dataset
import pandas as pd
from tqdm import tqdm

## Obtaining a spreadsheet of text which needs to be translated

In [None]:
squad = load_dataset("squad_v2")

In [None]:
text = set()

In [None]:
def add_to_set(row):
  text.add(row["title"])
  text.add(row["question"])
  text.add(row["context"])
  for ans in row["answers"]["text"]: text.add(ans)

In [None]:
squad["train"].map(add_to_set)

In [None]:
df = pd.DataFrame(columns=["en", "mt"])

In [None]:
for t in tqdm(text): df.loc[len(df.index)] = {"en": t, "mt": ""}

In [None]:
df.to_excel("translated_train.xlsx")

In [None]:
f"Train contains {len(df)} pieces of text to translate"

In [None]:
text = set()

In [None]:
squad["validation"].map(add_to_set)

In [None]:
df = pd.DataFrame(columns=["en", "mt"])

In [None]:
for t in tqdm(text): df.loc[len(df.index)] = {"en": t, "mt": ""}

In [None]:
df.to_excel("translated_valid.xlsx")

In [None]:
f"Validation contains {len(df)} pieces of text to translate"

Enter the output spreasheets into Google Sheets and use the ```GOOGLETRANSLATE(text, source language, target language)```

Import the resulting spreadhsheet

## Translating train

In [None]:
squad = load_dataset("squad_v2", download_mode="force_redownload")

In [None]:
def is_impossible(row):
  if len(row["answers"]["text"]) == 0 and len(row["answers"]["answer_start"]) == 0:
    row['is_impossible'] = True
  else:
    row["is_impossible"] = False
  return row

In [None]:
squad["train"] = squad["train"].map(is_impossible)

In [None]:
# translations = pd.read_excel("translated_train.xlsx")
translations = pd.concat([pd.read_excel("translated_train.xlsx"), pd.read_excel("squad_v2_train_errors.xlsx"), pd.read_excel("squad_v2_train_errors (1).xlsx"), pd.read_excel("squad_v2_train_errors (2).xlsx")], ignore_index=True)

In [None]:
en_mt_dict = dict()

In [None]:
for index, row in translations.iterrows():
    en_mt_dict.update({str(row["en"]): str(row["mt"])})

In [None]:
def translate_row(row):
    row['title'] = en_mt_dict.get(str(row['title']))
    row['context'] = en_mt_dict.get(str(row['context']))
    row['question'] = en_mt_dict.get(str(row['question']))
    if row['is_impossible'] == False:
        row['answers']['text'][0] = en_mt_dict.get(str(row['answers']['text'][0]))
    return row

In [None]:
squad["train"] = squad["train"].map(translate_row)

In [None]:
nones = squad["train"].filter(
    lambda row:
    not row["is_impossible"]
    and
    (
        not row["context"]
        or
        not row["title"]
        or
        not row["question"]
        or
        not row["answers"]["text"][0]
    )
)

In [None]:
def update_answer_start(row):
    if row['is_impossible'] == True: # if question had no answers
        return row
    if row['context'] is not None and row['answers']['text'][0] is not None:
        corpus = row['context'].lower()
        substring = row['answers']['text'][0].lower()
        row['answers']['answer_start'][0] = corpus.find(substring)
        return row
    return row

In [None]:
squad["train"] = squad["train"].map(update_answer_start)

In [None]:
def answer_match(row):
    if row['is_impossible'] == True or not row["context"] or any(entry is None for entry in row['answers']['text']):
        return True
    if row['context'][row['answers']['answer_start'][0] : row['answers']['answer_start'][0] + len(row['answers']['text'][0])].lower() == row['answers']['text'][0].lower():
        return True
    return False

In [None]:
squad["train"] = squad["train"].filter(
    lambda row:
    (row["id"] and row["title"] and row["context"] and row["question"])
    and
    (row['question'][-1] == "?")
    and
    (answer_match(row) is True) # check if row has answers if they match
    and
    any(entry is None for entry in row['answers']['text']) == False
    and
    (all(entry > -1 for entry in row["answers"]["answer_start"]))
)

In [None]:
squad["train"] = squad["train"].filter(
    lambda row:
    ((row["is_impossible"] is False) and (len(row["answers"]["text"]) > 0))
    or
    ((row["is_impossible"] is True) and (len(row["answers"]["text"]) == 0))
)

In [None]:
len(squad["train"].filter(lambda row: len(row['answers']['text']) > 0))

In [None]:
len(squad["train"].filter(lambda row: len(row['answers']['text']) == 0))

In [None]:
squad["train"] = squad["train"].remove_columns("is_impossible")

In [None]:
len(squad["train"])

## Errors in train

In [None]:
translated = squad["train"]

In [None]:
original = load_dataset("squad_v2", split="train", download_mode='force_redownload')

In [None]:
all_ids = set()

In [None]:
def add_to_all(row):
  all_ids.add(row["id"])

In [None]:
original.map(add_to_all)

In [None]:
translated_ids = set()

In [None]:
def add_to_translated(row):
  translated_ids.add(row["id"])

In [None]:
translated.map(add_to_translated)

In [None]:
errors = {id for id in all_ids if id not in translated_ids}

In [None]:
error_entries = original.filter(lambda row: row["id"] in errors)

In [None]:
len(error_entries)

In [None]:
error_table = pd.DataFrame(columns=["en", "mt"])

In [None]:
text = set()

In [None]:
def add_to_set(row):
  text.add(row["title"])
  text.add(row["question"])
  text.add(row["context"])
  for ans in row["answers"]["text"]: text.add(ans)

In [None]:
error_entries.map(add_to_set)

In [None]:
for t in tqdm(text): error_table.loc[len(error_table.index)] = {"en": t, "mt": ""}

In [None]:
error_table.to_excel("squad_v2_train_errors.xlsx", index=False)

Translate the errors and concatenate to the translations DataFrame. Then run "Translating Train" again to check if it yields improvement

In [None]:
error_table = pd.read_excel("squad_v2_train_errors.xlsx")

In [None]:
assert len((error_table['en'].where(error_table['mt'] == '#VALUE!').dropna().values)) == 0

In [None]:
translations = pd.concat([translations, error_table], ignore_index=True)

In [None]:
translations.to_excel("translated_train.xlsx", index=False)

## Translating validation

In [None]:
valid = load_dataset("squad_v2", split="validation", download_mode="force_redownload")

In [None]:
valid = valid.map(is_impossible)

In [None]:
# translations = pd.read_excel("translated_valid.xlsx")
translations = pd.concat([pd.read_excel("translated_valid.xlsx"), pd.read_excel("squad_v2_valid_errors.xlsx"), pd.read_excel("squad_v2_valid_errors (1).xlsx")], ignore_index=True)

In [None]:
en_mt_dict = dict()

In [None]:
for index, row in translations.iterrows():
    en_mt_dict.update({str(row["en"]): str(row["mt"])})

In [None]:
def translate_row(row):
    row['title'] = en_mt_dict.get(str(row['title']))
    row['context'] = en_mt_dict.get(str(row['context']))
    row['question'] = en_mt_dict.get(str(row['question']))
    if row["is_impossible"] == False:
        row['answers']['text'] = [en_mt_dict.get(str(entry)) for entry in row['answers']['text']]
    return row

In [None]:
valid = valid.map(translate_row)

In [None]:
def update_answer_start(row):
    if row['is_impossible'] == True: # if question had no answers
        return row
    if row['context'] is not None:
        corpus = row['context'].lower()
        starts = []
        answers = []
        for entry in row['answers']['text']:
            if entry is not None:
                 search = corpus.find(entry.lower())
                 if search != -1:
                    answers.append(entry)
                    starts.append(search)
        row['answers']['answer_start'] = starts
        row['answers']['text'] = answers
        return row
    return row

In [None]:
valid = valid.map(update_answer_start)

In [None]:
def answers_start_match(row):
    if row['is_impossible'] == True: return [True]
    matches = []
    for i in range(len(row['answers']['answer_start'])):
        if row['context'][row['answers']['answer_start'][i] : row['answers']['answer_start'][i] + len(row['answers']['text'][i])].lower() == row['answers']['text'][i].lower():
            matches.append(True)
        else:
            matches.append(False)
    return matches

In [None]:
valid = valid.filter(
    lambda row:
    (row["id"] and row["title"] and row["context"] and row["question"])
    and
    (row['question'][-1] == "?")
    and
    (all(entry is True for entry in answers_start_match(row)))
    and
    any(entry is None for entry in row['answers']['text']) == False
)

In [None]:
valid = valid.filter(
    lambda row:
    ((row["is_impossible"] is False) and (len(row["answers"]["text"]) > 0))
    or
    ((row["is_impossible"] is True) and (len(row["answers"]["text"]) == 0))
)

In [None]:
len(valid.filter(lambda row: row["is_impossible"] is False and len(row['answers']['text']) == 0))

In [None]:
len(valid.filter(lambda row: len(row['answers']['text']) > 0))

In [None]:
len(valid.filter(lambda row: len(row['answers']['text']) == 0))

In [None]:
valid = valid.remove_columns("is_impossible")

In [None]:
valid[0]

In [None]:
squad["validation"] = valid

In [None]:
len(squad["validation"])

## Errors in validation

In [None]:
original = load_dataset("squad_v2", split="validation", download_mode='force_redownload')

In [None]:
all_ids = set()

In [None]:
def add_to_all(row):
  all_ids.add(row["id"])

In [None]:
original.map(add_to_all)

In [None]:
translated_ids = set()

In [None]:
def add_to_translated(row):
  translated_ids.add(row["id"])

In [None]:
valid.map(add_to_translated)

In [None]:
errors = {id for id in all_ids if id not in translated_ids}

In [None]:
error_entries = original.filter(lambda row: row["id"] in errors)

In [None]:
len(error_entries)

In [None]:
error_table = pd.DataFrame(columns=["en", "mt"])

In [None]:
text = set()

In [None]:
def add_to_set(row):
  text.add(row["title"])
  text.add(row["question"])
  text.add(row["context"])
  for ans in row["answers"]["text"]: text.add(ans)

In [None]:
error_entries.map(add_to_set)

In [None]:
for t in tqdm(text): error_table.loc[len(error_table.index)] = {"en": t, "mt": ""}

In [None]:
error_table.to_excel("squad_v2_valid_errors.xlsx", index=False)

Translate the errors and concatenate to the translations DataFrame. Then run "Translating Train" again to check if it yields improvement

In [None]:
error_table = pd.read_excel("squad_v2_valid_errors.xlsx")

In [None]:
assert len((error_table['en'].where(error_table['mt'] == '#VALUE!').dropna().values)) == 0

In [None]:
translations = pd.concat([translations, error_table], ignore_index=True)

In [None]:
translations.to_excel("translated_valid.xlsx", index=False)

## Export dataset

In [None]:
# Download in official HF format
squad.save_to_disk("SQuAD_V2")

In [None]:
# Download as JSON
for split, dataset in squad.items():
    dataset.to_json(f"squad-v2-{split}.json")