## Requirements

In [None]:
from datasets import load_dataset
import pandas as pd
from tqdm import tqdm

## Obtaining a spreadsheet of text which needs to be translated

In [None]:
squad = load_dataset("squad")

In [None]:
squad["train"][0]

In [None]:
text = set()

In [None]:
def add_to_set(row):
  text.add(row["title"])
  text.add(row["question"])
  text.add(row["context"])
  for ans in row["answers"]["text"]: text.add(ans)

In [None]:
squad["train"].map(add_to_set)

In [None]:
df = pd.DataFrame(columns=["en", "mt"])

In [None]:
for t in tqdm(text): df.loc[len(df.index)] = {"en": t, "mt": ""}

In [None]:
df.to_excel("translated_train.xlsx")

In [None]:
f"Train contains {len(df)} pieces of text to translate"

In [None]:
text = set()

In [None]:
squad["validation"].map(add_to_set)

In [None]:
df = pd.DataFrame(columns=["en", "mt"])

In [None]:
for t in tqdm(text): df.loc[len(df.index)] = {"en": t, "mt": ""}

In [None]:
df.to_excel("translated_valid.xlsx")

In [None]:
f"Validation contains {len(df)} pieces of text to translate"

Enter the output spreasheets into Google Sheets and use the ```GOOGLETRANSLATE(text, source language, target language)```

Import the resulting spreadhsheet

## Translating train

In [None]:
squad = load_dataset("squad", download_mode='force_redownload')

In [None]:
# translations = pd.read_excel("translated_train.xlsx")
translations = pd.concat([pd.read_excel("translated_train.xlsx"), pd.read_excel("squad_v1_train_errors.xlsx"), pd.read_excel("squad_v1_train_errors (1).xlsx")], ignore_index=True)

In [None]:
assert len((translations['en'].where(translations['mt'] == '#VALUE!').dropna().values)) == 0

In [None]:
en_mt_dict = dict()

In [None]:
for index, row in translations.iterrows():
   en_mt_dict.update({str(row["en"]): str(row["mt"])})

In [None]:
def translate_row(row):
    row['title'] = en_mt_dict.get(str(row['title']))
    row['context'] = en_mt_dict.get(str(row['context']))
    row['question'] = en_mt_dict.get(str(row['question']))
    row['answers']['text'][0] = en_mt_dict.get(str(row['answers']['text'][0]))
    return row

In [None]:
squad["train"] = squad["train"].map(translate_row)

In [None]:
def update_answer_start(row):
    if row['context'] is not None and row['answers']['text'][0] is not None:
        corpus = row['context'].lower()
        substring = row['answers']['text'][0].lower()
        row['answers']['answer_start'][0] = corpus.find(substring)
        return row
    row['answers']['answer_start'][0] = -1
    return row

In [None]:
squad["train"] = squad["train"].map(update_answer_start)

In [None]:
squad["train"] = squad["train"].filter(
    lambda row:
    (row["id"] and row["title"] and row["context"] and row["question"] and row['answers']['text'][0] and row['answers']['answer_start'][0])
    and
    (row['question'][-1] == "?")
    and
    (row['answers']['answer_start'][0] != -1)
    and
    (
        row['context'][row['answers']['answer_start'][0] : row['answers']['answer_start'][0] + len(row['answers']['text'][0])].lower()
        ==
        row['answers']['text'][0].lower()
    )
)

In [None]:
len(squad["train"].filter(lambda row: len(row['answers']['text']) > 0))

In [None]:
len(squad["train"].filter(lambda row: len(row['answers']['text']) == 0))

In [None]:
len(squad["train"])

## Errors in train

In [None]:
translated = squad["train"]

In [None]:
original = load_dataset("squad", split="train", download_mode='force_redownload')

In [None]:
all_ids = set()

In [None]:
def add_to_all(row):
  all_ids.add(row["id"])

In [None]:
original.map(add_to_all)

In [None]:
translated_ids = set()

In [None]:
def add_to_translated(row):
  translated_ids.add(row["id"])

In [None]:
translated.map(add_to_translated)

In [None]:
errors = {id for id in all_ids if id not in translated_ids}

In [None]:
error_entries = original.filter(lambda row: row["id"] in errors)

In [None]:
len(error_entries)

In [None]:
error_table = pd.DataFrame(columns=["en", "mt"])

In [None]:
text = set()

In [None]:
def add_to_set(row):
  text.add(row["title"])
  text.add(row["question"])
  text.add(row["context"])
  for ans in row["answers"]["text"]: text.add(ans)

In [None]:
error_entries.map(add_to_set)

In [None]:
for t in tqdm(text): error_table.loc[len(error_table.index)] = {"en": t, "mt": ""}

In [None]:
error_table.to_excel("squad_v1_train_errors.xlsx", index=False)

Translate the errors and concatenate to the translations DataFrame. Then run "Translating Train" again

In [None]:
error_table = pd.read_excel("squad_v1_train_errors.xlsx")

In [None]:
assert len((error_table['en'].where(error_table['mt'] == '#VALUE!').dropna().values)) == 0

In [None]:
translations = pd.concat([translations, error_table], ignore_index=True)

In [None]:
translations.to_excel("translated_train.xlsx", index=False)

## Translating validation

In [None]:
valid = load_dataset("squad", split="validation", download_mode='force_redownload')

In [None]:
translations = pd.read_excel("translated_valid.xlsx")

In [None]:
en_mt_dict = dict()

In [None]:
for index, row in translations.iterrows():
    en_mt_dict.update({str(row["en"]): str(row["mt"])})

In [None]:
def translate_row(row):
    row['title'] = en_mt_dict.get(str(row['title']))
    row['context'] = en_mt_dict.get(str(row['context']))
    row['question'] = en_mt_dict.get(str(row['question']))
    for entry in row['answers']['text']: entry = en_mt_dict.get(str(entry))
    return row

In [None]:
valid = valid.map(translate_row)

In [None]:
def update_answer_start(row):
    if row['context'] is not None:
        corpus = row['context'].lower()
        starts = []
        answers = []
        for entry in row['answers']['text']:
            search = corpus.find(entry.lower())
            if search != -1:
                answers.append(entry)
                starts.append(search)
        row['answers']['answer_start'] = starts
        row['answers']['text'] = answers
        return row
    return row

In [None]:
valid = valid.map(update_answer_start)

In [None]:
def answers_start_match(row):
    matches = []
    for i in range(len(row['answers']['answer_start'])):
        if row['context'][row['answers']['answer_start'][i] : row['answers']['answer_start'][i] + len(row['answers']['text'][i])].lower() == row['answers']['text'][i].lower():
            matches.append(True)
        else:
            matches.append(False)
    return matches

In [None]:
valid = valid.filter(
    lambda row:
    (row["id"] and row["title"] and row["context"] and row["question"])
    and
    (len(row['answers']['text']) > 0 and len(row['answers']['answer_start']) > 0)
    and
    (row['question'][-1] == "?")
    and
    (all(entry is True for entry in answers_start_match(row)))
)

In [None]:
len(valid.filter(lambda row: len(row['answers']['text']) > 0))

In [None]:
len(valid.filter(lambda row: len(row['answers']['text']) == 0))

In [None]:
squad["validation"] = valid

In [None]:
len(squad["validation"])

## Errors in validation

In [None]:
original = load_dataset("squad", split="validation", download_mode='force_redownload')

In [None]:
all_ids = set()

In [None]:
def add_to_all(row):
  all_ids.add(row["id"])

In [None]:
original.map(add_to_all)

In [None]:
translated_ids = set()

In [None]:
def add_to_translated(row):
  translated_ids.add(row["id"])

In [None]:
valid.map(add_to_translated)

In [None]:
errors = {id for id in all_ids if id not in translated_ids}

In [None]:
error_entries = original.filter(lambda row: row["id"] in errors)

In [None]:
len(error_entries)

In [None]:
error_table = pd.DataFrame(columns=["en", "mt"])

In [None]:
text = set()

In [None]:
def add_to_set(row):
  text.add(row["title"])
  text.add(row["question"])
  text.add(row["context"])
  for ans in row["answers"]["text"]: text.add(ans)

In [None]:
error_entries.map(add_to_set)

In [None]:
for t in tqdm(text): error_table.loc[len(error_table.index)] = {"en": t, "mt": ""}

In [None]:
error_table.to_excel("squad_v1_valid_errors.xlsx", index=False)

Translate the errors and concatenate to the translations DataFrame. Then run "Translating Train" again

In [None]:
error_table = pd.read_excel("squad_v1_valid_errors.xlsx")

In [None]:
assert len((error_table['en'].where(error_table['mt'] == '#VALUE!').dropna().values)) == 0

In [None]:
translations = pd.concat([translations, error_table], ignore_index=True)

In [None]:
translations.to_excel("translated_valid.xlsx", index=False)

## Export dataset

In [None]:
squad

In [None]:
squad.save_to_disk("SQuAD_V1")

In [None]:
# Download as JSON
for split, dataset in squad.items():
    dataset.to_json(f"squad-v1-{split}.json")