## Requirements

In [2]:
import pandas as pd
from datasets import Dataset, load_dataset, concatenate_datasets
from uuid import uuid4

## Load dataset

In [3]:
is_v1 = True
version= "squad_v1" if is_v1 else "squad_v2"

In [4]:
squad = Dataset.from_file(f"{version}_train.arrow")

## Structural analysis

In [5]:
len(squad)

53607

In [6]:
columns = squad[0].keys()
columns

dict_keys(['id', 'title', 'context', 'question', 'answers'])

In [7]:
squad[0]['answers'].keys()

dict_keys(['text', 'answer_start'])

## Load data to be used to add entries

In [8]:
df = pd.read_csv("enrichment_working_data.csv", encoding='utf-8')

In [9]:
contexts = [c.strip() for c in df['context'] if c.strip()]

In [10]:
questions = [q.strip() for q in df['question'] if q.strip()]

In [11]:
answers = [a.strip() for a in df["answer"] if a.strip()]

In [12]:
assert len(contexts) == len(questions) == len(answers)

In [13]:
f"Enriching with {len(df)} entries"

'Enriching with 1000 entries'

## Create new entries

In [14]:
new = list()

In [15]:
for question, context, answer in zip(questions, contexts, answers):
  to_add = {
      'id': str(uuid4()),
      'title': 'Artikli',
      'context': context,
      'question': question,
      'answers': {
          "text": [answer],
          "answer_start": [context.find(answer)]
      }
  }
  new.append(to_add)

new = Dataset.from_list(new)

In [16]:
errors = 0

for idx, n in enumerate(new):
  context = n["context"]
  answer = n["answers"]["text"][0]
  start = n["answers"]["answer_start"][0]
  end = start + len(answer)
  if context[start:end] != answer:
    errors+=1
    print(f"Answer not found at {idx} --> {n['id']}")

In [17]:
assert errors == 0

## Append to SQuAD

In [18]:
before = len(squad)

In [19]:
before

53607

In [20]:
squad = Dataset.from_list(squad)

In [21]:
squad = concatenate_datasets([squad, new])

In [22]:
after = len(squad)

In [27]:
after

54607

In [23]:
assert before < after

## Export

In [24]:
squad.save_to_disk(f"enriched_{version}_train")

Saving the dataset (0/1 shards):   0%|          | 0/54607 [00:00<?, ? examples/s]

In [26]:
# Download as JSON
squad.to_json(f"enriched_{version}-train.json")

Creating json from Arrow format:   0%|          | 0/55 [00:00<?, ?ba/s]

62829350