In [12]:
from datasets import load_dataset, Dataset
import json

## SQuAD V1

In [13]:
original_squad_v1 = load_dataset("squad")
original_v1_train = original_squad_v1['train']
original_v1_valid = original_squad_v1['validation']

In [14]:
translated_train_v1 = Dataset.from_file("./SQuAD V1 Translated/squad_v1_train.arrow")
translated_valid_v1 = Dataset.from_file("./SQuAD V1 Translated/squad_v1_valid.arrow")

In [15]:
original_v1_train_len = len(original_v1_train)
original_v1_valid_len = len(original_v1_valid)
translated_train_v1_len = len(translated_train_v1)
translated_valid_v1_len = len(translated_valid_v1)
train_retention_v1 = translated_train_v1_len / original_v1_train_len * 100
valid_retention_v1 = translated_valid_v1_len / original_v1_valid_len * 100

In [16]:
print(f"""
    Original SQuAD V1 contains:
      - Train = {original_v1_train_len}
      - Valid = {original_v1_valid_len}
    Translated SQuAD V1 contains:
      - Train = {translated_train_v1_len}
      - Valid = {translated_valid_v1_len}
    Retention when translating SQuAD V1:
      - Train = {train_retention_v1 :.2f}
      - Valid = {valid_retention_v1 :.2f}
""")


    Original SQuAD V1 contains:
      - Train = 87599
      - Valid = 10570
    Translated SQuAD V1 contains:
      - Train = 58542
      - Valid = 3669
    Retention when translating SQuAD V1:
      - Train = 66.83
      - Valid = 34.71



In [17]:
data = {
    "Original SQuAD V1": {
        "Train": original_v1_train_len,
        "Valid": original_v1_valid_len
    },
    "Translated SQuAD V1": {
        "Train": translated_train_v1_len,
        "Valid": translated_valid_v1_len
    },
    "Retention when translating SQuAD V1": {
        "Train": f"{train_retention_v1:.2f}",
        "Valid": f"{valid_retention_v1:.2f}"
    }
}

with open('squad_v1_stats.json', 'w') as f:
    json.dump(data, f, indent=4)

## SQuAD V2

In [18]:
original_squad_v2 = load_dataset("squad_v2")

original_v2_train = original_squad_v2['train']
original_v2_train_ans = original_v2_train.filter(lambda row: len(row['answers']['text']) != 0)
original_v2_train_impossible =  original_v2_train.filter(lambda row: len(row['answers']['text']) == 0)
original_v2_valid = original_squad_v2['validation']
original_v2_valid_ans = original_v2_valid.filter(lambda row: len(row['answers']['text']) != 0)
original_v2_valid_impossible =  original_v2_valid.filter(lambda row: len(row['answers']['text'] )== 0)

Filter:   0%|          | 0/130319 [00:00<?, ? examples/s]

Filter:   0%|          | 0/130319 [00:00<?, ? examples/s]

Filter:   0%|          | 0/11873 [00:00<?, ? examples/s]

Filter:   0%|          | 0/11873 [00:00<?, ? examples/s]

In [19]:
translated_train_v2 = Dataset.from_file("./SQuAD V2 Translated/squad_v2_train.arrow")
translated_v2_train_ans = translated_train_v2.filter(lambda row: len(row['answers']['text']) != 0)
translated_v2_train_impossible =  translated_train_v2.filter(lambda row: len(row['answers']['text']) == 0)

translated_valid_v2 = Dataset.from_file("./SQuAD V2 Translated/squad_v2_valid.arrow")
translated_v2_valid_ans = translated_valid_v2.filter(lambda row: len(row['answers']['text']) != 0)
translated_v2_valid_impossible =  translated_valid_v2.filter(lambda row: len(row['answers']['text']) == 0)

Filter:   0%|          | 0/102107 [00:00<?, ? examples/s]

Filter:   0%|          | 0/102107 [00:00<?, ? examples/s]

Filter:   0%|          | 0/10452 [00:00<?, ? examples/s]

Filter:   0%|          | 0/10452 [00:00<?, ? examples/s]

In [20]:
original_v2_train_len = len(original_v2_train)
original_v2_valid_len = len(original_v2_valid)
translated_train_v2_len = len(translated_train_v2)
translated_valid_v2_len = len(translated_valid_v2)
train_retention_v2 = translated_train_v2_len / original_v2_train_len * 100
valid_retention_v2 = translated_valid_v2_len / original_v2_valid_len * 100

original_v2_train_ans_len = len(original_v2_train_ans)
original_v2_train_impossible_len = len(original_v2_train_impossible)
original_v2_valid_ans_len = len(original_v2_valid_ans)
original_v2_valid_impossible_len = len(original_v2_valid_impossible)
translated_v2_train_ans_len = len(translated_v2_train_ans)
translated_v2_train_impossible_len = len(translated_v2_train_impossible)
translated_v2_valid_ans_len = len(translated_v2_valid_ans)
translated_v2_valid_impossible_len = len(translated_v2_valid_impossible)
train_ans_retention_v2 = translated_v2_train_ans_len / original_v2_train_ans_len * 100
train_impossible_retention_v2 = translated_v2_train_impossible_len / original_v2_train_impossible_len * 100
valid_ans_retention_v2 = translated_v2_valid_ans_len / original_v2_valid_ans_len * 100
valid_impossible_retention_v2 = translated_v2_valid_impossible_len / original_v2_valid_impossible_len * 100


In [21]:
print(f"""
    Original SQuAD V2 contains:
      - Train = {original_v2_train_len}
      - Valid = {original_v2_valid_len}
      - Train (Has Ans) = {original_v2_train_ans_len}
      - Train (Impossible) = {original_v2_train_impossible_len}
      - Valid (Has Ans) = {original_v2_valid_ans_len}
      - Valid (Impossible) = {original_v2_valid_impossible_len}
    Translated SQuAD V2 contains:
      - Train = {translated_train_v2_len}
      - Valid = {translated_valid_v2_len}
      - Train (Has Ans) = {translated_v2_train_ans_len}
      - Train (Impossible) = {translated_v2_train_impossible_len}
      - Valid (Has Ans) = {translated_v2_valid_ans_len}
      - Valid (Impossible) = {translated_v2_valid_impossible_len}
    Retention when translating SQuAD V2:
      - Train = {train_retention_v2 :.2f}
      - Valid = {valid_retention_v2 :.2f}
      - Train (Has Ans) = {train_ans_retention_v2 :.2f}
      - Train (Impossible) = {train_impossible_retention_v2 :.2f}
      - Valid (Has Ans) = {valid_ans_retention_v2 :.2f}
      - Valid (Impossible) = {valid_impossible_retention_v2 :.2f}
""")


    Original SQuAD V2 contains:
      - Train = 130319
      - Valid = 11873
      - Train (Has Ans) = 86821
      - Train (Impossible) = 43498
      - Valid (Has Ans) = 5928
      - Valid (Impossible) = 5945
    Translated SQuAD V2 contains:
      - Train = 102107
      - Valid = 10452
      - Train (Has Ans) = 59006
      - Train (Impossible) = 43101
      - Valid (Has Ans) = 4550
      - Valid (Impossible) = 5902
    Retention when translating SQuAD V2:
      - Train = 78.35
      - Valid = 88.03
      - Train (Has Ans) = 67.96
      - Train (Impossible) = 99.09
      - Valid (Has Ans) = 76.75
      - Valid (Impossible) = 99.28



In [22]:
data = {
    "Original SQuAD V2": {
        "Train": original_v2_train_len,
        "Valid": original_v2_valid_len,
        "Train (Has Ans)": original_v2_train_ans_len,
        "Train (Impossible)": original_v2_train_impossible_len,
        "Valid (Has Ans)": original_v2_valid_ans_len,
        "Valid (Impossible)": original_v2_valid_impossible_len
    },
    "Translated SQuAD V2": {
        "Train": translated_train_v2_len,
        "Valid": translated_valid_v2_len,
        "Train (Has Ans)": translated_v2_train_ans_len,
        "Train (Impossible)": translated_v2_train_impossible_len,
        "Valid (Has Ans)": translated_v2_valid_ans_len,
        "Valid (Impossible)": translated_v2_valid_impossible_len
    },
    "Retention when translating SQuAD V2": {
        "Train": f"{train_retention_v2:.2f}",
        "Valid": f"{valid_retention_v2:.2f}",
        "Train (Has Ans)": f"{train_ans_retention_v2:.2f}",
        "Train (Impossible)": f"{train_impossible_retention_v2:.2f}",
        "Valid (Has Ans)": f"{valid_ans_retention_v2:.2f}",
        "Valid (Impossible)": f"{valid_impossible_retention_v2:.2f}"
    }
}

with open('squad_v2_stats.json', 'w') as f:
    json.dump(data, f, indent=4)
