# Download SQuAD Dataset and preprocess
- Download Train + eval
- tokenize data and write to separate files (context, question, answers)

## Download

In [17]:
from preprocess import download_squad_dataset

train, eval = download_squad_dataset()

In [18]:
print(len(train["data"]))
print(len(eval["data"]))

442
48


## Tokenization

- tokenization function (Stanford coreNLP tokenizer python only alternative)
- mapping function: (context, context_tokens) -> dictionary mapping char indices to tokens: <br>
example ("this is a test", [this, is, a, test]) ---> 0,1,2,3 -> ("this",0), 5,6 -> ("is",1), ... etc. 

In [138]:
import stanza

nlp = stanza.Pipeline(lang = "en", processors="tokenize", tokenize_pretokenized = False)

def tokenize(text):
    text = text.lower()
    doc = nlp(text)
    
    tokens = []
    for sen in doc.sentences:
        for token in sen.tokens:
            tokens.append(token.text)
    return tokens



2025-06-11 16:02:56 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json: 432kB [00:00, 14.1MB/s]                    
2025-06-11 16:02:56 INFO: Downloaded file to /home/luca/stanza_resources/resources.json
2025-06-11 16:02:56 INFO: Loading these models for language: en (English):
| Processor | Package  |
------------------------
| tokenize  | combined |
| mwt       | combined |

2025-06-11 16:02:56 INFO: Using device: cpu
2025-06-11 16:02:56 INFO: Loading: tokenize
2025-06-11 16:02:56 INFO: Loading: mwt
2025-06-11 16:02:56 INFO: Done loading processors!


In [135]:
def mapCharToToken(context, tokens):

    concat = ""
    curr = 0
    mapping = {}

    for i, char in enumerate(context):
        if char != ' ' and char != '\n':
            concat += char
            ctoken = tokens[curr]
            if concat == ctoken:
                start = i - len(concat) + 1
                for loc in range(start, i+1):
                    mapping[loc] = (concat, curr)
                concat = ""
                curr += 1
    if curr != len(tokens):
        return None
    else:
        return mapping




In [None]:
import os

i = 0
mappingissues = 0
spanissues = 0
tokenissues = 0
dataset = []

for article in train["data"]:
    for paragraph in article["paragraphs"]:
        context = paragraph["context"]
        context.replace("''",'" ') 
        context.replace("``",'" ') 
        context_tokens = tokenize(context)
        context = context.lower()

        mapping = mapCharToToken(context, context_tokens)

        if mapping is None:
            mappingissues += 1
            print(article["title"])
            continue
        
        for qa in paragraph["qas"]:
            question_tokens = tokenize(qa["question"])

            answer_text = qa["answers"][0]["text"].lower()
            answer_start = qa["answers"][0]["answer_start"]
            answer_end = answer_start + len(answer_text)

            if context[answer_start:answer_end] != answer_text:
                spanissues += 1
                continue

            answer_start_wordloc = mapping[answer_start][1]
            answer_end_wordloc = mapping[answer_end-1][1]

            answer_tokens = context_tokens[answer_start_wordloc:answer_end_wordloc+1]

            if "".join(answer_tokens) != "".join(answer_text.split()):
                tokenissues += 1
                continue
            dataset.append((' '.join(context_tokens), ' '.join(question_tokens), ' '.join(answer_tokens), ' '.join([str(answer_start_wordloc), str(answer_end_wordloc)])))

2008_Summer_Olympics_torch_relay
MP3
ASCII
ASCII
ASCII
Franco-Prussian_War
Franco-Prussian_War
Franco-Prussian_War
Eritrea
Hellenistic_period
Hellenistic_period
Presbyterianism
Presbyterianism
Pope_Paul_VI
Avicenna
Sahara
Paris
Muammar_Gaddafi
The_Bronx
mappingissues: 19
spanissues: 23
tokenissues: 19


NameError: name '__file__' is not defined

In [144]:
print(f"mappingissues: {mappingissues}")
print(f"spanissues: {spanissues}")
print(f"tokenissues: {mappingissues}")

current_dir = os.path.dirname(os.path.abspath(os.getcwd()))

context_file_path = os.path.join(current_dir, "data.context")
question_file_path = os.path.join(current_dir, "data.question")
answer_file_path = os.path.join(current_dir, "data.answer")
span_file_path = os.path.join(current_dir, "data.span")

with open(context_file_path,"w") as context_f, \
     open(question_file_path,"w") as question_f, \
     open(answer_file_path,"w") as answer_f, \
     open(span_file_path,"w") as span_f:
    
    for data in dataset: 
        (context, question, answer, span) = data

        context_f.write(context + "\n") 
        question_f.write(question + "\n") 
        answer_f.write(answer + "\n") 
        span_f.write(span + "\n") 

mappingissues: 19
spanissues: 23
tokenissues: 19
