# Download SQuAD Dataset and preprocess
- Download Train + eval
- tokenize data and write to separate files (context, question, answers)

## Download

In [3]:
from preprocess import download_squad_dataset

train, eval = download_squad_dataset()

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
print(len(train["data"]))
print(len(eval["data"]))

442
48


## Tokenization

- tokenization function (Stanford coreNLP tokenizer python only alternative)
- mapping function: (context, context_tokens) -> dictionary mapping char indices to tokens: <br>
example ("this is a test", [this, is, a, test]) ---> 0,1,2,3 -> ("this",0), 5,6 -> ("is",1), ... etc. 

In [5]:
import stanza

nlp = stanza.Pipeline(lang = "en", processors="tokenize", tokenize_pretokenized = False)

def tokenize(text):
    text = text.lower()
    doc = nlp(text)
    
    tokens = []
    for sen in doc.sentences:
        for token in sen.tokens:
            tokens.append(token.text)
    return tokens



2025-06-11 19:41:14 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json: 432kB [00:00, 26.7MB/s]                    
2025-06-11 19:41:14 INFO: Downloaded file to /home/luca/stanza_resources/resources.json
2025-06-11 19:41:14 INFO: Loading these models for language: en (English):
| Processor | Package  |
------------------------
| tokenize  | combined |
| mwt       | combined |

2025-06-11 19:41:14 INFO: Using device: cpu
2025-06-11 19:41:14 INFO: Loading: tokenize
2025-06-11 19:41:16 INFO: Loading: mwt
2025-06-11 19:41:16 INFO: Done loading processors!


In [6]:
def mapCharToToken(context, tokens):

    concat = ""
    curr = 0
    mapping = {}

    for i, char in enumerate(context):
        if char != ' ' and char != '\n':
            concat += char
            ctoken = tokens[curr]
            if concat == ctoken:
                start = i - len(concat) + 1
                for loc in range(start, i+1):
                    mapping[loc] = (concat, curr)
                concat = ""
                curr += 1
    if curr != len(tokens):
        return None
    else:
        return mapping




In [None]:
import os

i = 0
mappingissues = 0
spanissues = 0
tokenissues = 0
dataset = []

for article in eval["data"]:
    for paragraph in article["paragraphs"]:
        context = paragraph["context"]
        context.replace("''",'" ') 
        context.replace("``",'" ') 
        context_tokens = tokenize(context)
        context = context.lower()

        mapping = mapCharToToken(context, context_tokens)

        if mapping is None:
            mappingissues += 1
            print(article["title"])
            continue
        
        for qa in paragraph["qas"]:
            question_tokens = tokenize(qa["question"])

            answer_text = qa["answers"][0]["text"].lower()
            answer_start = qa["answers"][0]["answer_start"]
            answer_end = answer_start + len(answer_text)

            if context[answer_start:answer_end] != answer_text:
                spanissues += 1
                continue

            answer_start_wordloc = mapping[answer_start][1]
            answer_end_wordloc = mapping[answer_end-1][1]

            answer_tokens = context_tokens[answer_start_wordloc:answer_end_wordloc+1]

            if "".join(answer_tokens) != "".join(answer_text.split()):
                tokenissues += 1
                continue
            dataset.append((' '.join(context_tokens), ' '.join(question_tokens), ' '.join(answer_tokens), ' '.join([str(answer_start_wordloc), str(answer_end_wordloc)])))

print(f"mappingissues: {mappingissues}")
print(f"spanissues: {spanissues}")
print(f"tokenissues: {mappingissues}")

mappingissues: 0
spanissues: 0
tokenissues: 0


In [124]:
import os

current_dir = os.path.dirname(os.path.abspath(os.getcwd()))

context_file_path = os.path.join(current_dir, "data.context")
question_file_path = os.path.join(current_dir, "data.question")
answer_file_path = os.path.join(current_dir, "data.answer")
span_file_path = os.path.join(current_dir, "data.span")

context_tokens = []
question_tokens = []
answer_tokens = []
span_tokens = []

with open(context_file_path,"w") as context_f, \
     open(question_file_path,"w") as question_f, \
     open(answer_file_path,"w") as answer_f, \
     open(span_file_path,"w") as span_f:
    
    for data in dataset: 
        (context, question, answer, span) = data

        context_f.write(context + "\n") 
        question_f.write(question + "\n") 
        answer_f.write(answer + "\n") 
        span_f.write(span + "\n") 

        context_tokens.append(context)
        question_tokens.append(question)
        answer_tokens.append(answer)
        span_tokens.append(span)
        


## Map tokens to embedding indices

- load GloVe embeddings
- map vocabulary to embedding indices


In [50]:
import numpy as np
import os 

glove_path = os.path.abspath(os.path.dirname(os.getcwd())) + "/glove_embeddings/glove.840B.300d.txt"
print(glove_path) 
assert os.path.exists(glove_path), ("glove embeddings missing!")
embedding_index = {}
with open(glove_path, "r", encoding="utf-8") as f:
    for line in f:
        vals = line.split(' ')
        word = vals[0]
        coefs = np.asarray(vals[1:], dtype='float32')
        embedding_index[word] = coefs

print("Done! ", len(embedding_index),"words loaded")

/home/luca/workspace/dnlp2025/glove_embeddings/glove.840B.300d.txt
Done!  2196016 words loaded


In [136]:
word2idx = {}
idx2word = []
embedding_dim = 300
embedding_matrix = []

word2idx["[UNK]"] = 0
idx2word.append("[UNK]")
word2idx["[PAD]"] = 1
idx2word.append("[PAD]")
embedding_matrix.append(np.zeros(embedding_dim, dtype='float32'))
embedding_matrix.append(np.zeros(embedding_dim, dtype='float32'))

def get_or_create_index(token):
    token_lower = token.lower()
    if(token_lower) in word2idx:
        return word2idx[token_lower]
    else:
        idx = len(word2idx)
        word2idx[token_lower] = idx
        idx2word.append(token_lower)
        if token_lower in embedding_index:
            embedding_matrix.append(embedding_index[token_lower])
        else:
            embedding_matrix.append(np.random.normal(scale=0.01, size=embedding_dim))
        return idx

In [137]:
idxs = []
#do this for every token in contexts,question and answers
all_tokens = []
all_tokens.extend(context_tokens)
all_tokens.extend(question_tokens)
all_tokens.extend(answer_tokens)
i = 0
for tokens in all_tokens:
    if tokens is None:
        continue
    idx = [get_or_create_index(t) for t in tokens.split()]
    idxs.append(idx)
print(len(idxs))
print(len(word2idx))
print(len(idx2word))

31338
25764
25764


In [138]:
embedding_matrix = np.array(embedding_matrix, dtype='float32')
embedding_matrix.shape

(25764, 300)