# Download SQuAD Dataset and preprocess
- Download Train + eval
- tokenize data and write to separate files (context, question, answers)

## Download

In [2]:
import importlib
import preprocess
importlib.reload(preprocess)
from preprocess import download_squad_dataset, process_split, write_to_files

train, eval = download_squad_dataset()

In [3]:
print(len(train["data"]))
print(len(eval["data"]))

442
48


## Tokenization

- tokenization function (Stanford coreNLP tokenizer python only alternative)
- mapping function: (context, context_tokens) -> dictionary mapping char indices to tokens: <br>
example ("this is a test", [this, is, a, test]) ---> 0,1,2,3 -> ("this",0), 5,6 -> ("is",1), ... etc. 

In [4]:
import stanza

nlp = stanza.Pipeline(lang = "en", processors="tokenize", tokenize_pretokenized = False)
eval_dataset = process_split(eval, nlp)
e_context_tokens, e_question_tokens, e_answer_tokens, e_span_tokens = write_to_files(eval_dataset, "eval")

  from .autonotebook import tqdm as notebook_tqdm
2025-06-14 18:28:05 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json: 432kB [00:00, 26.0MB/s]                    
2025-06-14 18:28:05 INFO: Downloaded file to /home/luca/stanza_resources/resources.json
2025-06-14 18:28:05 INFO: Loading these models for language: en (English):
| Processor | Package  |
------------------------
| tokenize  | combined |
| mwt       | combined |

2025-06-14 18:28:05 INFO: Using device: cpu
2025-06-14 18:28:05 INFO: Loading: tokenize
2025-06-14 18:28:07 INFO: Loading: mwt
2025-06-14 18:28:07 INFO: Done loading processors!
Processing articles: 100%|██████████| 48/48 [02:15<00:00,  2.83s/it]

mappingissues: 0
spanissues: 0
tokenissues: 0





In [5]:
#train_dataset = process_split(train)
e_context_tokens, e_question_tokens, e_answer_tokens, e_span_tokens = write_to_files(eval_dataset, "eval")
#t_context_tokens, t_question_tokens, t_answer_tokens, t_span_tokens = write_to_files(train_dataset, "train")

## Map tokens to embedding indices

- load GloVe embeddings
- map vocabulary to embedding indices


In [6]:
import numpy as np
import os 

glove_path = os.path.abspath(os.path.dirname(os.getcwd())) + "/glove_embeddings/glove.840B.300d.txt"
print(glove_path) 
assert os.path.exists(glove_path), ("glove embeddings file missing! Please download the correct embeddings and place them into the glove_embeddings directory")
embedding_index = {}
with open(glove_path, "r", encoding="utf-8") as f:
    for line in f:
        vals = line.split(' ')
        word = vals[0]
        coefs = np.asarray(vals[1:], dtype='float32')
        embedding_index[word] = coefs

print("Done! ", len(embedding_index),"words loaded")

/home/luca/workspace/dnlp2025/glove_embeddings/glove.840B.300d.txt
Done!  2196016 words loaded


In [7]:
word2idx = {}
idx2word = []
embedding_dim = 300
embedding_matrix = []

word2idx["[PAD]"] = 0
word2idx["[UNK]"] = 1
idx2word.append("[PAD]")
idx2word.append("[UNK]")
embedding_matrix.append(np.zeros(embedding_dim, dtype='float32'))
embedding_matrix.append(np.zeros(embedding_dim, dtype='float32'))

def get_or_create_index(token):
    token_lower = token.lower()
    if(token_lower) in word2idx:
        return word2idx[token_lower]
    else:
        idx = len(word2idx)
        word2idx[token_lower] = idx
        idx2word.append(token_lower)
        if token_lower in embedding_index:
            embedding_matrix.append(embedding_index[token_lower])
        else:
            embedding_matrix.append(np.random.normal(scale=0.01, size=embedding_dim))
        return idx

In [8]:
sen_idxs = []
#do this for every token in contexts,question and answers
all_tokens = []
all_tokens.extend(e_context_tokens)
all_tokens.extend(e_question_tokens)
all_tokens.extend(e_answer_tokens)
print(len(all_tokens))
i = 0
for tokens in all_tokens:
    if tokens is None:
        continue
    idx = [get_or_create_index(t) for t in tokens.split()]
    sen_idxs.append(idx)
print(len(sen_idxs))
print(len(word2idx))
print(len(idx2word))
print(sen_idxs[0])

31338


31338
25764
25764
[2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 12, 15, 8, 16, 17, 18, 19, 20, 12, 21, 22, 23, 12, 7, 8, 24, 17, 25, 19, 13, 26, 27, 28, 12, 15, 8, 24, 17, 29, 19, 13, 30, 31, 32, 33, 34, 10, 35, 36, 37, 2, 3, 38, 23, 12, 9, 5, 39, 40, 41, 42, 43, 44, 43, 45, 46, 47, 48, 12, 49, 50, 51, 52, 45, 53, 54, 43, 55, 23, 56, 57, 5, 12, 58, 2, 3, 43, 12, 16, 59, 12, 60, 61, 62, 60, 63, 64, 65, 66, 67, 68, 43, 56, 69, 56, 70, 71, 12, 72, 14, 73, 74, 2, 3, 9, 63, 75, 76, 17, 77, 78, 12, 9, 79, 80, 81, 82, 56, 60, 2, 3, 83, 60, 19, 43, 84, 85, 12, 86, 87, 88, 89, 12, 90, 76, 4, 23]


In [9]:
embedding_matrix = np.array(embedding_matrix, dtype='float32')
embedding_matrix.shape

(25764, 300)

In [59]:
import torch
import torch.nn as nn
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

class Encoder(nn.Module):
    def __init__(self, hidden_dim, embedding_matrix, dropout_ratio):
        super().__init__()
        self.hidden_dim = hidden_dim
        vocab_size, embedding_dim = embedding_matrix.shape
        embedding_tensor = torch.tensor(embedding_matrix, dtype=torch.float)
        
        self.embedding = nn.Embedding.from_pretrained(embedding_tensor, freeze=True, padding_idx=0)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, 1, batch_first=True, dropout=dropout_ratio)
        self.dropout = nn.Dropout(dropout_ratio)

        self.w = nn.Linear(hidden_dim, hidden_dim)
        #self.b = nn.Parameter(torch.zeros(hidden_dim))

        self.sentinel = nn.Parameter(torch.randn(1,hidden_dim))

    def encode_sequence(self, idxs, mask):
        lengths = mask.sum(dim=1)  # [batch]
        sorted_lens, sorted_idx = lengths.sort(descending=True)
        _, orig_idx = sorted_idx.sort()

        # Sort sequences for packing
        idxs_sorted = idxs[sorted_idx]
        emb = self.embedding(idxs_sorted)
        packed = pack_padded_sequence(emb, sorted_lens.cpu(), batch_first=True, enforce_sorted=True)

        # LSTM encoding
        packed_out, _ = self.lstm(packed)
        out, _ = pad_packed_sequence(packed_out, batch_first=True)  # [batch, max_len, hidden]
        out = self.dropout(out)
        out = out[orig_idx]  # restore original order

        # Insert sentinel at end-of-sequence index for each example
        batch_size = out.size(0)
        sentinel_expanded = self.sentinel.expand(batch_size, 1, self.hidden_dim)  # [batch, 1, hidden]

        out_with_sentinel = torch.cat([out, torch.zeros_like(sentinel_expanded)], dim=1)  # [batch, max_len+1, hidden]
        lens = lengths.long().unsqueeze(1).unsqueeze(2).expand(-1, 1, self.hidden_dim)  # [batch, 1, hidden]
        out_with_sentinel = out_with_sentinel.scatter(1, lens, sentinel_expanded)

        return out_with_sentinel  # [batch, seq_len + 1, hidden]

    def forward(self, doc_idxs, doc_mask, q_idxs, q_mask):

        D = self.encode_sequence(doc_idxs, doc_mask)  # [batch, m+1, hidden]
        Q_prime = self.encode_sequence(q_idxs, q_mask)  # [batch, n+1, hidden]

        # Nonlinear projection: Q = tanh(W * Q′ + b)
        Q = torch.tanh(self.w(Q_prime))  # [batch, n+1, hidden]

        return D, Q       

## Small Encoder test

In [61]:
import torch
import numpy as np

# Dummy vocab
word2idx_test = {
    "[PAD]": 0, "[UNK]": 1,
    "the": 2, "quick": 3, "brown": 4, "fox": 5, "jumps": 6, "over": 7, "lazy": 8, "dog": 9
}

# Random embedding matrix for vocab (vocab_size x emb_dim)
vocab_size = len(word2idx_test)
embedding_dim = 50
embedding_matrix_test = np.random.uniform(-0.1, 0.1, (vocab_size, embedding_dim))

# Dummy inputs
# Document: "the quick brown fox jumps"
# Question: "the fox"
doc_tokens = [2, 3, 4, 5, 6]
q_tokens = [2, 5]

# Padding to max length
doc_max_len = 6
q_max_len = 4
doc_input = [doc_tokens + [0] * (doc_max_len - len(doc_tokens))]  # batch size 1
q_input = [q_tokens + [0] * (q_max_len - len(q_tokens))]

# Masks (1 for real tokens, 0 for padding)
doc_mask = [[1]*len(doc_tokens) + [0]*(doc_max_len - len(doc_tokens))]
q_mask = [[1]*len(q_tokens) + [0]*(q_max_len - len(q_tokens))]

# Convert to tensors
doc_idxs = torch.tensor(doc_input)      # [1, 6]
doc_mask = torch.tensor(doc_mask)       # [1, 6]
q_idxs = torch.tensor(q_input)          # [1, 4]
q_mask = torch.tensor(q_mask)           # [1, 4]

hidden_size = 64
encoder = Encoder(hidden_size, embedding_matrix_test, 0)

# Run encoder
D, Q = encoder(doc_idxs, doc_mask, q_idxs, q_mask)

# Outputs
print("Document encoding shape:", D.shape)  # [1, m+1, 64]
print("Question encoding shape:", Q.shape)      # [1, n+1, 64]

print("\nSentinel vector (document):", D[0, -1])
print("Sentinel vector (question):", Q[0, -1])


Document encoding shape: torch.Size([1, 6, 64])
Question encoding shape: torch.Size([1, 3, 64])

Sentinel vector (document): tensor([-0.4656, -0.0565, -0.5351, -0.9820, -0.0663,  1.0698, -1.5193, -0.6821,
         0.0422,  0.7030,  0.7814, -0.9856,  0.0167, -0.8512,  0.1405, -1.1459,
         0.6573,  0.2178,  0.4201, -0.6437, -1.3230, -1.3122, -1.6510, -1.2029,
        -0.5488,  1.3250, -0.3112, -0.7705, -0.2678,  0.1188, -1.4871,  1.2600,
         0.4149, -0.1634,  0.2989, -0.0226,  0.8531, -0.1789, -0.3517,  0.2735,
        -0.8494,  0.8108, -0.2446, -0.1861,  1.5437, -0.4878,  0.9340, -0.7628,
        -1.0553,  1.0785, -1.8988,  0.3790,  0.7689, -0.8682, -0.3757,  1.5796,
         0.8995, -0.6768, -0.0731, -0.5980, -0.7956,  1.7212,  0.0497, -1.1951],
       grad_fn=<SelectBackward0>)
Sentinel vector (question): tensor([ 0.2639, -0.3731,  0.2077,  0.6458, -0.5799, -0.4256,  0.3102,  0.5333,
        -0.5420,  0.1103, -0.4383,  0.4475, -0.4170,  0.4575,  0.2951,  0.1116,
         0.0

In [12]:
import torch.nn.functional as functional

class BRNN(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers):
        super(BRNN, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, bidirectional=True)
        
    def forward(self, x):
        h0 = torch.zeros(self.num_layers*2, x.size(0), self.hidden_size).to(x.device)
        c0 = torch.zeros(self.num_layers*2, x.size(0), self.hidden_size).to(x.device)
        
        out, _ = self.lstm(x, (h0, c0))
        return out

class CoattentionEncoder(nn.Module):
    def __init__(self, hidden_size, num_layers=1):
        super().__init__()
        self.hidden_size = hidden_size

        self.brnn = BRNN(input_size=3 * hidden_size,
                         hidden_size=hidden_size,
                         num_layers=num_layers)

    #2.2 Coattention Encoder
    def forward(self, D, Q):
        #Affinity matrix
        L = torch.bmm(Q, torch.transpose(D, 1, 2)) 

        #Attention weights
        AQ = functional.softmax(L, dim=1)         
        AD = functional.softmax(torch.transpose(L, 1, 2), dim=1)  

        #Context Summaries
        CQ = torch.bmm(AQ, D) 
        Q_combined = torch.cat([Q, CQ], dim=2)   
        CD = torch.bmm(AD, Q_combined)

        #BRNN
        return self.brnn(torch.cat([D, CD], dim=2)) 

In [68]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
coattention_enc = CoattentionEncoder(hidden_size).to(device)

U = coattention_enc(D, Q)  
print(U.shape)

b,m,l = list(U.size())
print(b,m,l)

torch.Size([1, 6, 128])
1 6 128


In [87]:
x = torch.tensor([[1,2,8,4],[1,2,3,7]])
y = torch.randn(4,4)
x_u = x.max(1)
print(x_u)



torch.return_types.max(
values=tensor([8, 7]),
indices=tensor([2, 3]))


## Decoder

In [None]:
class DynamicDecoder(nn.Module):

    def __init__(self, input_size ,hidden_dim, maxout_pool_size, max_steps, dropout_ratio):
        super().__init__()
        self.max_steps = max_steps
        self.lstm = nn.LSTM(input_size, hidden_dim, 1, batch_first=True, dropout=dropout_ratio)

        self.maxout_start = MaxOutHighWay(hidden_dim, maxout_pool_size, dropout_ratio)
        self.maxout_end = MaxOutHighWay(hidden_dim, maxout_pool_size, dropout_ratio)

    def forward(self, U, doc_pad_mask, target):
        b,m,_ = list(U.size())

        curr_change_mask_s, curr_change_mask_e = None, None

        masks_s, masks_e, results_s, results_e, losses = [], [], [], [], []

        # invert the document pad mask -> multiply padded values with smalles possible value -> no influence on loss computation
        pad_mask = (1.0-doc_pad_mask.float()) * torch.finfo(torch.float32).min

        idxs = torch.arange(0,b,out=torch.LongTensor(b))

        #init start and end index to 0 and last word in document
        s_idx_prev = torch.zeros(b,).long()
        # sum evaluates to all words in document, since pad tokens == 0 and rest == 1 
        e_idx_prev = torch.sum(doc_pad_mask,1) - 1

        decoder_state = None
        s_target = None
        e_target = None
        
        #extract idx from given answer span
        if target is not None:
            s_target = target[:,0]
            e_target = target[:,1]

        #get previously computed start index coattention representation
        u_s_idx_prev = U[idxs, s_idx_prev,:]

        #decoder iterations (recommmended: 16)

        for i in range(self.max_steps):
            #get previously computed end index coattention represenation
            u_e_idx_prev = U[idxs, e_idx_prev, :]
            u_s_e = torch.cat((u_s_idx_prev, u_e_idx_prev), 1)

            lstm_out, decoder_state = self.lstm(u_s_e.unsqueeze(1), decoder_state)
            #extract final hidden state h_i
            c_i, h_i = decoder_state

            #compute new start index
            s_idx_prev, curr_change_mask_s, loss_s = self.maxout_start(h_i, U, u_s_e, pad_mask, s_idx_prev, curr_change_mask_s, s_target) 

            #update start index with index computed above
            u_s_idx_prev = U[idxs, s_idx_prev, :]
            u_s_e = torch.cat((u_s_idx_prev, u_e_idx_prev), 1)

            #compute new end index
            e_idx_prev, curr_change_mask_e, loss_e = self.maxout_start(h_i, U, u_s_e, pad_mask, e_idx_prev, curr_change_mask_e, e_target) 

            if target is not None:
                loss = loss_s + loss_e
                losses.append(loss)

            masks_s.append(curr_change_mask_s)
            masks_e.append(curr_change_mask_e)
            results_s.append(s_idx_prev)
            results_e.append(e_idx_prev)

        #retrieve last index predictions where updates halted
        #idx should have shape (b,)
        result_idx_s = torch.sum(torch.stack(masks_s,1),1).long() - 1
        idx_s = torch.gather(torch.stack(results_s,1),1,result_idx_s.unsqueeze(1)).squeeze()
        result_idx_e = torch.sum(torch.stack(masks_e,1),1).long() - 1
        idx_e = torch.gather(torch.stack(results_e,1),1,result_idx_e.unsqueeze(1)).squeeze()

        loss = None

        #compute loss while training and evaluating
        if target is not None:
            sum_losses = torch.sum(torch.stack(losses,1),1)
            avg_loss = sum_losses/self.max_steps
            loss = torch.mean(avg_loss)
        print(f"DEBUG: Before return - type(loss): {type(loss)}, value: {loss}")
        print(f"DEBUG: Before return - type(idx_s): {type(idx_s)}, value: {idx_s}")
        print(f"DEBUG: Before return - type(idx_e): {type(idx_e)}, value: {idx_e}")
        return loss, idx_s, idx_e

In [120]:
class MaxOutHighWay(nn.Module):
    def __init__(self, hidden_dim, maxout_pool_size, dropout_ratio=0.0):
        super().__init__()
        self.hidden_dim = hidden_dim
        self.maxout_pool_size = maxout_pool_size
        self.w_d = nn.Linear(5 * hidden_dim, hidden_dim, bias=False)
        self.w_1 = nn.Linear(3 * hidden_dim, hidden_dim*maxout_pool_size)
        self.w_2 = nn.Linear(hidden_dim, hidden_dim*maxout_pool_size)
        self.w_3 = nn.Linear(2 * hidden_dim, hidden_dim*maxout_pool_size)
        self.loss = nn.CrossEntropyLoss()

    def forward(self, h_i, U, u_s_e, pad_mask, idx_prev, change_mask, target=None):
        b,m,_ = list(U.size())
        #use view if dimensions dont match for cat
        r_in = self.w_d(torch.cat((h_i.view(-1,self.hidden_dim), u_s_e),1))
        r = functional.tanh(r_in)
        print("r.shape after tanh: ",r.shape)
        r = r.unsqueeze(1).expand(b,m,self.hidden_dim).contiguous()

        m_t_1_in = torch.cat((U,r),2).view(-1, self.hidden_dim*3)
        m_t_1, _ = self.w_1(m_t_1_in).view(-1, self.hidden_dim, self.maxout_pool_size).max(2)
        print("m_t_1 shape: ", m_t_1.shape)

        m_t_2, _ = self.w_2(m_t_1).view(-1, self.hidden_dim, self.maxout_pool_size).max(2)

        score, _ = self.w_3(torch.cat((m_t_1,m_t_2),1)).max(1)
        score = functional.softmax((score.view(-1,m) + pad_mask), dim=1)
        _, idx = torch.max(score, dim=1)

        if change_mask is None:
            change_mask = (idx == idx)
        else:
            idx = idx * change_mask.long()
            idx_prev = idx * change_mask.long()
            change_mask = (idx!=idx_prev)

        if target is not None:
            loss = self.loss(score, target)
            loss = loss * change_mask.float()
        
        return idx, change_mask, loss
    
        



                         

# Decoder Test


In [122]:
# --- Test Setup ---
# Define small placeholder values for your dimensions
BATCH_SIZE = 4
DOCUMENT_LENGTH_M = 50 # Example max document length
HIDDEN_DIM = 64     # Example hidden dimension (make it small for speed)
MAX_DEC_STEPS = 3   # Example decoding steps (run at least 2-3 to see dynamic behavior)
MAXOUT_POOL_SIZE = 4 # Example maxout pool size (p)

print("--- Initializing Decoder with Dummy Inputs ---")
decoder = DynamicDecoder(
    input_size=HIDDEN_DIM*4,
    hidden_dim=HIDDEN_DIM,
    maxout_pool_size=MAXOUT_POOL_SIZE,
    max_steps=MAX_DEC_STEPS,
    dropout_ratio=0.0 # Dropout isn't active in current code, but parameter is needed
)

# Create dummy inputs that mimic the expected shapes and types
# U: (batch_size, document_length_m, 2 * hidden_dim) as u_t is 2l
dummy_U = torch.randn(BATCH_SIZE, DOCUMENT_LENGTH_M, 2 * HIDDEN_DIM)
print(f"Dummy U shape: {dummy_U.shape}")

# d_mask: (batch_size, document_length_m) - True for valid, False for padding
dummy_d_mask = torch.ones(BATCH_SIZE, DOCUMENT_LENGTH_M, dtype=torch.bool)
# Simulate padding for one example
if DOCUMENT_LENGTH_M > 10:
    dummy_d_mask[0, 40:] = False # Pad 10 tokens for first sample
    dummy_d_mask[1, 30:] = False # Pad 20 tokens for second sample
print(f"Dummy d_mask shape: {dummy_d_mask.shape}")

# span: (batch_size, 2) - true start and end indices
# Make dummy span within valid range
dummy_span = torch.randint(0, DOCUMENT_LENGTH_M - 1, (BATCH_SIZE, 2), dtype=torch.long)
# Ensure start < end
dummy_span[:, 1] = torch.max(dummy_span[:, 0] + 1, dummy_span[:, 1]) # ensure end > start
print(f"Dummy span shape: {dummy_span.shape}")

print(f"DEBUG: Type of decoder object before call: {type(decoder)}")
print(f"DEBUG: Is decoder None? {decoder is None}")
# --- Run the Forward Pass ---
print("\n--- Running DynamicDecoder Forward Pass ---")
try:
    loss, pred_s, pred_e = decoder(dummy_U, dummy_d_mask, dummy_span)

    print("\n--- Forward Pass Completed Successfully! ---")
    print(f"Final Loss: {loss.item()}")
    print(f"Final Predicted Start Indices: {pred_s}")
    print(f"Final Predicted End Indices: {pred_e}")
    print(f"Final Predicted Start Indices Shape: {pred_s.shape}")
    print(f"Final Predicted End Indices Shape: {pred_e.shape}")

except Exception as e:
    print(f"\n--- An Error Occurred During Forward Pass ---")
    print(e)
    import traceback
    traceback.print_exc()

--- Initializing Decoder with Dummy Inputs ---
Dummy U shape: torch.Size([4, 50, 128])
Dummy d_mask shape: torch.Size([4, 50])
Dummy span shape: torch.Size([4, 2])
DEBUG: Type of decoder object before call: <class '__main__.DynamicDecoder'>
DEBUG: Is decoder None? False

--- Running DynamicDecoder Forward Pass ---
r.shape after tanh:  torch.Size([4, 64])
m_t_1 shape:  torch.Size([200, 64])
r.shape after tanh:  torch.Size([4, 64])
m_t_1 shape:  torch.Size([200, 64])
r.shape after tanh:  torch.Size([4, 64])
m_t_1 shape:  torch.Size([200, 64])
r.shape after tanh:  torch.Size([4, 64])
m_t_1 shape:  torch.Size([200, 64])
r.shape after tanh:  torch.Size([4, 64])
m_t_1 shape:  torch.Size([200, 64])
r.shape after tanh:  torch.Size([4, 64])
m_t_1 shape:  torch.Size([200, 64])
DEBUG: Before return - type(loss): <class 'torch.Tensor'>, value: 2.6054880619049072
DEBUG: Before return - type(idx_s): <class 'torch.Tensor'>, value: tensor([13, 12,  4, 15])
DEBUG: Before return - type(idx_e): <class 't

Traceback (most recent call last):
  File "/tmp/ipykernel_26951/4138332594.py", line 43, in <module>
    loss, pred_s, pred_e = decoder(dummy_U, dummy_d_mask, dummy_span)
TypeError: cannot unpack non-iterable NoneType object
