In [1]:
!pip install -q transformers
import pandas as pd
import numpy as np
import tensorflow as tf
from transformers import BertTokenizerFast

!pip install datasets
from datasets import load_dataset

# Load SQuAD dataset
dataset = load_dataset("squad", split="train")

# Step 1: Add a new column for context length
dataset = dataset.map(lambda x: {"context_length": len(x["context"].split())})
dataset = dataset.map(lambda x: {"question_length": len(x["question"].split())})


# Step 2: Sort by the new column
sorted_dataset = dataset.sort("context_length")

# Step 3: Select the shortest 10k rows (or whatever range you need)
subset = sorted_dataset.select(range(11000))  # You can change this to 5000, 20000, etc.
df = subset.to_pandas()

# Initialize tokenizer
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")

2025-04-22 16:56:40.889136: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1745341001.074450      31 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1745341001.135360      31 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading fsspec-2024.12.0-py3-none-any.whl (183 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m183.9/183.9 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: fsspec
  Attempting uninstall: fsspec
    Found existing installation: fsspec 2025.3.2
    Uninstalling fsspec-2025.3.2:
      Successfully uninstalled fsspec-2025.3.2
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
gcsfs 2024.10.0 requires fsspec==2024.10.0, but you have fsspec 2024.12.0 which is incompatible.
torch 2.5.1+cu124 requires nvidia-cublas-cu12==12.4.5.8; platform_system == "Linux" and platform_machine == "x86_64", but you have nvidia-cublas-cu12 12.8.4.1 which is incompatible.
torc

README.md:   0%|          | 0.00/7.62k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/14.5M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/1.82M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/87599 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10570 [00:00<?, ? examples/s]

Map:   0%|          | 0/87599 [00:00<?, ? examples/s]

Map:   0%|          | 0/87599 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [2]:
def prepare_qa_data_encoder_decoder(df, tokenizer, max_len=384):
    context_ids = []
    context_masks = []
    question_ids = []
    question_masks = []
    start_positions = []
    end_positions = []

    for i, row in df.iterrows():
        question = row["question"]
        context = row["context"]
        answer_text = row["answers"]["text"][0]
        answer_start = row["answers"]["answer_start"][0]
        answer_end = answer_start + len(answer_text)

        # Tokenize context and question separately
        context_encoding = tokenizer(
            context,
            return_offsets_mapping=True,
            padding="max_length",
            truncation=True,
            max_length=max_len,
            return_tensors=None
        )
        
        question_encoding = tokenizer(
            question,
            padding="max_length",
            truncation=True,
            max_length=max_len,
            return_tensors=None
        )

        offsets = context_encoding["offset_mapping"]
        start_pos, end_pos = 0, 0

        for idx, (start, end) in enumerate(offsets):
            if start <= answer_start < end:
                start_pos = idx
            if start < answer_end <= end:
                end_pos = idx
                break

        context_ids.append(context_encoding["input_ids"])
        context_masks.append(context_encoding["attention_mask"])
        question_ids.append(question_encoding["input_ids"])
        question_masks.append(question_encoding["attention_mask"])
        start_positions.append(start_pos)
        end_positions.append(end_pos)

    return {
        "context_input_ids": np.array(context_ids),
        "context_attention_mask": np.array(context_masks),
        "question_input_ids": np.array(question_ids),
        "question_attention_mask": np.array(question_masks),
        "start_positions": np.array(start_positions),
        "end_positions": np.array(end_positions),
    }

train_data = prepare_qa_data_encoder_decoder(df, tokenizer)

print("Context input_ids shape:", train_data["context_input_ids"].shape)
print("Question input_ids shape:", train_data["question_input_ids"].shape)

Context input_ids shape: (11000, 384)
Question input_ids shape: (11000, 384)


In [3]:
batch_size = 32

train_dataset = tf.data.Dataset.from_tensor_slices((
    {
        "context_input_ids": train_data["context_input_ids"],
        "context_attention_mask": train_data["context_attention_mask"],
        "question_input_ids": train_data["question_input_ids"],
        "question_attention_mask": train_data["question_attention_mask"]
    },
    {
        "start_positions": train_data["start_positions"],
        "end_positions": train_data["end_positions"]
    }
))

train_dataset = train_dataset.shuffle(1000).batch(batch_size).prefetch(tf.data.AUTOTUNE)

I0000 00:00:1745341053.204670      31 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 13942 MB memory:  -> device: 0, name: Tesla T4, pci bus id: 0000:00:04.0, compute capability: 7.5
I0000 00:00:1745341053.205421      31 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:1 with 13942 MB memory:  -> device: 1, name: Tesla T4, pci bus id: 0000:00:05.0, compute capability: 7.5


In [4]:
class PositionalEmbedding(tf.keras.layers.Layer):
    def __init__(self, sequence_length, vocab_size, embed_dim):
        super().__init__()
        self.token_embedding = tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)
        self.position_embedding = tf.keras.layers.Embedding(input_dim=sequence_length, output_dim=embed_dim)
        self.sequence_length = sequence_length

    def call(self, inputs):
        positions = tf.range(start=0, limit=self.sequence_length, delta=1)
        embedded_tokens = self.token_embedding(inputs)
        embedded_positions = self.position_embedding(positions)
        return embedded_tokens + embedded_positions

class TransformerEncoder(tf.keras.layers.Layer):
    def __init__(self, embed_dim, dense_dim, num_heads, dropout_rate=0.1):
        super().__init__()
        self.attention = tf.keras.layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.dense_proj = tf.keras.Sequential([
            tf.keras.layers.Dense(dense_dim, activation="relu"),
            tf.keras.layers.Dense(embed_dim),
        ])
        self.layernorm1 = tf.keras.layers.LayerNormalization()
        self.layernorm2 = tf.keras.layers.LayerNormalization()
        self.dropout = tf.keras.layers.Dropout(dropout_rate)

    def call(self, inputs, mask=None, training=False):
        attention_output = self.attention(inputs, inputs, attention_mask=mask)
        out1 = self.layernorm1(inputs + self.dropout(attention_output, training=training))
        dense_output = self.dense_proj(out1)
        return self.layernorm2(out1 + self.dropout(dense_output, training=training))

class CrossAttentionDecoder(tf.keras.layers.Layer):
    def __init__(self, embed_dim, dense_dim, num_heads, dropout_rate=0.1):
        super().__init__()
        self.cross_attention = tf.keras.layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=embed_dim
        )
        self.dense_proj = tf.keras.Sequential([
            tf.keras.layers.Dense(dense_dim, activation="relu"),
            tf.keras.layers.Dense(embed_dim),
        ])
        self.layernorm1 = tf.keras.layers.LayerNormalization()
        self.layernorm2 = tf.keras.layers.LayerNormalization()
        self.dropout = tf.keras.layers.Dropout(dropout_rate)

    def call(self, inputs, encoder_outputs, mask=None, training=False):
        # Cross attention: question attends to context
        attention_output = self.cross_attention(
            query=inputs,
            key=encoder_outputs,
            value=encoder_outputs,
            attention_mask=mask
        )
        out1 = self.layernorm1(inputs + self.dropout(attention_output, training=training))
        dense_output = self.dense_proj(out1)
        return self.layernorm2(out1 + self.dropout(dense_output, training=training))

def build_encoder_decoder_qa_model(
    vocab_size,
    sequence_length,
    embed_dim=128,
    dense_dim=512,
    num_heads=4,
    num_encoder_layers=2,
    num_decoder_layers=1
):
    # Context encoder inputs
    context_input_ids = tf.keras.Input(shape=(sequence_length,), dtype="int32", name="context_input_ids")
    context_attention_mask = tf.keras.Input(shape=(sequence_length,), dtype="int32", name="context_attention_mask")
    
    # Question encoder inputs
    question_input_ids = tf.keras.Input(shape=(sequence_length,), dtype="int32", name="question_input_ids")
    question_attention_mask = tf.keras.Input(shape=(sequence_length,), dtype="int32", name="question_attention_mask")

    # Context encoder
    context_embeddings = PositionalEmbedding(sequence_length, vocab_size, embed_dim)(context_input_ids)
    context_mask = tf.keras.layers.Lambda(
        lambda x: tf.cast(x[:, tf.newaxis, tf.newaxis, :], dtype=tf.float32)
    )(context_attention_mask)
    
    context_encoded = context_embeddings
    for _ in range(num_encoder_layers):
        context_encoded = TransformerEncoder(embed_dim, dense_dim, num_heads)(context_encoded, context_mask)

    # Question encoder
    question_embeddings = PositionalEmbedding(sequence_length, vocab_size, embed_dim)(question_input_ids)
    question_mask = tf.keras.layers.Lambda(
        lambda x: tf.cast(x[:, tf.newaxis, tf.newaxis, :], dtype=tf.float32)
    )(question_attention_mask)
    
    question_encoded = question_embeddings
    for _ in range(num_encoder_layers):
        question_encoded = TransformerEncoder(embed_dim, dense_dim, num_heads)(question_encoded, question_mask)

    # Cross attention decoder
    cross_attention_mask = tf.keras.layers.Lambda(
        lambda x: tf.cast(x[0][:, tf.newaxis, tf.newaxis, :], dtype=tf.float32)
    )([context_attention_mask, question_attention_mask])
    
    decoder_output = question_encoded
    for _ in range(num_decoder_layers):
        decoder_output = CrossAttentionDecoder(embed_dim, dense_dim, num_heads)(
            decoder_output, context_encoded, cross_attention_mask
        )

    # Output layers
    start_logits = tf.keras.layers.Dense(1)(decoder_output)
    start_logits = tf.keras.layers.Reshape((sequence_length,), name="start_positions")(start_logits)

    end_logits = tf.keras.layers.Dense(1)(decoder_output)
    end_logits = tf.keras.layers.Reshape((sequence_length,), name="end_positions")(end_logits)

    model = tf.keras.Model(
        inputs={
            "context_input_ids": context_input_ids,
            "context_attention_mask": context_attention_mask,
            "question_input_ids": question_input_ids,
            "question_attention_mask": question_attention_mask
        },
        outputs={"start_positions": start_logits, "end_positions": end_logits}
    )
    return model

In [5]:
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

model = build_encoder_decoder_qa_model(
    vocab_size=len(tokenizer.vocab),
    sequence_length=384,
    embed_dim=256,
    dense_dim=256,
    num_heads=4,
    num_encoder_layers=2,
    num_decoder_layers=1
)

model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=3e-4),
    loss={"start_positions": loss_fn, "end_positions": loss_fn},
    metrics={"start_positions": "accuracy", "end_positions": "accuracy"}
)

model.summary()

history = model.fit(
    train_dataset,
    epochs=15,
    verbose=1
)

Epoch 1/15


I0000 00:00:1745341076.759931     118 service.cc:148] XLA service 0x2ffaa7a0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1745341076.760762     118 service.cc:156]   StreamExecutor device (0): Tesla T4, Compute Capability 7.5
I0000 00:00:1745341076.760803     118 service.cc:156]   StreamExecutor device (1): Tesla T4, Compute Capability 7.5
I0000 00:00:1745341078.501909     118 cuda_dnn.cc:529] Loaded cuDNN version 90300
I0000 00:00:1745341089.369046     118 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m344/344[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m130s[0m 282ms/step - end_positions_accuracy: 0.0187 - end_positions_loss: 4.5470 - loss: 8.9662 - start_positions_accuracy: 0.0442 - start_positions_loss: 4.4193
Epoch 2/15
[1m344/344[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m97s[0m 283ms/step - end_positions_accuracy: 0.0362 - end_positions_loss: 4.1801 - loss: 8.2715 - start_positions_accuracy: 0.0619 - start_positions_loss: 4.0914
Epoch 3/15
[1m344/344[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m97s[0m 283ms/step - end_positions_accuracy: 0.0581 - end_positions_loss: 4.0499 - loss: 8.0328 - start_positions_accuracy: 0.0723 - start_positions_loss: 3.9828
Epoch 4/15
[1m344/344[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m97s[0m 282ms/step - end_positions_accuracy: 0.0854 - end_positions_loss: 3.8766 - loss: 7.6688 - start_positions_accuracy: 0.1010 - start_positions_loss: 3.7922
Epoch 5/15
[1m344/344[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m98s[0m 284m

In [6]:
def predict_answer_encoder_decoder(question, context, tokenizer, model, max_len=384):
    # Tokenize context and question separately
    context_encoding = tokenizer(
        context,
        return_tensors='tf',
        truncation=True,
        padding='max_length',
        max_length=max_len,
        return_offsets_mapping=True
    )
    
    question_encoding = tokenizer(
        question,
        return_tensors='tf',
        truncation=True,
        padding='max_length',
        max_length=max_len
    )

    input_dict = {
        "context_input_ids": context_encoding["input_ids"],
        "context_attention_mask": context_encoding["attention_mask"],
        "question_input_ids": question_encoding["input_ids"],
        "question_attention_mask": question_encoding["attention_mask"]
    }
    
    # Run the model
    outputs = model.predict(input_dict)
    
    # Get predicted positions
    start_logits = outputs['start_positions'][0]
    end_logits = outputs['end_positions'][0]
    
    start_idx = np.argmax(start_logits)
    end_idx = np.argmax(end_logits)
    
    # Handle edge case
    if end_idx < start_idx:
        end_idx = start_idx
    
    # Convert to character positions
    offset_mapping = context_encoding["offset_mapping"][0].numpy()
    start_char = offset_mapping[start_idx][0]
    end_char = offset_mapping[end_idx][1]
    
    return context[start_char:end_char]

# Test prediction
question = "Which year did the USSR cancel the N1 rocket program?"
context = "Meanwhile, 1976..."

predicted_answer = predict_answer_encoder_decoder(question, context, tokenizer, model)
print("Predicted Answer:", predicted_answer)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2s/step
Predicted Answer: ,


TRY AGAIN TEST ON  THE WHOLE TEST SET WITHOUT CHANGING THE MODEL ITSELF


In [9]:
test_dataset = subset.select(range(10000, 11000))  # rows 10000 to 10999 (inclusive)
test_df = test_dataset.to_pandas()


In [11]:

# Prepare the test data similarly to the train data
test_data = prepare_qa_data_encoder_decoder(test_df, tokenizer)

# Create the test dataset
test_dataset_tf = tf.data.Dataset.from_tensor_slices((
    {
        "context_input_ids": test_data["context_input_ids"],
        "context_attention_mask": test_data["context_attention_mask"],
        "question_input_ids": test_data["question_input_ids"],
        "question_attention_mask": test_data["question_attention_mask"]
    },
    {
        "start_positions": test_data["start_positions"],
        "end_positions": test_data["end_positions"]
    }
))

test_dataset_tf = test_dataset_tf.batch(batch_size).prefetch(tf.data.AUTOTUNE)

# Now let's test prediction on a few examples from the test set
def evaluate_on_test_set(test_dataset, tokenizer, model, max_len=384, num_samples=10):
    for idx, (batch_data, labels) in enumerate(test_dataset.take(num_samples)):
        context_input_ids = batch_data["context_input_ids"]
        context_attention_mask = batch_data["context_attention_mask"]
        question_input_ids = batch_data["question_input_ids"]
        question_attention_mask = batch_data["question_attention_mask"]
        
        # We will predict for the first sample in the batch
        context = tokenizer.decode(context_input_ids[0], skip_special_tokens=True)
        question = tokenizer.decode(question_input_ids[0], skip_special_tokens=True)
        
        print(f"\nTest Sample {idx + 1}:")
        print(f"Question: {question}")
        print(f"Context: {context}")
        
        # Get the predicted answer using the model
        predicted_answer = predict_answer_encoder_decoder(question, context, tokenizer, model)
        print(f"Predicted Answer: {predicted_answer}")

        # Actual answer (if available)
        actual_start = labels["start_positions"][0].numpy()
        actual_end = labels["end_positions"][0].numpy()
        actual_answer = context[actual_start:actual_end]
        print(f"Actual Answer: {actual_answer}")

evaluate_on_test_set(test_dataset_tf, tokenizer, model, max_len=384, num_samples=10)



Test Sample 1:
Question: the instruments used to point out the different corrupt forms looked to see if they were rigidly domestic or what?
Context: the purpose of these instruments was to address the various forms of corruption ( involving the public sector, the private sector, the financing of political activities, etc. ) whether they had a strictly domestic or also a transnational dimension. to monitor the implementation at national level of the requirements and principles provided in those texts, a monitoring mechanism – the group of states against corruption ( also known as greco ) ( french : groupe d ' etats contre la corruption ) was created.
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
Predicted Answer: transnational
Actual Answer: d

Test Sample 2:
Question: when were the last of the six great north faces of the alps climbed?
Context: the first british mont blanc ascent was in 1788 ; the first female ascent in 1819. by the mid - 1850s swiss mountain