In [11]:
from datasets import load_dataset

# Load SQuAD dataset
dataset = load_dataset("squad", split="train")

# Step 1: Add a new column for context length
dataset = dataset.map(lambda x: {"context_length": len(x["context"])})

# Step 2: Sort by the new column
sorted_dataset = dataset.sort("context_length")

# Step 3: Select the shortest 10k rows (or whatever range you need)
subset = sorted_dataset.select(range(11000))  # You can change this to 5000, 20000, etc.

# Preview
print(subset[0])


{'id': '56e10a3be3433e1400422b22', 'title': 'Space_Race', 'context': 'Meanwhile, the USSR continued briefly trying to perfect their N1 rocket, finally canceling it in 1976, after two more launch failures in 1971 and 1972.', 'question': "Which year did the USSR cancel the N1 rocket program after two failures that didn't launch?", 'answers': {'text': ['1976'], 'answer_start': [97]}, 'context_length': 151}


In [12]:
# Preview
print(subset[20])

{'id': '56e3c2db39bdeb14003478f6', 'title': 'Estonian_language', 'context': 'From 1525 to 1917 14,503 titles were published in Estonian, as opposed to the 23,868 titles which were published between 1918 and 1940.[citation needed]', 'question': 'In what language were 14,503 books published prior to 1918?', 'answers': {'text': ['Estonian'], 'answer_start': [50]}, 'context_length': 152}


In [13]:
import pandas as pd

# Convert to pandas DataFrame
df = subset.to_pandas()

# Show the first 20 rows
print(df.head(1))

                         id       title  \
0  56e10a3be3433e1400422b22  Space_Race   

                                             context  \
0  Meanwhile, the USSR continued briefly trying t...   

                                            question  \
0  Which year did the USSR cancel the N1 rocket p...   

                                    answers  context_length  
0  {'text': ['1976'], 'answer_start': [97]}             151  


In [14]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

import ast

# df['answers'] = df['answers'].apply(ast.literal_eval)


# Extract the first answer text from the nested dictionary
df['answer_text'] = df['answers'].apply(lambda x: f"<SOS> {x['text'][0]} <EOS>")

# Combine all text fields
all_text = df['context'].tolist() + df['question'].tolist() + df['answer_text'].tolist()

# Initialize tokenizer
tokenizer = Tokenizer(
    oov_token="<unk>",
    lower=False,
    filters='!"#$%&()*+,-./:;=@[\\]^_`{|}~\t\n'  # removed < and >


)

tokenizer.fit_on_texts(all_text)

# Convert text to sequences
context_seq = tokenizer.texts_to_sequences(df['context'].tolist())
question_seq = tokenizer.texts_to_sequences(df['question'].tolist())
answer_seq = tokenizer.texts_to_sequences(df['answer_text'].tolist())

# Pad the sequences
max_context_len = max(len(seq) for seq in context_seq)
max_question_len =max(len(seq) for seq in question_seq)
max_answer_len = max(len(seq) for seq in answer_seq)

context_seq = pad_sequences(context_seq, maxlen=max_context_len, padding='post', truncating='post')
question_seq = pad_sequences(question_seq, maxlen=max_question_len, padding='post', truncating='post')
answer_seq = pad_sequences(answer_seq, maxlen=max_answer_len, padding='post', truncating='post')


In [15]:
import numpy as np

# Top 10 tokenized and padded sequences for context, question, and answer
top_10_context = np.array(context_seq)[:10]
top_10_question = np.array(question_seq)[:10]
top_10_answer = np.array(answer_seq)[:50]

print("Top 10 Tokenized and Padded Context Sequences:")
print(top_10_context)

print("\nTop 10 Tokenized and Padded Question Sequences:")
print(top_10_question)

print("\nTop 10 Tokenized and Padded Answer Sequences:")
print(top_10_answer)


Top 10 Tokenized and Padded Context Sequences:
[[4093    2 1774 ...    0    0    0]
 [  22    2  932 ...    0    0    0]
 [  22    2  932 ...    0    0    0]
 ...
 [  44  214   29 ...    0    0    0]
 [  22    2  632 ...    0    0    0]
 [  22    2  632 ...    0    0    0]]

Top 10 Tokenized and Padded Question Sequences:
[[  132    58    35     2  1774 12913     2 10492  4254   755    68    51
  19271    21  8997 20979     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0]
 [   16   109    35     2 19272    83    14     2   568 23479     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0]
 [   16    86     2   109  1192     6 13782     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0]
 [  158    10    11   421   422     3  8948     4 21310   941

In [16]:
# !wget http://nlp.stanford.edu/data/glove.6B.zip
# !unzip glove.6B.zip

In [17]:
import numpy as np

def load_glove_embeddings(glove_path, word_index, embedding_dim=100):
    embeddings_index = {}

    # Load GloVe file line by line
    with open(glove_path, encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs

    print(f"Loaded {len(embeddings_index)} word vectors from GloVe.")

    # Initialize embedding matrix with zeros
    embedding_matrix = np.zeros((len(word_index) + 1, embedding_dim))

    # Fill embedding matrix
    for word, i in word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

    return embedding_matrix


In [18]:
glove_path = '/kaggle/input/glove6b100dtxt/glove.6B.100d.txt'  # or your full path
embedding_dim = 100

embedding_matrix = load_glove_embeddings(glove_path, tokenizer.word_index, embedding_dim)


Loaded 400000 word vectors from GloVe.


In [19]:
print("Embedding matrix shape:", embedding_matrix.shape)
vocab_size = embedding_matrix.shape[0]  # <- safest way


Embedding matrix shape: (26965, 100)


In [21]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense

# === SHARED EMBEDDING ===
embedding_layer = Embedding(
    input_dim=vocab_size,
    output_dim=embedding_dim,
    weights=[embedding_matrix],
    input_length=max_question_len,
    trainable=False,
    name='shared_embedding'
)

# === ENCODER ===
encoder_inputs = Input(shape=(max_question_len,), name='encoder_input')
encoder_embed = embedding_layer(encoder_inputs)

encoder_lstm = LSTM(256, return_state=True, name='encoder_lstm')
_, state_h, state_c = encoder_lstm(encoder_embed)

# === DECODER ===
decoder_inputs = Input(shape=(max_answer_len,), name='decoder_input')
decoder_embed = embedding_layer(decoder_inputs)  # Share same embedding

decoder_lstm = LSTM(256, return_sequences=True, name='decoder_lstm')
decoder_outputs = decoder_lstm(decoder_embed, initial_state=[state_h, state_c])

decoder_dense = Dense(vocab_size, activation='softmax', name='output_dense')
decoder_outputs = decoder_dense(decoder_outputs)

# === FULL MODEL ===
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(
    optimizer='adam',
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

model.summary()


In [22]:
import numpy as np

decoder_target_data = np.zeros_like(answer_seq)
decoder_target_data[:, :-1] = answer_seq[:, 1:]
decoder_target_data[:, -1] = 0  # optional: pad the last token with 0


In [23]:
from tensorflow.keras.callbacks import ModelCheckpoint

# Create a checkpoint callback
checkpoint = ModelCheckpoint(
    filepath="seq2seq_checkpoint.keras",  # required .keras format
    save_best_only=False,              # or True if you're using validation loss to pick the best one
    save_weights_only=False,           # save full model, not just weights
    verbose=1
)

# Now pass the callback to model.fit
history = model.fit(
    [question_seq, answer_seq],
    np.expand_dims(decoder_target_data, -1),
    batch_size=64,
    epochs=15,
    validation_split=0.1,
    callbacks=[checkpoint]  # <-- this is the key line
)


Epoch 1/15


I0000 00:00:1745344921.564305     105 cuda_dnn.cc:529] Loaded cuDNN version 90300


[1m155/155[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 98ms/step - accuracy: 0.8453 - loss: 3.9930
Epoch 1: saving model to seq2seq_checkpoint.keras
[1m155/155[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 110ms/step - accuracy: 0.8455 - loss: 3.9797 - val_accuracy: 0.8850 - val_loss: 1.0115
Epoch 2/15
[1m155/155[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 99ms/step - accuracy: 0.8987 - loss: 0.8386 
Epoch 2: saving model to seq2seq_checkpoint.keras
[1m155/155[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 109ms/step - accuracy: 0.8987 - loss: 0.8386 - val_accuracy: 0.8916 - val_loss: 0.9884
Epoch 3/15
[1m155/155[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 101ms/step - accuracy: 0.9050 - loss: 0.7888
Epoch 3: saving model to seq2seq_checkpoint.keras
[1m155/155[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 110ms/step - accuracy: 0.9050 - loss: 0.7889 - val_accuracy: 0.8940 - val_loss: 0.9805
Epoch 4/15
[1m155/155[0m [32m━━

In [24]:
encoder_model = Model(encoder_inputs, [state_h, state_c])
latent_dim = 256  # hidden state size


decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))


# Set up the embedding, LSTM and Dense layers again for inference
decoder_lstm_infer = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_output, state_h_out, state_c_out = decoder_lstm_infer(decoder_embed, initial_state=[decoder_state_input_h, decoder_state_input_c])


decoder_dense_infer = Dense(vocab_size, activation='softmax')
decoder_output_infer = decoder_dense_infer(decoder_output)


# Decoder model to generate next tokens
decoder_model = Model([decoder_inputs] + [decoder_state_input_h, decoder_state_input_c],
                      [decoder_output_infer, state_h_out, state_c_out])




In [25]:
def generate_answer_from_question(question_text, tokenizer, encoder_model, decoder_model, max_answer_len):
    # Step 1: Tokenize and pad the input question
    new_question_seq = tokenizer.texts_to_sequences([question_text])
    new_question_seq = pad_sequences(new_question_seq, maxlen=max_question_len, padding='post')


    # Step 2: Encode the question
    states_value = encoder_model.predict(new_question_seq)


    # Step 3: Prepare the <start> token input
    start_token = tokenizer.word_index["<SOS>"]
    target_seq = np.zeros((1, 1), dtype='int32')
    target_seq[0, 0] = start_token


    # Step 4: Decode the answer word by word
    answer = []
    for _ in range(max_answer_len):
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)


        sampled_token_index = np.argmax(output_tokens[0, 0, :])
        sampled_word = tokenizer.index_word.get(sampled_token_index, "<unk>")
        answer.append(sampled_word)


        # Break on <end> or <EOS>
        if sampled_word in ["<end>", "<EOS>"]:
            break


        # Prepare next token and update decoder state
        target_seq[0, 0] = sampled_token_index
        states_value = [h, c]


    # Clean output (remove <start> and <end> tokens if present)
    filtered_answer = [w for w in answer if w not in ["<start>", "<end>", "<SOS>", "<EOS>"]]
    return ' '.join(filtered_answer)


In [26]:
question = "Which year did the USSR cancel the N1 rocket program?"
generated = generate_answer_from_question(
    question,
    tokenizer=tokenizer,
    encoder_model=encoder_model,
    decoder_model=decoder_model,
    max_answer_len=max_answer_len
)


print("Q:", question)
print("Generated A:", generated)



[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 194ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 171ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 

In [27]:
!zip seq2seq_model.zip seq2seq_checkpoint.keras


  adding: seq2seq_checkpoint.keras (deflated 15%)


In [28]:
history

<keras.src.callbacks.history.History at 0x791ef435f7d0>

In [29]:
from kaggle_secrets import UserSecretsClient
from IPython.display import FileLink

# Option 1: Direct link (simple)
FileLink('seq2seq_model.zip')


In [30]:
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense

decoder_input_single = Input(shape=(1,), name="decoder_input_token")

# 2. Decoder state inputs
decoder_input_h = Input(shape=(256,), name="decoder_h")
decoder_input_c = Input(shape=(256,), name="decoder_c")

# 3. Shared embedding layer
decoder_embed_single = embedding_layer(decoder_input_single)

# 4. LSTM returns all 3 outputs (wrapped, no unpacking!)
decoder_lstm_out = decoder_lstm(
    decoder_embed_single,
    initial_state=[decoder_input_h, decoder_input_c]
)

# Use indexing to access outputs safely
decoder_output_tokens = decoder_lstm_out[0]
decoder_output_h = decoder_lstm_out[1]
decoder_output_c = decoder_lstm_out[2]

# 5. Dense output layer
decoder_softmax = decoder_dense(decoder_output_tokens)

# 6. Final decoder inference model
decoder_model = Model(
    inputs=[decoder_input_single, decoder_input_h, decoder_input_c],
    outputs=[decoder_softmax, decoder_output_h, decoder_output_c]
)

In [31]:
def decode_sequence(question_text):
    # Encode question
    seq = tokenizer.texts_to_sequences([question_text])
    seq = pad_sequences(seq, maxlen=max_question_len, padding='post')

    states_value = encoder_model.predict(seq)

    # Start with <SOS>
    target_seq = np.zeros((1, 1), dtype='int32')
    target_seq[0, 0] = tokenizer.word_index["<SOS>"]

    decoded_sentence = []
    stop_condition = False

    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)

        # ✅ fixed index: shape is (1, vocab_size)
        sampled_token_index = np.argmax(output_tokens[0])
        sampled_word = tokenizer.index_word.get(sampled_token_index, "<unk>")

        if sampled_word == "<EOS>" or len(decoded_sentence) > max_answer_len:
            stop_condition = True
        else:
            decoded_sentence.append(sampled_word)

        # Feed the next token
        target_seq[0, 0] = sampled_token_index
        states_value = [h, c]

    return ' '.join(decoded_sentence)

In [33]:
# Decoder inference model
decoder_state_input_h = Input(shape=(256,))
decoder_state_input_c = Input(shape=(256,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

# Apply the embedding layer to the decoder inputs
decoder_embed2 = embedding_layer(decoder_inputs)

# Apply the LSTM layer to the embedded decoder input and states
decoder_lstm_output = decoder_lstm(decoder_embed2, initial_state=decoder_states_inputs)

# The output from the LSTM is a tuple: (output_sequence, state_h, state_c)
decoder_outputs2 = decoder_lstm_output[0]
state_h2 = decoder_lstm_output[1]
state_c2 = decoder_lstm_output[2]

# Apply the dense layer
decoder_outputs2 = decoder_dense(decoder_outputs2)

# Define the decoder model
decoder_model = Model(
    [decoder_inputs] + decoder_states_inputs,
    [decoder_outputs2] + [state_h2, state_c2]
)


In [39]:
def decode_sequence(input_sequence):
    # Step 1: Process the input sequence (e.g., tokenization, padding)
    target_seq = process_input(input_sequence)  # Shape: (1, 1), e.g., one token at a time
    
    # Step 2: Initialize states
    states_value = [initial_state_h, initial_state_c]  # State from the encoder
    
    stop_condition = False
    decoded_sentence = ''
    
    while not stop_condition:
        # Predict the next token and the new states
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)
        
        # Extract the most probable token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_token = reverse_token_index[sampled_token_index]
        
        # Append the sampled token to the decoded sentence
        decoded_sentence += sampled_token + ' '
        
        # Update the target sequence (next token) and states
        target_seq = np.zeros((1, 1))  # Reset to zeros (for the next token prediction)
        target_seq[0, 0] = sampled_token_index
        
        states_value = [h, c]  # Update the states for the next time step
        
        # Define a stop condition (e.g., max length or end token)
        if sampled_token == '<end>' or len(decoded_sentence) > max_decoder_seq_length:
            stop_condition = True
    
    return decoded_sentence


In [40]:
# Select the last 1000 records
last_1000_records = df.tail(1000)


In [41]:
# Extract questions and corresponding answers
questions = last_1000_records['question'].tolist()
ground_truth_answers = last_1000_records['answer_text'].tolist()


In [42]:
generated_answers = []
for question in questions:
    generated_answer = decode_sequence(question)
    generated_answers.append(generated_answer)


NameError: name 'process_input' is not defined

In [38]:
for i in range(10):  # Display first 10 results for verification
    print(f"Q: {questions[i]}")
    print(f"Ground Truth: {ground_truth_answers[i]}")
    print(f"Generated: {generated_answers[i]}")
    print()


Q: How many regional MPs have argued for the importance of Plymouth's train service?
Ground Truth: <SOS> three <EOS>


IndexError: list index out of range

In [None]:
from nltk.translate.bleu_score import sentence_bleu

bleu_scores = []
for i in range(1000):
    reference = ground_truth_answers[i].split()  # Reference answer
    hypothesis = generated_answers[i].split()  # Generated answer
    bleu_score = sentence_bleu([reference], hypothesis)
    bleu_scores.append(bleu_score)

print(f"Average BLEU Score: {np.mean(bleu_scores)}")


In [None]:
import matplotlib.pyplot as plt

plt.plot(bleu_scores)
plt.title("BLEU Scores for Generated Answers")
plt.xlabel("Sample Index")
plt.ylabel("BLEU Score")
plt.show()
