In [2]:
%pip install dataset

Defaulting to user installation because normal site-packages is not writeable
Collecting dataset
  Downloading dataset-1.6.2-py2.py3-none-any.whl.metadata (1.9 kB)
Collecting sqlalchemy<2.0.0,>=1.3.2 (from dataset)
  Downloading sqlalchemy-1.4.54.tar.gz (8.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.5/8.5 MB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting alembic>=0.6.2 (from dataset)
  Downloading alembic-1.16.5-py3-none-any.whl.metadata (7.3 kB)
Collecting banal>=1.0.1 (from dataset)
  Downloading banal-1.0.6-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting Mako (from alembic>=0.6.2->dataset)
  Using cached mako-1.3.10-py3-none-any.whl.metadata (2.9 kB)
Downloading dataset-1.6.2-py2.py3-none-any.whl (18 kB)
Downloading alembic-1.16.5-py3-none-any.whl (247 kB)
Downloading banal-1.0.6-py2.py3-none-any.whl (6.1 kB)
Using cached mako-1.3.10-py3-none-any.whl (78 kB)
Building whe

In [1]:
import io
import os
import re
import string
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras import Sequential, Model
from tensorflow.keras.layers import (Dense, Embedding, GlobalAveragePooling1D, 
                                     LSTM, Bidirectional, Dropout, Input, Concatenate)
from tensorflow.keras.layers import TextVectorization
from datasets import load_dataset
ds = load_dataset("lucadiliello/newsqa")


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def prepare_data(dataset, max_samples=50000):
    """Extract contexts and questions from NewsQA dataset"""
    texts = []
    
    for split in ['train', 'validation']:
        for item in dataset[split]:
            if item['context']:
                texts.append(item['context'])
            if item['question']:
                texts.append(item['question'])
            
            if len(texts) >= max_samples:
                break
        if len(texts) >= max_samples:
            break
    
    return texts[:max_samples]

In [3]:
def custom_standardization(input_data):
    lowercase = tf.strings.lower(input_data)
    stripped_html = tf.strings.regex_replace(lowercase, '<br />', ' ')
    return tf.strings.regex_replace(stripped_html, '[%s]' % re.escape(string.punctuation), '')

In [4]:
vocab_size = 10000
sequence_length = 100

vectorize_layer = TextVectorization(
    standardize=custom_standardization,
    max_tokens=vocab_size,
    output_mode='int',
    output_sequence_length=sequence_length
)

In [7]:
texts = prepare_data(ds, max_samples=50000)

In [None]:
print("Adapting vectorizer...")
text_ds = tf.data.Dataset.from_tensor_slices(texts)
vectorize_layer.adapt(text_ds)

vocab = vectorize_layer.get_vocabulary()


Adapting vectorizer...


# LSTM training

In [None]:
def createSequencesLstm(texts, vectorize_layer, seq_length=50):
    X, y = [], []
    
    for text in texts[:10000]:
        vectorized = vectorize_layer([text]).numpy()[0]
        for i in range(1, len(vectorized)):
            if vectorized[i] == 0:
                break
            X.append(vectorized[:i])
            y.append(vectorized[i])
    
    # Pad sequences
    X_padded = tf.keras.preprocessing.sequence.pad_sequences(X, maxlen=seq_length, padding='pre')
    return np.array(X_padded), np.array(y)

print("Creating LSTM training sequences...")
X_lstm, y_lstm = createSequencesLstm(texts, vectorize_layer)
print(f"Created {len(X_lstm)} sequences")

# Build LSTM model
embedding_dim_2 = 128

lstm_model = Sequential([
    Embedding(vocab_size, embedding_dim_2, name='lstm_embedding'),
    Bidirectional(LSTM(128, return_sequences=True)),
    Dropout(0.3),
    Bidirectional(LSTM(64)),
    Dropout(0.3),
    Dense(256, activation='relu'),
    Dense(vocab_size, activation='softmax')
])

lstm_model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

print("Training LSTM model...")
lstm_model.fit(X_lstm, y_lstm, epochs=5, batch_size=256, validation_split=0.1, verbose=1)


lstm_embeddings = lstm_model.get_layer('lstm_embedding').get_weights()[0]
save_embeddings_to_csv(lstm_embeddings, vocab, 'lstm_contextual_embeddings.csv')

# Multi-task embeddings

In [None]:
embedding_dim_4 = 128

input_layer = Input(shape=(sequence_length,), name='input')
embedding = Embedding(vocab_size, embedding_dim_4, name='multitask_embedding')(input_layer)

# Task 1: Next word prediction
lstm_out = LSTM(64, return_sequences=True)(embedding)
next_word = GlobalAveragePooling1D()(lstm_out)
next_word_pred = Dense(vocab_size, activation='softmax', name='next_word')(next_word)

# Task 2: Sentence classification (based on length/complexity)
pooled = GlobalAveragePooling1D()(embedding)
classification = Dense(64, activation='relu')(pooled)
class_output = Dense(3, activation='softmax', name='classification')(classification)

multitask_model = Model(inputs=input_layer, outputs=[next_word_pred, class_output])
multitask_model.compile(
    optimizer='adam',
    loss={'next_word': 'sparse_categorical_crossentropy', 'classification': 'sparse_categorical_crossentropy'},
    loss_weights={'next_word': 1.0, 'classification': 0.5},
    metrics={'next_word': 'accuracy', 'classification': 'accuracy'}
)

# Prepare multi-task data
X_multi = X_lstm[:5000]
y_next_word = y_lstm[:5000]
y_class = np.random.randint(0, 3, size=(len(X_multi),))  # Dummy classification labels

print("Training multi-task model...")
multitask_model.fit(X_multi, {'next_word': y_next_word, 'classification': y_class}, 
                   epochs=5, batch_size=256, validation_split=0.1, verbose=1)

# Extract embeddings
multitask_embeddings = multitask_model.get_layer('multitask_embedding').get_weights()[0]
save_embeddings_to_csv(multitask_embeddings, vocab, 'multitask_embeddings.csv')