In [3]:
pip install datasets

Note: you may need to restart the kernel to use updated packages.


In [4]:
import re
import nltk
import numpy as np
from nltk.corpus import stopwords
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from datasets import load_dataset
from sklearn.model_selection import train_test_split

# Download NLTK resources (stopwords)
nltk.download("stopwords")

def clean_text(text):
    # Remove special characters and digits
    cleaned_text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Convert to lowercase
    cleaned_text = cleaned_text.lower()
    # Remove extra whitespaces
    cleaned_text = ' '.join(cleaned_text.split())
    return cleaned_text

def preprocess_text(text):
    # Tokenize the text
    tokens = nltk.word_tokenize(text)
    # Remove stop words
    stop_words = set(stopwords.words("english"))
    filtered_tokens = [token for token in tokens if token.lower() not in stop_words]
    # Join the tokens back into a text string
    preprocessed_text = ' '.join(filtered_tokens)
    return preprocessed_text

def process_dataset(dataset, tokenizer, max_len):
    # Clean and preprocess document texts
    documents = [preprocess_text(clean_text(doc)) for doc in dataset['document']]

    # Clean and preprocess summaries
    summaries = [preprocess_text(clean_text(summary)) for summary in dataset['summary']]

    # Tokenize and pad sequences for documents
    X_seq = tokenizer.texts_to_sequences(documents)
    X_padded = pad_sequences(X_seq, maxlen=max_len, padding='post')

    # Tokenize and pad sequences for summaries
    Y_seq = tokenizer.texts_to_sequences(summaries)
    Y_padded = pad_sequences(Y_seq, maxlen=max_len, padding='post')

    return X_padded, Y_padded

# Load the dataset
multi_news_dataset = load_dataset('xsum')

# Accessing the train, validation, and test splits
train_dataset1 = multi_news_dataset['train']
validation_dataset = multi_news_dataset['validation']

test_dataset = multi_news_dataset['test']
train_dataset, unused_data = train_test_split(train_dataset1, test_size=0.007, random_state=42)

# Tokenize and preprocess the text data
max_words = 10000
max_len = 100  # Adjust as needed

# Tokenizer for word indexing
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(train_dataset['document'])

# Process training dataset
X_train, Y_train = process_dataset(train_dataset, tokenizer, max_len)

# Process validation dataset
X_validation, Y_validation = process_dataset(validation_dataset, tokenizer, max_len)

# Process test dataset
X_test, Y_test = process_dataset(test_dataset, tokenizer, max_len)

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jassu\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
Using custom data configuration default
Reusing dataset xsum (C:\Users\jassu\.cache\huggingface\datasets\xsum\default\1.2.0\32c23220eadddb1149b16ed2e9430a05293768cfffbdfd151058697d4c11f934)
100%|██████████| 3/3 [00:00<00:00,  6.75it/s]


In [5]:
# Add an end-of-sequence token (EOS) to the target sequences
EOS_token = len(tokenizer.word_index) + 1  # Choose an index not used in the vocabulary
Y_train_eos = np.column_stack([Y_train, np.full_like(Y_train[:, :1], EOS_token)])
Y_validation_eos = np.column_stack([Y_validation, np.full_like(Y_validation[:, :1], EOS_token)])
Y_test_eos = np.column_stack([Y_test, np.full_like(Y_test[:, :1], EOS_token)])

# Actual labels for training
actual_labels_train = Y_train_eos[:, 1:]

# Actual labels for validation
actual_labels_validation = Y_validation_eos[:, 1:]

# Actual labels for testing
actual_labels_test = Y_test_eos[:, 1:]

In [6]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense

In [7]:
# Build the Seq2Seq model
def seq2seq_model_with_eos(input_vocab_size, output_vocab_size, embedding_dim, hidden_units, max_len_input, max_len_output):
    # Encoder
    encoder_input = Input(shape=(max_len_input,))
    encoder_embedding = Embedding(input_vocab_size, embedding_dim, input_length=max_len_input)(encoder_input)
    _, state_h, state_c = LSTM(hidden_units, return_state=True)(encoder_embedding)
    encoder_states = [state_h, state_c]

    # Decoder
    decoder_input = Input(shape=(max_len_output,))
    decoder_embedding = Embedding(output_vocab_size, embedding_dim, input_length=max_len_output)(decoder_input)
    decoder_lstm = LSTM(hidden_units, return_sequences=True, return_state=True)
    decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)
    decoder_dense = Dense(output_vocab_size, activation='softmax')
    decoder_outputs = decoder_dense(decoder_outputs)

    model = Model([encoder_input, decoder_input], decoder_outputs)
    return model

# Vocabulary sizes
input_vocab_size = len(tokenizer.word_index) + 1
output_vocab_size = len(tokenizer.word_index) + 2  # Add one for EOS token

# Model parameters
embedding_dim = 50
hidden_units = 100

In [8]:
# Build the model
model = seq2seq_model_with_eos(input_vocab_size, output_vocab_size, embedding_dim, hidden_units, max_len_input = 100, max_len_output = 100)

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Print the model summary
model.summary()

# # Evaluate the model on the test set
# evaluation = model.evaluate([X_test, actual_labels_test], Y_test_eos[:, 1:])
# print("Test Loss:", evaluation[0])
# print("Test Accuracy:", evaluation[1])

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 100)]        0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, 100)]        0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 100, 50)      16091350    input_1[0][0]                    
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 100, 50)      16091400    input_2[0][0]                    
______________________________________________________________________________________________

In [9]:
model.fit([X_train, actual_labels_train], Y_train_eos[:, 1:], epochs=1, batch_size=50)  # Adjust the batch size as needed


   8/4053 [..............................] - ETA: 495:08:08 - loss: 12.6700 - accuracy: 0.6698