In [4]:
# This is the initial dataset which is used in the first initial step of training after this the model should be able to complete text

In [5]:
import os

from datasets import load_dataset

from torch.utils.data import Dataset, DataLoader

from preprocess.sequencing import create_sequences
from preprocess.tokenizer import BPETokenizer

from concurrent.futures import ProcessPoolExecutor, as_completed
from tqdm import tqdm

import torch 
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset

In [6]:
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
SPECIAL_TOKENS = ["<s>", "<pad>", "</s>", "<unk>", "<mask>"]
TEXT_COMPLETION_PATH = os.path.join("data", "text_completion.json")

train_set = load_dataset("abisee/cnn_dailymail", "3.0.0", split="train[:2%]")

# Load 5% of the validation set
valid_set = load_dataset("abisee/cnn_dailymail", "3.0.0", split="validation[:2%]")

print(f"Training set size: {len(train_set)}")
print(f"Validation set size: {len(valid_set)}")

In [4]:
train_articles = train_set["article"]
train_highlights = train_set["highlights"]


tokenizer = BPETokenizer(
    vocab_size=30000, min_frequency=2, special_tokens=SPECIAL_TOKENS
)

if not os.path.exists(TEXT_COMPLETION_PATH):
    tokenizer.fit(
        train_articles + train_highlights,
    )
    tokenizer.save(TEXT_COMPLETION_PATH)
else:
    tokenizer.load(TEXT_COMPLETION_PATH)

In [5]:
train_articles = [item["article"] for item in tqdm(train_set, desc="Extracting Train Articles") if item["article"] is not None]
valid_articles = [item["article"] for item in tqdm(valid_set, desc="Extracting Valid Articles") if item["article"] is not None]

def encode_article(article):
    return tokenizer.encode(article)

def parallel_encode(articles, desc):
    encoded_articles = []
    with ProcessPoolExecutor() as executor:
        futures = {executor.submit(encode_article, article): article for article in articles}
        for future in tqdm(as_completed(futures), total=len(futures), desc=desc):
            encoded_articles.append(future.result())
    return encoded_articles

train_set_encoded = parallel_encode(train_articles, "Encoding Train Set")
valid_set_encoded = parallel_encode(valid_articles, "Encoding Valid Set")

Extracting Train Articles: 100%|██████████| 5742/5742 [00:00<00:00, 11547.67it/s]
Extracting Valid Articles: 100%|██████████| 267/267 [00:00<00:00, 10474.19it/s]
Encoding Train Set: 100%|██████████| 5742/5742 [00:08<00:00, 666.79it/s]
Encoding Valid Set: 100%|██████████| 267/267 [00:00<00:00, 536.23it/s]


In [6]:
def extract_token_ids(encoded_data):
    """
    Convert each Encoding object into its list of token IDs and flatten them into a single list,
    with a progress bar showing the extraction progress.
    """
    flattened_ids = []
    for encoding in tqdm(encoded_data, desc="Extracting Token IDs"):
        flattened_ids.extend(encoding.ids)
    return flattened_ids

# Extract token IDs with progress bars for training and validation sets
train_token_ids = extract_token_ids(train_set_encoded)
valid_token_ids = extract_token_ids(valid_set_encoded)


Extracting Token IDs: 100%|██████████| 5742/5742 [00:00<00:00, 18991.07it/s]
Extracting Token IDs: 100%|██████████| 267/267 [00:00<00:00, 33581.60it/s]


In [7]:
CONTEXT_ELN = 50 # N
TARGET_ELN = 1

train_seq = create_sequences(
    tokenized_data=train_token_ids, 
    max_context_length=CONTEXT_ELN,
    max_target_length=TARGET_ELN,
    skip_processed=True,
)

valid_seq = create_sequences(
    tokenized_data=valid_token_ids,
    max_context_length=CONTEXT_ELN,
    max_target_length=TARGET_ELN,
    skip_processed=True,
)

In [8]:
print(len(train_seq))
for i, (context, target) in enumerate(train_seq):
    print(f"Context: {context[:10]}... (Total: {len(context)} tokens)") 
    print(f"Target: {target} (Total: {len(target)} token)") 
    print(f"Decoded: ...{tokenizer.decode(context)[-10:]}")
    print(f"Decoded: {tokenizer.decode(target)}")
    if i == 5:
        break


4551174
Context: [20604, 16, 1354, 467, 9296, 8931, 13, 596, 3949, 11831]... (Total: 50 tokens)
Target: [306] (Total: 1 token)
Decoded: ... Radcliffe
Decoded:  as
Context: [16, 1354, 467, 9296, 8931, 13, 596, 3949, 11831, 874]... (Total: 50 tokens)
Target: [3949] (Total: 1 token)
Decoded: ...dcliffe as
Decoded:  Harry
Context: [1354, 467, 9296, 8931, 13, 596, 3949, 11831, 874, 3812]... (Total: 50 tokens)
Target: [11831] (Total: 1 token)
Decoded: ...e as Harry
Decoded:  Potter
Context: [467, 9296, 8931, 13, 596, 3949, 11831, 874, 3812, 24211]... (Total: 50 tokens)
Target: [231] (Total: 1 token)
Decoded: ...rry Potter
Decoded:  in
Context: [9296, 8931, 13, 596, 3949, 11831, 874, 3812, 24211, 13676]... (Total: 50 tokens)
Target: [366] (Total: 1 token)
Decoded: ... Potter in
Decoded:  "
Context: [8931, 13, 596, 3949, 11831, 874, 3812, 24211, 13676, 2623]... (Total: 50 tokens)
Target: [22955] (Total: 1 token)
Decoded: ...otter in "
Decoded: Harry


In [9]:
class TextCompletionDataset(Dataset):
    def __init__(self, sequences):
        self.sequences = sequences

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        context, target = self.sequences[idx]
        # Convert context and target to tensors
        context = torch.tensor(context, dtype=torch.long)
        target = torch.tensor(target, dtype=torch.long)
        return context, target

    
train_dataset = TextCompletionDataset(train_seq)
valid_dataset = TextCompletionDataset(valid_seq)

In [10]:
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=64, shuffle=False)

In [11]:
EPOCHS = 10
VOC_SIZE = tokenizer.get_vocab_size()
MAX_LEN = CONTEXT_ELN
D_MODEL = 512
FFN_HIDDEN = 2048
N_HEAD = 8
N_LAYERS = 6
DROP_PROB = 0.1

In [3]:
import torch
import numpy as np

# Function to check for NaN, infinite, or unstable values in datasets
def check_data_stability(data_loader, data_name="Dataset"):
    nan_count = 0
    inf_count = 0
    unstable_count = 0
    total_elements = 0

    for batch in data_loader:
        context, target = batch

        # Convert tensors to numpy for detailed checks
        context_np = context.numpy()
        target_np = target.numpy()

        # Check for NaN values
        nan_count += np.isnan(context_np).sum() + np.isnan(target_np).sum()

        # Check for infinite values
        inf_count += np.isinf(context_np).sum() + np.isinf(target_np).sum()

        # Check for unstable values (very large or very small values)
        unstable_count += np.sum((np.abs(context_np) > 1e6) | (np.abs(context_np) < 1e-6))

        total_elements += context_np.size + target_np.size

    print(f"{data_name} - Total Elements: {total_elements}")
    print(f"NaN Values: {nan_count}")
    print(f"Infinite Values: {inf_count}")
    print(f"Unstable Values: {unstable_count}")

# Check the training and validation datasets for stability
check_data_stability(train_loader, "Training Set")
check_data_stability(valid_loader, "Validation Set")


NameError: name 'train_loader' is not defined