In [29]:
import yaml
import torch
import pandas as pd
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from torch.utils.data import DataLoader
from dataset import SingleStage  # Adjust if your dataset class is named differently
from utils import calculate_perplexity, calculate_distinct_n
from tqdm import tqdm
import os

In [33]:
with open('config.yaml') as file:
    config = yaml.full_load(file)

training_args = config['training_args']

# Checkpoint directory
checkpoint_dir = os.path.join(config['model']['save_path'], 'single_stage')
model_path = os.path.join(checkpoint_dir, 'model.safetensors')

In [34]:
decoder_name = config['model']['decoder']
tokenizer = GPT2Tokenizer.from_pretrained(decoder_name, padding_side="left")

if os.path.exists(model_path):
    print("Loading model from checkpoint...")
    model = GPT2LMHeadModel.from_pretrained(checkpoint_dir, use_safetensors=True)
else:
    print("Initializing new model...")
    raise FileNotFoundError(f"Checkpoint not found in {checkpoint_dir}. Ensure the checkpoint exists before resuming.")


Loading model from checkpoint...


In [35]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
model.resize_token_embeddings(len(tokenizer), mean_resizing=False)
model.config.pad_token_id = tokenizer.pad_token_id

In [36]:
train_dataset = SingleStage(config['dataset']['train_path'], tokenizer)
train_data_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)

In [37]:
sample_index = 0  # Change this index to sample a different example
sample_data = train_dataset[sample_index]

# Ensure 'input_ids' and 'attention_mask' are in the correct format
if 'input_ids' in sample_data and 'attention_mask' in sample_data:
    input_ids = sample_data['input_ids'].to(device)  # No need to unsqueeze, it's already 2D
    attention_mask = sample_data['attention_mask'].to(device)  # Move to device
else:
    raise KeyError("'input_ids' or 'attention_mask' not found in sample_data")

# Check the shape of input_ids
print("Input IDs Shape:", input_ids.shape)

Input IDs Shape: torch.Size([1, 128])


In [38]:
with torch.no_grad():
    # Use max_new_tokens to specify how many tokens to generate
    generated_ids = model.generate(input_ids, attention_mask=attention_mask, max_new_tokens=20, num_return_sequences=1, pad_token_id=tokenizer.pad_token_id)


In [39]:
generated_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)


In [40]:
print("Input Text:\n", tokenizer.decode(input_ids[0], skip_special_tokens=True))
print("Generated Text:\n", generated_text)

Input Text:
 [ WP ] You 've finally managed to discover the secret to immortality . Suddenly , Death appears before you , hands you a business card , and says , `` When you realize living forever sucks , call this number , I 've got a job offer for you . '' <SEP> So many times have I walked on ruins , the remainings of places that I loved and got used to.. At first I was scared , each time I could feel my city , my current generation collapse , break into the black hole that thrives within it , I could feel humanity , the way I 'm able to feel my body.. After a
Generated Text:
 [ WP ] You 've finally managed to discover the secret to immortality . Suddenly , Death appears before you , hands you a business card , and says , `` When you realize living forever sucks , call this number , I 've got a job offer for you . '' <SEP> So many times have I walked on ruins , the remainings of places that I loved and got used to.. At first I was scared , each time I could feel my city , my current g