## Finetuing GPT2 on children stories
### Tutorial url: https://medium.com/@prashanth.ramanathan/fine-tuning-a-pre-trained-gpt-2-model-and-performing-inference-a-hands-on-guide-57c097a3b810
### Dataset Link: https://www.kaggle.com/datasets/edenbd/children-stories-text-corpus
### Generate and debug code with ChatGPT and Grok 3
### The fine-tuned model is uploaded here: https://drive.google.com/drive/folders/1iJQxaTG5OZbLDKDrSFhOoLJgrOij-hNb?usp=sharing

In [2]:
# train with custom text
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
from datasets import Dataset
import os

# Verify CUDA setup
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if not torch.cuda.is_available():
    print("CUDA not available. Exiting.")
    exit(1)
print(f"GPU: {torch.cuda.get_device_name(0)}")
print(f"Total GPU memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB")

# Set device
device = torch.device("cuda")
print(f"Using device: {device}")

# Clear CUDA cache
torch.cuda.empty_cache()
print(f"GPU memory before loading: {torch.cuda.memory_allocated() / 1024**2:.2f} MB")

# Load custom text dataset
custom_file = 'cleaned_merged_fairy_tales_without_eos.txt'  # Replace with your .txt file path
if not os.path.exists(custom_file):
    print(f"File {custom_file} not found. Creating a sample file.")
    with open(custom_file, 'w') as f:
        f.write("Artificial intelligence is transforming the world.\n")
        f.write("In the future, machines will collaborate with humans.\n")
        f.write("The possibilities of technology are endless.\n")

# Read text file
try:
    with open(custom_file, 'r', encoding='utf-8') as f:
        text_data = f.readlines()
    text_data = [line.strip() for line in text_data if line.strip()]  # Remove empty lines
    print(f"Loaded {len(text_data)} lines from {custom_file}")
except Exception as e:
    print(f"Error reading {custom_file}: {e}")
    exit(1)

if not text_data:
    print("No valid data found. Exiting.")
    exit(1)

# Create dataset and split
dataset = Dataset.from_dict({"text": text_data})
dataset_dict = dataset.train_test_split(test_size=0.1, seed=42)  # 90% train, 10% test

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained('gpt2')
model = AutoModelForCausalLM.from_pretrained('gpt2').to(device)

# Enable gradient checkpointing
model.gradient_checkpointing_enable()

# Verify model device
print(f"Model device: {next(model.parameters()).device}")
print(f"GPU memory after model load: {torch.cuda.memory_allocated() / 1024**2:.2f} MB")

# Set padding token
tokenizer.pad_token = tokenizer.eos_token

# Tokenize dataset
def tokenize_function(examples):
    inputs = tokenizer(examples['text'], truncation=True, padding='max_length', max_length=16)  # Balanced for 4GB
    inputs['labels'] = inputs['input_ids'].copy()
    return inputs

# Pre-tokenize
tokenized_datasets = dataset_dict.map(tokenize_function, batched=True, remove_columns=['text'])

# Define training arguments
training_args = TrainingArguments(
    output_dir='./children_stories_results_4_epochs',
    evaluation_strategy='no',  # Disable eval to save memory
    num_train_epochs=4,
    per_device_train_batch_size=4, 
    warmup_steps=10,
    weight_decay=0.0,
    logging_dir='./children_stories_logs_4_epochs',
    logging_steps=10,
    fp16=True,  # Trainer-managed mixed precision
    gradient_accumulation_steps=4,  # Effective batch size 16
    dataloader_num_workers=0,  # Avoid CPU bottleneck
    dataloader_pin_memory=True,  # Default, but explicit
    report_to="none",
    save_strategy="no"
)

# Initialize Trainer (no custom collator needed)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['test'],
)

# Train with error handling
try:
    trainer.train()
    print("Training completed successfully!")
except Exception as e:
    print(f"Training failed: {e}")
finally:
    print(f"GPU memory at end: {torch.cuda.memory_allocated() / 1024**2:.2f} MB")

# Save model and tokenizer
model_output_dir = './children_stories_results_4_epochs/model'
model.save_pretrained(model_output_dir)
tokenizer.save_pretrained(model_output_dir)

# Clear memory
torch.cuda.empty_cache()

PyTorch version: 1.12.1+cu113
CUDA available: True
GPU: NVIDIA GeForce RTX 3050 Laptop GPU
Total GPU memory: 4.00 GB
Using device: cuda
GPU memory before loading: 487.47 MB
Loaded 90329 lines from cleaned_merged_fairy_tales_without_eos.txt
Model device: cuda:0
GPU memory after model load: 487.47 MB


Map: 100%|██████████████████████████████████████████████████████████████████████████| 81296/81296 [00:03<00:00, 23104.15 examples/s]
Map: 100%|████████████████████████████████████████████████████████████████████████████| 9033/9033 [00:00<00:00, 19069.23 examples/s]
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Step,Training Loss
10,5.2867
20,4.8415
30,4.1307
40,3.9766
50,3.9222
60,3.8063
70,3.8277
80,3.7323
90,3.8166
100,3.871


Training completed successfully!
GPU memory at end: 1914.61 MB


In [1]:
# inference
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

# Verify CUDA setup
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if not torch.cuda.is_available():
    print("CUDA not available. Exiting.")
    exit(1)
print(f"GPU: {torch.cuda.get_device_name(0)}")
print(f"Total GPU memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB")

# Set device
device = torch.device("cuda")
print(f"Using device: {device}")

# Clear CUDA cache
torch.cuda.empty_cache()
print(f"GPU memory before loading: {torch.cuda.memory_allocated() / 1024**2:.2f} MB")

# Load tokenizer and model from trained checkpoint
model_path = './children_stories_results_4_epochs/model'
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype=torch.float16).to(device)

# If pad_token is not set or same as eos_token, define a new one
if tokenizer.pad_token is None or tokenizer.pad_token == tokenizer.eos_token:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    model.resize_token_embeddings(len(tokenizer))
tokenizer.pad_token = '[PAD]'

# Verify model device
print(f"Model device: {next(model.parameters()).device}")
print(f"GPU memory after model load: {torch.cuda.memory_allocated() / 1024**2:.2f} MB")

# Set model to evaluation mode
model.eval()

# Define inference function
def generate_text(prompt, max_new_tokens=1000, temperature=0.9, top_k=50):
    # Tokenize input with padding/truncation
    inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True)
    inputs = {k: v.to(device) for k, v in inputs.items()}

    # Check input tensor shapes (for debugging)
    print(f"Input tensor shapes: {inputs['input_ids'].shape}, {inputs['attention_mask'].shape}")

    # Generate text
    with torch.no_grad():  # Disable gradient computation
        outputs = model.generate(
            input_ids=inputs["input_ids"],
            attention_mask=inputs["attention_mask"],  # Pass attention mask explicitly
            max_new_tokens=max_new_tokens,
            temperature=temperature,
            top_k=top_k,
            do_sample=True,
            pad_token_id=tokenizer.pad_token_id
        )
    
    # Decode and return
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return generated_text

# Example usage
prompt = "Once upon a time, "
try:
    generated = generate_text(prompt, max_new_tokens=300)
    print(f"\nPrompt: {prompt}")
    print(f"Generated: {generated}")
except Exception as e:
    print(f"Inference failed: {e}")
finally:
    print(f"GPU memory at end: {torch.cuda.memory_allocated() / 1024**2:.2f} MB")

# Clear memory
torch.cuda.empty_cache()


  from .autonotebook import tqdm as notebook_tqdm


PyTorch version: 1.12.1+cu113
CUDA available: True
GPU: NVIDIA GeForce RTX 3050 Laptop GPU
Total GPU memory: 4.00 GB
Using device: cuda
GPU memory before loading: 0.00 MB


The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


Model device: cuda:0
GPU memory after model load: 255.49 MB
Input tensor shapes: torch.Size([1, 6]), torch.Size([1, 6])

Prompt: Once upon a time, 
Generated: Once upon a time,    When the moonlight shone like stars,   And the blue waters were like the blue waters of the sea,   When the shore was high in the midst of the wide blue land,    And before the waters were as clear as pitch-black,  The birds sang and flitted across the meadows like stars,   And sang      The great yellow sky rose to full height above.
GPU memory at end: 255.49 MB


## Below are additional codes just for extra information

In [1]:
# Train with CPU - modified by Chatgpt
# wikitext (CPU version)
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
from datasets import load_dataset

# Verify CPU setup
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()} (not used in this run)")

# Set device to CPU
device = torch.device("cpu")
print(f"Using device: {device}")

# Load tiny dataset
dataset = load_dataset('wikitext', 'wikitext-2-raw-v1', split='train[:1%]')  # 1% for testing

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained('gpt2')
model = AutoModelForCausalLM.from_pretrained('gpt2')  # stays on CPU

# Enable gradient checkpointing (optional on CPU, may not help much)
model.gradient_checkpointing_enable()

# Verify model device
print(f"Model device: {next(model.parameters()).device}")

# Set padding token
tokenizer.pad_token = tokenizer.eos_token

# Tokenize with minimal sequence length
def tokenize_function(examples):
    inputs = tokenizer(examples['text'], truncation=True, padding='max_length', max_length=8)
    inputs['labels'] = inputs['input_ids'].copy()
    return inputs

tokenized_datasets = dataset.map(tokenize_function, batched=True, remove_columns=['text'])

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='no',
    num_train_epochs=1,
    per_device_train_batch_size=1,
    warmup_steps=10,
    weight_decay=0.0,
    logging_dir='./logs',
    logging_steps=10,
    fp16=False,  # No mixed precision on CPU
    gradient_accumulation_steps=4,
    dataloader_num_workers=0,
    report_to="none",
    no_cuda=True,  # Important: disables CUDA usage in Trainer
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets,
)

# Train with error handling
try:
    trainer.train()
    print("Training completed successfully on CPU!")
except Exception as e:
    print(f"Training failed: {e}")

# Save model and tokenizer
model_output_dir = './results/model'
model.save_pretrained(model_output_dir)
tokenizer.save_pretrained(model_output_dir)


  from .autonotebook import tqdm as notebook_tqdm


PyTorch version: 1.12.1+cu113
CUDA available: True (not used in this run)
Using device: cpu


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Model device: cpu


Step,Training Loss
10,6.7078


KeyboardInterrupt: 

In [3]:
# train with wikitext
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
from datasets import load_dataset

# Verify CUDA setup
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if not torch.cuda.is_available():
    print("CUDA not available. Exiting.")
    exit(1)
print(f"GPU: {torch.cuda.get_device_name(0)}")
print(f"Total GPU memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB")

# Set device
device = torch.device("cuda")
print(f"Using device: {device}")

# Clear CUDA cache
torch.cuda.empty_cache()
print(f"GPU memory before loading: {torch.cuda.memory_allocated() / 1024**2:.2f} MB")

# Load tiny dataset
dataset = load_dataset('wikitext', 'wikitext-2-raw-v1', split='train[:1%]')  # 1% for testing

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained('gpt2')
model = AutoModelForCausalLM.from_pretrained('gpt2').to(device)  # No torch_dtype

# Enable gradient checkpointing
model.gradient_checkpointing_enable()

# Verify model device
print(f"Model device: {next(model.parameters()).device}")
print(f"GPU memory after model load: {torch.cuda.memory_allocated() / 1024**2:.2f} MB")

# Set padding token
tokenizer.pad_token = tokenizer.eos_token

# Tokenize with minimal sequence length
def tokenize_function(examples):
    inputs = tokenizer(examples['text'], truncation=True, padding='max_length', max_length=8)  # Ultra-low
    inputs['labels'] = inputs['input_ids'].copy()
    return inputs

tokenized_datasets = dataset.map(tokenize_function, batched=True, remove_columns=['text'])

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='no',  # Disable eval to save memory
    num_train_epochs=1,
    per_device_train_batch_size=1,  # Minimal
    warmup_steps=10,  # Reduced
    weight_decay=0.0,  # Disabled
    logging_dir='./logs',
    logging_steps=10,
    fp16=True,  # Mixed precision, let Trainer handle scaling
    gradient_accumulation_steps=4,  # Effective batch size 4
    dataloader_num_workers=0,  # Avoid overhead
    report_to="none",  # Disable external logging
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets,
)

# Train with error handling
try:
    trainer.train()
    print("Training completed successfully!")
except Exception as e:
    print(f"Training failed: {e}")
finally:
    print(f"GPU memory at end: {torch.cuda.memory_allocated() / 1024**2:.2f} MB")

# Save model and tokenizer
model_output_dir = './results/model'
model.save_pretrained(model_output_dir)
tokenizer.save_pretrained(model_output_dir)

# Clear memory
torch.cuda.empty_cache()

PyTorch version: 1.12.1+cu113
CUDA available: True
GPU: NVIDIA GeForce RTX 3050 Laptop GPU
Total GPU memory: 4.00 GB
Using device: cuda
GPU memory before loading: 1915.11 MB
Model device: cuda:0
GPU memory after model load: 2402.58 MB




Step,Training Loss
10,6.8954
20,4.8639
30,3.4012
40,3.2192
50,4.4238
60,2.7625
70,3.4335
80,3.8883
90,4.349


Training completed successfully!
GPU memory at end: 1915.11 MB
