In [44]:
!pip install -q transformers datasets peft accelerate bitsandbytes

# Check GPU
!nvidia-smi

zsh:1: command not found: nvidia-smi


In [45]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel, TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model, TaskType
from datasets import Dataset

In [46]:
import glob

book_files = glob.glob('./data/*.txt')

# combine all books into one text
all_text = ""
for file_path in sorted(book_files):
    with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
        all_text += f.read() + "\n\n"
    
print(f"Total characters: {len(all_text):,}")
print(f"Total words: {len(all_text.split()):,}")

Total characters: 6,285,452
Total words: 1,101,476


In [47]:
from transformers import GPT2Tokenizer

tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token


def create_chunks(all_text, chunk_size=512, stride=256):
    tokens = tokenizer.encode(all_text, add_special_tokens=False)
    chunks = []

    for i in range(0, len(tokens) - chunk_size, stride):
        chunk = tokens[i:i + chunk_size]
        chunks.append(chunk)
    
    return chunks


chunks = create_chunks(all_text, chunk_size=512, stride=256)
print(f"Number of chunks: {len(chunks)}")

Token indices sequence length is longer than the specified maximum sequence length for this model (1669304 > 1024). Running this sequence through the model will result in indexing errors


Number of chunks: 6519


In [48]:
# convert to labels(for causal LM, inputs=labels)
texts = [tokenizer.decode(chunk, skip_special_tokens=False) for chunk in chunks]

In [49]:
# create dataset
dataset = Dataset.from_dict({"text": texts})
dataset

Dataset({
    features: ['text'],
    num_rows: 6519
})

In [50]:
# tokenize properly for training

def tokenize_function(examples):
    return tokenizer(
        examples["text"],
        truncation=True,
        max_length=512,
        padding="max_length"
    )

tokenized_dataset = dataset.map(tokenize_function, batched=True)
print(tokenized_dataset)

Map: 100%|██████████| 6519/6519 [00:06<00:00, 943.33 examples/s]

Dataset({
    features: ['text', 'input_ids', 'attention_mask'],
    num_rows: 6519
})





In [51]:
# for causal LM, labels = input_ids
tokenized_dataset = tokenized_dataset.map(
    lambda x: {"labels": x["input_ids"]},
    batched=True
)
tokenized_dataset

Map: 100%|██████████| 6519/6519 [00:00<00:00, 9596.12 examples/s]


Dataset({
    features: ['text', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 6519
})

In [52]:
# train/val split
split_dataset = tokenized_dataset.train_test_split(test_size=0.1)
split_dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 5867
    })
    test: Dataset({
        features: ['text', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 652
    })
})

In [53]:
model = GPT2LMHeadModel.from_pretrained('gpt2')

In [54]:
lora_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    r=16,
    lora_alpha=32,
    lora_dropout=0.1,
    target_modules=["c_attn"],
    bias="none"
)

In [55]:
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

trainable params: 589,824 || all params: 125,029,632 || trainable%: 0.4717




In [57]:
small_train = split_dataset["train"].select(range(50))
small_eval = split_dataset["test"].select(range(10))

training_args = TrainingArguments(
    output_dir="./harry-potter-lora",
    learning_rate=5e-4,
    num_train_epochs=3,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=4,
    warmup_steps=100,
    logging_steps=50,
    save_steps=500,
    eval_steps=500
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train,
    eval_dataset=small_eval,
)

In [58]:
trainer.train()



Step,Training Loss


TrainOutput(global_step=12, training_loss=3.4894612630208335, metrics={'train_runtime': 41.0501, 'train_samples_per_second': 3.654, 'train_steps_per_second': 0.292, 'total_flos': 39465595699200.0, 'train_loss': 3.4894612630208335, 'epoch': 3.0})

In [59]:
model.save_pretrained('./harry-potter-lora-final')
tokenizer.save_pretrained("./harry-potter-lora-final")

('./harry-potter-lora-final/tokenizer_config.json',
 './harry-potter-lora-final/special_tokens_map.json',
 './harry-potter-lora-final/vocab.json',
 './harry-potter-lora-final/merges.txt',
 './harry-potter-lora-final/added_tokens.json')

In [60]:
from peft import PeftModel

base_model = GPT2LMHeadModel.from_pretrained('gpt2')
model = PeftModel.from_pretrained(base_model, "./harry-potter-lora-final")
tokenizer = GPT2Tokenizer.from_pretrained("./harry-potter-lora-final")

In [62]:
def generate_text(prompt, max_length=200, temperature=0.8, top_p=0.9):
    inputs = tokenizer(prompt, return_tensors="pt")

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_length=max_length,
            temperature=temperature,
            top_p=top_p,
            do_sample=True,
            repetition_penalty=1.1,
            pad_token_id=tokenizer.eos_token_id
        )

    return tokenizer.decode(outputs[0], skip_special_tokens=True)


def generate_text_base(prompt, max_length=150, temperature=0.8, top_p=0.9):
    inputs = tokenizer(prompt, return_tensors="pt")
    
    outputs = base_model.generate(
        **inputs,
        max_length=max_length,
        temperature=temperature,
        top_p=top_p,
        do_sample=True,
        repetition_penalty=1.1,
        pad_token_id=tokenizer.eos_token_id
    )
    
    return tokenizer.decode(outputs[0], skip_special_tokens=True)


prompts = [
    "The boy with the lightning scar",
    "The magical castle",
    "He opened the dusty book",
]

for prompt in prompts:
    print(f"\nPrompt: {prompt}")
    print("=" * 50)
    print("Base GPT-2:")
    print(generate_text_base(prompt))
    print("\nLoRA Fine-tuned:")
    print(generate_text(prompt))


Prompt: The boy with the lightning scar
Base GPT-2:
The boy with the lightning scar was in a state of shock and confusion.
It wasn't until later that he realized what had happened, so his mother took him to hospital for treatment. The wound has healed nicely since then. He is recovering well but needs all kinds from other injuries as an adult – most notably torn muscles (the ones found on those who have been attacked by monsters). Now we know about how it started: this kid came out after being beaten up at school once because there were more than one person involved or maybe just some bullies yelling 'I'm not you' when they heard my voice telling them I got beat down before running off! It's funny enough where kids start showing signs of aggression like walking around alone without taking care

LoRA Fine-tuned:
The boy with the lightning scar from his foot had no idea what he was looking at, but when a doctor asked about it to him and they saw that there were two little black holes in