## Dependencies

In [1]:
%pip install -qU transformers peft accelerate datasets trl einops sentencepiece bitsandbytes jinja2>=3.1.0 dotenv
# %pip install -U git+https://github.com/huggingface/transformers

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3.11 -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [None]:
models = [
    "meta-llama/Meta-Llama-3-70B-Instruct", # 0 # very slow, pretty much same quality as 8b on a100
    "meta-llama/Meta-Llama-3-8B-Instruct",  # 1 # good quality, pretty fast
    "openai/gpt-oss-20b",                   # 2 # good quality, decently quick, but have to deal with thinking 
    "Qwen/Qwen3-4B-Instruct-2507",          # 3 # tends to generate the same post over and over (without tuning)
    "Qwen/Qwen3-30B-A3B-Instruct-2507",     # 4 # pretty slow > 1 min per post on a100, good quality
    "google/gemma-3-4b-it",                 # 5 # multimodal, good quality
    "google/gemma-3-27b-it",                # 6 # multimodal, good quality
    "mistralai/Mistral-7B-v0.1",            # 7 WAITING FOR ACCESS
    "microsoft/phi-4",                      # 8 14B, pretty slow, low-decent quality, generates same post over and over
    "deepseek-ai/DeepSeek-R1-Distill-Llama-8B" #9 very fast, have to deal with thinking, decent quality
    ]

MODEL_ID = models[5] 
OUTPUT_DIR = "./lora-style-outputs"
PROMPT_TOKENS = 64
MICRO_BATCH_SIZE = 1
GRAD_ACCUM_STEPS = 1
LEARNING_RATE = 2e-4
NUM_TRAIN_STEPS = 1000  
MAX_SEQ_LEN = 2048

PROMPT = "Please generate one reddit post. Use this format. \n\ntitle: {title}\n self_text: {self_text}\n subreddit: {subreddit}\n"

## Load data 
(make sure to run sampleposts.py)

In [73]:
from typing import List, Dict
import json
import re
import random

def load_datasets_proportional(datasets_dict: Dict[str, float], total_posts: int) -> List[dict]:
    """
    Load datasets with proportional sampling.
    
    Args:
        datasets_dict: Dictionary mapping dataset names to their proportions (0-1)
        total_posts: Total number of posts desired across all datasets
    
    Returns:
        List of examples in the format: {"instruction": PROMPT, "output": post}
    """
    # Validate proportions sum to approximately 1
    total_proportion = sum(datasets_dict.values())
    if not (0.99 <= total_proportion <= 1.01):
        print(f"Warning: Proportions sum to {total_proportion:.3f}, not 1.0")
    
    examples: List[dict] = []
    
    for dataset_name, proportion in datasets_dict.items():
        # Calculate number of posts for this dataset
        target_count = int(total_posts * proportion)
        print(f"Loading {target_count} posts from {dataset_name} dataset ({proportion*100:.1f}%)")
        
        # Load sampled Reddit posts from JSON created by sample-posts.py
        # Each item is a dict with keys: title, subreddit, self_text
        try:
            with open(f"../../datasets/{dataset_name}.json", "r", encoding="utf-8") as f:
                reddit_posts: List[dict] = json.load(f)
        except FileNotFoundError:
            print(f"Warning: Could not find dataset file for {dataset_name}")
            continue
        
        # Filter valid posts (must have self_text and no image_url)
        valid_posts = []
        for p in reddit_posts:
            title = p.get("title", "")
            self_text = p.get("self_text", "")
            image_url = p.get("image_url", "")
            
            if self_text and not image_url:
                subreddit = p.get("subreddit", "")
                subreddit = re.sub(r"\s*(/)?r/", "r/", subreddit)
                post = f"title: {title}\nself_text: {self_text}\nsubreddit: {subreddit}"
                valid_posts.append({"instruction": PROMPT, "output": post})
        
        print(f"Found {len(valid_posts)} valid posts in {dataset_name}")
        
        # Sample the target number of posts
        if len(valid_posts) >= target_count:
            # Randomly sample target_count posts
            sampled_posts = random.sample(valid_posts, target_count)
        else:
            # Use all available posts if we don't have enough
            print(f"Warning: Only {len(valid_posts)} posts available, using all")
            sampled_posts = valid_posts
        
        examples.extend(sampled_posts)
    
    # Shuffle the final dataset to mix posts from different datasets
    random.shuffle(examples)
    
    return examples

# Example usage - modify these values as needed
datasets_dict = {
    "ucla": 0.5,  # 100% minecraft posts
    "minecraft": 0.5,  
}
total_posts = 100  # Total number of posts desired

examples = load_datasets_proportional(datasets_dict, total_posts)

print(f"Total number of examples loaded: {len(examples)}")
if examples:
    print("Sample example:")
    print(examples[0])


Loading 50 posts from ucla dataset (50.0%)
Found 500 valid posts in ucla
Loading 50 posts from minecraft dataset (50.0%)
Found 500 valid posts in minecraft
Total number of examples loaded: 100
Sample example:
{'instruction': 'Please generate one reddit post. Use this format. \n\ntitle: {title}\n self_text: {self_text}\n subreddit: {subreddit}\n', 'output': 'title: How to spread Mycelium over very large areas?\nself_text: I want to turn my steampunk village into a land impoverished by massive industrial use, and I found Mycelium has got the right look to replace all the grass in my village.  The problem is, by eyeball I would need to replace somewhere between 20k and 30k grass blocks if I include the surrounding mountains. The prospect is tedious.  Is there any way to make Mycelium spread to blocks that are already covered in grass? Or is there any way to shortcut through the process or manually replacing the blocks (except with cheats/creative mode)? Is there an efficient way to do thi

# Load model

Make sure to set HUGGING_FACE_HUB_TOKEN environment variable

In [4]:
from huggingface_hub import login
import dotenv, os

dotenv.load_dotenv()
login(token=os.getenv("HUGGING_FACE_HUB_TOKEN"))

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


In [17]:
# Load tokenizer and model with proper device management
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
import os

bf16 = torch.cuda.is_available() and torch.cuda.get_device_capability(0)[0] >= 8

# Device configuration - choose single GPU or multi-GPU
USE_MULTI_GPU = True  # Set to True for multi-GPU training
if USE_MULTI_GPU and torch.cuda.device_count() > 1:
    print(f"Using {torch.cuda.device_count()} GPUs for training")
    device = torch.device("cuda:0")  # Primary device
    device_map = "auto"  # Let transformers handle multi-GPU distribution
else:
    # Single GPU configuration - explicitly set device
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    device_map = {"": device}  # Force all parameters to single device
    print(f"Using single device: {device}")

tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    torch_dtype=torch.bfloat16 if bf16 else torch.float16,
    device_map=device_map,
    low_cpu_mem_usage=True,
)

model.config.use_cache = False
print("Loaded:", MODEL_ID)
print(f"Model device configuration: {device_map}")
print(f"Available GPUs: {torch.cuda.device_count()}")


Using single device: cuda:0


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loaded: google/gemma-3-4b-it
Model device configuration: {'': device(type='cuda', index=0)}
Available GPUs: 1


## Test model

### Text only

In [6]:
from transformers import TextStreamer

# prompt = "Please generate one reddit post (and nothing else). Make sure to stick to the format below exactly. Don't include any extraneous characters like asterisks or other symbols. \n\n title: {title} \n self_text: {self_text} \n subreddit: {subreddit} \n Here's an example of the format: \n\ntitle: This is the title of the post! \nself_text: Here's where the content of the post goes. \nsubreddit: This is the subreddit, or the name of the community the post belongs to."

messages = [
    {
        "role": "system",
        "content": [
            {"type": "text", "text": "You generate reddit posts in the given format."}
        ]
    },
    {
        "role": "user", "content": [
            {"type": "text", "text": PROMPT},
        ]
    },
]

streamer = TextStreamer(tokenizer, 
                        skip_special_tokens=False,
                        skip_prompt=True)

inputs = tokenizer.apply_chat_template(
	messages,
	add_generation_prompt=True,
	tokenize=True,
	return_dict=True,
	return_tensors="pt",
).to(model.device)

outputs = model.generate(
    **inputs, 
	max_new_tokens=MAX_SEQ_LEN,
	temperature=0.7,
	top_p=0.95,
	do_sample=True,
	streamer=streamer,
)
# print(tokenizer.decode(outputs[0][inputs["input_ids"].shape[-1]:]))

Okay, please provide me with the details for the Reddit post you want me to generate! I need the following information to fill in the template:

*   **title:** (The title of the post)
*   **self_text:** (The main body of the post - the actual text that will be posted)
*   **subreddit:** (The subreddit where you want to post it)

Once you give me those three things, I’ll format it exactly as you requested. 😊<end_of_turn>


## Training

In [None]:
# Configure PEFT LoRA
from peft import LoraConfig, get_peft_model, TaskType

lora_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    target_modules=["q_proj","k_proj","v_proj","o_proj","up_proj","down_proj","gate_proj"],
    bias="none",
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()


trainable params: 163,840 || all params: 4,300,243,312 || trainable%: 0.0038


In [75]:
# Preprocess instruction/output dataset
from datasets import Dataset

# Build HF dataset from examples [{"instruction", "output"}]
dataset = Dataset.from_list(examples)

# Tokenize instruction with chat template, and supervise only the output tokens
def tokenize_io(sample):
    # Build chat prompt prefix for the user instruction
    messages = [{"role": "user", "content": sample["instruction"]}]
    prompt_text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True,
    )

    prompt_ids = tokenizer(prompt_text, add_special_tokens=False)["input_ids"]
    output_ids = tokenizer(sample["output"], add_special_tokens=False)["input_ids"]
    eos_id = tokenizer.eos_token_id

    input_ids = prompt_ids + output_ids + ([eos_id] if eos_id is not None else [])
    labels = ([-100] * len(prompt_ids)) + output_ids + ([eos_id] if eos_id is not None else [])
    attention_mask = [1] * len(input_ids)

    # Truncate from the left if too long, keeping alignment between inputs and labels
    if len(input_ids) > MAX_SEQ_LEN:
        input_ids = input_ids[-MAX_SEQ_LEN:]
        labels = labels[-MAX_SEQ_LEN:]
        attention_mask = attention_mask[-MAX_SEQ_LEN:]

    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "labels": labels,
    }

train_ds = dataset.map(tokenize_io, remove_columns=dataset.column_names)
train_ds


Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 100
})

In [None]:
# Trainer setup and brief training
import math
from torch.utils.data import DataLoader
from transformers import get_linear_schedule_with_warmup
from torch.optim import AdamW


def collate_fn(features):
    pad_id = tokenizer.pad_token_id
    batch_size = len(features)
    seq_lens = [len(f["input_ids"]) for f in features]
    max_len = max(seq_lens)

    input_ids = torch.full((batch_size, max_len), pad_id, dtype=torch.long)
    attention_mask = torch.zeros((batch_size, max_len), dtype=torch.long)
    labels = torch.full((batch_size, max_len), -100, dtype=torch.long)

    for i, f in enumerate(features):
        ids = torch.tensor(f["input_ids"], dtype=torch.long)
        attn = torch.tensor(f["attention_mask"], dtype=torch.long)
        labs = torch.tensor(f["labels"], dtype=torch.long)
        L = ids.size(0)
        input_ids[i, :L] = ids
        attention_mask[i, :L] = attn
        labels[i, :L] = labs

    return {"input_ids": input_ids, "attention_mask": attention_mask, "labels": labels}


train_loader = DataLoader(
    train_ds,
    batch_size=MICRO_BATCH_SIZE,
    shuffle=True,
    collate_fn=collate_fn,
)

optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)
# Total optimizer steps we intend to take
total_optim_steps = NUM_TRAIN_STEPS
num_warmup_steps = max(1, int(0.1 * total_optim_steps))
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=num_warmup_steps,
    num_training_steps=total_optim_steps,
)

model.train()

# Ensure model is on the correct device(s)
if USE_MULTI_GPU and torch.cuda.device_count() > 1 and device_map == "auto":
    # For multi-GPU with device_map="auto", model is already distributed
    # Get the device of the first parameter for data placement
    model_device = next(model.parameters()).device
else:
    # For single GPU, ensure model is on the specified device
    model = model.to(device)
    model_device = device

print(f"Training on device: {model_device}")

optimizer.zero_grad()
optim_step = 0
accumulated = 0
running_loss = 0.0
for epoch in range(10):  # repeat over dataset until reaching desired steps
    for batch in train_loader:
        batch = {k: v.to(model_device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        (loss / GRAD_ACCUM_STEPS).backward()
        running_loss += loss.item()
        accumulated += 1
        if accumulated % GRAD_ACCUM_STEPS == 0:
            optimizer.step()
            scheduler.step()
            optimizer.zero_grad()
            if optim_step % 10 == 0:
                print(f"step {optim_step} loss {running_loss / GRAD_ACCUM_STEPS:.4f}")
            running_loss = 0.0
            optim_step += 1
            if optim_step >= total_optim_steps:
                break
    if optim_step >= total_optim_steps:
        break

model.save_pretrained(OUTPUT_DIR)
print("Saved LoRA adapter to:", OUTPUT_DIR)


Training on device: cuda:0
step 0 loss 10.9245
step 10 loss 10.3352
step 20 loss 10.5659
step 30 loss 6.9277
step 40 loss 5.0669
step 50 loss 5.4028
step 60 loss 3.4549
step 70 loss 2.2578
step 80 loss 3.1206
step 90 loss 2.3812
step 100 loss 3.5285
step 110 loss 1.9654
step 120 loss 2.5969
step 130 loss 2.5752
step 140 loss 3.5270
step 150 loss 2.2049
step 160 loss 3.4621
step 170 loss 2.8819
step 180 loss 2.3326
step 190 loss 2.9377
step 200 loss 2.5657
step 210 loss 1.9290
step 220 loss 2.5735
step 230 loss 2.4257
step 240 loss 2.2963
step 250 loss 3.1765
step 260 loss 1.7853
step 270 loss 2.2927
step 280 loss 2.5812
step 290 loss 2.2205
step 300 loss 2.4991
step 310 loss 2.1616
step 320 loss 2.1505
step 330 loss 2.7876
step 340 loss 2.4344
step 350 loss 3.2336
step 360 loss 1.9318
step 370 loss 2.7044
step 380 loss 3.1333
step 390 loss 2.8938
step 400 loss 2.7853
step 410 loss 2.4263
step 420 loss 2.4813
step 430 loss 2.0607
step 440 loss 2.6551
step 450 loss 2.3264
step 460 loss 2

In [None]:
from peft import PeftModel
from transformers import TextStreamer, AutoModelForCausalLM, AutoTokenizer
import torch 

bf16 = torch.cuda.is_available() and torch.cuda.get_device_capability(0)[0] >= 8

tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Use the same device configuration as training
if USE_MULTI_GPU and torch.cuda.device_count() > 1:
    device = torch.device("cuda:0")  # Primary device
    device_map = "auto"  # Let transformers handle multi-GPU distribution
else:
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    device_map = {"": device}  # Force all parameters to single device

# Reload base + adapter
base = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    torch_dtype=torch.bfloat16 if bf16 else torch.float16,
    device_map=device_map,
    low_cpu_mem_usage=True,
)
base = PeftModel.from_pretrained(base, OUTPUT_DIR)
base.eval()

# For inference, get the correct device
if USE_MULTI_GPU and torch.cuda.device_count() > 1 and device_map == "auto":
    inference_device = next(base.parameters()).device
else:
    inference_device = device
    
print(f"Inference device: {inference_device}")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Inference device: cuda:0


In [86]:
streamer = TextStreamer(tokenizer, 
                        skip_special_tokens=True,
                        skip_prompt=True
                        )

# Build chat-formatted inputs via the model's chat template
messages = [
    {"role": "user", "content": PROMPT},
]

inputs = tokenizer.apply_chat_template(
    messages,
    add_generation_prompt=True,
    tokenize=True,
    return_tensors="pt",
    return_dict=True,
).to(base.device)

with torch.no_grad():
    _ = base.generate(
        **inputs,
        max_new_tokens=MAX_SEQ_LEN,
        temperature=0.7,
        top_p=0.9,
        do_sample=True,
        repetition_penalty=1.1,
        streamer=streamer,
    )


title: How many classes can I take in a semester?
self_text: I'm considering taking 15 credit hours this fall, but I'm not sure if that's even possible. Some people seem to be able to take more than that! Can anyone tell me the maximum number of classes I can take in a single semester?
subreddit: ucla


In [12]:
## Multi-GPU Training Setup and Utils

# If you want to enable multi-GPU training, run this cell first:

def setup_multi_gpu_training():
    """
    Setup for proper multi-GPU training with PyTorch.
    This provides several strategies for multi-GPU training.
    """
    import torch
    import torch.nn as nn
    from torch.nn.parallel import DataParallel, DistributedDataParallel
    import os
    
    print(f"Available GPUs: {torch.cuda.device_count()}")
    for i in range(torch.cuda.device_count()):
        print(f"GPU {i}: {torch.cuda.get_device_name(i)}")
    
    if torch.cuda.device_count() < 2:
        print("Warning: Less than 2 GPUs available. Multi-GPU training not possible.")
        return False
    
    return True

def enable_multi_gpu_mode():
    """
    Call this to switch to multi-GPU mode.
    You'll need to restart the kernel and re-run cells after changing this.
    """
    global USE_MULTI_GPU
    USE_MULTI_GPU = True
    print("Multi-GPU mode enabled. Please restart kernel and re-run all cells.")
    print("Alternative approaches for multi-GPU training:")
    print("1. Use device_map='auto' (current approach)")
    print("2. Use torch.nn.DataParallel (simpler but less efficient)")
    print("3. Use torch.nn.DistributedDataParallel (most efficient)")

# Check GPU setup
setup_multi_gpu_training()

# Uncomment the next line to enable multi-GPU training:
# enable_multi_gpu_mode()


Available GPUs: 1
GPU 0: NVIDIA H200


False

# 