In [1]:
!pip -q install -U "transformers>=4.41" datasets accelerate peft bitsandbytes trl

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m


In [2]:
from pathlib import Path

OUT_DIR = Path("/workspace/standup_scripts")   # 너가 python으로 extract한 폴더
json_files = sorted(OUT_DIR.rglob("script_*.json"))
print("script json count:", len(json_files))
print("sample:", json_files[:3])

script json count: 31
sample: [PosixPath('/workspace/standup_scripts/json/.ipynb_checkpoints/script__fJ3pO5CFxg-checkpoint.json'), PosixPath('/workspace/standup_scripts/json/script_1e9GH83u_Gc.json'), PosixPath('/workspace/standup_scripts/json/script_2KCnrU0S7Vw.json')]


In [3]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

BASE_MODEL = "Qwen/Qwen2.5-7B-Instruct" 

tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, use_fast=True, trust_remote_code=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,                    # ✅ 4bit 양자화(메모리 크게 절약)
    bnb_4bit_quant_type="nf4",            # QLoRA에서 자주 쓰는 방식
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16
)

model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True
)

# 메모리 절약 옵션(긴 시퀀스에 특히 도움)
model.gradient_checkpointing_enable()
model.config.use_cache = False

print("loaded:", BASE_MODEL)


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/663 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/3.86G [00:00<?, ?B/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/3.95G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/3.56G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/3.86G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/243 [00:00<?, ?B/s]

loaded: Qwen/Qwen2.5-7B-Instruct


In [4]:
import json
from datasets import Dataset
from tqdm import tqdm

def timeline_to_text(obj):
    parts = []
    for ev in obj.get("timeline", []):
        c = (ev.get("content") or "").strip()
        if c:
            parts.append(c)
    return "\n".join(parts)

SYSTEM_PROMPT = (
    "You are a stand-up comedian writing a live performance script.\n"
    "Keep tags exactly as-is when present (e.g., [laughter], [applause], [SETUP#], [CALLBACK#]).\n"
    "Continue naturally with good pacing and callbacks."
)

PROMPT_LEN = 2048
ANSWER_LEN = 1024
STRIDE = 1024

samples = []
for fp in tqdm(json_files, desc="build samples"):
    obj = json.loads(Path(fp).read_text(encoding="utf-8"))
    full_text = timeline_to_text(obj)
    if len(full_text) < 2000:
        continue

    ids = tokenizer.encode(full_text)
    total = len(ids)
    win = PROMPT_LEN + ANSWER_LEN

    for start in range(0, max(0, total - win), STRIDE):
        prompt = tokenizer.decode(ids[start:start+PROMPT_LEN])
        answer = tokenizer.decode(ids[start+PROMPT_LEN:start+win])

        messages = [
            {"role": "system", "content": SYSTEM_PROMPT},
            {"role": "user", "content": "Continue the following stand-up script in the same style:\n\n" + prompt},
            {"role": "assistant", "content": answer},
        ]

        if hasattr(tokenizer, "apply_chat_template") and tokenizer.chat_template is not None:
            text = tokenizer.apply_chat_template(messages, tokenize=False)
        else:
            text = (
                f"<|system|>\n{SYSTEM_PROMPT}\n"
                f"<|user|>\nContinue the following stand-up script in the same style:\n\n{prompt}\n"
                f"<|assistant|>\n{answer}\n"
            )

        samples.append({"text": text})

print("total samples:", len(samples))
ds = Dataset.from_list(samples).train_test_split(test_size=0.03, seed=42)
train_ds, eval_ds = ds["train"], ds["test"]
print(train_ds, eval_ds)


build samples: 100%|██████████| 31/31 [00:02<00:00, 13.78it/s]


total samples: 337
Dataset({
    features: ['text'],
    num_rows: 326
}) Dataset({
    features: ['text'],
    num_rows: 11
})


In [7]:
!mkdir -p /workspace/finetuning/qwen2.5-standup-sft

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [10]:
from peft import LoraConfig
from trl import SFTTrainer, SFTConfig

lora = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    # Qwen 계열도 보통 이 프로젝션들이 핵심 학습 지점
    target_modules=["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj"],
)

cfg = SFTConfig(
    output_dir="/workspace/finetuning/qwen2.5-standup-sft",
    dataset_text_field="text",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=16,
    learning_rate=2e-4,
    num_train_epochs=1,    
    max_length=4096,
    logging_steps=10,
    save_steps=200,
    bf16=True,
    report_to=[],
)

trainer = SFTTrainer(
    model=model,
    args=cfg,
    train_dataset=train_ds,
    eval_dataset=eval_ds,
    peft_config=lora,
    processing_class=tokenizer,
)

trainer.train()

# 어댑터 저장(이게 결과물)
trainer.save_model("/workspace/finetuning/qwen25-standup-sft/adapter")
print("saved adapter to qwen25-standup-sft/adapter")


Adding EOS to train dataset:   0%|          | 0/326 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/326 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/326 [00:00<?, ? examples/s]

Adding EOS to eval dataset:   0%|          | 0/11 [00:00<?, ? examples/s]

Tokenizing eval dataset:   0%|          | 0/11 [00:00<?, ? examples/s]

Truncating eval dataset:   0%|          | 0/11 [00:00<?, ? examples/s]

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None, 'pad_token_id': 151643}.
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Step,Training Loss
10,2.4421
20,2.3054


saved adapter to qwen25-standup-sft/adapter


In [12]:
from peft import PeftModel

base = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True
)
ft = PeftModel.from_pretrained(base, "qwen25-standup-sft/adapter")
ft.eval()

messages = [
    {"role":"system","content": SYSTEM_PROMPT},
    {"role":"user","content": "Write a 15-minute stand-up set about university student. Include 3 callbacks. Use [laughter] tags."}
]

if hasattr(tokenizer, "apply_chat_template") and tokenizer.chat_template is not None:
    prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
else:
    prompt = f"<|system|>\n{SYSTEM_PROMPT}\n<|user|>\n{messages[-1]['content']}\n<|assistant|>\n"

inputs = tokenizer(prompt, return_tensors="pt").to(ft.device)

with torch.no_grad():
    out = ft.generate(
        **inputs,
        max_new_tokens=2000,
        do_sample=True,
        temperature=0.9,
        top_p=0.95
    )

print(tokenizer.decode(out[0], skip_special_tokens=False))


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

<|im_start|>system
You are a stand-up comedian writing a live performance script.
Keep tags exactly as-is when present (e.g., [laughter], [applause], [SETUP#], [CALLBACK#]).
Continue naturally with good pacing and callbacks.<|im_end|>
<|im_start|>user
Write a 15-minute stand-up set about university student. Include 3 callbacks. Use [laughter] tags.<|im_end|>
<|im_start|>assistant
[laughter]
Hi, I'm your dad's favorite son.
I got to be careful with what I say.
My dad's my favorite person in the world,
but my mom's my favorite person.
I love him to death,
but he's not my favorite.
He's got some real things that he's into
that I'm gonna tell you about today,
so bear with me for a little while.
So, when I was a kid,
I remember being very interested
in my dad's college experience,
and I loved asking questions.
And every time he'd tell me stories about it,
I would just get more and more excited.
Because I wanted to go to the same school.
But my dad never went to university,
so I always had t