In [1]:
!pip install -q -U transformers accelerate bitsandbytes trl peft

In [2]:
!pip install --upgrade transformers huggingface-hub

Collecting huggingface-hub
  Using cached huggingface_hub-1.2.3-py3-none-any.whl.metadata (13 kB)


In [5]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import LoraConfig
from trl import SFTTrainer
from transformers import TrainingArguments
from peft import PeftModel
from datasets import Dataset
import json

model_name = "Qwen/Qwen3-0.6B"

In [6]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)

tokenizer = AutoTokenizer.from_pretrained(model_name)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto",
    torch_dtype=torch.bfloat16,
)

model.gradient_checkpointing_enable()
model.enable_input_require_grads()

peft_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
`torch_dtype` is deprecated! Use `dtype` instead!


In [7]:
data = None

with open("jokes_generate.json", "r") as file:
    data = json.load(file)

flat_data = []
for category in data:
    for item in category["Data"]:
        flat_data.append({
            "seed": item["seed"],
            "joke": item["joke"]
        })

dataset = Dataset.from_list(flat_data)

In [8]:
def preprocess(example):
    start_text = example["seed"]
    answer_text = example["joke"]

    prompt = f"Seed:{start_text}\nJoke:"
    completion = answer_text

    start_enc = tokenizer(prompt, add_special_tokens=False)
    comp_enc = tokenizer(completion, add_special_tokens=True)
    input_ids = start_enc["input_ids"] + comp_enc["input_ids"]
    attention_mask = [1] * len(input_ids)
    labels = [-100] * len(start_enc["input_ids"]) + comp_enc["input_ids"]
    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "labels": labels
    }

dataset = dataset.map(preprocess)

Map:   0%|          | 0/800 [00:00<?, ? examples/s]

In [9]:
training_args = TrainingArguments(
    output_dir = "checkpoint_safe",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=16,
    learning_rate=2e-4,
    logging_steps=100,
    fp16=False,
    bf16=True,
    gradient_checkpointing=True,
    optim="paged_adamw_8bit",
    num_train_epochs=30,
    remove_unused_columns=False)

trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    args=training_args,
    peft_config=peft_config)

trainer.train()

Truncating train dataset:   0%|          | 0/800 [00:00<?, ? examples/s]

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None, 'pad_token_id': 151643}.
[34m[1mwandb[0m: (1) Create a W&B account
[34m[1mwandb[0m: (2) Use an existing W&B account
[34m[1mwandb[0m: (3) Don't visualize my results
[34m[1mwandb[0m: Enter your choice:[34m[1mwandb[0m: You chose "Don't visualize my results"


Step,Training Loss
100,2.9768
200,2.602
300,2.3573
400,2.1328
500,1.9276
600,1.7367
700,1.567
800,1.4275
900,1.3075
1000,1.2141




Step,Training Loss
100,2.9768
200,2.602
300,2.3573
400,2.1328
500,1.9276
600,1.7367
700,1.567
800,1.4275
900,1.3075
1000,1.2141




TrainOutput(global_step=1500, training_loss=1.6329814809163412, metrics={'train_runtime': 8407.3277, 'train_samples_per_second': 2.855, 'train_steps_per_second': 0.178, 'total_flos': 3553785381519360.0, 'train_loss': 1.6329814809163412, 'epoch': 30.0})

In [10]:
with open("jokes_example.json", "r") as file:
    data = json.load(file)

test_data = []
for category in data:
    for item in category["Data"]:
        test_data.append({
            "seed": item["seed"],
            "joke": item["joke"]
        })

test_data = Dataset.from_list(test_data)

test_data[0]

{'seed': 'Сидят два рыбака на льду',
 'joke': 'Один другому: "Ты чего без шапки? Уши отморозишь". — "Ага, я вчера был в шапке, мне мужики водку предлагали, а я не услышал".'}

In [11]:
with open("jokes_example.json", "r") as file:
    data = json.load(file)

test_data = []
for category in data:
    for item in category["Data"]:
        test_data.append((item["seed"], item["joke"]))

prompts = [f"Seed:{seed}\nJoke:" for seed, joke in test_data]

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)

tokenizer = AutoTokenizer.from_pretrained("checkpoint_safe/checkpoint-1500")

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto",
    torch_dtype=torch.bfloat16,
    trust_remote_code=True
).eval()
model = PeftModel.from_pretrained(model, "checkpoint_safe/checkpoint-1500")

inputs = tokenizer(prompts, return_tensors="pt", padding=True).to(model.device)


outputs = model.generate(
    **inputs,
    max_new_tokens=50,
    do_sample=True,
    top_p=0.9,
    temperature=0.8
)

result = {}
for (seed, joke), out in zip(test_data, outputs):
    result[seed] = {"joke": joke,
                    "output": tokenizer.decode(out, skip_special_tokens=True).split("\nJoke:")[1]}

result

A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


{'Сидят два рыбака на льду': {'joke': 'Один другому: "Ты чего без шапки? Уши отморозишь". — "Ага, я вчера был в шапке, мне мужики водку предлагали, а я не услышал".',
  'output': 'утопно конкурентно конкурентно конкурентно. Сидят пара рыбаков, сплетая: "Ещё в море! Нет комарь?!" — возвращаются в воду, х'},
 'Разговаривают две блондинки': {'joke': '— А ты знаешь, что этот шарф связан из шерсти верблюда? \n— Ого! А я и не знала, что они умеют вязать.',
  'output': '— А ты знаешь, что этот шарф из цвета твоего真皮 заблуждаёт миллион людей?\n— Нет, я не знаю, он подумал, что это цвета моих. Сознан'},
 'Вызывает начальник подчиненного': {'joke': '— Иванов, вы уволены.\n— Уволен? Странно, я думал, рабов продают.',
  'output': '— Иванов, вы завещаны на смерть по призруянию.\n— А почему на смерть не уволив? — Тогда будет кровь на клasse. — Уволю, и о'},
 'Приходит муж домой под утро': {'joke': 'Жена: "Где был?". Муж: "В пробке". Жена: "В четыре утра?". Муж: "Ну ты же знаешь, как трудно пробиться

In [12]:
import re

pattern = re.compile(r"^\s*(\d+)\s+(.*)$")

def read_prompts(path: str):
    data = []
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue

            m = pattern.match(line)
            if not m:
                continue

            idx = int(m.group(1))
            prompt = m.group(2)
            data.append((idx, prompt))

    return data

test_data = read_prompts("prefixes.txt")
prompts = [f"Seed:{seed}\nJoke:" for idx, seed in test_data]

In [14]:
inputs = tokenizer(prompts, return_tensors="pt", padding=True).to(model.device)
outputs = model.generate(
    **inputs,
    max_new_tokens=50,
    do_sample=True,
    top_p=0.9,
    top_k=50,
    temperature=0.7,
    repetition_penalty=1.15,
    no_repeat_ngram_size=3
)

A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


In [15]:
with open("jokes.txt", "w", encoding="utf-8") as file:
  for idx, out in enumerate(outputs):
    idx, seed = test_data[idx]
    joke = tokenizer.decode(out, skip_special_tokens=True).split("\nJoke:")[1] + "\n"
    file.write(f"{idx} {joke}\n")

