In [1]:
import json
import random

In [2]:
with open("fine_tune_data.json", mode="r") as f:
    data = json.loads(f.read().replace("\n", ""))

In [3]:
len(data)

3894

In [13]:
weighted_prefix_length_dist = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3]
fine_tune_data = []
for line in data:
    line = line.replace("â€”", "")
    split_idx = random.randint(0, len(line.split(" "))-1)
    fine_tune_data.append({
        "ctx": " ".join(line.split(" ")[0:split_idx]),
        "prefix": line.split(" ")[split_idx][0:min(len(line.split(" ")[split_idx]), random.choice(weighted_prefix_length_dist))],
        "prediction": line.split(" ")[split_idx],
    })

with open("processed_ft_data.jsonl", mode="w") as f:
    f.write("\n".join([json.dumps(i) for i in fine_tune_data]).replace("\\u2019", "'"))

In [14]:
with open("processed_ft_data.jsonl", mode="r") as f:
    data = f.read().split("\n")
len(data)

3894

In [16]:
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
    DataCollatorForLanguageModeling
)
from peft import LoraConfig, get_peft_model
from trl import SFTTrainer

# -----------------------
# 1. MODEL + TOKENIZER
# -----------------------
model_name = "Qwen/Qwen3-0.6B"

tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    trust_remote_code=True,
    model_max_length=8192
)
tokenizer.pad_token = tokenizer.eos_token

# -----------------------
# 2. LOAD DATASET
# -----------------------
dataset = load_dataset("json", data_files="processed_ft_data.jsonl")["train"]

# -----------------------
# 3. FORMAT SAMPLES
# -----------------------
def format_example(example):
    example["text"] = (
        f"<task>\n"
        f"Context: {example['ctx']}\n"
        f"Prefix: {example['prefix']}\n"
        f"Next word: <answer>{example['prediction']}</answer>\n"
        f"</task>"
    )
    return example

dataset = dataset.map(format_example)

# -----------------------
# 4. TRAIN/TEST SPLIT
# -----------------------
dataset = dataset.shuffle(seed=42)
split = dataset.train_test_split(test_size=0.2)

train_dataset = split["train"]
eval_dataset = split["test"]

# -----------------------
# 5. LOAD MODEL (GPU SAFE)
# -----------------------
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    device_map="auto",
    trust_remote_code=True
)

# -----------------------
# 6. APPLY LoRA
# -----------------------
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, lora_config)

# -----------------------
# 7. TRAINING ARGS
# -----------------------
training_args = TrainingArguments(
    output_dir="./ham-lora",
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    num_train_epochs=2,
    learning_rate=2e-4,
    warmup_ratio=0.05,
    lr_scheduler_type="cosine",
    fp16=True,
    save_strategy="no",
    logging_steps=50,
    weight_decay=0.01
)


# -----------------------
# 8. CREATE TRAINER
# -----------------------
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

trainer = SFTTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=data_collator
)

# -----------------------
# 9. RUN TRAINING
# -----------------------
trainer.train()

# -----------------------
# 10. SAVE MODEL + TOKENIZER
# -----------------------
model.save_pretrained("./ham-lora-final")
tokenizer.save_pretrained("./ham-lora-final")

print("Training complete. Model saved.")

Adding EOS to train dataset:   0%|          | 0/3115 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/3115 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/3115 [00:00<?, ? examples/s]

Adding EOS to eval dataset:   0%|          | 0/779 [00:00<?, ? examples/s]

Tokenizing eval dataset:   0%|          | 0/779 [00:00<?, ? examples/s]

Truncating eval dataset:   0%|          | 0/779 [00:00<?, ? examples/s]

The model is already on multiple devices. Skipping the move to device specified in `args`.
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None, 'pad_token_id': 151643}.


Step,Training Loss
50,3.0713
100,1.6515
150,1.4704
200,1.4425
250,1.3456
300,1.286
350,1.2957


Training complete. Model saved.


In [19]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel
import torch

base_model = "Qwen/Qwen3-0.6B"
adapter_path = "./ham-lora-final"

tok = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)
tok.pad_token = tok.eos_token

model = AutoModelForCausalLM.from_pretrained(
    base_model,
    torch_dtype=torch.float16,
    device_map="auto",
    trust_remote_code=True
)

model = PeftModel.from_pretrained(model, adapter_path)
model = model.to("cuda")


<task>
Context: cq cq cq this is kh6lf .
Prefix: m
Next word: <answer>
man</answer>
</task>


In [40]:
metrics = trainer.evaluate()
print(metrics)

{'eval_loss': 1.291980504989624, 'eval_runtime': 9.4173, 'eval_samples_per_second': 82.72, 'eval_steps_per_second': 10.406, 'eval_entropy': 1.2529199287599446, 'eval_num_tokens': 243628.0, 'eval_mean_token_accuracy': 0.7461925489561898, 'epoch': 2.0}


In [79]:

prompt = """
<task>
Context: 
Prefix: 
Next word: <answer>
"""
inputs = tok(prompt, return_tensors="pt").to("cuda")

with torch.no_grad():
    outputs = model(**inputs)
    logits = outputs.logits[:, -1, :]
    probs = torch.softmax(logits, dim=-1)
    topk = torch.topk(probs, k=5)

tokens = topk.indices[0].tolist()
scores = topk.values[0].tolist()

for t, p in zip(tokens, scores):
    print(tok.decode([t]), float(p))



j 0.481201171875
jump 0.117919921875
jon 0.09326171875
" 0.059295654296875
jo 0.053131103515625
