<a href="https://colab.research.google.com/github/BinaryZee/word-ai-colab/blob/main/word-ai.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [25]:
import json
import torch
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import LoraConfig, get_peft_model
from torch.utils.data import DataLoader

In [26]:
from datasets import load_dataset
import json

In [27]:
jsonl_file = "quill_actions_big.jsonl"

data = []
with open(jsonl_file, "r", encoding="utf8") as f:
    for line in f:
        data.append(json.loads(line))


In [28]:
train_data = []
for item in data:
    instruction = item["instruction"]
    response = json.dumps(item["output"], ensure_ascii=False)
    train_data.append({"instruction": instruction, "response": response})

In [29]:
with open("train.jsonl", "w", encoding="utf8") as f:
    for ex in train_data:
        f.write(json.dumps(ex, ensure_ascii=False) + "\n")

In [30]:
dataset = load_dataset("json", data_files="train.jsonl", split="train")

Generating train split: 0 examples [00:00, ? examples/s]

In [31]:
def format_example(example):
    text = f"Instruction: {example['instruction']}\nResponse: {example['response']}"
    example["text"] = text
    return example

dataset = dataset.map(format_example)

# Load tokenizer
model_id = "microsoft/phi-3-mini-4k-instruct"
tokenizer = AutoTokenizer.from_pretrained(model_id)

# Tokenize dataset
tokenized = dataset.map(
    lambda ex: tokenizer(ex["text"], truncation=True, padding="max_length", max_length=512),
    batched=True
)

Map:   0%|          | 0/1795 [00:00<?, ? examples/s]

Map:   0%|          | 0/1795 [00:00<?, ? examples/s]

In [32]:
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",
    load_in_4bit=True,
    trust_remote_code=True
)

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [8]:
from datasets import Dataset
from transformers import DataCollatorForLanguageModeling

def format_example(example):
    # simple instruction→response prompt
    text = f"Instruction: {example['instruction']}\nResponse: {example['response']}"
    example["text"] = text
    return example

dataset = dataset.map(format_example)
tokenized = dataset.map(
    lambda ex: tokenizer(ex["text"], truncation=True, padding="max_length", max_length=512),
    batched=True
)

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

Map:   0%|          | 0/1795 [00:00<?, ? examples/s]

Map:   0%|          | 0/1795 [00:00<?, ? examples/s]

In [12]:
for name, module in model.named_modules():
    if "attn" in name or "proj" in name or "linear" in name:
        print(name)

model.layers.0.self_attn
model.layers.0.self_attn.o_proj
model.layers.0.self_attn.qkv_proj
model.layers.0.self_attn.rotary_emb
model.layers.0.mlp.gate_up_proj
model.layers.0.mlp.down_proj
model.layers.0.resid_attn_dropout
model.layers.1.self_attn
model.layers.1.self_attn.o_proj
model.layers.1.self_attn.qkv_proj
model.layers.1.self_attn.rotary_emb
model.layers.1.mlp.gate_up_proj
model.layers.1.mlp.down_proj
model.layers.1.resid_attn_dropout
model.layers.2.self_attn
model.layers.2.self_attn.o_proj
model.layers.2.self_attn.qkv_proj
model.layers.2.self_attn.rotary_emb
model.layers.2.mlp.gate_up_proj
model.layers.2.mlp.down_proj
model.layers.2.resid_attn_dropout
model.layers.3.self_attn
model.layers.3.self_attn.o_proj
model.layers.3.self_attn.qkv_proj
model.layers.3.self_attn.rotary_emb
model.layers.3.mlp.gate_up_proj
model.layers.3.mlp.down_proj
model.layers.3.resid_attn_dropout
model.layers.4.self_attn
model.layers.4.self_attn.o_proj
model.layers.4.self_attn.qkv_proj
model.layers.4.self_a

In [23]:
from peft import LoraConfig, get_peft_model
from transformers import TrainingArguments, Trainer
import torch

In [33]:
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["qkv_proj", "o_proj", "gate_up_proj", "down_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, lora_config)
model.config.use_cache = False  # fix DynamicCache
model.print_trainable_parameters()

trainable params: 12,582,912 || all params: 3,833,662,464 || trainable%: 0.3282


In [20]:
def collate_fn(batch):
    input_ids = torch.stack([torch.tensor(b["input_ids"]) for b in batch])
    attention_mask = torch.stack([torch.tensor(b["attention_mask"]) for b in batch])
    labels = input_ids.clone()
    return {"input_ids": input_ids, "attention_mask": attention_mask, "labels": labels}

In [35]:
from torch.utils.data import DataLoader
import torch

# Custom collate function
def collate_fn(batch):
    input_ids = torch.stack([torch.tensor(b["input_ids"]) for b in batch])
    attention_mask = torch.stack([torch.tensor(b["attention_mask"]) for b in batch])
    labels = input_ids.clone()
    return {"input_ids": input_ids, "attention_mask": attention_mask, "labels": labels}

dataloader = DataLoader(tokenized, batch_size=1, shuffle=True, collate_fn=collate_fn)

device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
model.train()

optimizer = torch.optim.AdamW(model.parameters(), lr=2e-4)
grad_accum_steps = 8
max_steps = 500
optimizer.zero_grad()

for step, batch in enumerate(dataloader):
    input_ids = batch["input_ids"].to(device)
    attention_mask = batch["attention_mask"].to(device)
    labels = batch["labels"].to(device)

    outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
    loss = outputs.loss / grad_accum_steps
    loss.backward()

    if (step + 1) % grad_accum_steps == 0:
        optimizer.step()
        optimizer.zero_grad()

    if step % 25 == 0:
        print(f"Step {step}, Loss: {loss.item()*grad_accum_steps}")

    if step >= max_steps:
        break




Step 0, Loss: 10.241780281066895
Step 25, Loss: 3.8031184673309326
Step 50, Loss: 0.31117966771125793
Step 75, Loss: 0.41492751240730286
Step 100, Loss: 0.31495794653892517
Step 125, Loss: 0.2879364490509033
Step 150, Loss: 0.24675379693508148
Step 175, Loss: 0.2119787037372589
Step 200, Loss: 0.18490320444107056
Step 225, Loss: 0.1539226770401001
Step 250, Loss: 0.1458243876695633
Step 275, Loss: 0.2033899426460266
Step 300, Loss: 0.12528036534786224
Step 325, Loss: 0.10514368116855621
Step 350, Loss: 0.11082557588815689
Step 375, Loss: 0.10434263944625854
Step 400, Loss: 0.08343463391065598
Step 425, Loss: 0.1029670238494873
Step 450, Loss: 0.08658599853515625
Step 475, Loss: 0.07320673018693924
Step 500, Loss: 0.10187433660030365


In [36]:
model.save_pretrained("./phi3-finetuned-quill")
tokenizer.save_pretrained("./phi3-finetuned-quill")

('./phi3-finetuned-quill/tokenizer_config.json',
 './phi3-finetuned-quill/special_tokens_map.json',
 './phi3-finetuned-quill/chat_template.jinja',
 './phi3-finetuned-quill/tokenizer.model',
 './phi3-finetuned-quill/added_tokens.json',
 './phi3-finetuned-quill/tokenizer.json')

In [37]:
from transformers import pipeline

pipe = pipeline(
    "text-generation",
    model="./phi3-finetuned-quill",
    tokenizer=tokenizer,
    device_map="auto"
)

prompt = "Instruction: Make the text bold and red\nResponse:"
print(pipe(prompt, max_new_tokens=100)[0]["generated_text"])

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Device set to use cuda:0


Instruction: Make the text bold and red
Response: [{"fn": "getSelectionRange", "args": []}, {"fn": "formatText", "args": ["<range.index>", "<range.length>", "bold", true]}, {"fn": "formatText", "args": ["<range.index>", "<range


In [38]:
!zip -r phi3-finetuned-quill.zip phi3-finetuned-quill

  adding: phi3-finetuned-quill/ (stored 0%)
  adding: phi3-finetuned-quill/adapter_config.json (deflated 56%)
  adding: phi3-finetuned-quill/added_tokens.json (deflated 62%)
  adding: phi3-finetuned-quill/chat_template.jinja (deflated 60%)
  adding: phi3-finetuned-quill/README.md (deflated 65%)
  adding: phi3-finetuned-quill/special_tokens_map.json (deflated 79%)
  adding: phi3-finetuned-quill/adapter_model.safetensors (deflated 8%)
  adding: phi3-finetuned-quill/tokenizer_config.json (deflated 86%)
  adding: phi3-finetuned-quill/tokenizer.json (deflated 85%)
  adding: phi3-finetuned-quill/runs/ (stored 0%)
  adding: phi3-finetuned-quill/runs/Oct25_00-08-37_6afc0188c741/ (stored 0%)
  adding: phi3-finetuned-quill/runs/Oct25_00-08-37_6afc0188c741/events.out.tfevents.1761350934.6afc0188c741.1440.0 (deflated 62%)
  adding: phi3-finetuned-quill/runs/Oct25_00-14-27_6afc0188c741/ (stored 0%)
  adding: phi3-finetuned-quill/runs/Oct25_00-14-27_6afc0188c741/events.out.tfevents.1761351274.6afc01