In [None]:
import digitalhub as dh
import json
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM
from trl import SFTTrainer, SFTConfig


In [None]:
project = dh.get_or_create_project("ner-fine-tuning")
model_name = "microsoft/Phi-3-mini-4k-instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

# Preparing the dataset for finetuning

In [None]:
with open("data/people_data.json", "r", encoding="utf-8") as f:
    data = json.load(f)

ds = Dataset.from_list(data)

def to_text(ex):
    resp = ex["response"]
    if not isinstance(resp, str):
        resp = json.dumps(resp, ensure_ascii=False)
    msgs = [
        {"role": "user", "content": ex["prompt"]},
        {"role": "assistant", "content": resp},
    ]
    return {
        "text": tokenizer.apply_chat_template(
            msgs, tokenize=False, add_generation_prompt=False
        )
    }

dataset = ds.map(to_text, remove_columns=ds.column_names)

In [None]:
def tokenize_function(example):   
    checkpoint = "microsoft/Phi-3-mini-4k-instruct" 
    tokenizer = AutoTokenizer.from_pretrained(checkpoint)
    return tokenizer(
        example["text"], 
        padding="max_length",
        truncation=True,
        max_length=128
    )
tokenized_dataset = dataset.map(tokenize_function, batched=True)    

In [None]:
import torch
torch.cuda.empty_cache()

In [None]:
config = SFTConfig(
    #max_seq_length=2048,
    output_dir="./phi3-finetune-json",
    per_device_train_batch_size=2,
    num_train_epochs=3
)

trainer = SFTTrainer(
    model=model,
    #tokenizer=tokenizer,
    train_dataset=tokenized_dataset,
    args=config
)
trainer.train()
trainer.save_model("./phi3-json-extractor")