In [None]:
# !pip install torch torchvision
# !pip install transformers
# !pip install datasets
# !pip install peft
# !pip install bitsandbytes
# !pip install accelerate -U
# !pip install wandb

In [5]:
import json
import torch

from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from datasets import Dataset
from peft import LoraConfig, get_peft_model

In [None]:
import wandb

wandb.login()
run = wandb.init(project='Fine tuning microsoft phi -1.5', job_type="training", anonymous="allow")

In [6]:
model_name = "microsoft/phi-1_5"
data_path = "../data/Kotlin/train.jsonl"
model_dir = "../model/"
model_output_dir = model_dir + "output/"

In [None]:
bnb_config = BitsAndBytesConfig(load_in_4bit=True,
                                bnb_4bit_use_double_quant=True,
                                bnb_4bit_quant_type="nf4",
                                bnb_4bit_compute_dtype=torch.float16)

phi_model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=bnb_config)

In [None]:
lora_config = LoraConfig(
    r=16,
    lora_alpha=16,
    target_modules=["fc1", "fc2","Wqkv", "out_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

phi_model = get_peft_model(phi_model, lora_config)
phi_model.print_trainable_parameters()

In [9]:
def load_and_tokenize_dataset(file_path, tokenizer, use_subset=False, subset_ratio=0.1):
    dataset = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:            
            data = json.loads(line)
            dataset.append(data)
            
    if use_subset:
        subset_size = int(len(dataset) * subset_ratio)
        dataset = dataset[:subset_size]

    input_texts = [data["signature"] for data in dataset]
    target_texts = [data["body"] for data in dataset]

    tokenized_inputs = tokenizer(input_texts, padding=True, truncation=True, return_tensors="pt")
    tokenized_targets = tokenizer(target_texts, padding=True, truncation=True, return_tensors="pt")

    tokenized_inputs["labels"] = tokenized_targets["input_ids"]

    dataset = Dataset.from_dict({
        "input_ids": tokenized_inputs["input_ids"],
        "attention_mask": tokenized_inputs["attention_mask"],
        "labels": tokenized_inputs["labels"]
    })

    return dataset


def display_dataset_info(dataset):
    print("Dataset Info")
    print("============")
    print("Number of samples:", dataset.num_rows)
    print("Column names:", dataset.column_names)
    print("Features:", dataset.features)

In [10]:
phi_tokenizer = AutoTokenizer.from_pretrained(model_name)
phi_tokenizer.pad_token = phi_tokenizer.eos_token

kt_tokenized_inputs = load_and_tokenize_dataset(data_path, phi_tokenizer, use_subset=True)
display_dataset_info(kt_tokenized_inputs)

Dataset Info:
Number of samples: 54520
Column names: ['input_ids', 'attention_mask', 'labels']
Features: {'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None), 'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None), 'labels': Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None)}


In [None]:
training_args = TrainingArguments(
    output_dir=model_output_dir,
    per_device_train_batch_size=6,
    gradient_accumulation_steps=2,
    learning_rate=2e-4,
    max_steps=-1,
    num_train_epochs=2,
    lr_scheduler_type="cosine",
    logging_steps=50,
    save_strategy="steps",
    save_steps=500,
    disable_tqdm=False,
    resume_from_checkpoint=model_output_dir
)

trainer = Trainer(
    model=phi_model,
    args=training_args,
    train_dataset=kt_tokenized_inputs,
    data_collator=DataCollatorForLanguageModeling(phi_tokenizer, mlm=False)
)

In [None]:
trainer.train()

In [None]:
trainer.save_model(model_dir)