In [None]:
# !pip install torch torchvision
# !pip install transformers
# !pip install datasets
# !pip install peft
# !pip install bitsandbytes
# !pip install accelerate -U
# !pip install wandb

In [1]:
import torch
import requests

from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from peft import LoraConfig, get_peft_model
from typing import Callable

The installed version of bitsandbytes was compiled without GPU support. 8-bit optimizers, 8-bit multiplication, and GPU quantization are unavailable.


In [None]:
import wandb

wandb.login()
run = wandb.init(project='Fine tuning phi-1.5 for Kotlin code completion', job_type="training", anonymous="allow")

In [2]:
model_name = "microsoft/phi-1_5"
data_url = "https://raw.githubusercontent.com/DaveS24/KotComplete/main/data/Kotlin/train.jsonl"
data_parser_url = "https://raw.githubusercontent.com/DaveS24/KotComplete/main/src/data_parser.py"

model_dir = "./model/"
model_output_dir = model_dir + "output/"

In [3]:
response_data_parser = requests.get(data_parser_url)
data_parser_code = response_data_parser.text

load_and_tokenize_dataset: Callable
display_dataset_info: Callable

exec(data_parser_code)

In [None]:
bnb_config = BitsAndBytesConfig(load_in_4bit=True,
                                bnb_4bit_use_double_quant=True,
                                bnb_4bit_quant_type="nf4",
                                bnb_4bit_compute_dtype=torch.float16)

phi_model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=bnb_config)

In [None]:
lora_config = LoraConfig(
    r=16,
    lora_alpha=16,
    target_modules=["fc1", "fc2","Wqkv", "out_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

phi_model = get_peft_model(phi_model, lora_config)
phi_model.print_trainable_parameters()

In [4]:
phi_tokenizer = AutoTokenizer.from_pretrained(model_name)
phi_tokenizer.pad_token = phi_tokenizer.eos_token

kt_tokenized_inputs = load_and_tokenize_dataset(data_url, phi_tokenizer, use_subset=True)
display_dataset_info(kt_tokenized_inputs)

NameError: name 'load_and_tokenize_dataset' is not defined

In [None]:
training_args = TrainingArguments(
    output_dir=model_output_dir,
    per_device_train_batch_size=6,
    gradient_accumulation_steps=2,
    learning_rate=2e-4,
    max_steps=-1,
    num_train_epochs=2,
    lr_scheduler_type="cosine",
    logging_steps=50,
    save_strategy="steps",
    save_steps=500,
    disable_tqdm=False,
    resume_from_checkpoint=model_output_dir
)

trainer = Trainer(
    model=phi_model,
    args=training_args,
    train_dataset=kt_tokenized_inputs,
    data_collator=DataCollatorForLanguageModeling(phi_tokenizer, mlm=False)
)

In [None]:
trainer.train()

In [None]:
trainer.save_model(model_dir)