The trained model is saved in Google Drive at the following location: [Trained_Models](https://drive.google.com/drive/folders/1F1zcr6L9NhoKbQVZEBYnRluLWs6jprRf?usp=sharing)

To view detailed visualizations and logs of the training process, please visit the [WandB dashboard](https://wandb.ai/david-spannagl/Fine-tuning_phi-1.5_for_Kotlin-code-completion/kotlin-train) associated with this training run.

In [None]:
# !pip install torch torchvision
# !pip install transformers
# !pip install datasets
# !pip install peft
# !pip install bitsandbytes
# !pip install accelerate -U
# !pip install wandb

In [None]:
import torch
import requests

from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from peft import LoraConfig, get_peft_model
from typing import Callable

In [None]:
import wandb
wandb.login()

In [None]:
model_id = "microsoft/phi-1_5"
model_log_dir = "/content/model/training/output/"

data_url = "https://raw.githubusercontent.com/DaveS24/KotComplete/main/data/Kotlin/train.jsonl"
dataset_loader_url = "https://raw.githubusercontent.com/DaveS24/KotComplete/main/src/dataset_loader.py"

In [None]:
response_data_parser = requests.get(dataset_loader_url)
dataset_loader_code = response_data_parser.text

load_jsonl_from_url: Callable
create_and_tokenize_dataset: Callable
dataset_summary: Callable

exec(dataset_loader_code)

In [None]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16
)

lora_config = LoraConfig(
    r=16,
    lora_alpha=16,
    target_modules=["fc1", "fc2","Wqkv", "out_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

In [None]:
phi_model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config)
phi_model = get_peft_model(phi_model, lora_config)
phi_model.print_trainable_parameters()

In [None]:
phi_tokenizer = AutoTokenizer.from_pretrained(model_id)
phi_tokenizer.pad_token = phi_tokenizer.eos_token

In [None]:
train_data = load_jsonl_from_url(data_url, use_subset=True, subset_ratio=0.25)
train_dataset = create_and_tokenize_dataset(train_data, phi_tokenizer)

dataset_summary(train_dataset)

In [None]:
training_args = TrainingArguments(
    output_dir=model_log_dir,
    per_device_train_batch_size=8,
    gradient_accumulation_steps=2,
    learning_rate=2e-4,
    lr_scheduler_type="cosine",
    logging_steps=50
)

trainer = Trainer(
    model=phi_model,
    args=training_args,
    train_dataset=train_dataset,
    data_collator=DataCollatorForLanguageModeling(phi_tokenizer, mlm=False)
)

In [None]:
run = wandb.init(project='Fine-tuning_phi-1.5_for_Kotlin-code-completion', name='kotlin-train', job_type="training", anonymous="allow")

trainer.train()

run.finish()

In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

# model_save_dir = "/content/drive/My Drive/Trained_Models/Phi-1.5/"
# trainer.save_model(model_save_dir)