The trained model is saved in Google Drive at the following location: [Trained_Models](https://drive.google.com/drive/folders/14sKTTKcrkvs6pQmzMvICq2Q_hdVItTNU?usp=sharing)

To view detailed visualizations and logs of the training process, please visit the [WandB dashboard](https://wandb.ai/david-spannagl/Fine-tuning_phi-1.5_for_Kotlin-code-completion/runs/47pxy4uk) associated with this training run.

In [None]:
# !pip install torch torchvision
# !pip install transformers
# !pip install datasets
# !pip install peft
# !pip install bitsandbytes
# !pip install accelerate -U
# !pip install wandb

In [2]:
import torch
import requests

from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from peft import LoraConfig, get_peft_model
from typing import Callable

In [None]:
import wandb
wandb.login()

In [4]:
model_id = "microsoft/phi-1_5"
model_log_dir = "/content/model/training/output/"

data_url = "https://raw.githubusercontent.com/DaveS24/KotComplete/main/data/Kotlin/train.jsonl"
dataset_loader_url = "https://raw.githubusercontent.com/DaveS24/KotComplete/main/src/dataset_loader.py"

In [5]:
response_data_parser = requests.get(dataset_loader_url)
dataset_loader_code = response_data_parser.text

load_jsonl_from_url: Callable
create_and_tokenize_dataset: Callable
dataset_summary: Callable

exec(dataset_loader_code)

In [6]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16
)

lora_config = LoraConfig(
    r=16,
    lora_alpha=16,
    target_modules=["fc1", "fc2","Wqkv", "out_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

In [None]:
phi_model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config)
phi_model = get_peft_model(phi_model, lora_config)
phi_model.print_trainable_parameters()

In [None]:
phi_tokenizer = AutoTokenizer.from_pretrained(model_id)
phi_tokenizer.pad_token = phi_tokenizer.eos_token

In [9]:
train_data = load_jsonl_from_url(data_url, use_subset=True, subset_ratio=0.25)
train_dataset = create_and_tokenize_dataset(train_data, phi_tokenizer)

dataset_summary(train_dataset)

Dataset Info
Number of samples: 13224
Column names: ['input_ids', 'attention_mask', 'labels']
Features: {'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None), 'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None), 'labels': Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None)}


In [10]:
training_args = TrainingArguments(
    output_dir=model_log_dir,
    per_device_train_batch_size=8,
    gradient_accumulation_steps=2,
    learning_rate=2e-4,
    lr_scheduler_type="cosine",
    logging_steps=50,
    disable_tqdm=True
)

trainer = Trainer(
    model=phi_model,
    args=training_args,
    train_dataset=train_dataset,
    data_collator=DataCollatorForLanguageModeling(phi_tokenizer, mlm=False)
)

In [11]:
run = wandb.init(project='Fine-tuning_phi-1.5_for_Kotlin-code-completion', name='kotlin-train', job_type="training", anonymous="allow")

trainer.train()

run.finish()

{'loss': 2.4513, 'grad_norm': 3.360219955444336, 'learning_rate': 2.535498186973417e-06, 'epoch': 2.782819116757411}
{'loss': 2.4199, 'grad_norm': 3.1693801879882812, 'learning_rate': 1.3138137898047965e-06, 'epoch': 2.8433151845130067}
{'loss': 2.4279, 'grad_norm': 2.618941307067871, 'learning_rate': 4.88542572549755e-07, 'epoch': 2.9038112522686026}
{'loss': 2.3873, 'grad_norm': 3.093620777130127, 'learning_rate': 6.299957250064382e-08, 'epoch': 2.9643073200241985}
{'train_runtime': 2986.5611, 'train_samples_per_second': 13.284, 'train_steps_per_second': 0.83, 'train_loss': 2.708823637388905, 'epoch': 2.9981851179673322}


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
train/epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
train/global_step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
train/grad_norm,▂▃▁▄▃▃▄▃▁▃▂▃▃▃▃▄▅▇▅▃▅▅▄▄▃▄▅▆▆▆▄▅▅▆▅█▅▇▇▇
train/learning_rate,███████▇▇▇▇▇▇▆▆▆▆▅▅▅▄▄▄▄▃▃▃▃▂▂▂▂▂▂▁▁▁▁▁▁
train/loss,█▅▄▄▅▄▄▄▄▃▄▃▃▃▃▃▃▃▃▂▂▂▂▃▂▂▂▁▂▁▁▁▁▁▁▂▂▂▁▁

0,1
total_flos,4.651877505171456e+16
train/epoch,2.99819
train/global_step,2478.0
train/grad_norm,3.09362
train/learning_rate,0.0
train/loss,2.3873
train_loss,2.70882
train_runtime,2986.5611
train_samples_per_second,13.284
train_steps_per_second,0.83


In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

# model_save_dir = "/content/drive/My Drive/Trained_Models/Phi-1.5/"
# trainer.save_model(model_save_dir)