The trained model is saved in Google Drive at the following location: [Trained_Models](https://drive.google.com/drive/folders/1F1zcr6L9NhoKbQVZEBYnRluLWs6jprRf?usp=sharing)

To view detailed visualizations and logs of the training process, please visit the [WandB dashboard](https://wandb.ai/spannagl/Fine-tuning_phi-1.5_for_Kotlin-code-completion?nw=nwuserdavidspannagl) associated with this training run.

In [None]:
# !pip install torch torchvision
# !pip install transformers
# !pip install datasets
# !pip install peft
# !pip install bitsandbytes
# !pip install accelerate -U
# !pip install wandb

In [2]:
import torch
import requests

from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from peft import LoraConfig, get_peft_model
from typing import Callable

In [3]:
import wandb
wandb.login()

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [4]:
model_id = "microsoft/phi-1_5"
model_log_dir = "/content/model/training/output/"

data_url = "https://raw.githubusercontent.com/DaveS24/KotComplete/main/data/Kotlin/train.jsonl"
dataset_loader_url = "https://raw.githubusercontent.com/DaveS24/KotComplete/main/src/dataset_loader.py"

In [5]:
response_data_parser = requests.get(dataset_loader_url)
dataset_loader_code = response_data_parser.text

load_jsonl_from_url: Callable
create_and_tokenize_dataset: Callable
dataset_summary: Callable

exec(dataset_loader_code)

In [6]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16
)

lora_config = LoraConfig(
    r=16,
    lora_alpha=16,
    target_modules=["fc1", "fc2","Wqkv", "out_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

In [7]:
phi_model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config)
phi_model = get_peft_model(phi_model, lora_config)
phi_model.print_trainable_parameters()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/736 [00:00<?, ?B/s]

`low_cpu_mem_usage` was None, now set to True since model is quantized.


model.safetensors:   0%|          | 0.00/2.84G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/74.0 [00:00<?, ?B/s]

trainable params: 7,864,320 || all params: 1,426,135,040 || trainable%: 0.5514428703750243


In [8]:
phi_tokenizer = AutoTokenizer.from_pretrained(model_id)
phi_tokenizer.pad_token = phi_tokenizer.eos_token

tokenizer_config.json:   0%|          | 0.00/237 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/1.08k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

In [9]:
train_data = load_jsonl_from_url(data_url, use_subset=True, subset_ratio=0.25)
train_dataset = create_and_tokenize_dataset(train_data, phi_tokenizer)

dataset_summary(train_dataset)

Dataset Info
Number of samples: 13224
Column names: ['input_ids', 'attention_mask', 'labels']
Features: {'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None), 'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None), 'labels': Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None)}


In [10]:
training_args = TrainingArguments(
    output_dir=model_log_dir,
    per_device_train_batch_size=8,
    gradient_accumulation_steps=2,
    learning_rate=2e-4,
    lr_scheduler_type="cosine",
    logging_steps=50,
    disable_tqdm=True
)

trainer = Trainer(
    model=phi_model,
    args=training_args,
    train_dataset=train_dataset,
    data_collator=DataCollatorForLanguageModeling(phi_tokenizer, mlm=False)
)

In [11]:
run = wandb.init(project='Fine-tuning_phi-1.5_for_Kotlin-code-completion', name='kotlin-train', job_type="training", anonymous="allow")

trainer.train()

run.finish()

[34m[1mwandb[0m: Currently logged in as: [33mdavid-spannagl[0m ([33mspannagl[0m). Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
50,3.9283
100,3.2268
150,3.1102
200,3.1063
250,3.1537
300,3.0033
350,3.0305
400,2.9766
450,3.0236
500,3.0128




VBox(children=(Label(value='0.022 MB of 0.022 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
train/epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
train/global_step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
train/grad_norm,▂▃▁▃▃▃▄▃▁▃▂▃▃▄▃▃▅▇▅▃▅▅▄▃▃▄▅▆▆▆▄▅▅▆▅█▄▇▇▇
train/learning_rate,███████▇▇▇▇▇▇▆▆▆▆▅▅▅▄▄▄▄▃▃▃▃▂▂▂▂▂▂▁▁▁▁▁▁
train/loss,█▅▄▄▅▄▄▄▄▃▄▃▃▃▃▃▃▃▃▂▂▂▂▃▂▂▂▂▂▁▁▁▁▁▁▂▂▂▁▁

0,1
total_flos,4.651877505171456e+16
train/epoch,2.99819
train/global_step,2478.0
train/grad_norm,3.10321
train/learning_rate,0.0
train/loss,2.3842
train_loss,2.70532
train_runtime,3031.4002
train_samples_per_second,13.087
train_steps_per_second,0.817


In [23]:
# from google.colab import drive
# drive.mount('/content/drive')

# model_save_dir = "/content/drive/My Drive/Trained_Models/Phi-1.5/"
# trainer.save_model(model_save_dir)

Mounted at /content/drive


