In [None]:
!pip install torch torchvision
!pip install transformers

In [None]:
import json
import torch

from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments, DataCollatorForLanguageModeling

In [None]:
model_name = "microsoft/phi-1_5"

phi_tokenizer = AutoTokenizer.from_pretrained(model_name)
phi_model = AutoModelForCausalLM.from_pretrained(model_name)

In [None]:
def read_dataset(file_path):
    dataset = []
    with open(file_path, 'r', encoding='utf-8') as file:
          for line in file:
              data = json.loads(line)
              dataset.append(data)
    return dataset


def preprocess_and_tokenize(dataset, tokenizer):
    input_texts = []
    target_texts = []
    for data in dataset:
        input_text = data["signature"]
        target_text = data["body"]

        input_texts.append(input_text)
        target_texts.append(target_text)
    
    tokenized_inputs = tokenizer(input_texts, return_tensors="pt", padding=True, truncation=True)
    return tokenized_inputs

In [None]:
data_path = "/content/data/train.jsonl"

kotlin_dataset = read_dataset(data_path)
kotlin_tokenized_inputs = preprocess_and_tokenize(kotlin_dataset, phi_tokenizer)

In [None]:
# Define training arguments
training_args = TrainingArguments(
    output_dir="./output",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=4,
    save_steps=1000,
    save_total_limit=2,
    logging_steps=100,
    evaluation_strategy="steps",
    eval_steps=500,
    logging_dir="./logs",
    disable_tqdm=False,
)

# Define data collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=phi_tokenizer, mlm=False
)

# Define optimizer
optimizer = torch.optim.AdamW(
    phi_model.parameters(), 
    lr=5e-5,
    eps=1e-8
)

# Define scheduler
scheduler = torch.optim.lr_scheduler.StepLR(
    optimizer,
    step_size=500,
    gamma=0.1
)

# Create Trainer instance
trainer = Trainer(
    model=phi_model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=kotlin_tokenized_inputs,
    optimizers=(optimizer, scheduler)
)

In [None]:
trainer.train()

In [None]:
trainer.save_model("./fine_tuned_model")