In [None]:
!pip install unsloth peft transformers datasets trl accelerate evaluate

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
from unsloth import FastLanguageModel
import torch
import json
from datasets import Dataset,DatasetDict
import pandas as pd
from transformers import AutoTokenizer, DataCollatorWithPadding
from transformers import AutoModelForSequenceClassification
from transformers import Trainer,TrainingArguments
from peft import get_peft_model, LoraConfig, TaskType
import evaluate
import numpy as np
import os

In [3]:
project_directory="/content/drive/MyDrive/Colab Notebooks/Diplomska/als_datasets"
emglab_dataset=os.path.join(project_directory,"emglab_dataset_bb")

control_als_dataset=os.path.join(project_directory,"control_als_dataset")
control_myopathy_dataset=os.path.join(project_directory,"control_myopathy_dataset")
als_myopathy_dataset=os.path.join(project_directory,"als_myopathy_dataset")
control_als_myopathy_dataset=os.path.join(project_directory,"control_als_myopathy_dataset")

In [4]:
LLAMA_MODEL="unsloth/Llama-3.2-1B-Instruct"
DEEPSEEK_MODEL="deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"

MODEL_PATH=LLAMA_MODEL  # Large Language model used for classification

TOKEN_LEN = 256
NUM_CLASSES=2  # 2 if using a binary dataset, 3 if using a multiclass dataset

lr = 1e-4
batch_size = 16
num_epochs = 50

In [6]:
current_dataset=control_myopathy_dataset  # Which dataset to finetune the llm
def mapping_control_myopathy(example):  # Mapping function for control_myopathy_dataset
  return 1 if example["completion"] == "Myopathy" else 0

def mapping_als(example):  # Mapping function for control_als_dataset and als_myopathy_dataset
  return 1 if example["completion"] == "ALS" else 0

CURRENT_MAPPING_FUNCTION=mapping_control_myopathy  # Choose mapping function based on the dataset used

with open(os.path.join(current_dataset,"train.jsonl"), "r") as file:
    train_dataset = [json.loads(line) for line in file]

with open(os.path.join(current_dataset,"test.jsonl"), "r") as file:
    test_dataset = [json.loads(line) for line in file]

In [None]:
train_dataset=Dataset.from_list(train_dataset)
test_dataset=Dataset.from_list(test_dataset)

dataset=DatasetDict({
    "train":train_dataset,
    "test":test_dataset
})
def rename_features_2_classes(example):
    return {
        "text": example["prompt"],
        "target": CURRENT_MAPPING_FUNCTION(example)
    }

dataset = dataset.map(rename_features_2_classes, remove_columns=["prompt", "completion"])

In [None]:
col_to_delete = ['text']

In [None]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, trust_remote_code=True)
tokenizer.pad_token_id = tokenizer.eos_token_id
tokenizer.pad_token = tokenizer.eos_token

def preprocessing_function(examples):
    return tokenizer(examples['text'], truncation=True, max_length=TOKEN_LEN)

tokenized_datasets = dataset.map(preprocessing_function, batched=True, remove_columns=col_to_delete)
tokenized_datasets = tokenized_datasets.rename_column("target", "label")
tokenized_datasets.set_format("torch")

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
model =  AutoModelForSequenceClassification.from_pretrained(
  pretrained_model_name_or_path=MODEL_PATH,
  num_labels=NUM_CLASSES,
  device_map="auto",
  offload_folder="offload",
  trust_remote_code=True
)

In [None]:
model.config.pad_token_id = model.config.eos_token_id

peft_config = LoraConfig(
    task_type=TaskType.SEQ_CLS, r=16, lora_alpha=16, lora_dropout=0.05, bias="none",
    target_modules=[
        "q_proj",
        "v_proj",
    ],
)

model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

In [None]:
def compute_metrics(eval_pred):
    precision_metric = evaluate.load("precision")
    recall_metric = evaluate.load("recall")
    f1_metric= evaluate.load("f1")
    accuracy_metric = evaluate.load("accuracy")

    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    precision = precision_metric.compute(predictions=predictions, references=labels)["precision"]
    recall = recall_metric.compute(predictions=predictions, references=labels)["recall"]
    f1 = f1_metric.compute(predictions=predictions, references=labels)["f1"]
    accuracy = accuracy_metric.compute(predictions=predictions, references=labels)["accuracy"]

    return {"precision": precision, "recall": recall, "f1-score": f1, 'accuracy': accuracy}

In [None]:
training_args = TrainingArguments(
    output_dir="lora-token-classification",
    learning_rate=lr,
    lr_scheduler_type= "constant",
    warmup_ratio= 0.1,
    max_grad_norm= 0.3,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    weight_decay=0.001,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    report_to="wandb",
    fp16=True,
    gradient_checkpointing=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets["test"],
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()

In [None]:
print("Model: ","LLaMA")
print("Dataset: ","control_myopathy_dataset")
trainer.evaluate()