In [None]:
# !pip install -q --upgrade transformers
# !pip install -q --upgrade peft
!pip install -q -U "transformers==4.40.0" "peft==0.10.0"
!pip install -q --upgrade datasets
!pip install -q --upgrade accelerate
!pip install -q --upgrade bitsandbytes
!pip install -q --upgrade torch

In [None]:
import pandas as pd
from datasets import load_dataset, Dataset, DatasetDict, Value
from peft import PeftModel, get_peft_model, LoraConfig, TaskType, prepare_model_for_kbit_training
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments, Trainer, DataCollatorForLanguageModeling
import torch
import torch.nn.functional as F
from sklearn.metrics import confusion_matrix, classification_report, balanced_accuracy_score, accuracy_score
import numpy as np

In [None]:
import os
os.environ["WANDB_DISABLED"] = "true"

# import csv dataset


In [None]:
train_df = pd.read_csv('train.csv')
val_df = pd.read_csv('val.csv')
test_df = pd.read_csv('test.csv')

# Convert from Pandas DataFrame to Hugging Face Dataset

In [None]:
dataset_train = Dataset.from_pandas(train_df)
dataset_val = Dataset.from_pandas(val_df)
dataset_test = Dataset.from_pandas(test_df)
dataset = DatasetDict({'train': dataset_train, 'validation': dataset_val, 'test': dataset_test})

# Load tokenizer and model with lora config

In [None]:
model_name = "Qwen/Qwen2.5-0.5B-Instruct"

In [None]:
lora_config = LoraConfig(
                task_type=TaskType.SEQ_CLS,
                r=16,
                lora_alpha=32,
                target_modules= ["q_proj", "k_proj", "v_proj", "o_proj"],
                lora_dropout=0.05,
                bias="none"
            )

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True, padding_side='left')

In [None]:
tokenizer.pad_token_id = tokenizer.eos_token_id
tokenizer.pad_token = tokenizer.eos_token

In [None]:
model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True, device_map="auto")
model.config.pad_token_id = tokenizer.pad_token_id
model.config.use_cache = False
model.config.pretraining_tp = 1

In [None]:
model = prepare_model_for_kbit_training(model)

In [None]:
model = get_peft_model(model, lora_config)

# Prompt

In [None]:
categories = train_df['label'].unique().tolist()
instruction = 'You are Qwen an advanced model specializing classify text.'
def make_prompt(text):
    return f"""You are an expert text classifier.
Classify the following academic abstract into **exactly one** of the following categories.
Your answer must be **only one of the following labels**, spelled **exactly as shown** — no explanations, no extra words, and no made-up categories.
Categories: {categories}
If the text fits into more than one, choose the most relevant one.
If the text does not fit exactly, pick the **closest matching** category from the list.
Do not invent new labels. Do not return anything outside the list.
inputtext: {text}
"""

In [None]:
def prompt_template(text, label=None):
  prompt = make_prompt(text=text)
  message = [
    {"role": "system", "content": instruction},
    {"role": "user", "content": prompt}
  ]
  if label:
        message.append({"role": "assistant", "content": label})
  chat_format = tokenizer.apply_chat_template(message, tokenize=False, add_generation_prompt=True)
  return chat_format

# Create Qwen tokenized

In [None]:
# def qwen_preprocess(df):
#   prompt = prompt_template(df['text'], df['label'])
#   tokens = tokenizer(prompt, truncation=True, max_length=512, padding="max_length", return_tensors='pt')
#   tokens["labels"] = tokens["input_ids"].copy()
#   return tokens

def qwen_preprocess(df):
    prompt = prompt_template(df['text'], df['label'])
    tokens = tokenizer(prompt, truncation=True, max_length=512, padding="max_length")
    input_ids = tokens["input_ids"]
    labels = [
        token if token != tokenizer.pad_token_id else -100
        for token in input_ids
    ]
    tokens["labels"] = labels
    return tokens

tokenized_dataset = dataset.map(qwen_preprocess, remove_columns=["text", "label"])

# define training args

In [None]:
training_args = TrainingArguments(
    output_dir="lora-results",
    learning_rate=1e-5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=5,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True
)

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['validation'],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

# Run trainer

In [None]:
train_result = trainer.train()

In [None]:
model.save_pretrained("qwen2.5-finetuned")
tokenizer.save_pretrained("qwen2.5-finetuned")

In [None]:
from google.colab import drive, files
drive.mount('/content/drive')

In [None]:
%cd /content/qwen2.5-finetuned
!zip -r qwen2.5-finetuned.zip /content/qwen2.5-finetuned
files.download('qwen2.5-finetuned.zip')

In [None]:
%cd /content/lora-results
!zip -r lora-results.zip /content/lora-results
files.download('lora-results.zip')