In [11]:
import pandas as pd
import json
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import torch
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model

In [3]:
train_df = pd.read_csv('../dataset/processed/train.csv')
val_df = pd.read_csv('../dataset/processed/validation.csv')

with open('../dataset/processed/label_mapping.json', 'r') as f:
  label_mapping = json.load(f)
reversed_label_mapping = {v: k for k, v in label_mapping.items()}

In [4]:
def create_formatted_prompt(row, reversed_label_mapping):
    """
    This function creates the full prompt structure that the model will see,
    including the instruction, the tweet, and the correct label.
    The SFTTrainer will automatically handle splitting this into input and target.
    """
    categories = list(reversed_label_mapping.values())
    return (
        f"Classify the following Indonesian tweet into one of these categories: {categories}.\n\n"
        f"Tweet: {row['cleaned_text']}\n\n"
        f"Category: {reversed_label_mapping[row['label']]}"
    )

# Create the formatted prompt column for the SFTTrainer
train_df['prompt'] = train_df.apply(lambda row: create_formatted_prompt(row, reversed_label_mapping), axis=1)
val_df['prompt'] = val_df.apply(lambda row: create_formatted_prompt(row, reversed_label_mapping), axis=1)

# Convert to Hugging Face Dataset objects
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)

In [None]:
model_name = 'deepseek-ai/DeepSeek-R1-Distill-Qwen-7B'

# Quantize model
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16
)

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

# Load model with quantization
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=quantization_config,
    device_map="auto"
)

# Prepare the model for training
model = prepare_model_for_kbit_training(model)

lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

# Apply LoRA adapters to the model
model = get_peft_model(model, lora_config)

print("Model loaded and prepared for fine-tuning successfully!")
print(f"Trainable parameters: {model.print_trainable_parameters()}")