In [None]:
import pandas as pd
import json
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import torch

In [2]:
train_df = pd.read_csv('../dataset/processed/train.csv')
val_df = pd.read_csv('../dataset/processed/validation.csv')

with open('../dataset/processed/label_mapping.json', 'r') as f:
  label_mapping = json.load(f)
reversed_label_mapping = {v: k for k, v in label_mapping.items()}

In [5]:
def create_formatted_prompt(row, reversed_label_mapping):
    """
    This function creates the full prompt structure that the model will see,
    including the instruction, the tweet, and the correct label.
    The SFTTrainer will automatically handle splitting this into input and target.
    """
    categories = list(reversed_label_mapping.values())
    return (
        f"Classify the following Indonesian tweet into one of these categories: {categories}.\n\n"
        f"Tweet: {row['cleaned_text']}\n\n"
        f"Category: {reversed_label_mapping[row['label']]}"
    )

# Create the formatted prompt column for the SFTTrainer
train_df['prompt'] = train_df.apply(lambda row: create_formatted_prompt(row, reversed_label_mapping), axis=1)
val_df['prompt'] = val_df.apply(lambda row: create_formatted_prompt(row, reversed_label_mapping), axis=1)

# Convert to Hugging Face Dataset objects
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)