In [1]:
import pandas as pd
import json
from datasets import Dataset

In [2]:
train_df = pd.read_csv('../dataset/processed/train.csv')
val_df = pd.read_csv('../dataset/processed/validation.csv')

with open('../dataset/processed/label_mapping.json', 'r') as f:
  label_mapping = json.load(f)
reversed_label_mapping = {v: k for k, v in label_mapping.items()}

In [3]:
def create_prompt(row, label_mapping, reversed_label_mapping):
    categories = list(label_mapping.keys())
    instruction = (
        f"Classify the following Indonesian tweet into one of these categories: {categories}.\n\n"
        f"Tweet: {row['cleaned_text']}\n\nCategory:"
    )
    return {"prompt": instruction, "label": reversed_label_mapping[row['label']]}

# Generate prompts for train and validation datasets
def generate_prompts(dataframe, label_mapping, reversed_label_mapping):
    return [
        create_prompt(row, label_mapping, reversed_label_mapping)
        for _, row in dataframe.iterrows()
    ]

# Create Hugging Face datasets from prompts
def create_hf_dataset(prompts):
    return Dataset.from_pandas(pd.DataFrame(prompts))

# Generate train and validation prompts
train_prompts = generate_prompts(train_df, label_mapping, reversed_label_mapping)
val_prompts = generate_prompts(val_df, label_mapping, reversed_label_mapping)

# Convert prompts to Hugging Face datasets
train_dataset = create_hf_dataset(train_prompts)
val_dataset = create_hf_dataset(val_prompts)