In [7]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import numpy as np

In [2]:
train_df = pd.read_csv('../dataset/processed/train.csv')
val_df = pd.read_csv('../dataset/processed/validation.csv')
test_df = pd.read_csv('../dataset/processed/test.csv')

In [3]:
print("Train shape:", train_df.shape)
print("Validation shape:", val_df.shape)
print("Test shape:", test_df.shape)
print("\nTrain DataFrame head:")
print(train_df.head())

Train shape: (3500, 2)
Validation shape: (750, 2)
Test shape: (750, 2)

Train DataFrame head:
                                        cleaned_text  label
0  warga kp bayam itu yang lahannya dipakai buat ...      5
1  team pesona bobby kertanegara mulai turun tang...      5
2  kapolri jenderal listyo sigit prabowo menghadi...      6
3  ini lah hasil dari kinerja pak prabowo selama ...      5
4  puluhan ribu masyarakat menyambut kedatangan a...      5


In [4]:
model_name = "indobenchmark/indobert-large-p2"
num_labels = 8

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-large-p2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
# Convert pandas DataFrames to Hugging Face Datasets and tokenize

# Drop rows with missing cleaned_text
train_df_clean = train_df.dropna(subset=['cleaned_text'])
val_df_clean = val_df.dropna(subset=['cleaned_text'])
test_df_clean = test_df.dropna(subset=['cleaned_text'])

def tokenize_function(batch):
    return tokenizer(batch['cleaned_text'], padding='max_length', truncation=True, max_length=256)

# Convert to Hugging Face Dataset objects
train_dataset = Dataset.from_pandas(train_df_clean[['cleaned_text', 'label']])
val_dataset = Dataset.from_pandas(val_df_clean[['cleaned_text', 'label']])
test_dataset = Dataset.from_pandas(test_df_clean[['cleaned_text', 'label']])

# Tokenize datasets
train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

# Set format for PyTorch
columns = ['input_ids', 'attention_mask', 'label']
train_dataset.set_format(type='torch', columns=columns)
val_dataset.set_format(type='torch', columns=columns)
test_dataset.set_format(type='torch', columns=columns)

Map:   0%|          | 0/3494 [00:00<?, ? examples/s]

Map:   0%|          | 0/749 [00:00<?, ? examples/s]

Map:   0%|          | 0/748 [00:00<?, ? examples/s]