In [None]:
!pip install -q transformers datasets scikit-learn

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [None]:
# Load CSV
df = pd.read_csv('/content/LabeledText.csv')

# Drop baris kosong dan hanya ambil label valid
df = df.dropna(subset=["LABEL", "Caption"])
df = df[df["LABEL"].isin(["negative", "neutral", "positive"])]

# Ubah nama kolom
df = df.rename(columns={"Caption": "tweet", "LABEL": "label"})

In [None]:
# Label encoding
le = LabelEncoder()
df['label'] = le.fit_transform(df['label'])

print("Label mapping:", dict(zip(le.classes_, le.transform(le.classes_))))
print("Distribusi label:\n", df['label'].value_counts())

Label mapping: {'negative': np.int64(0), 'neutral': np.int64(1), 'positive': np.int64(2)}
Distribusi label:
 label
1    1771
2    1646
0    1452
Name: count, dtype: int64


In [None]:
# Split data
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['tweet'], df['label'], test_size=0.2, stratify=df['label'], random_state=42
)

In [None]:
import torch
from torch.utils.data import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
import numpy as np

In [None]:
class TweetDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

    def __len__(self):
        return len(self.labels)

In [None]:
from sklearn.preprocessing import label_binarize
from scipy.special import softmax

def compute_metrics(pred):
    labels = pred.label_ids
    logits = pred.predictions
    probs = softmax(logits, axis=1)
    preds = np.argmax(probs, axis=1)

    try:
        labels_binarized = label_binarize(labels, classes=[0,1,2])
        auc = roc_auc_score(labels_binarized, probs, multi_class='ovr')
    except:
        auc = 0.0

    return {
        'accuracy': accuracy_score(labels, preds),
        'f1': f1_score(labels, preds, average='weighted'),
        'roc_auc': auc
    }

In [None]:
def train_transformer_model(model_name, train_texts, val_texts, train_labels, val_labels):
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    train_encodings = tokenizer(list(train_texts), truncation=True, padding=True)
    val_encodings = tokenizer(list(val_texts), truncation=True, padding=True)

    train_dataset = TweetDataset(train_encodings, train_labels.tolist())
    val_dataset = TweetDataset(val_encodings, val_labels.tolist())

    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3)

    training_args = TrainingArguments(
        output_dir='./results_' + model_name.replace("/", "_"),
        num_train_epochs=2,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=64,
        warmup_steps=100,
        weight_decay=0.01,
        logging_dir='./logs',
        eval_strategy='epoch',
        logging_strategy='epoch',
        save_strategy='no'
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=compute_metrics,
    )

    trainer.train()
    save_path = f"./saved_models/{model_name.replace('/', '_')}"
    trainer.save_model(save_path)

    eval_result = trainer.evaluate()
    return eval_result

In [None]:
models = {
    "RoBERTa": "roberta-base",
    "BERT": "bert-base-uncased",
    "DistilBERT": "distilbert-base-uncased"
}

results = {}

for name, model_id in models.items():
    print(f"\n========== Training {name} ==========\n")
    metrics = train_transformer_model(model_id, train_texts, val_texts, train_labels, val_labels)
    results[name] = metrics

print("\n\n==== Hasil Evaluasi Semua Model ====\n")
for model_name, metric in results.items():
    print(f"{model_name}:")
    print(f"  Accuracy : {metric['eval_accuracy']:.4f}")
    print(f"  F1 Score : {metric['eval_f1']:.4f}")
    print(f"  ROC AUC  : {metric['eval_roc_auc']:.4f}\n")






Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Roc Auc
1,0.9031,0.731462,0.674538,0.666111,0.861783
2,0.5692,0.622527,0.743326,0.741096,0.899893






Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Roc Auc
1,0.8076,0.562659,0.75154,0.744705,0.914576
2,0.4347,0.545074,0.788501,0.787311,0.925816






Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Roc Auc
1,0.813,0.581327,0.758727,0.750595,0.908869
2,0.4385,0.543466,0.782341,0.780479,0.923096




==== Hasil Evaluasi Semua Model ====

RoBERTa:
  Accuracy : 0.7433
  F1 Score : 0.7411
  ROC AUC  : 0.8999

BERT:
  Accuracy : 0.7885
  F1 Score : 0.7873
  ROC AUC  : 0.9258

DistilBERT:
  Accuracy : 0.7823
  F1 Score : 0.7805
  ROC AUC  : 0.9231



In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score

In [None]:
# TF-IDF Vektorisasi
tfidf = TfidfVectorizer(max_features=10000, ngram_range=(1,2), stop_words='english')

X_train_tfidf = tfidf.fit_transform(train_texts)
X_val_tfidf = tfidf.transform(val_texts)


In [None]:
# Train Logistic Regression
clf = LogisticRegression(max_iter=1000, solver='lbfgs', multi_class='ovr')
clf.fit(X_train_tfidf, train_labels)




In [None]:
preds = clf.predict(X_val_tfidf)
probs = clf.predict_proba(X_val_tfidf)

acc = accuracy_score(val_labels, preds)
f1 = f1_score(val_labels, preds, average='weighted')
try:
    auc = roc_auc_score(val_labels, probs, multi_class='ovr')
except:
    auc = 0.0

print(f"\nTF-IDF + Logistic Regression Evaluation:")
print(f"Accuracy: {acc:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"ROC AUC: {auc:.4f}")



TF-IDF + Logistic Regression Evaluation:
Accuracy: 0.6869
F1 Score: 0.6878
ROC AUC: 0.8437


In [None]:
# from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer

# model_path = "./saved_models/roberta-base"  # contoh
# model = AutoModelForSequenceClassification.from_pretrained(model_path)
# tokenizer = AutoTokenizer.from_pretrained(model_path)

# trainer = Trainer(
#     model=model,
#     tokenizer=tokenizer,
#     # args, dataset sama seperti training sebelumnya
# )

# metrics = trainer.evaluate()
# print(metrics)
