In [1]:
import os
import pandas as pd
import torch
import numpy as np

import re, string, nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

from torch.utils.data import Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
data = pd.read_csv('berita_scraped.csv')
data.head()

Unnamed: 0,Teks,Media,Label,Link,Teks_Artikel
0,Sepakbola,Detik.com,Liga Inggris,https://sport.detik.com/sepakbola/liga-inggris...,Daftar IsiKlasemen Liga InggrisJadwal Liga Ing...
1,Sepakbola,Detik.com,Liga Inggris,https://sport.detik.com/sepakbola/liga-inggris...,Liverpool-ManajerLiverpoolArne Slottanpa ragu ...
2,Sepakbola,Detik.com,Liga Inggris,https://sport.detik.com/sepakbola/liga-inggris...,Manchester-PenampilanManchester Unitedmasih na...
3,20Detik,Detik.com,Liga Inggris,https://20.detik.com/detikupdate/20250106-2501...,Pelatih Liverpool memuji permainan Manchester ...
4,Sepakbola,Detik.com,Liga Inggris,https://sport.detik.com/sepakbola/liga-inggris...,London-Para pemain topArsenalseperti Bukayo Sa...


In [4]:
data.groupby('Label')['Label'].count()

Label
Liga Indonesia             23
Liga Inggris               22
Liga Italia                22
Liga Spanyol               22
Olahraga Non Sepak Bola    24
Name: Label, dtype: int64

In [5]:
data.isna().sum()

Teks            0
Media           0
Label           0
Link            0
Teks_Artikel    0
dtype: int64

In [3]:
def clean_text(text):
    text = re.sub(r'[^A-Za-z\s]', '', text)
    text = text.lower()
    return text

def tokenize(text):
    return word_tokenize(text)

def remove_stopwords(tokens):
    #Khusus stopword dalam bahasa Indonesia
    stop_words = set(stopwords.words('indonesian'))
    return [word for word in tokens if word not in stop_words]

def lemmatize(tokens):
    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(word) for word in tokens]

df = data[['Label', 'Teks_Artikel']]
df.rename(columns={'Teks_Artikel': 'Teks'}, inplace=True)
df['cleaned_text'] = df['Teks'].apply(clean_text)
df['tokens'] = df['cleaned_text'].apply(tokenize)
df['tokens'] = df['tokens'].apply(remove_stopwords)
df['text'] = df['tokens'].apply(lemmatize)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.rename(columns={'Teks_Artikel': 'Teks'}, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['cleaned_text'] = df['Teks'].apply(clean_text)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['tokens'] = df['cleaned_text'].apply(tokenize)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexe

In [4]:
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df["text"],
    df["Label"],
    test_size=0.2,
    stratify=df["Label"],
    random_state=42
)

In [5]:
model_name = "bert-base-multilingual-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_data(texts, labels, tokenizer, max_len=128):
    # Convert all texts to strings
    if hasattr(texts, "astype"):
        texts = texts.astype(str)
    else:
        texts = [str(t) for t in texts]

    # Tokenize
    tokens = tokenizer(
        list(texts),
        truncation=True,
        padding=True,
        max_length=max_len,
        return_tensors="pt"
    )

    # Convert labels to tensor
    if hasattr(labels, "values"):
        labels = labels.values
    labels = torch.tensor(labels, dtype=torch.long)

    return tokens, labels

In [6]:
class CustomDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item["labels"] = self.labels[idx]
        return item

    def __len__(self):
        return len(self.labels)

In [7]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc}


In [11]:
unique_labels_1stage = df["Label"].unique()
label_mapping_1stage = {label: idx for idx, label in enumerate(unique_labels_1stage)}
print("Label mapping (1-stage):", label_mapping_1stage)

train_labels_1stage = train_labels.map(label_mapping_1stage)
test_labels_1stage = test_labels.map(label_mapping_1stage)

tokens_train_1stage, labels_train_1stage = tokenize_data(
    train_texts, train_labels_1stage, tokenizer
)
tokens_test_1stage, labels_test_1stage = tokenize_data(
    test_texts, test_labels_1stage, tokenizer
)

train_dataset_1stage = CustomDataset(tokens_train_1stage, labels_train_1stage)
test_dataset_1stage = CustomDataset(tokens_test_1stage, labels_test_1stage)

model_1stage = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=len(label_mapping_1stage)
)


training_args_1stage = TrainingArguments(
    output_dir="./results_1stage",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs_1stage",
    logging_steps=10,
    load_best_model_at_end=True,
    no_cuda=True, 
)

trainer_1stage = Trainer(
    model=model_1stage,
    args=training_args_1stage,
    train_dataset=train_dataset_1stage,
    eval_dataset=test_dataset_1stage,
    compute_metrics=compute_metrics
)

Label mapping (1-stage): {'Liga Inggris': 0, 'Liga Italia': 1, 'Liga Spanyol': 2, 'Liga Indonesia': 3, 'Olahraga Non Sepak Bola': 4}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
print("====== Fine-Tuning Classification with One Stage Model ======")
trainer_1stage.train()

print("====== Evaluating Classification with One Stage Model ======")
predictions_1stage = trainer_1stage.predict(test_dataset_1stage)
y_pred_1stage = np.argmax(predictions_1stage.predictions, axis=-1)
print("Classification Report (1-stage model):\n")
print(classification_report(labels_test_1stage, y_pred_1stage))



Epoch,Training Loss,Validation Loss,Accuracy
1,No log,1.629014,0.173913
2,1.630500,1.620166,0.173913
3,1.630500,1.616298,0.173913




Classification Report (1-stage model):

              precision    recall  f1-score   support

           0       0.17      1.00      0.30         4
           1       0.00      0.00      0.00         4
           2       0.00      0.00      0.00         5
           3       0.00      0.00      0.00         5
           4       0.00      0.00      0.00         5

    accuracy                           0.17        23
   macro avg       0.03      0.20      0.06        23
weighted avg       0.03      0.17      0.05        23



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


# 2 stage model

In [9]:
def map_to_binary_labels(label):
    # 0 -> "Olahraga Non Sepak Bola", 1 -> otherwise
    return 0 if label == "Olahraga Non Sepak Bola" else 1

binary_train_labels = train_labels.map(map_to_binary_labels)
binary_test_labels = test_labels.map(map_to_binary_labels)

tokens_train_binary, labels_train_binary = tokenize_data(
    train_texts, binary_train_labels, tokenizer
)
tokens_test_binary, labels_test_binary = tokenize_data(
    test_texts, binary_test_labels, tokenizer
)

train_dataset_binary = CustomDataset(tokens_train_binary, labels_train_binary)
test_dataset_binary = CustomDataset(tokens_test_binary, labels_test_binary)

model_binary = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=2
)

training_args_binary = TrainingArguments(
    output_dir="./results_binary",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs_binary",
    logging_steps=10,
    load_best_model_at_end=True,
    no_cuda=True,
)

trainer_binary = Trainer(
    model=model_binary,
    args=training_args_binary,
    train_dataset=train_dataset_binary,
    eval_dataset=test_dataset_binary,
    compute_metrics=compute_metrics
)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
print("====== Fine-Tuning Binary Model First Stage ======")
trainer_binary.train()

print("====== Evaluating Binary Model First Stage ======")
predictions_binary = trainer_binary.predict(test_dataset_binary)
y_pred_binary = np.argmax(predictions_binary.predictions, axis=-1)
print("Classification Report (Binary model):\n")
print(classification_report(labels_test_binary, y_pred_binary))



Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.556959,0.782609
2,0.547900,0.52099,0.782609
3,0.547900,0.511324,0.782609




Classification Report (Binary model):

              precision    recall  f1-score   support

           0       0.00      0.00      0.00         5
           1       0.78      1.00      0.88        18

    accuracy                           0.78        23
   macro avg       0.39      0.50      0.44        23
weighted avg       0.61      0.78      0.69        23



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [10]:
train_texts_stage2 = train_texts[binary_train_labels == 1]
train_labels_stage2 = train_labels[binary_train_labels == 1]
test_texts_stage2 = test_texts[binary_test_labels == 1]
test_labels_stage2 = test_labels[binary_test_labels == 1]

print(f"\nTrain Stage 2 Dataset Size: {len(train_texts_stage2)}")
print(f"Test Stage 2 Dataset Size: {len(test_texts_stage2)}")

league_labels = {
    "Liga Inggris": 0,
    "Liga Indonesia": 1,
    "Liga Spanyol": 2,
    "Liga Italia": 3
}


Train Stage 2 Dataset Size: 71
Test Stage 2 Dataset Size: 18


In [11]:
train_labels_stage2_mapped = train_labels_stage2.map(league_labels)
test_labels_stage2_mapped = test_labels_stage2.map(league_labels)

tokens_train_2stage, labels_train_2stage = tokenize_data(
    train_texts_stage2, train_labels_stage2_mapped, tokenizer
)
tokens_test_2stage, labels_test_2stage = tokenize_data(
    test_texts_stage2, test_labels_stage2_mapped, tokenizer
)

train_dataset_2stage = CustomDataset(tokens_train_2stage, labels_train_2stage)
test_dataset_2stage = CustomDataset(tokens_test_2stage, labels_test_2stage)

model_2stage = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=4
)

training_args_2stage = TrainingArguments(
    output_dir="./results_2stage",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs_2stage",
    logging_steps=10,
    load_best_model_at_end=True,
    no_cuda=True,  # <--- force CPU
    # seed=42
)

trainer_2stage = Trainer(
    model=model_2stage,
    args=training_args_2stage,
    train_dataset=train_dataset_2stage,
    eval_dataset=test_dataset_2stage,
    compute_metrics=compute_metrics
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
print("====== Fine-Tuning 4-League Model Second Stage ======")
trainer_2stage.train()

print("====== Evaluating 4-League Model Second Stage ======")
predictions_2stage = trainer_2stage.predict(test_dataset_2stage)
y_pred_2stage = np.argmax(predictions_2stage.predictions, axis=-1)
print("Classification Report (2-stage, league model):\n")
print(classification_report(labels_test_2stage, y_pred_2stage))




Epoch,Training Loss,Validation Loss,Accuracy
1,No log,1.383295,0.222222
2,1.390600,1.382902,0.222222
3,1.390600,1.380077,0.611111




Classification Report (2-stage, league model):

              precision    recall  f1-score   support

           0       0.50      1.00      0.67         4
           1       0.80      0.80      0.80         5
           2       1.00      0.20      0.33         5
           3       0.50      0.50      0.50         4

    accuracy                           0.61        18
   macro avg       0.70      0.62      0.57        18
weighted avg       0.72      0.61      0.57        18

