# Import Library

In [1]:
import torch
import pandas as pd
import numpy as np
import os
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, get_linear_schedule_with_warmup
from torch.optim import AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


# Load Dataset

In [2]:
df = pd.read_csv('fake_news_dataset.csv')

In [3]:
df.head()

Unnamed: 0,title,text,date,source,author,category,label
0,Foreign Democrat final.,more tax development both store agreement lawy...,2023-03-10,NY Times,Paula George,Politics,real
1,To offer down resource great point.,probably guess western behind likely next inve...,2022-05-25,Fox News,Joseph Hill,Politics,fake
2,Himself church myself carry.,them identify forward present success risk sev...,2022-09-01,CNN,Julia Robinson,Business,fake
3,You unit its should.,phone which item yard Republican safe where po...,2023-02-07,Reuters,Mr. David Foster DDS,Science,fake
4,Billion believe employee summer how.,wonder myself fact difficult course forget exa...,2023-04-03,CNN,Austin Walker,Technology,fake


In [4]:
df.info()

<class 'pandas.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype
---  ------    --------------  -----
 0   title     20000 non-null  str  
 1   text      20000 non-null  str  
 2   date      20000 non-null  str  
 3   source    19000 non-null  str  
 4   author    19000 non-null  str  
 5   category  20000 non-null  str  
 6   label     20000 non-null  str  
dtypes: str(7)
memory usage: 33.8 MB


In [5]:
df.describe()

Unnamed: 0,title,text,date,source,author,category,label
count,20000,20000,20000,19000,19000,20000,20000
unique,20000,20000,1096,8,17051,7,2
top,Foreign Democrat final.,more tax development both store agreement lawy...,2023-08-31,Daily News,Michael Smith,Health,fake
freq,1,1,32,2439,12,2922,10056


# Data Preperation & Deep Understanding

## Eksplorasi & Label Encoding

Pertama, kita muat dataset dan gabungkan fitur teksnya. Menggabungkan title dan text seringkali meningkatkan akurasi karena model mendapatkan konteks penuh sejak dari judul.

In [3]:
# Fitur Engineering Sederhana
# Kita gabungkan judul dan isi berita agar model paham konteks utuhnya
df['combined_text'] = df['title'] + " [SEP] " + df['text'] 
# [SEP] adalah token khusus BERT untuk memisahkan dua bagian teks

In [4]:
# Label Encoding
df['label_idx'] = df['label'].map({'real': 0, 'fake': 1})

print(f"Data ready: {df.shape[0]} baris")
print(df[['combined_text', 'label_idx']].head())

Data ready: 20000 baris
                                       combined_text  label_idx
0  Foreign Democrat final. [SEP] more tax develop...          0
1  To offer down resource great point. [SEP] prob...          1
2  Himself church myself carry. [SEP] them identi...          1
3  You unit its should. [SEP] phone which item ya...          1
4  Billion believe employee summer how. [SEP] won...          1


# Configuration & Setup

In [5]:
MAX_LEN = 128
BATCH_SIZE = 16
EPOCHS = 3
LEARNING_RATE = 2e-5
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

## Tokenization

BERT tidak membaca kata per kata seperti manusia, melainkan menggunakan Subword Tokenization.

Konsep: WordPiece Tokenization

Jika ada kata asing seperti "internship", BERT mungkin memecahnya menjadi intern dan ##ship. Ini memastikan tidak ada kata yang "unknown" (OOV - Out of Vocabulary).

Selain memecah kata, Tokenizer BERT menghasilkan tiga hal penting:

1. Input IDs: Representasi angka unik untuk setiap token.

2. Attention Mask: Deretan angka 0 dan 1. Angka 1 berarti itu kata asli, 0 berarti itu padding (kosong). Ini memberitahu model: "Hanya perhatikan angka 1, abaikan angka 0".

3. Special Tokens: BERT butuh token [CLS] di awal kalimat untuk klasifikasi dan [SEP] untuk pemisah.

kita akan gunakan library transformers

In [6]:
from transformers import BertTokenizer

# Memanggil tokenizer yang sudah dilatih oleh Google
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [7]:
def preprocess_function(text):
    return tokenizer(
        text,
        padding='max_length',     # Menyamakan panjang semua kalimat (misal 128 kata)
        truncation=True,        # Memotong teks jika lebih dari max_length
        max_length=128,         # Batas kata agar memori GPU tidak bengkak
        return_tensors="pt"     # Mengembalikan format PyTorch Tensor
    )

test function

In [8]:
# Contoh cara kerjanya pada satu kalimat
sample_text = "Breaking news: Mars is green!"
encoded = preprocess_function(sample_text)

print(f"Token IDs: {encoded['input_ids']}")
print(f"Attention Mask: {encoded['attention_mask']}")

Token IDs: tensor([[ 101, 4911, 2739, 1024, 7733, 2003, 2665,  999,  102,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0]])
Attention Mask: tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0,

## Data Splitting

In [9]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    df['combined_text'], 
    df['label_idx'], 
    test_size=0.2, 
    random_state=42,
    stratify=df['label_idx']
)

In [10]:
y_train.value_counts()

label_idx
1    8045
0    7955
Name: count, dtype: int64

In [11]:
y_test.value_counts()

label_idx
1    2011
0    1989
Name: count, dtype: int64

# Membuat Custom Dataset & DataLoader

Konsep: PyTorch Dataset
Bayangkan Dataset sebagai sebuah gudang yang menyimpan data, dan DataLoader sebagai kurir yang mengantarkan data dalam porsi kecil (Batch) ke model.

Batch Size: Kita akan mengirim data, misalnya 16 atau 32 baris sekali jalan.

Iterasi: Model akan belajar dari batch demi batch sampai seluruh data habis (1 Epoch).

Kita akan membuat kelas Python untuk membungkus data agar sesuai dengan standar PyTorch dan Hugging Face.

In [12]:
class FakeNewsDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts.to_numpy()
        self.labels = labels.to_numpy()
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        encoding = self.tokenizer(
            str(self.texts[item]),
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(self.labels[item], dtype=torch.long)
        }

# Mendefinisikan Arsitektur Model (Fine-Tuning BERT)

Menambahkan satu lapisan terakhir (Classification Head) yang tugasnya cuma satu: menentukan apakah fitur yang ditangkap BERT itu mengarah ke "Fake" atau "Real".

In [13]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

train_ds = FakeNewsDataset(X_train, y_train, tokenizer, MAX_LEN)
test_ds = FakeNewsDataset(X_test, y_test, tokenizer, MAX_LEN)

train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_ds, batch_size=BATCH_SIZE, shuffle=False)

model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
model.to(DEVICE)

optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)

Loading weights: 100%|██████████| 199/199 [00:00<00:00, 346.47it/s, Materializing param=bert.pooler.dense.weight]                               
[1mBertForSequenceClassification LOAD REPORT[0m from: bert-base-uncased
Key                                        | Status     | 
-------------------------------------------+------------+-
cls.seq_relationship.bias                  | UNEXPECTED | 
cls.predictions.transform.LayerNorm.weight | UNEXPECTED | 
cls.predictions.transform.LayerNorm.bias   | UNEXPECTED | 
cls.predictions.transform.dense.bias       | UNEXPECTED | 
cls.predictions.transform.dense.weight     | UNEXPECTED | 
cls.seq_relationship.weight                | UNEXPECTED | 
cls.predictions.bias                       | UNEXPECTED | 
classifier.weight                          | MISSING    | 
classifier.bias                            | MISSING    | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- M

AdamW sering digunakan sebagai optimizer untuk BERT

- Learning Rate (LR): Kecepatan belajar. Kita gunakan nilai yang sangat kecil (misal: 2e-5) karena kita tidak ingin merusak pengetahuan bahasa yang sudah dimiliki BERT, kita hanya ingin memolesnya sedikit.

- Epochs: Berapa kali model akan melihat seluruh dataset. Untuk BERT, biasanya 3-4 kali sudah cukup.

# Training & Evaluation

In [14]:

for epoch in range(EPOCHS):
    print(f"\n--- Epoch {epoch + 1} / {EPOCHS} ---")
    
    # FASE TRAINING
    model.train()
    total_train_loss = 0
    
    for batch in tqdm(train_loader, desc="Training"):
        optimizer.zero_grad()
        
        input_ids = batch['input_ids'].to(DEVICE)
        attention_mask = batch['attention_mask'].to(DEVICE)
        labels = batch['labels'].to(DEVICE)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        
        loss = outputs.loss
        total_train_loss += loss.item()

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) # Clip gradient
        optimizer.step()

    avg_train_loss = total_train_loss / len(train_loader)
    print(f"Average Training Loss: {avg_train_loss:.4f}")

    # FASE EVALUASI
    model.eval()
    predictions, true_labels = [], []

    for batch in tqdm(test_loader, desc="Evaluating"):
        input_ids = batch['input_ids'].to(DEVICE)
        attention_mask = batch['attention_mask'].to(DEVICE)
        labels = batch['labels'].to(DEVICE)

        with torch.no_grad():
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        
        logits = outputs.logits
        preds = torch.argmax(logits, dim=1).flatten()

        predictions.extend(preds.cpu().numpy())
        true_labels.extend(labels.cpu().numpy())

    print("\nHasil Evaluasi:")
    print(classification_report(true_labels, predictions, target_names=['Real', 'Fake']))


--- Epoch 1 / 3 ---


Training:   9%|▊         | 86/1000 [11:26<2:03:00,  8.08s/it]

# SIMPAN MODEL

In [None]:
output_dir = './model_save_hoaks/'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)
print(f"\nModel sukses disimpan di {output_dir}")