# Import Library

In [2]:
%pip install nltk

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.1.1 -> 26.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [3]:
import numpy as numpy
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Load Dataset

In [4]:
df = pd.read_csv('fake_news_dataset.csv')

In [5]:
df.head()

Unnamed: 0,title,text,date,source,author,category,label
0,Foreign Democrat final.,more tax development both store agreement lawy...,2023-03-10,NY Times,Paula George,Politics,real
1,To offer down resource great point.,probably guess western behind likely next inve...,2022-05-25,Fox News,Joseph Hill,Politics,fake
2,Himself church myself carry.,them identify forward present success risk sev...,2022-09-01,CNN,Julia Robinson,Business,fake
3,You unit its should.,phone which item yard Republican safe where po...,2023-02-07,Reuters,Mr. David Foster DDS,Science,fake
4,Billion believe employee summer how.,wonder myself fact difficult course forget exa...,2023-04-03,CNN,Austin Walker,Technology,fake


In [6]:
df.info()

<class 'pandas.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype
---  ------    --------------  -----
 0   title     20000 non-null  str  
 1   text      20000 non-null  str  
 2   date      20000 non-null  str  
 3   source    19000 non-null  str  
 4   author    19000 non-null  str  
 5   category  20000 non-null  str  
 6   label     20000 non-null  str  
dtypes: str(7)
memory usage: 33.8 MB


In [7]:
df.describe()

Unnamed: 0,title,text,date,source,author,category,label
count,20000,20000,20000,19000,19000,20000,20000
unique,20000,20000,1096,8,17051,7,2
top,Foreign Democrat final.,more tax development both store agreement lawy...,2023-08-31,Daily News,Michael Smith,Health,fake
freq,1,1,32,2439,12,2922,10056


# Data Preperation & Deep Understanding

## Eksplorasi & Label Encoding

Pertama, kita muat dataset dan gabungkan fitur teksnya. Menggabungkan title dan text seringkali meningkatkan akurasi karena model mendapatkan konteks penuh sejak dari judul.

In [8]:
# Fitur Engineering Sederhana
# Kita gabungkan judul dan isi berita agar model paham konteks utuhnya
df['combined_text'] = df['title'] + " [SEP] " + df['text'] 
# [SEP] adalah token khusus BERT untuk memisahkan dua bagian teks

In [9]:
# Label Encoding
df['label'] = df['label'].map({'real': 0, 'fake': 1})

print(f"Data ready: {df.shape[0]} baris")
print(df[['combined_text', 'label']].head())

Data ready: 20000 baris
                                       combined_text  label
0  Foreign Democrat final. [SEP] more tax develop...      0
1  To offer down resource great point. [SEP] prob...      1
2  Himself church myself carry. [SEP] them identi...      1
3  You unit its should. [SEP] phone which item ya...      1
4  Billion believe employee summer how. [SEP] won...      1


## Tokenization

BERT tidak membaca kata per kata seperti manusia, melainkan menggunakan Subword Tokenization.

Konsep: WordPiece Tokenization

Jika ada kata asing seperti "internship", BERT mungkin memecahnya menjadi intern dan ##ship. Ini memastikan tidak ada kata yang "unknown" (OOV - Out of Vocabulary).

Selain memecah kata, Tokenizer BERT menghasilkan tiga hal penting:

1. Input IDs: Representasi angka unik untuk setiap token.

2. Attention Mask: Deretan angka 0 dan 1. Angka 1 berarti itu kata asli, 0 berarti itu padding (kosong). Ini memberitahu model: "Hanya perhatikan angka 1, abaikan angka 0".

3. Special Tokens: BERT butuh token [CLS] di awal kalimat untuk klasifikasi dan [SEP] untuk pemisah.

kita akan gunakan library transformers

In [11]:
%pip install transformers

Collecting transformers
  Downloading transformers-5.1.0-py3-none-any.whl.metadata (31 kB)
Collecting huggingface-hub<2.0,>=1.3.0 (from transformers)
  Downloading huggingface_hub-1.4.1-py3-none-any.whl.metadata (13 kB)
Collecting pyyaml>=5.1 (from transformers)
  Downloading pyyaml-6.0.3-cp313-cp313-win_amd64.whl.metadata (2.4 kB)
Collecting tokenizers<=0.23.0,>=0.22.0 (from transformers)
  Downloading tokenizers-0.22.2-cp39-abi3-win_amd64.whl.metadata (7.4 kB)
Collecting typer-slim (from transformers)
  Downloading typer_slim-0.23.0-py3-none-any.whl.metadata (4.2 kB)
Collecting safetensors>=0.4.3 (from transformers)
  Downloading safetensors-0.7.0-cp38-abi3-win_amd64.whl.metadata (4.2 kB)
Collecting hf-xet<2.0.0,>=1.2.0 (from huggingface-hub<2.0,>=1.3.0->transformers)
  Downloading hf_xet-1.2.0-cp37-abi3-win_amd64.whl.metadata (5.0 kB)
Collecting httpx<1,>=0.23.0 (from huggingface-hub<2.0,>=1.3.0->transformers)
  Downloading httpx-0.28.1-py3-none-any.whl.metadata (7.1 kB)
Collecting 


[notice] A new release of pip is available: 25.1.1 -> 26.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [12]:
from transformers import BertTokenizer

# Memanggil tokenizer yang sudah dilatih oleh Google
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

  from .autonotebook import tqdm as notebook_tqdm


In [13]:
def preprocess_function(text):
    return tokenizer(
        text,
        padding='max_length',     # Menyamakan panjang semua kalimat (misal 128 kata)
        truncation=True,        # Memotong teks jika lebih dari max_length
        max_length=128,         # Batas kata agar memori GPU tidak bengkak
        return_tensors="pt"     # Mengembalikan format PyTorch Tensor
    )

test function

In [14]:
# Contoh cara kerjanya pada satu kalimat
sample_text = "Breaking news: Mars is green!"
encoded = preprocess_function(sample_text)

print(f"Token IDs: {encoded['input_ids']}")
print(f"Attention Mask: {encoded['attention_mask']}")

Token IDs: tensor([[ 101, 4911, 2739, 1024, 7733, 2003, 2665,  999,  102,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0]])
Attention Mask: tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0,

## Data Splitting

In [15]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    df['combined_text'], 
    df['label'], 
    test_size=0.2, 
    random_state=42,
    stratify=df['label'] # Menjaga keseimbangan porsi label
)

In [19]:
y_train.value_counts()

label
1    8045
0    7955
Name: count, dtype: int64

In [20]:
y_test.value_counts()

label
1    2011
0    1989
Name: count, dtype: int64