In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import string
import requests
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
nltk.download('punkt_tab')
nltk.download('stopwords')

!pip install sastrawi
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from wordcloud import WordCloud
import random

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import classification_report, accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB

from imblearn.over_sampling import SMOTE
import pickle

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Collecting sastrawi
  Downloading Sastrawi-1.0.1-py2.py3-none-any.whl.metadata (909 bytes)
Downloading Sastrawi-1.0.1-py2.py3-none-any.whl (209 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m209.7/209.7 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sastrawi
Successfully installed sastrawi-1.0.1


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Memuat dataset
df = pd.read_excel("/content/drive/MyDrive/News Dataset.xlsx")
df = df.drop(columns='Status.1')

# Cek data
df.head()

Unnamed: 0,Date,Status,Link,Title,Description
0,"June 18, 2024",PENIPUAN,https://turnbackhoax.id/2024/06/18/penipuan-su...,Surat Undangan Rakerkesnas Mengatasnamakan RSCM,Hasil periksa fakta Rahmah. RSUPN Dr. Cipto Ma...
1,"August 28, 2020",ACARA,https://turnbackhoax.id/2020/08/28/acara-lawan...,“Lawan Berita Palsu!”,Facebook Indonesia & Box2Box Indonesia: “Nyama...
2,"August 27, 2020",ACARA,https://turnbackhoax.id/2020/08/27/acara-refle...,“REFLEKSI INFODEMI DI KALA PANDEMI”,"Klinik Misinformasi: Webinar | Rabu, 26 Agustu..."
3,"August 17, 2020",ACARA,https://turnbackhoax.id/2020/08/17/acara-merde...,"“Merdeka dari Hoaks, Apa Bisa?”","Indorelawan, 14 Agustus 2020. Simak di: View t..."
4,"August 16, 2020",ACARA,https://turnbackhoax.id/2020/08/16/acara-gande...,"“Gandeng MAFINDO, Bawaslu Bekali Pemuda tentan...","Surakarta, 13 Agustus 2020, selengkapnya di: V..."


# **Cleaning Data**

**Menangani Missing Value**

In [None]:
# Hapus baris dengan nilai kosong pada kolom penting
df = df.dropna(subset=['Title', 'Description', 'Status'])

df.shape

(16445, 5)

**Normalisasi Label**

**Remove Special Characters, Symbols, and Hashtags**

In [None]:
def clean_text(text):
    # Lowercase
    text = text.lower()
    # Remove URL
    text = re.sub(r'http\S+|www.\S+', '', text)
    # Remove non-alphabetic characters and extra spaces
    text = re.sub(r'[^a-z\s]', '', text)
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# Terapkan pada kolom 'Combined_Text'
df['Cleaned_Text'] = df['Description'].apply(clean_text)

print("Contoh teks setelah membersihkan karakter khusus:")
df['Cleaned_Text']

Contoh teks setelah membersihkan karakter khusus:


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Cleaned_Text'] = df['Description'].apply(clean_text)


Unnamed: 0,Cleaned_Text
0,hasil periksa fakta rahmah rsupn dr cipto mang...
1,facebook indonesia boxbox indonesia nyaman di ...
2,klinik misinformasi webinar rabu agustus wib s...
3,indorelawan agustus simak di view this post on...
4,surakarta agustus selengkapnya di view this po...
...,...
16454,mafindo pernah men debunk kompascom karena kom...
16455,update dari artikel berikut setelah menelusuri...
16456,terdapat informasi terbaru dari artikel periks...
16457,direktur jenderal pencemaran dan kerusakan lin...


**Tokenisasi**

In [None]:
# Tokenisasi menggunakan nltk
df['Tokenized'] = df['Cleaned_Text'].apply(word_tokenize)

print("Contoh teks setelah tokenisasi:")
print(df['Tokenized'].head(5))

Contoh teks setelah tokenisasi:
0    [hasil, periksa, fakta, rahmah, rsupn, dr, cip...
1    [facebook, indonesia, boxbox, indonesia, nyama...
2    [klinik, misinformasi, webinar, rabu, agustus,...
3    [indorelawan, agustus, simak, di, view, this, ...
4    [surakarta, agustus, selengkapnya, di, view, t...
Name: Tokenized, dtype: object


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Tokenized'] = df['Cleaned_Text'].apply(word_tokenize)


**Stopword removal**

In [None]:
# Mengambil daftar stopwords bahasa Indonesia
stop_words = set(stopwords.words('indonesian'))

def remove_stopwords(text):
    return [word for word in text if word not in stop_words]

# Menghapus stopwords pada kolom Tokenized
df['No_Stopwords'] = df['Tokenized'].apply(remove_stopwords)

print("Contoh teks setelah menghapus stopwords:")
df['No_Stopwords'].head(5)

Contoh teks setelah menghapus stopwords:


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['No_Stopwords'] = df['Tokenized'].apply(remove_stopwords)


Unnamed: 0,No_Stopwords
0,"[hasil, periksa, fakta, rahmah, rsupn, dr, cip..."
1,"[facebook, indonesia, boxbox, indonesia, nyama..."
2,"[klinik, misinformasi, webinar, rabu, agustus,..."
3,"[indorelawan, agustus, simak, view, this, post..."
4,"[surakarta, agustus, selengkapnya, view, this,..."


**Lematisasi**

In [None]:
# Install NLTK jika belum ada
!pip install nltk

# Import NLTK dan lemmatizer
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

# Download resource jika belum ada
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

# Inisialisasi lemmatizer
lemmatizer = WordNetLemmatizer()

# Fungsi lematisasi
def lemmatize_words(text):
    return [lemmatizer.lemmatize(word) for word in text]

# Terapkan lematisasi
df['Lemmatized'] = df['No_Stopwords'].apply(lemmatize_words)

# Gabungkan kata setelah lemmatization
df['Processed_Text'] = df['Lemmatized'].apply(lambda x: ' '.join(x))

print("Contoh teks setelah lemmatization:")
df['Processed_Text'].head(5)



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


Contoh teks setelah lemmatization:


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Lemmatized'] = df['No_Stopwords'].apply(lemmatize_words)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Processed_Text'] = df['Lemmatized'].apply(lambda x: ' '.join(x))


Unnamed: 0,Processed_Text
0,hasil periksa fakta rahmah rsupn dr cipto mang...
1,facebook indonesia boxbox indonesia nyaman sos...
2,klinik misinformasi webinar rabu agustus wib s...
3,indorelawan agustus simak view this post on in...
4,surakarta agustus selengkapnya view this post ...


**Pengelompokkan Label**

In [None]:
# Pemetaan label Status ke Validitas
status_mapping = {
    # Validitas Tinggi
    'ACARA': 'validitas tinggi',
    'Admin Post': 'validitas tinggi',
    'BENAR': 'validitas tinggi',
    'Benar': 'validitas tinggi',
    'BERITA': 'validitas tinggi',
    'BERITA, EDUKASI': 'validitas tinggi',
    'Cek Fakta': 'validitas tinggi',
    'CekFakta': 'validitas tinggi',
    'KLARIFIKASI': 'validitas tinggi',
    'Klarifikasi': 'validitas tinggi',
    'KLARIFIKASI, EDUKASI': 'validitas tinggi',
    'EDUKASI': 'validitas tinggi',
    'EDUKASI, BERITA': 'validitas tinggi',
    'INFORMASI': 'validitas tinggi',
    'INFORMASI & EDUKASI': 'validitas tinggi',
    'INFORMASI, EDUKASI': 'validitas tinggi',
    'FAKTA': 'validitas tinggi',
    'KOREKSI': 'validitas tinggi',
    'RILIS PERS': 'validitas tinggi',
    'Siaran Pers': 'validitas tinggi',
    'UPDATE': 'validitas tinggi',
    'EVENT': 'validitas tinggi',
    'INFO': 'validitas tinggi',

    # Validitas Sedang
    'BELUM TERBUKTI': 'validitas sedang',
    'Belum Terbukti': 'validitas sedang',
    'ISU': 'validitas sedang',
    'Isu': 'validitas sedang',
    'DOKUMENTASI': 'validitas sedang',
    'PARODI': 'validitas sedang',
    'SATIRE': 'validitas sedang',

    # Validitas Rendah
    'PENIPUAN': 'validitas rendah',
    'DISINFORMASI': 'validitas rendah',
    'Disinformasi': 'validitas rendah',
    'DISINFORMASI & HASUT': 'validitas rendah',
    'DISINFORMASI + HASUT': 'validitas rendah',
    'DISINFORMASI, HASUT': 'validitas rendah',
    'DISINFORMASI/MISINFORMASI': 'validitas rendah',
    'DISINFORMASI+FITNAH': 'validitas rendah',
    'DISINFORMASI+FRAMING': 'validitas rendah',
    'DISINFORMASI+HASUT': 'validitas rendah',
    'EDUKASI, HOAX': 'validitas rendah',
    'EDUKASI,HOAX': 'validitas rendah',
    'SALAH': 'validitas rendah',
    'SALAH ': 'validitas rendah',
    'FITNAH': 'validitas rendah',
    'FITNAH / HASUT': 'validitas rendah',
    'FITNAH+HASUT': 'validitas rendah',
    'FITNAH+HOAX': 'validitas rendah',
    'Foto Gus Mus': 'validitas rendah',
    'FRAMING': 'validitas rendah',
    'HASUT': 'validitas rendah',
    'HASUT+FITNAH': 'validitas rendah',
    'HOAKS': 'validitas rendah',
    'HOAX': 'validitas rendah',
    'Hoax': 'validitas rendah',
    'HOAX + HASUT': 'validitas rendah',
    'HOAX + LOGICAL FALLACY': 'validitas rendah',
    'HOAX KILLS': 'validitas rendah',
    'HOAX, HASUT': 'validitas rendah',
    'HOAX, HASUT, & PROVOKASI': 'validitas rendah',
    'HOAX/FITNAH': 'validitas rendah',
    'HOAX/HASUT': 'validitas rendah',
    'HOAX+FITNAH': 'validitas rendah',
    'HOAX+HASUT': 'validitas rendah',
    'MISINFORMASI': 'validitas rendah',
    'Misinformasi': 'validitas rendah',
    'SCAM': 'validitas rendah',
    'Campuran; Disinformasi, Hasut, & Fakta': 'validitas rendah',
}

# Fungsi untuk kategorisasi
def categorize_status_exact(status):
    return status_mapping.get(status.strip(), 'tidak terklasifikasi')

# Terapkan ke kolom Status
df['Validitas'] = df['Status'].apply(categorize_status_exact)

# Tampilkan distribusi hasil
print("Distribusi Validitas:")
df['Validitas'].value_counts()

Distribusi Validitas:


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Validitas'] = df['Status'].apply(categorize_status_exact)


Unnamed: 0_level_0,count
Validitas,Unnamed: 1_level_1
validitas rendah,15291
validitas tinggi,1097
validitas sedang,57


**Menampilkan Validitas**

In [None]:
# Daftar label validitas utama yang ingin ditampilkan
validitas_labels = ['validitas tinggi', 'validitas sedang', 'validitas rendah']

# Fungsi untuk menampilkan data berdasarkan kategori validitas
def lihat_isi_validitas(validitas_label):
    subset = df[df['Validitas'] == validitas_label]
    print(f"\n=== {validitas_label.upper()} ===")
    print(subset[['Title', 'Status']].head(10))  # Tampilkan 10 data teratas
    print(f"Jumlah data: {len(subset)}")

# Tampilkan isi untuk tiap kategori validitas
for label in validitas_labels:
    lihat_isi_validitas(label)


=== VALIDITAS TINGGI ===
                                                Title Status
1                               “Lawan Berita Palsu!”  ACARA
2                 “REFLEKSI INFODEMI DI KALA PANDEMI”  ACARA
3                     “Merdeka dari Hoaks, Apa Bisa?”  ACARA
4   “Gandeng MAFINDO, Bawaslu Bekali Pemuda tentan...  ACARA
5            Campaign “Jaga Jogja” MAFINDO Yogyakarta  ACARA
6   Unlimited Media Partners Unlimited Talks “Perl...  ACARA
7                         Sekolah.mu “Fakta vs Hoaks”  ACARA
8   Muhammadiyah COVID-19 Command Center “Literasi...  ACARA
9   MAFINDO Malang Bincang Online “Cek Fakta Seput...  ACARA
10  Kenari Djaja Bincang Kenari 5 “Turn Back Hoax ...  ACARA
Jumlah data: 1097

=== VALIDITAS SEDANG ===
                                                Title          Status
19          Penemuan Makam Nabi di Tembok Besar China  BELUM TERBUKTI
20   Keluarga Jokowi Terlibat Kasus Korupsi Pertamina  BELUM TERBUKTI
21  Zat Klorin hingga BPA dalam Pembalut Picu Gan