Scraping Berita dari Platform Berita CNBC dengan Tema IKN

In [1]:
import requests
from bs4 import BeautifulSoup
import csv
import time

# =====================================
# Header untuk request (biar tidak diblokir)
# =====================================
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)'
}

# =====================================
# Fungsi untuk ambil isi artikel
# =====================================
def get_article_content(link):
    try:
        res = requests.get(link, headers=headers, timeout=10)
        soup = BeautifulSoup(res.text, 'lxml')

        # Cari konten utama artikel
        div_content = soup.find('div', class_='detail-text')
        if not div_content:
            return ''

        paragraphs = div_content.find_all('p')
        # Gabungkan isi paragraf dengan pemisah baris
        content = '\n'.join([p.get_text(strip=True) for p in paragraphs])
        return content
    except Exception as e:
        print(f"⚠️ Error ambil artikel: {link} | {e}")
        return ''

# =====================================
# Step 1: Ambil halaman pertama untuk cari total halaman
# =====================================
url_first = 'https://www.cnbcindonesia.com/search?query=ikn&page=1&fromdate=2024/01/01&todate=2025/09/30'
res_first = requests.get(url_first, headers=headers)
soup_first = BeautifulSoup(res_first.text, 'lxml')

# Cari angka halaman terbesar
page_numbers = []
for a in soup_first.find_all('a'):
    try:
        num = int(a.text.strip())
        page_numbers.append(num)
    except:
        continue

last_page = max(page_numbers) if page_numbers else 1
print(f"🔎 Total halaman ditemukan: {last_page}")

# =====================================
# Step 2: Simpan hasil ke CSV
# =====================================
max_articles = 1000
count = 0

with open('cnbc_ikn_full_article.csv', 'w', newline='', encoding='utf-8') as f:
    # Gunakan titik koma agar Excel bisa baca kolom dengan benar
    writer = csv.writer(f, delimiter=';', quoting=csv.QUOTE_MINIMAL)
    writer.writerow(['Title', 'Link', 'Time', 'Content'])

    # Loop tiap halaman
    for page in range(1, last_page + 1):
        if count >= max_articles:
            break

        print(f"\n📄 Scraping halaman {page}...")
        url = f'https://www.cnbcindonesia.com/search?query=ikn&page={page}&fromdate=2024/03/01&todate=2025/04/30'

        try:
            res = requests.get(url, headers=headers, timeout=10)
            soup = BeautifulSoup(res.text, 'lxml')
            articles = soup.find_all('article')

            for art in articles:
                if count >= max_articles:
                    break

                try:
                    a_tag = art.find('a')
                    if not a_tag:
                        continue

                    # Ambil data
                    title_tag = a_tag.find('h2')
                    if not title_tag:
                        continue

                    title = title_tag.text.strip()
                    link = a_tag['href']
                    time_info = a_tag.find_all('span')[-1].text.strip() if a_tag.find_all('span') else ''
                    content = get_article_content(link)

                    # Tulis ke CSV
                    writer.writerow([title, link, time_info, content])
                    count += 1
                    print(f"✅ {count}. {title}")

                    # jeda biar tidak kena rate-limit
                    time.sleep(1)

                except Exception as e:
                    print(f"⚠️ Error parsing artikel di halaman {page}: {e}")
                    continue

            # jeda antar halaman
            time.sleep(1)

        except Exception as e:
            print(f"❌ Gagal akses page {page}: {e}")
            continue

print(f"\n✅ Selesai! Total artikel disimpan: {count}")


🔎 Total halaman ditemukan: 169

📄 Scraping halaman 1...
✅ 1. Absen Bagi Dividen, PTPP Fokus Perkuat Struktur Modal
✅ 2. Satgas IKN Era Jokowi Dibubarkan, Ini Gantinya
✅ 3. Satgas Pembangunan IKN Tiba-Tiba Dibubarkan, Ternyata Ini Alasannya
✅ 4. 2 Alasan Satgas Pembangunan IKN Dibubarkan, Dibentuk di Era Jokowi
✅ 5. Video: Menpan RB Ungkap Rencana ASN Pindah ke IKN
✅ 6. Menteri PANRB Blak-blakan Soal Penundaaan Pemindahan ASN ke IKN
✅ 7. Video:PNS Belum Direstui ke IKN - China Pulangkan Pesawat Boeing ke AS
✅ 8. 30 Tower Baru untuk ASN di IKN Bakal Dibangun, Nilainya Rp 6 Triliun!
✅ 9. Video: Pemindahan PNS ke IKN Belum Dapat Izin Prabowo
✅ 10. Menteri PANRB soal Gaji PNS Naik 16%: Tak Ada Diskusi
✅ 11. Video: Ditunda! Kemenpan-RB Rancang Ulang Pemindahan ASN ke IKN
✅ 12. Ditegur DPR, OIKN Buka Suara Soal Tugu 'Lorem Ipsum' di IKN

📄 Scraping halaman 2...
✅ 13. Menteri PANRB: Satu ASN Bakal Dapat Satu Unit Hunian Dinas di IKN
✅ 14. Otorita IKN Targetkan Kantor Pemerintah Rampung Juni 20

Case Folding dari Hasil Scraping dari Platform Berita CNBC dengan Tema IKN

In [2]:
import pandas as pd

# =====================================
# 1. Baca data hasil scraping (pakai delimiter ;)
# =====================================
df = pd.read_csv("/content/cnbc_ikn_full_article.csv", sep=';', encoding='utf-8')

# =====================================
# 2. Simpan kolom asli sebelum case folding
# =====================================
df['Title_Original'] = df['Title']
df['Content_Original'] = df['Content']

# =====================================
# 3. Buat kolom baru hasil case folding
# =====================================
df['Title_Casefolded'] = df['Title_Original'].astype(str).str.lower()
df['Content_Casefolded'] = df['Content_Original'].astype(str).str.lower()

# =====================================
# 4. Pilih dan urutkan kolom biar rapi
# =====================================
df_final = df[['Title_Original', 'Title_Casefolded',
               'Link', 'Time',
               'Content_Original', 'Content_Casefolded']]

# =====================================
# 5. Simpan hasil ke file baru (pakai ; agar rapi di Excel)
# =====================================
df_final.to_csv("cnbc_ikn_casefolding.csv", sep=';', index=False, encoding='utf-8')

print("✅ Case folding selesai! Hasil disimpan ke cnbc_ikn_casefolding.csv")


✅ Case folding selesai! Hasil disimpan ke cnbc_ikn_casefolding.csv
