In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from urllib.parse import urljoin
import time

In [2]:
def scrape_kompas(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    
    try:
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()
        
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Coba selector alternatif untuk judul
        title = soup.find('h1', {'class': ['read__title', 'headline__title']}).get_text(strip=True) if soup.find('h1', {'class': ['read__title', 'headline__title']}) else 'Judul tidak ditemukan'
        
        # Coba selector alternatif untuk konten
        content_div = soup.find('div', class_='read__content') or soup.find('div', class_='article__content')
        if content_div:
            paragraphs = content_div.find_all('p', recursive=True)
            content = '\n'.join([p.get_text(strip=True) for p in paragraphs if p.get_text(strip=True)])
        else:
            content = 'Konten tidak ditemukan'
        
        return {
            'title': title,
            'content': content,
            'url': url
        }
    
    except Exception as e:
        print(f"Error saat scraping {url}: {str(e)}")
        return None

In [3]:
def main():
    base_url = "https://regional.kompas.com/"
    all_news_data = []

    try:
        headers = {'User-Agent': 'Mozilla/5.0'}
        response = requests.get(base_url, headers=headers, timeout=10)
        soup = BeautifulSoup(response.text, 'html.parser')

        # Cari link dengan selector yang lebih umum
        news_links = []
        for article in soup.find_all('a', href=True):
            link = article['href']
            if '/read/' in link or '/regional/' in link:
                full_url = urljoin(base_url, link)
                if full_url not in news_links:
                    news_links.append(full_url)

        print(f"Found {len(news_links)} news links")

        for link in news_links:
            print(f"\nScraping: {link}")
            time.sleep(5)
            news_data = scrape_kompas(link)
            if news_data:
                print(f"Success: {news_data['title'][:100]}...")
                all_news_data.append(news_data)
            else:
                print("Failed to scrape this article")

        if all_news_data:
            df = pd.DataFrame(all_news_data)
            df.to_csv('kompas_regional_1.csv', index=False, encoding='utf-8-sig')
            print(f"\nBerhasil menyimpan {len(df)} berita ke CSV")
        else:
            print("Tidak ada data yang berhasil di-scrape")

    except Exception as e:
        print(f"Main function error: {str(e)}")

if __name__ == "__main__":
    main()

Found 31 news links

Scraping: https://surabaya.kompas.com/read/2025/07/23/144414978/pemkab-pamekasan-tunggak-iuran-bpjs-2024-senilai-rp-42-m-baru-dilunasi
Success: Pemkab Pamekasan Tunggak Iuran BPJS 2024 Senilai Rp 42 M, Baru Dilunasi Tahun Ini...

Scraping: https://regional.kompas.com/read/2025/07/23/144151778/annar-salahuddin-terdakwa-uang-palsu-uin-makassar-ngaku-telepon-kapolsek
Success: Annar Salahuddin, Terdakwa Uang Palsu UIN Makassar Ngaku Telepon Kapolsek saat Rumah Digerebek...

Scraping: https://regional.kompas.com/read/2025/07/23/143608278/jokowi-dicecar-penyidik-soal-foto-ijazah-di-medsos-hingga-dosen-pembimbing
Success: Jokowi Dicecar Penyidik soal Foto Ijazah di Medsos hingga Dosen Pembimbing Skripsi, Ini Jawabannya...

Scraping: https://medan.kompas.com/read/2025/07/23/143050278/diperiksa-3-jam-kasus-korupsi-jalan-topan-ginting-eks-sekda-sebagai-ketua-tim
Success: Diperiksa 3 Jam Kasus Korupsi Jalan Topan Ginting, Eks Sekda: Sebagai Ketua Tim Anggaran...

Scraping: ht

In [4]:
def main():
    base_url = "https://regional.kompas.com/?page=2"
    all_news_data = []

    try:
        headers = {"User-Agent": "Mozilla/5.0"}
        response = requests.get(base_url, headers=headers, timeout=10)
        soup = BeautifulSoup(response.text, "html.parser")

        # Cari link dengan selector yang lebih umum
        news_links = []
        for article in soup.find_all("a", href=True):
            link = article["href"]
            if "/read/" in link or "/regional/" in link:
                full_url = urljoin(base_url, link)
                if full_url not in news_links:
                    news_links.append(full_url)

        print(f"Found {len(news_links)} news links")

        for link in news_links:
            print(f"\nScraping: {link}")
            time.sleep(5)
            news_data = scrape_kompas(link)
            if news_data:
                print(f"Success: {news_data['title'][:100]}...")
                all_news_data.append(news_data)
            else:
                print("Failed to scrape this article")

        if all_news_data:
            df = pd.DataFrame(all_news_data)
            df.to_csv("kompas_regional_2.csv", index=False, encoding="utf-8-sig")
            print(f"\nBerhasil menyimpan {len(df)} berita ke CSV")
        else:
            print("Tidak ada data yang berhasil di-scrape")

    except Exception as e:
        print(f"Main function error: {str(e)}")

if __name__ == "__main__":
    main()

Found 31 news links

Scraping: https://surabaya.kompas.com/read/2025/07/23/144414978/pemkab-pamekasan-tunggak-iuran-bpjs-2024-senilai-rp-42-m-baru-dilunasi
Success: Pemkab Pamekasan Tunggak Iuran BPJS 2024 Senilai Rp 42 M, Baru Dilunasi Tahun Ini...

Scraping: https://regional.kompas.com/read/2025/07/23/13254191/bupati-trk-berikan-penghargaan-kepada-147-mahasiswa-berprestasi-asal-nagan
Success: Bupati TRK Berikan Penghargaan Kepada 147 Mahasiswa Berprestasi Asal Nagan Raya...

Scraping: https://yogyakarta.kompas.com/read/2025/07/23/132111478/motor-viar-tertabrak-dua-mobil-di-sleman-pengendara-tewas-di-tempat
Success: Motor Viar Tertabrak Dua Mobil di Sleman, Pengendara Tewas di Tempat...

Scraping: https://bandung.kompas.com/read/2025/07/23/131952878/berlaga-jadi-korban-laka-2-pelaku-curanmor-sasar-mahasiswa-di-jatinangor
Success: Berlaga Jadi Korban Laka, 2 Pelaku Curanmor Sasar Mahasiswa di Jatinangor Sumedang...

Scraping: https://surabaya.kompas.com/read/2025/07/23/131927778/mendik

In [5]:
def main():
    base_url = "https://regional.kompas.com/?page=3"
    all_news_data = []

    try:
        headers = {"User-Agent": "Mozilla/5.0"}
        response = requests.get(base_url, headers=headers, timeout=10)
        soup = BeautifulSoup(response.text, "html.parser")

        # Cari link dengan selector yang lebih umum
        news_links = []
        for article in soup.find_all("a", href=True):
            link = article["href"]
            if "/read/" in link or "/regional/" in link:
                full_url = urljoin(base_url, link)
                if full_url not in news_links:
                    news_links.append(full_url)

        print(f"Found {len(news_links)} news links")

        for link in news_links:
            print(f"\nScraping: {link}")
            time.sleep(5)
            news_data = scrape_kompas(link)
            if news_data:
                print(f"Success: {news_data['title'][:100]}...")
                all_news_data.append(news_data)
            else:
                print("Failed to scrape this article")

        if all_news_data:
            df = pd.DataFrame(all_news_data)
            df.to_csv("kompas_regional_3.csv", index=False, encoding="utf-8-sig")
            print(f"\nBerhasil menyimpan {len(df)} berita ke CSV")
        else:
            print("Tidak ada data yang berhasil di-scrape")

    except Exception as e:
        print(f"Main function error: {str(e)}")

if __name__ == "__main__":
    main()

Found 31 news links

Scraping: https://surabaya.kompas.com/read/2025/07/23/144414978/pemkab-pamekasan-tunggak-iuran-bpjs-2024-senilai-rp-42-m-baru-dilunasi
Success: Pemkab Pamekasan Tunggak Iuran BPJS 2024 Senilai Rp 42 M, Baru Dilunasi Tahun Ini...

Scraping: https://medan.kompas.com/read/2025/07/23/122651778/seorang-prajurit-tni-diduga-membunuh-istrinya-di-deli-serdang
Success: Seorang Prajurit TNI Diduga Membunuh Istrinya di Deli Serdang...

Scraping: https://bandung.kompas.com/read/2025/07/23/122206278/sakit-hati-diputus-cinta-pria-di-ciamis-sebar-video-porno-ke-wali-kelas-dan
Success: Sakit Hati Diputus Cinta, Pria di Ciamis Sebar Video Porno ke Wali Kelas dan Teman...

Scraping: https://regional.kompas.com/read/2025/07/23/121255778/dulu-pendiam-di-sekolah-eks-marinir-satria-arta-kini-menyesal-gabung
Success: Dulu Pendiam di Sekolah, Eks Marinir Satria Arta Kini Menyesal Gabung Tentara Rusia...

Scraping: https://surabaya.kompas.com/read/2025/07/23/121123578/cak-ji-gelar-mediasi-l

In [6]:
def main():
    base_url = "https://regional.kompas.com/?page=4"
    all_news_data = []

    try:
        headers = {"User-Agent": "Mozilla/5.0"}
        response = requests.get(base_url, headers=headers, timeout=10)
        soup = BeautifulSoup(response.text, "html.parser")

        # Cari link dengan selector yang lebih umum
        news_links = []
        for article in soup.find_all("a", href=True):
            link = article["href"]
            if "/read/" in link or "/regional/" in link:
                full_url = urljoin(base_url, link)
                if full_url not in news_links:
                    news_links.append(full_url)

        print(f"Found {len(news_links)} news links")

        for link in news_links:
            print(f"\nScraping: {link}")
            time.sleep(5)
            news_data = scrape_kompas(link)
            if news_data:
                print(f"Success: {news_data['title'][:100]}...")
                all_news_data.append(news_data)
            else:
                print("Failed to scrape this article")

        if all_news_data:
            df = pd.DataFrame(all_news_data)
            df.to_csv("kompas_regional_4.csv", index=False, encoding="utf-8-sig")
            print(f"\nBerhasil menyimpan {len(df)} berita ke CSV")
        else:
            print("Tidak ada data yang berhasil di-scrape")

    except Exception as e:
        print(f"Main function error: {str(e)}")

if __name__ == "__main__":
    main()

Found 31 news links

Scraping: https://surabaya.kompas.com/read/2025/07/23/145531278/kapal-cepat-banyuwangi-denpasar-mulai-berlayar-ini-jadwal-dan-tarifnya
Success: Kapal Cepat Banyuwangi-Denpasar Mulai Berlayar, Ini Jadwal dan Tarifnya...

Scraping: https://surabaya.kompas.com/read/2025/07/23/113127278/pemkab-puluhan-tambang-galian-c-di-sumenep-tidak-kantongi-izin
Success: Pemkab: Puluhan Tambang Galian C di Sumenep Tidak Kantongi Izin...

Scraping: https://surabaya.kompas.com/read/2025/07/23/112557278/gapasdap-minta-pembangunan-dermaga-baru-di-pelabuhan-ketapang-gilimanuk
Success: Gapasdap Minta Pembangunan Dermaga Baru di Pelabuhan Ketapang-Gilimanuk...

Scraping: https://regional.kompas.com/read/2025/07/23/112542078/isi-dua-surat-putri-karlina-usai-insiden-pernikahan-di-pendopo-garut
Success: Isi Dua Surat Putri Karlina Usai Insiden Pernikahan di Pendopo Garut......

Scraping: https://regional.kompas.com/read/2025/07/23/112410478/usai-dua-kali-kebakaran-bigmall-samarinda-kembali-di

In [7]:
def main():
    base_url = "https://regional.kompas.com/?page=5"
    all_news_data = []

    try:
        headers = {"User-Agent": "Mozilla/5.0"}
        response = requests.get(base_url, headers=headers, timeout=10)
        soup = BeautifulSoup(response.text, "html.parser")

        # Cari link dengan selector yang lebih umum
        news_links = []
        for article in soup.find_all("a", href=True):
            link = article["href"]
            if "/read/" in link or "/regional/" in link:
                full_url = urljoin(base_url, link)
                if full_url not in news_links:
                    news_links.append(full_url)

        print(f"Found {len(news_links)} news links")

        for link in news_links:
            print(f"\nScraping: {link}")
            time.sleep(5)
            news_data = scrape_kompas(link)
            if news_data:
                print(f"Success: {news_data['title'][:100]}...")
                all_news_data.append(news_data)
            else:
                print("Failed to scrape this article")

        if all_news_data:
            df = pd.DataFrame(all_news_data)
            df.to_csv("kompas_regional_5.csv", index=False, encoding="utf-8-sig")
            print(f"\nBerhasil menyimpan {len(df)} berita ke CSV")
        else:
            print("Tidak ada data yang berhasil di-scrape")

    except Exception as e:
        print(f"Main function error: {str(e)}")

if __name__ == "__main__":
    main()

Found 31 news links

Scraping: https://surabaya.kompas.com/read/2025/07/23/145531278/kapal-cepat-banyuwangi-denpasar-mulai-berlayar-ini-jadwal-dan-tarifnya
Success: Kapal Cepat Banyuwangi-Denpasar Mulai Berlayar, Ini Jadwal dan Tarifnya...

Scraping: https://makassar.kompas.com/read/2025/07/23/102022578/kampus-jadi-pabrik-uang-palsu-sidang-digelar-di-perpustakaan-uin-alauddin
Success: Kampus Jadi Pabrik Uang Palsu, Sidang Digelar di Perpustakaan UIN Alauddin Makassar...

Scraping: https://regional.kompas.com/read/2025/07/23/101827178/eks-marinir-satria-kumbara-ternyata-alumni-smk-dr-tjipto-ambarawa-kepala
Success: Eks Marinir Satria Kumbara Ternyata Alumni SMK Dr. Tjipto Ambarawa, Kepala Sekolah Ungkap Sosoknya...

Scraping: https://regional.kompas.com/read/2025/07/23/100404178/kabut-asap-parah-melanda-rokan-hulu-sekolah-diliburkan
Success: Kabut Asap Parah Melanda Rokan Hulu, Sekolah Diliburkan...

Scraping: https://regional.kompas.com/read/2025/07/23/100002078/apa-itu-beras-oplosan-i

In [8]:
def main():
    base_url = "https://regional.kompas.com/?page=6"
    all_news_data = []

    try:
        headers = {"User-Agent": "Mozilla/5.0"}
        response = requests.get(base_url, headers=headers, timeout=10)
        soup = BeautifulSoup(response.text, "html.parser")

        # Cari link dengan selector yang lebih umum
        news_links = []
        for article in soup.find_all("a", href=True):
            link = article["href"]
            if "/read/" in link or "/regional/" in link:
                full_url = urljoin(base_url, link)
                if full_url not in news_links:
                    news_links.append(full_url)

        print(f"Found {len(news_links)} news links")

        for link in news_links:
            print(f"\nScraping: {link}")
            time.sleep(5)
            news_data = scrape_kompas(link)
            if news_data:
                print(f"Success: {news_data['title'][:100]}...")
                all_news_data.append(news_data)
            else:
                print("Failed to scrape this article")

        if all_news_data:
            df = pd.DataFrame(all_news_data)
            df.to_csv("kompas_regional_6.csv", index=False, encoding="utf-8-sig")
            print(f"\nBerhasil menyimpan {len(df)} berita ke CSV")
        else:
            print("Tidak ada data yang berhasil di-scrape")

    except Exception as e:
        print(f"Main function error: {str(e)}")

if __name__ == "__main__":
    main()

Found 30 news links

Scraping: https://surabaya.kompas.com/read/2025/07/23/145531278/kapal-cepat-banyuwangi-denpasar-mulai-berlayar-ini-jadwal-dan-tarifnya
Success: Kapal Cepat Banyuwangi-Denpasar Mulai Berlayar, Ini Jadwal dan Tarifnya...

Scraping: https://regional.kompas.com/read/2025/07/23/083212478/4-orang-jadi-tersangka-korupsi-dana-pip-di-sman-7-cirebon-uang-rp-368-juta
Success: 4 Orang Jadi Tersangka Korupsi Dana PIP di SMAN 7 Cirebon, Uang Rp 368 Juta Disita...

Scraping: https://regional.kompas.com/read/2025/07/23/081547778/gunung-ile-lewotolok-meletus-11-kali-pada-rabu-pagi
Success: Gunung Ile Lewotolok Meletus 11 Kali pada Rabu Pagi...

Scraping: https://surabaya.kompas.com/read/2025/07/23/080322078/anggota-dpr-semprot-dirjen-hubla-soal-pengawasan-pelayaran-kmp-tunu-pratama
Success: Anggota DPR "Semprot" Dirjen Hubla Soal Pengawasan Pelayaran KMP Tunu Pratama Jaya...

Scraping: https://denpasar.kompas.com/read/2025/07/23/075114578/58-sd-negeri-di-buleleng-kekurangan-siswa-d

In [9]:
def main():
    base_url = "https://regional.kompas.com/?page=7"
    all_news_data = []

    try:
        headers = {"User-Agent": "Mozilla/5.0"}
        response = requests.get(base_url, headers=headers, timeout=10)
        soup = BeautifulSoup(response.text, "html.parser")

        # Cari link dengan selector yang lebih umum
        news_links = []
        for article in soup.find_all("a", href=True):
            link = article["href"]
            if "/read/" in link or "/regional/" in link:
                full_url = urljoin(base_url, link)
                if full_url not in news_links:
                    news_links.append(full_url)

        print(f"Found {len(news_links)} news links")

        for link in news_links:
            print(f"\nScraping: {link}")
            time.sleep(5)
            news_data = scrape_kompas(link)
            if news_data:
                print(f"Success: {news_data['title'][:100]}...")
                all_news_data.append(news_data)
            else:
                print("Failed to scrape this article")

        if all_news_data:
            df = pd.DataFrame(all_news_data)
            df.to_csv("kompas_regional_7.csv", index=False, encoding="utf-8-sig")
            print(f"\nBerhasil menyimpan {len(df)} berita ke CSV")
        else:
            print("Tidak ada data yang berhasil di-scrape")

    except Exception as e:
        print(f"Main function error: {str(e)}")

if __name__ == "__main__":
    main()

Found 30 news links

Scraping: https://surabaya.kompas.com/read/2025/07/23/150512978/embun-upas-mulai-terlihat-di-kawasan-wisata-gunung-bromo
Success: Embun Upas Mulai Terlihat di Kawasan Wisata Gunung Bromo...

Scraping: https://regional.kompas.com/read/2025/07/23/060020378/kisah-inspiratif-pasutri-di-kulon-progo-lulus-dari-pkh-berkat-bertani
Success: Kisah Inspiratif Pasutri di Kulon Progo, Lulus dari PKH Berkat Bertani Pepaya...

Scraping: https://regional.kompas.com/read/2025/07/23/060000078/kagetnya-sugi-dapat-ganti-rugi-tol-jogja-bawen-rp-5-4-m-dari-tanah-yang
Success: Kagetnya Sugi Dapat Ganti Rugi Tol Jogja-Bawen Rp 5,4 M dari Tanah yang Dibelinya Rp 250 Juta...

Scraping: https://regional.kompas.com/read/2025/07/23/054812478/kolaborasi-dengan-kemenbud-gubernur-sulteng-anwar-hafid-kearifan-lokal
Success: Kolaborasi dengan Kemenbud, Gubernur Sulteng Anwar Hafid: Kearifan Lokal Kunci Kesejahteraan...

Scraping: https://regional.kompas.com/read/2025/07/23/053807078/pemprov-jawa-te

In [10]:
def main():
    base_url = "https://regional.kompas.com/?page=8"
    all_news_data = []

    try:
        headers = {"User-Agent": "Mozilla/5.0"}
        response = requests.get(base_url, headers=headers, timeout=10)
        soup = BeautifulSoup(response.text, "html.parser")

        # Cari link dengan selector yang lebih umum
        news_links = []
        for article in soup.find_all("a", href=True):
            link = article["href"]
            if "/read/" in link or "/regional/" in link:
                full_url = urljoin(base_url, link)
                if full_url not in news_links:
                    news_links.append(full_url)

        print(f"Found {len(news_links)} news links")

        for link in news_links:
            print(f"\nScraping: {link}")
            time.sleep(5)
            news_data = scrape_kompas(link)
            if news_data:
                print(f"Success: {news_data['title'][:100]}...")
                all_news_data.append(news_data)
            else:
                print("Failed to scrape this article")

        if all_news_data:
            df = pd.DataFrame(all_news_data)
            df.to_csv("kompas_regional_8.csv", index=False, encoding="utf-8-sig")
            print(f"\nBerhasil menyimpan {len(df)} berita ke CSV")
        else:
            print("Tidak ada data yang berhasil di-scrape")

    except Exception as e:
        print(f"Main function error: {str(e)}")

if __name__ == "__main__":
    main()

Found 30 news links

Scraping: https://surabaya.kompas.com/read/2025/07/23/150512978/embun-upas-mulai-terlihat-di-kawasan-wisata-gunung-bromo
Success: Embun Upas Mulai Terlihat di Kawasan Wisata Gunung Bromo...

Scraping: https://regional.kompas.com/read/2025/07/22/220535578/3-hari-kebingungan-di-hutan-bangka-pencari-burung-ditemukan-lemas
Success: 3 Hari Kebingungan di Hutan Bangka, Pencari Burung Ditemukan Lemas...

Scraping: https://medan.kompas.com/read/2025/07/22/215921878/1-dari-26-pelaku-penjarahan-besi-pabrik-di-medan-pensiunan-tni-al-perannya
Success: 1 dari 26 Pelaku Penjarahan Besi Pabrik di Medan Pensiunan TNI AL, Perannya Penadah...

Scraping: https://regional.kompas.com/read/2025/07/22/215851078/guru-madin-zuhdi-sudah-bayar-denda-usai-tampar-murid-malah-diteror-lsm
Success: Guru Madin Zuhdi Sudah Bayar Denda Usai Tampar Murid, Malah Diteror LSM Pakai Ancaman Masuk Penjara...

Scraping: https://surabaya.kompas.com/read/2025/07/22/215826878/17-rekomendasi-knkt-terkait-trage

In [11]:
def main():
    base_url = "https://regional.kompas.com/?page=9"
    all_news_data = []

    try:
        headers = {"User-Agent": "Mozilla/5.0"}
        response = requests.get(base_url, headers=headers, timeout=10)
        soup = BeautifulSoup(response.text, "html.parser")

        # Cari link dengan selector yang lebih umum
        news_links = []
        for article in soup.find_all("a", href=True):
            link = article["href"]
            if "/read/" in link or "/regional/" in link:
                full_url = urljoin(base_url, link)
                if full_url not in news_links:
                    news_links.append(full_url)

        print(f"Found {len(news_links)} news links")

        for link in news_links:
            print(f"\nScraping: {link}")
            time.sleep(5)
            news_data = scrape_kompas(link)
            if news_data:
                print(f"Success: {news_data['title'][:100]}...")
                all_news_data.append(news_data)
            else:
                print("Failed to scrape this article")

        if all_news_data:
            df = pd.DataFrame(all_news_data)
            df.to_csv("kompas_regional_9.csv", index=False, encoding="utf-8-sig")
            print(f"\nBerhasil menyimpan {len(df)} berita ke CSV")
        else:
            print("Tidak ada data yang berhasil di-scrape")

    except Exception as e:
        print(f"Main function error: {str(e)}")

if __name__ == "__main__":
    main()

Found 31 news links

Scraping: https://regional.kompas.com/read/2025/07/23/151634578/pemerintah-rohil-masih-pertimbangkan-sekolah-diliburkan-akibat-karhutla
Success: Pemerintah Rohil Masih Pertimbangkan Sekolah Diliburkan Akibat Karhutla...

Scraping: https://surabaya.kompas.com/read/2025/07/22/205822578/knkt-beber-kondisi-tak-laik-kmp-tunu-pratama-jaya-sebelum-tenggelam
Success: KNKT Beber Kondisi Tak Laik KMP Tunu Pratama Jaya Sebelum Tenggelam...

Scraping: https://regional.kompas.com/read/2025/07/22/205756978/pantau-karhutla-gunakan-helikopter-menteri-hanif-kebakaran-besar-titik-api
Success: Pantau Karhutla Gunakan Helikopter, Menteri Hanif: Kebakaran Besar, Titik Api di Bukit...

Scraping: https://regional.kompas.com/read/2025/07/22/205517678/2-kali-cabuli-anak-di-bawah-umur-di-pantai-pria-maluku-tengah-divonis-9
Success: 2 Kali Cabuli Anak di Bawah Umur di Pantai, Pria Maluku Tengah Divonis 9 Tahun Penjara...

Scraping: https://yogyakarta.kompas.com/read/2025/07/22/205054078/korb

In [12]:
def main():
    base_url = "https://regional.kompas.com/?page=10"
    all_news_data = []

    try:
        headers = {"User-Agent": "Mozilla/5.0"}
        response = requests.get(base_url, headers=headers, timeout=10)
        soup = BeautifulSoup(response.text, "html.parser")

        # Cari link dengan selector yang lebih umum
        news_links = []
        for article in soup.find_all("a", href=True):
            link = article["href"]
            if "/read/" in link or "/regional/" in link:
                full_url = urljoin(base_url, link)
                if full_url not in news_links:
                    news_links.append(full_url)

        print(f"Found {len(news_links)} news links")

        for link in news_links:
            print(f"\nScraping: {link}")
            time.sleep(5)
            news_data = scrape_kompas(link)
            if news_data:
                print(f"Success: {news_data['title'][:100]}...")
                all_news_data.append(news_data)
            else:
                print("Failed to scrape this article")

        if all_news_data:
            df = pd.DataFrame(all_news_data)
            df.to_csv("kompas_regional_10.csv", index=False, encoding="utf-8-sig")
            print(f"\nBerhasil menyimpan {len(df)} berita ke CSV")
        else:
            print("Tidak ada data yang berhasil di-scrape")

    except Exception as e:
        print(f"Main function error: {str(e)}")

if __name__ == "__main__":
    main()

Found 31 news links

Scraping: https://regional.kompas.com/read/2025/07/23/151634578/pemerintah-rohil-masih-pertimbangkan-sekolah-diliburkan-akibat-karhutla
Success: Pemerintah Rohil Masih Pertimbangkan Sekolah Diliburkan Akibat Karhutla...

Scraping: https://regional.kompas.com/read/2025/07/22/194556378/presiden-prabowo-tugasi-menteri-hanif-3-hari-pantau-kabut-asap-kebakaran
Success: Presiden Prabowo Tugasi Menteri Hanif 3 Hari Pantau Kabut Asap Kebakaran Hutan Riau...

Scraping: https://regional.kompas.com/read/2025/07/22/193937278/tak-terima-disalip-empat-pemuda-di-berau-keroyok-pengendara-motor
Success: Tak Terima Disalip, Empat Pemuda di Berau Keroyok Pengendara Motor...

Scraping: https://bandung.kompas.com/read/2025/07/22/193302578/permintaan-maaf-motovlogger-atas-konten-hoaks-gerebek-asusila-di-pakansari
Success: Permintaan Maaf Motovlogger atas Konten Hoaks Gerebek Asusila di Pakansari Bogor...

Scraping: https://surabaya.kompas.com/read/2025/07/22/193027978/puluhan-guru-sd-di

In [1]:
import pandas as pd
import ollama
from tqdm import tqdm

In [14]:
# List semua file CSV yang ingin digabungkan
file_paths = [
    'kompas_regional_1.csv', 'kompas_regional_2.csv', 'kompas_regional_3.csv', 'kompas_regional_4.csv', 'kompas_regional_5.csv', 'kompas_regional_6.csv', 'kompas_regional_7.csv', 'kompas_regional_8.csv', 'kompas_regional_9.csv', 'kompas_regional_10.csv'
]

# Membaca dan menggabungkan semua file
all_data = pd.concat((pd.read_csv(file) for file in file_paths), ignore_index=True)

# Menyimpan hasil gabungan ke file baru
all_data.to_csv("kompas_regional.csv", index=False, encoding="utf-8-sig")

In [2]:
# Pilih model yang lebih ringan
MODEL_NAME = "phi3"

# Fungsi yang dioptimalkan
def generate_qa_light(text):
    prompt = f"""
    Buat 1 pertanyaan dan jawaban singkat (maks 20 kata) dari teks berikut:
    {text[:1000]}  # Lebih pendek untuk menghemat memori
    Format: Pertanyaan: <pertanyaan>\nJawaban: <jawaban>
    """
    
    try:
        response = ollama.generate(
            model=MODEL_NAME,
            prompt=prompt,
            options={
                "temperature": 0.3,  # Kurangi kreativitas untuk hasil lebih konsisten
                "num_ctx": 1024  # Batasi context window
            }
        )
        return response['response']
    except Exception as e:
        print(f"Error: {e}")
        return None

In [4]:
# Proses CSV dengan optimasi memori
def process_csv(input_path, output_path, sample_size=None):
    df = pd.read_csv(input_path)
    if sample_size:
        df = df.sample(min(sample_size, len(df)))
    
    results = []
    for _, row in tqdm(df.iterrows(), total=len(df)):
        generated = generate_qa_light(str(row['content']))
        if generated:
            try:
                q = generated.split("Pertanyaan: ")[1].split("\n")[0].strip()
                a = generated.split("Jawaban: ")[1].strip() if "Jawaban: " in generated else "-"
                results.append({
                    'content': row.get('content', ''),
                    'question': q,
                    'answer': a
                })
            except:
                continue
    
    pd.DataFrame(results).to_csv(output_path, index=False)

process_csv('kompas_regional.csv', 'kr_qa.csv', sample_size=50)

100%|██████████| 50/50 [42:26<00:00, 50.93s/it]
