In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from urllib.parse import urljoin
import time

In [2]:
def scrape_kompas(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    
    try:
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()
        
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Coba selector alternatif untuk judul
        title = soup.find('h1', {'class': ['read__title', 'headline__title']}).get_text(strip=True) if soup.find('h1', {'class': ['read__title', 'headline__title']}) else 'Judul tidak ditemukan'
        
        # Coba selector alternatif untuk konten
        content_div = soup.find('div', class_='read__content') or soup.find('div', class_='article__content')
        if content_div:
            paragraphs = content_div.find_all('p', recursive=True)
            content = '\n'.join([p.get_text(strip=True) for p in paragraphs if p.get_text(strip=True)])
        else:
            content = 'Konten tidak ditemukan'
        
        return {
            'title': title,
            'content': content,
            'url': url
        }
    
    except Exception as e:
        print(f"Error saat scraping {url}: {str(e)}")
        return None

In [3]:
def main():
    base_url = "https://megapolitan.kompas.com/"
    all_news_data = []

    try:
        headers = {'User-Agent': 'Mozilla/5.0'}
        response = requests.get(base_url, headers=headers, timeout=10)
        soup = BeautifulSoup(response.text, 'html.parser')

        # Cari link dengan selector yang lebih umum
        news_links = []
        for article in soup.find_all('a', href=True):
            link = article['href']
            if '/read/' in link or '/megapolitan/' in link:
                full_url = urljoin(base_url, link)
                if full_url not in news_links:
                    news_links.append(full_url)

        print(f"Found {len(news_links)} news links")

        for link in news_links:
            print(f"\nScraping: {link}")
            time.sleep(5)
            news_data = scrape_kompas(link)
            if news_data:
                print(f"Success: {news_data['title'][:100]}...")
                all_news_data.append(news_data)
            else:
                print("Failed to scrape this article")

        if all_news_data:
            df = pd.DataFrame(all_news_data)
            df.to_csv('kompas_megapolitan_1.csv', index=False, encoding='utf-8-sig')
            print(f"\nBerhasil menyimpan {len(df)} berita ke CSV")
        else:
            print("Tidak ada data yang berhasil di-scrape")

    except Exception as e:
        print(f"Main function error: {str(e)}")

if __name__ == "__main__":
    main()

Found 32 news links

Scraping: https://megapolitan.kompas.com/read/2025/07/23/07312731/belum-pernah-diungkap-ini-temuan-kompolnas-di-tkp-kematian-diplomat-kemlu
Success: Belum Pernah Diungkap, Ini Temuan Kompolnas di TKP Kematian Diplomat Kemlu...

Scraping: https://megapolitan.kompas.com/read/2025/07/23/14045151/penampakan-jlnt-pluit-yang-mangkrak-10-tahun-dinding-penuh-coretan
Success: Penampakan JLNT Pluit yang Mangkrak 10 Tahun, Dinding Penuh Coretan...

Scraping: https://megapolitan.kompas.com/read/2025/07/23/14000241/pengendara-motor-tewas-ditabrak-mobil-di-kebayoran-baru-terpental-6-meter
Success: Pengendara Motor Tewas Ditabrak Mobil di Kebayoran Baru Terpental 6 Meter...

Scraping: https://megapolitan.kompas.com/read/2025/07/23/13471821/kubu-roy-suryo-soroti-perlakuan-khusus-polisi-terhadap-jokowi-soal-kasus
Success: Kubu Roy Suryo Soroti Perlakuan Khusus Polisi terhadap Jokowi soal Kasus Ijazah...

Scraping: https://megapolitan.kompas.com/read/2025/07/23/13354921/konflik-bisn

In [None]:
def main():
    base_url = "https://megapolitan.kompas.com/?page=2"
    all_news_data = []

    try:
        headers = {"User-Agent": "Mozilla/5.0"}
        response = requests.get(base_url, headers=headers, timeout=10)
        soup = BeautifulSoup(response.text, "html.parser")

        # Cari link dengan selector yang lebih umum
        news_links = []
        for article in soup.find_all("a", href=True):
            link = article["href"]
            if "/read/" in link or "/megapolitan/" in link:
                full_url = urljoin(base_url, link)
                if full_url not in news_links:
                    news_links.append(full_url)

        print(f"Found {len(news_links)} news links")

        for link in news_links:
            print(f"\nScraping: {link}")
            time.sleep(5)
            news_data = scrape_kompas(link)
            if news_data:
                print(f"Success: {news_data['title'][:100]}...")
                all_news_data.append(news_data)
            else:
                print("Failed to scrape this article")

        if all_news_data:
            df = pd.DataFrame(all_news_data)
            df.to_csv("kompas_megapolitan_2.csv", index=False, encoding="utf-8-sig")
            print(f"\nBerhasil menyimpan {len(df)} berita ke CSV")
        else:
            print("Tidak ada data yang berhasil di-scrape")

    except Exception as e:
        print(f"Main function error: {str(e)}")

if __name__ == "__main__":
    main()

Found 32 news links

Scraping: https://megapolitan.kompas.com/read/2025/07/23/07312731/belum-pernah-diungkap-ini-temuan-kompolnas-di-tkp-kematian-diplomat-kemlu
Success: Belum Pernah Diungkap, Ini Temuan Kompolnas di TKP Kematian Diplomat Kemlu...

Scraping: https://megapolitan.kompas.com/read/2025/07/23/11410661/flyover-latumenten-dibangun-agustus-begini-potret-kemacetannya-kini
Success: Flyover Latumenten Dibangun Agustus, Begini Potret Kemacetannya Kini...

Scraping: https://megapolitan.kompas.com/read/2025/07/23/11354121/kubu-roy-suryo-jokowi-bilang-sakit-saat-dipanggil-polda-tapi-muncul-di
Success: Kubu Roy Suryo: Jokowi Bilang Sakit Saat Dipanggil Polda, tapi Muncul di Kongres PSI...

Scraping: https://megapolitan.kompas.com/read/2025/07/23/11185191/adik-bunuh-kakak-kandung-di-jaktim-karena-narkoba-pisau-hingga-pakaian
Success: Adik Bunuh Kakak Kandung di Jaktim karena Narkoba, Pisau hingga Pakaian Disita...

Scraping: https://megapolitan.kompas.com/read/2025/07/23/11180011/banso

In [None]:
def main():
    base_url = "https://megapolitan.kompas.com/?page=3"
    all_news_data = []

    try:
        headers = {"User-Agent": "Mozilla/5.0"}
        response = requests.get(base_url, headers=headers, timeout=10)
        soup = BeautifulSoup(response.text, "html.parser")

        # Cari link dengan selector yang lebih umum
        news_links = []
        for article in soup.find_all("a", href=True):
            link = article["href"]
            if "/read/" in link or "/megapolitan/" in link:
                full_url = urljoin(base_url, link)
                if full_url not in news_links:
                    news_links.append(full_url)

        print(f"Found {len(news_links)} news links")

        for link in news_links:
            print(f"\nScraping: {link}")
            time.sleep(5)
            news_data = scrape_kompas(link)
            if news_data:
                print(f"Success: {news_data['title'][:100]}...")
                all_news_data.append(news_data)
            else:
                print("Failed to scrape this article")

        if all_news_data:
            df = pd.DataFrame(all_news_data)
            df.to_csv("kompas_megapolitan_3.csv", index=False, encoding="utf-8-sig")
            print(f"\nBerhasil menyimpan {len(df)} berita ke CSV")
        else:
            print("Tidak ada data yang berhasil di-scrape")

    except Exception as e:
        print(f"Main function error: {str(e)}")

if __name__ == "__main__":
    main()

Found 31 news links

Scraping: https://megapolitan.kompas.com/read/2025/07/23/07312731/belum-pernah-diungkap-ini-temuan-kompolnas-di-tkp-kematian-diplomat-kemlu
Success: Belum Pernah Diungkap, Ini Temuan Kompolnas di TKP Kematian Diplomat Kemlu...

Scraping: https://megapolitan.kompas.com/read/2025/07/23/09144201/benarkah-dana-operasional-rt-rw-jakarta-naik-dua-kali-lipat-oktober-nanti
Success: Benarkah Dana Operasional RT/RW Jakarta Naik Dua Kali Lipat Oktober Nanti?...

Scraping: https://megapolitan.kompas.com/read/2025/07/23/09133791/gagalkan-jambret-rp-300-juta-dua-warga-depok-dapat-penghargaan-dari
Success: Gagalkan Jambret Rp 300 Juta, Dua Warga Depok Dapat Penghargaan dari Polisi...

Scraping: https://megapolitan.kompas.com/read/2025/07/23/09090221/nelayan-muara-angke-tradisi-nadran-bukan-untuk-mengotori-laut
Success: Nelayan Muara Angke: Tradisi Nadran Bukan untuk Mengotori Laut...

Scraping: https://megapolitan.kompas.com/read/2025/07/23/09042901/detik-detik-kekejian-3-pembunu

In [None]:
def main():
    base_url = "https://megapolitan.kompas.com/?page=4"
    all_news_data = []

    try:
        headers = {"User-Agent": "Mozilla/5.0"}
        response = requests.get(base_url, headers=headers, timeout=10)
        soup = BeautifulSoup(response.text, "html.parser")

        # Cari link dengan selector yang lebih umum
        news_links = []
        for article in soup.find_all("a", href=True):
            link = article["href"]
            if "/read/" in link or "/megapolitan/" in link:
                full_url = urljoin(base_url, link)
                if full_url not in news_links:
                    news_links.append(full_url)

        print(f"Found {len(news_links)} news links")

        for link in news_links:
            print(f"\nScraping: {link}")
            time.sleep(5)
            news_data = scrape_kompas(link)
            if news_data:
                print(f"Success: {news_data['title'][:100]}...")
                all_news_data.append(news_data)
            else:
                print("Failed to scrape this article")

        if all_news_data:
            df = pd.DataFrame(all_news_data)
            df.to_csv("kompas_megapolitan_4.csv", index=False, encoding="utf-8-sig")
            print(f"\nBerhasil menyimpan {len(df)} berita ke CSV")
        else:
            print("Tidak ada data yang berhasil di-scrape")

    except Exception as e:
        print(f"Main function error: {str(e)}")

if __name__ == "__main__":
    main()

Found 32 news links

Scraping: https://megapolitan.kompas.com/read/2025/07/23/07312731/belum-pernah-diungkap-ini-temuan-kompolnas-di-tkp-kematian-diplomat-kemlu
Success: Belum Pernah Diungkap, Ini Temuan Kompolnas di TKP Kematian Diplomat Kemlu...

Scraping: https://megapolitan.kompas.com/read/2025/07/22/23100111/lolos-verifikasi-ahmed-zaki-iskandar-jadi-calon-tunggal-ketua-dpd-golkar
Success: Lolos Verifikasi, Ahmed Zaki Iskandar Jadi Calon Tunggal Ketua DPD Golkar Jakarta...

Scraping: https://megapolitan.kompas.com/read/2025/07/22/22584241/polisi-bongkar-sindikat-oli-palsu-di-jakbar-tiga-pelaku-ditangkap
Success: Polisi Bongkar Sindikat Oli Palsu di Jakbar, Tiga Pelaku Ditangkap...

Scraping: https://megapolitan.kompas.com/read/2025/07/22/22464461/perempuan-di-cisauk-sempat-teriak-minta-tolong-sebelum-dibunuh-tiga
Success: Perempuan di Cisauk Sempat Teriak Minta Tolong Sebelum Dibunuh Tiga Pelaku...

Scraping: https://megapolitan.kompas.com/read/2025/07/22/22233051/normalisasi-kali-

In [None]:
def main():
    base_url = "https://megapolitan.kompas.com/?page=5"
    all_news_data = []

    try:
        headers = {"User-Agent": "Mozilla/5.0"}
        response = requests.get(base_url, headers=headers, timeout=10)
        soup = BeautifulSoup(response.text, "html.parser")

        # Cari link dengan selector yang lebih umum
        news_links = []
        for article in soup.find_all("a", href=True):
            link = article["href"]
            if "/read/" in link or "/megapolitan/" in link:
                full_url = urljoin(base_url, link)
                if full_url not in news_links:
                    news_links.append(full_url)

        print(f"Found {len(news_links)} news links")

        for link in news_links:
            print(f"\nScraping: {link}")
            time.sleep(5)
            news_data = scrape_kompas(link)
            if news_data:
                print(f"Success: {news_data['title'][:100]}...")
                all_news_data.append(news_data)
            else:
                print("Failed to scrape this article")

        if all_news_data:
            df = pd.DataFrame(all_news_data)
            df.to_csv("kompas_megapolitan_5.csv", index=False, encoding="utf-8-sig")
            print(f"\nBerhasil menyimpan {len(df)} berita ke CSV")
        else:
            print("Tidak ada data yang berhasil di-scrape")

    except Exception as e:
        print(f"Main function error: {str(e)}")

if __name__ == "__main__":
    main()

Found 32 news links

Scraping: https://megapolitan.kompas.com/read/2025/07/23/07312731/belum-pernah-diungkap-ini-temuan-kompolnas-di-tkp-kematian-diplomat-kemlu
Success: Belum Pernah Diungkap, Ini Temuan Kompolnas di TKP Kematian Diplomat Kemlu...

Scraping: https://megapolitan.kompas.com/read/2025/07/22/20041531/besok-jokowi-bakal-diperiksa-di-solo-terkait-tudingan-ijazah-palsu
Success: Besok, Jokowi Bakal Diperiksa di Solo Terkait Tudingan Ijazah Palsu...

Scraping: https://megapolitan.kompas.com/read/2025/07/22/19571771/sepinya-sekolah-swasta-di-depok-dari-kuota-29-hanya-dapat-4-siswa
Success: Sepinya Sekolah Swasta di Depok: Dari Kuota 29, Hanya Dapat 4 Siswa...

Scraping: https://megapolitan.kompas.com/read/2025/07/22/19474041/warga-nilai-bansos-beras-lebih-bermanfaat-ketimbang-makan-bergizi-gratis
Success: Warga Nilai Bansos Beras Lebih Bermanfaat Ketimbang Makan Bergizi Gratis...

Scraping: https://megapolitan.kompas.com/read/2025/07/22/19363221/warga-muara-angke-gelar-tradisi-n

In [None]:
def main():
    base_url = "https://megapolitan.kompas.com/?page=6"
    all_news_data = []

    try:
        headers = {"User-Agent": "Mozilla/5.0"}
        response = requests.get(base_url, headers=headers, timeout=10)
        soup = BeautifulSoup(response.text, "html.parser")

        # Cari link dengan selector yang lebih umum
        news_links = []
        for article in soup.find_all("a", href=True):
            link = article["href"]
            if "/read/" in link or "/megapolitan/" in link:
                full_url = urljoin(base_url, link)
                if full_url not in news_links:
                    news_links.append(full_url)

        print(f"Found {len(news_links)} news links")

        for link in news_links:
            print(f"\nScraping: {link}")
            time.sleep(5)
            news_data = scrape_kompas(link)
            if news_data:
                print(f"Success: {news_data['title'][:100]}...")
                all_news_data.append(news_data)
            else:
                print("Failed to scrape this article")

        if all_news_data:
            df = pd.DataFrame(all_news_data)
            df.to_csv("kompas_megapolitan_6.csv", index=False, encoding="utf-8-sig")
            print(f"\nBerhasil menyimpan {len(df)} berita ke CSV")
        else:
            print("Tidak ada data yang berhasil di-scrape")

    except Exception as e:
        print(f"Main function error: {str(e)}")

if __name__ == "__main__":
    main()

Found 32 news links

Scraping: https://megapolitan.kompas.com/read/2025/07/23/07312731/belum-pernah-diungkap-ini-temuan-kompolnas-di-tkp-kematian-diplomat-kemlu
Success: Belum Pernah Diungkap, Ini Temuan Kompolnas di TKP Kematian Diplomat Kemlu...

Scraping: https://megapolitan.kompas.com/read/2025/07/22/17233141/kompolnas-ungkap-isi-pertemuan-dengan-penyidik-polda-metro-soal-kematian
Success: Kompolnas Ungkap Isi Pertemuan dengan Penyidik Polda Metro Soal Kematian Diplomat Kemlu...

Scraping: https://megapolitan.kompas.com/read/2025/07/22/17163491/petugas-damkar-terlempar-nasi-uduk-dan-mata-kena-sambal-saat-chaos
Success: Petugas Damkar Terlempar Nasi Uduk dan Mata Kena Sambal Saat Chaos Kebakaran Tambora...

Scraping: https://megapolitan.kompas.com/read/2025/07/22/17131391/tetangga-kos-sebut-situasi-sunyi-saat-malam-tewasnya-diplomat-kemlu
Success: Tetangga Kos Sebut Situasi Sunyi Saat Malam Tewasnya Diplomat Kemlu...

Scraping: https://megapolitan.kompas.com/read/2025/07/22/17101141

In [9]:
def main():
    base_url = "https://megapolitan.kompas.com/?page=7"
    all_news_data = []

    try:
        headers = {"User-Agent": "Mozilla/5.0"}
        response = requests.get(base_url, headers=headers, timeout=10)
        soup = BeautifulSoup(response.text, "html.parser")

        # Cari link dengan selector yang lebih umum
        news_links = []
        for article in soup.find_all("a", href=True):
            link = article["href"]
            if "/read/" in link or "/megapolitan/" in link:
                full_url = urljoin(base_url, link)
                if full_url not in news_links:
                    news_links.append(full_url)

        print(f"Found {len(news_links)} news links")

        for link in news_links:
            print(f"\nScraping: {link}")
            time.sleep(5)
            news_data = scrape_kompas(link)
            if news_data:
                print(f"Success: {news_data['title'][:100]}...")
                all_news_data.append(news_data)
            else:
                print("Failed to scrape this article")

        if all_news_data:
            df = pd.DataFrame(all_news_data)
            df.to_csv("kompas_megapolitan_7.csv", index=False, encoding="utf-8-sig")
            print(f"\nBerhasil menyimpan {len(df)} berita ke CSV")
        else:
            print("Tidak ada data yang berhasil di-scrape")

    except Exception as e:
        print(f"Main function error: {str(e)}")

if __name__ == "__main__":
    main()

Found 31 news links

Scraping: https://megapolitan.kompas.com/read/2025/07/23/07312731/belum-pernah-diungkap-ini-temuan-kompolnas-di-tkp-kematian-diplomat-kemlu
Success: Belum Pernah Diungkap, Ini Temuan Kompolnas di TKP Kematian Diplomat Kemlu...

Scraping: https://megapolitan.kompas.com/read/2025/07/22/15404101/pramono-teken-aturan-dana-operasional-rt-rw-cair-mulai-oktober
Success: Pramono Teken Aturan Dana Operasional RT/RW, Cair Mulai Oktober...

Scraping: https://megapolitan.kompas.com/read/2025/07/22/15390161/warga-murka-saat-rekonstruksi-pembunuhan-wanita-terborgol-di-cisauk
Success: Warga Murka Saat Rekonstruksi Pembunuhan Wanita Terborgol di Cisauk, Sempat Lempar Botol...

Scraping: https://megapolitan.kompas.com/read/2025/07/22/15345061/dukung-penuh-persija-pramono-kalau-prestasi-enggak-baik-jangan-salahkan
Success: Dukung Penuh Persija, Pramono: Kalau Prestasi Enggak Baik, Jangan Salahkan Pemprov DKI Lagi...

Scraping: https://megapolitan.kompas.com/read/2025/07/22/15311281/

In [10]:
def main():
    base_url = "https://megapolitan.kompas.com/?page=8"
    all_news_data = []

    try:
        headers = {"User-Agent": "Mozilla/5.0"}
        response = requests.get(base_url, headers=headers, timeout=10)
        soup = BeautifulSoup(response.text, "html.parser")

        # Cari link dengan selector yang lebih umum
        news_links = []
        for article in soup.find_all("a", href=True):
            link = article["href"]
            if "/read/" in link or "/megapolitan/" in link:
                full_url = urljoin(base_url, link)
                if full_url not in news_links:
                    news_links.append(full_url)

        print(f"Found {len(news_links)} news links")

        for link in news_links:
            print(f"\nScraping: {link}")
            time.sleep(5)
            news_data = scrape_kompas(link)
            if news_data:
                print(f"Success: {news_data['title'][:100]}...")
                all_news_data.append(news_data)
            else:
                print("Failed to scrape this article")

        if all_news_data:
            df = pd.DataFrame(all_news_data)
            df.to_csv("kompas_megapolitan_8.csv", index=False, encoding="utf-8-sig")
            print(f"\nBerhasil menyimpan {len(df)} berita ke CSV")
        else:
            print("Tidak ada data yang berhasil di-scrape")

    except Exception as e:
        print(f"Main function error: {str(e)}")

if __name__ == "__main__":
    main()

Found 32 news links

Scraping: https://megapolitan.kompas.com/read/2025/07/23/07312731/belum-pernah-diungkap-ini-temuan-kompolnas-di-tkp-kematian-diplomat-kemlu
Success: Belum Pernah Diungkap, Ini Temuan Kompolnas di TKP Kematian Diplomat Kemlu...

Scraping: https://megapolitan.kompas.com/read/2025/07/22/13572831/kompolnas-minta-penjaga-kos-peragakan-posisi-kunci-kamar-diplomat-kemlu
Success: Kompolnas Minta Penjaga Kos Peragakan Posisi Kunci Kamar Diplomat Kemlu, Ini Temuannya...

Scraping: https://megapolitan.kompas.com/read/2025/07/22/13570041/diduga-oplos-dan-kurangi-takaran-beras-pt-food-station-dipanggil-pramono
Success: Diduga Oplos dan Kurangi Takaran Beras, PT Food Station Dipanggil Pramono Sore Ini...

Scraping: https://megapolitan.kompas.com/read/2025/07/22/13514801/cara-pramono-berantas-preman-dan-copet-jakarta-pakai-teknologi-digital
Success: Cara Pramono Berantas Preman dan Copet Jakarta Pakai Teknologi Digital...

Scraping: https://megapolitan.kompas.com/read/2025/07/22/

In [11]:
def main():
    base_url = "https://megapolitan.kompas.com/?page=9"
    all_news_data = []

    try:
        headers = {"User-Agent": "Mozilla/5.0"}
        response = requests.get(base_url, headers=headers, timeout=10)
        soup = BeautifulSoup(response.text, "html.parser")

        # Cari link dengan selector yang lebih umum
        news_links = []
        for article in soup.find_all("a", href=True):
            link = article["href"]
            if "/read/" in link or "/megapolitan/" in link:
                full_url = urljoin(base_url, link)
                if full_url not in news_links:
                    news_links.append(full_url)

        print(f"Found {len(news_links)} news links")

        for link in news_links:
            print(f"\nScraping: {link}")
            time.sleep(5)
            news_data = scrape_kompas(link)
            if news_data:
                print(f"Success: {news_data['title'][:100]}...")
                all_news_data.append(news_data)
            else:
                print("Failed to scrape this article")

        if all_news_data:
            df = pd.DataFrame(all_news_data)
            df.to_csv("kompas_megapolitan_9.csv", index=False, encoding="utf-8-sig")
            print(f"\nBerhasil menyimpan {len(df)} berita ke CSV")
        else:
            print("Tidak ada data yang berhasil di-scrape")

    except Exception as e:
        print(f"Main function error: {str(e)}")

if __name__ == "__main__":
    main()

Found 32 news links

Scraping: https://megapolitan.kompas.com/read/2025/07/23/07312731/belum-pernah-diungkap-ini-temuan-kompolnas-di-tkp-kematian-diplomat-kemlu
Success: Belum Pernah Diungkap, Ini Temuan Kompolnas di TKP Kematian Diplomat Kemlu...

Scraping: https://megapolitan.kompas.com/read/2025/07/22/11424641/rano-karno-ungkap-8-langkah-pemprov-untuk-atasi-kemacetan-jakarta
Success: Rano Karno Ungkap 8 Langkah Pemprov untuk Atasi Kemacetan Jakarta...

Scraping: https://megapolitan.kompas.com/read/2025/07/22/11382281/transaksi-digital-di-jakarta-tembus-rp-22-miliar-naik-180-persen-dari
Success: Transaksi Digital di Jakarta Tembus Rp 2,2 Miliar, Naik 180 Persen dari 2024...

Scraping: https://megapolitan.kompas.com/read/2025/07/22/11372201/pengguna-qris-di-jakarta-tembus-62-juta-tertinggi-se-indonesia
Success: Pengguna QRIS di Jakarta Tembus 6,2 Juta, Tertinggi se-Indonesia...

Scraping: https://megapolitan.kompas.com/read/2025/07/22/11304521/kuasa-hukum-jokowi-permintaan-gelar-perka

In [12]:
def main():
    base_url = "https://megapolitan.kompas.com/?page=10"
    all_news_data = []

    try:
        headers = {"User-Agent": "Mozilla/5.0"}
        response = requests.get(base_url, headers=headers, timeout=10)
        soup = BeautifulSoup(response.text, "html.parser")

        # Cari link dengan selector yang lebih umum
        news_links = []
        for article in soup.find_all("a", href=True):
            link = article["href"]
            if "/read/" in link or "/megapolitan/" in link:
                full_url = urljoin(base_url, link)
                if full_url not in news_links:
                    news_links.append(full_url)

        print(f"Found {len(news_links)} news links")

        for link in news_links:
            print(f"\nScraping: {link}")
            time.sleep(5)
            news_data = scrape_kompas(link)
            if news_data:
                print(f"Success: {news_data['title'][:100]}...")
                all_news_data.append(news_data)
            else:
                print("Failed to scrape this article")

        if all_news_data:
            df = pd.DataFrame(all_news_data)
            df.to_csv("kompas_megapolitan_10.csv", index=False, encoding="utf-8-sig")
            print(f"\nBerhasil menyimpan {len(df)} berita ke CSV")
        else:
            print("Tidak ada data yang berhasil di-scrape")

    except Exception as e:
        print(f"Main function error: {str(e)}")

if __name__ == "__main__":
    main()

Found 32 news links

Scraping: https://megapolitan.kompas.com/read/2025/07/23/07312731/belum-pernah-diungkap-ini-temuan-kompolnas-di-tkp-kematian-diplomat-kemlu
Success: Belum Pernah Diungkap, Ini Temuan Kompolnas di TKP Kematian Diplomat Kemlu...

Scraping: https://megapolitan.kompas.com/read/2025/07/22/08432401/lurah-berharap-taman-terbengkalai-di-cilincing-bisa-segera-diperbaiki
Success: Lurah Berharap Taman Terbengkalai di Cilincing Bisa Segera Diperbaiki...

Scraping: https://megapolitan.kompas.com/read/2025/07/22/08420131/lokasi-samsat-keliling-di-jakarta-untuk-pemutihan-pajak-kendaraan-selasa
Success: Lokasi Samsat Keliling di Jakarta untuk Pemutihan Pajak Kendaraan Selasa 22 Juli 2025...

Scraping: https://megapolitan.kompas.com/read/2025/07/22/08381291/sehari-tambora-membara
Success: Sehari Tambora Membara......

Scraping: https://megapolitan.kompas.com/read/2025/07/22/08361831/mengapa-taman-terbengkalai-di-cilincing-tak-kunjung-diperbaiki
Success: Mengapa Taman Terbengkalai d

In [None]:
import pandas as pd
import ollama
from tqdm import tqdm

In [14]:
# List semua file CSV yang ingin digabungkan
file_paths = [
    'kompas_megapolitan_1.csv', 'kompas_megapolitan_2.csv', 'kompas_megapolitan_3.csv', 'kompas_megapolitan_4.csv', 'kompas_megapolitan_5.csv', 'kompas_megapolitan_6.csv', 'kompas_megapolitan_7.csv', 'kompas_megapolitan_8.csv', 'kompas_megapolitan_9.csv', 'kompas_megapolitan_10.csv'
]

# Membaca dan menggabungkan semua file
all_data = pd.concat((pd.read_csv(file) for file in file_paths), ignore_index=True)

# Menyimpan hasil gabungan ke file baru
all_data.to_csv("kompas_megapolitan.csv", index=False, encoding="utf-8-sig")

In [12]:
# Pilih model yang lebih ringan
MODEL_NAME = "phi3"

# Fungsi yang dioptimalkan
def generate_qa_light(text):
    prompt = f"""
    Buat 1 pertanyaan dan jawaban singkat (maks 20 kata) dari teks berikut:
    {text[:1000]}  # Lebih pendek untuk menghemat memori
    Format: Pertanyaan: <pertanyaan>\nJawaban: <jawaban>
    """
    
    try:
        response = ollama.generate(
            model=MODEL_NAME,
            prompt=prompt,
            options={
                "temperature": 0.3,  # Kurangi kreativitas untuk hasil lebih konsisten
                "num_ctx": 1024  # Batasi context window
            }
        )
        return response['response']
    except Exception as e:
        print(f"Error: {e}")
        return None

In [13]:
# Proses CSV dengan optimasi memori
def process_csv(input_path, output_path, sample_size=None):
    df = pd.read_csv(input_path)
    if sample_size:
        df = df.sample(min(sample_size, len(df)))
    
    results = []
    for _, row in tqdm(df.iterrows(), total=len(df)):
        generated = generate_qa_light(str(row['content']))
        if generated:
            try:
                q = generated.split("Pertanyaan: ")[1].split("\n")[0].strip()
                a = generated.split("Jawaban: ")[1].strip() if "Jawaban: " in generated else "-"
                results.append({
                    'content': row.get('content', ''),
                    'question': q,
                    'answer': a
                })
            except:
                continue
    
    pd.DataFrame(results).to_csv(output_path, index=False)

process_csv('kompas_megapolitan.csv', 'km_qa.csv', sample_size=50)

100%|██████████| 50/50 [43:00<00:00, 51.60s/it]  
