In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from urllib.parse import urljoin
import time

In [2]:
def scrape_kompas(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    
    try:
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()
        
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Coba selector alternatif untuk judul
        title = soup.find('h1', {'class': ['read__title', 'headline__title']}).get_text(strip=True) if soup.find('h1', {'class': ['read__title', 'headline__title']}) else 'Judul tidak ditemukan'
        
        # Coba selector alternatif untuk konten
        content_div = soup.find('div', class_='read__content') or soup.find('div', class_='article__content')
        if content_div:
            paragraphs = content_div.find_all('p', recursive=True)
            content = '\n'.join([p.get_text(strip=True) for p in paragraphs if p.get_text(strip=True)])
        else:
            content = 'Konten tidak ditemukan'
        
        return {
            'title': title,
            'content': content,
            'url': url
        }
    
    except Exception as e:
        print(f"Error saat scraping {url}: {str(e)}")
        return None

In [3]:
def main():
    base_url = "https://www.kompas.com/global"
    all_news_data = []
    
    try:
        headers = {'User-Agent': 'Mozilla/5.0'}
        response = requests.get(base_url, headers=headers, timeout=10)
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Cari link dengan selector yang lebih umum
        news_links = []
        for article in soup.find_all('a', href=True):
            link = article['href']
            if '/read/' in link or '/global/' in link:
                full_url = urljoin(base_url, link)
                if full_url not in news_links:
                    news_links.append(full_url)
        
        print(f"Found {len(news_links)} news links")
        
        for link in news_links:
            print(f"\nScraping: {link}")
            time.sleep(5)
            news_data = scrape_kompas(link)
            if news_data:
                print(f"Success: {news_data['title'][:100]}...")
                all_news_data.append(news_data)
            else:
                print("Failed to scrape this article")
        
        if all_news_data:
            df = pd.DataFrame(all_news_data)
            df.to_csv('kompas_global_1.csv', index=False, encoding='utf-8-sig')
            print(f"\nBerhasil menyimpan {len(df)} berita ke CSV")
        else:
            print("Tidak ada data yang berhasil di-scrape")
    
    except Exception as e:
        print(f"Main function error: {str(e)}")

if __name__ == "__main__":
    main()

Found 36 news links

Scraping: https://www.kompas.com/global/read/2025/08/04/113600770/jet-tempur-as-tembakkan-suar-usir-pesawat-di-atas-lapangan-golf-trump
Success: Jet Tempur AS Tembakkan Suar, Usir Pesawat di Atas Lapangan Golf Trump...

Scraping: https://www.kompas.com/global/read/2025/08/05/100827370/tegang-pilot-united-airlines-serukan-mayday-berulang-setelah-lepas-landas
Success: Tegang, Pilot United Airlines Serukan “Mayday” Berulang Setelah Lepas Landas...

Scraping: https://www.kompas.com/global/read/2025/08/05/091700070/di-perundingan-damai-thailand-kamboja-masih-saling-tuding-lakukan
Success: Di Perundingan Damai, Thailand-Kamboja Masih Saling Tuding Lakukan Pelanggaran...

Scraping: https://www.kompas.com/global/read/2025/08/05/085719970/ingin-akhiri-perang-di-gaza-ratusan-eks-pejabat-militer-israel-desak
Success: Ingin Akhiri Perang di Gaza, Ratusan Eks Pejabat Militer Israel Desak Trump Tekan Netanyahu...

Scraping: https://www.kompas.com/global/read/2025/08/05/080100870

In [4]:
def main():
    base_url = "https://www.kompas.com/global?page=2"
    all_news_data = []
    
    try:
        headers = {'User-Agent': 'Mozilla/5.0'}
        response = requests.get(base_url, headers=headers, timeout=10)
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Cari link dengan selector yang lebih umum
        news_links = []
        for article in soup.find_all('a', href=True):
            link = article['href']
            if '/read/' in link or '/global/' in link:
                full_url = urljoin(base_url, link)
                if full_url not in news_links:
                    news_links.append(full_url)
        
        print(f"Found {len(news_links)} news links")
        
        for link in news_links:
            print(f"\nScraping: {link}")
            time.sleep(5)
            news_data = scrape_kompas(link)
            if news_data:
                print(f"Success: {news_data['title'][:100]}...")
                all_news_data.append(news_data)
            else:
                print("Failed to scrape this article")
        
        if all_news_data:
            df = pd.DataFrame(all_news_data)
            df.to_csv('kompas_global_2.csv', index=False, encoding='utf-8-sig')
            print(f"\nBerhasil menyimpan {len(df)} berita ke CSV")
        else:
            print("Tidak ada data yang berhasil di-scrape")
    
    except Exception as e:
        print(f"Main function error: {str(e)}")

if __name__ == "__main__":
    main()

Found 34 news links

Scraping: https://www.kompas.com/global/read/2025/08/04/113600770/jet-tempur-as-tembakkan-suar-usir-pesawat-di-atas-lapangan-golf-trump
Success: Jet Tempur AS Tembakkan Suar, Usir Pesawat di Atas Lapangan Golf Trump...

Scraping: https://www.kompas.com/global/read/2025/08/04/144423770/adu-kekuatan-kapal-selam-nuklir-as-vs-rusia-mana-lebih-unggul
Success: Adu Kekuatan Kapal Selam Nuklir AS Vs Rusia, Mana Lebih Unggul?...

Scraping: https://www.kompas.com/global/read/2025/08/04/143705170/4-pekerja-tewas-jatuh-ke-lubang-got-saat-inspeksi-pipa-limbah-di-jepang
Success: 4 Pekerja Tewas Jatuh ke Lubang Got Saat Inspeksi Pipa Limbah di Jepang...

Scraping: https://www.kompas.com/global/read/2025/08/04/143419770/80-tahun-berlalu-derita-bom-atom-hiroshima-masih-terasa
Success: 80 Tahun Berlalu, Derita Bom Atom Hiroshima Masih Terasa...

Scraping: https://www.kompas.com/global/read/2025/08/04/135650170/jaga-jaga-diserang-israel-lagi-iran-bentuk-dewan-pertahanan-nasional
Succ

In [5]:
def main():
    base_url = "https://www.kompas.com/global?page=3"
    all_news_data = []
    
    try:
        headers = {'User-Agent': 'Mozilla/5.0'}
        response = requests.get(base_url, headers=headers, timeout=10)
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Cari link dengan selector yang lebih umum
        news_links = []
        for article in soup.find_all('a', href=True):
            link = article['href']
            if '/read/' in link or '/global/' in link:
                full_url = urljoin(base_url, link)
                if full_url not in news_links:
                    news_links.append(full_url)
        
        print(f"Found {len(news_links)} news links")
        
        for link in news_links:
            print(f"\nScraping: {link}")
            time.sleep(5)
            news_data = scrape_kompas(link)
            if news_data:
                print(f"Success: {news_data['title'][:100]}...")
                all_news_data.append(news_data)
            else:
                print("Failed to scrape this article")
        
        if all_news_data:
            df = pd.DataFrame(all_news_data)
            df.to_csv('kompas_global_3.csv', index=False, encoding='utf-8-sig')
            print(f"\nBerhasil menyimpan {len(df)} berita ke CSV")
        else:
            print("Tidak ada data yang berhasil di-scrape")
    
    except Exception as e:
        print(f"Main function error: {str(e)}")

if __name__ == "__main__":
    main()

Found 40 news links

Scraping: https://www.kompas.com/global/read/2025/08/04/113600770/jet-tempur-as-tembakkan-suar-usir-pesawat-di-atas-lapangan-golf-trump
Success: Jet Tempur AS Tembakkan Suar, Usir Pesawat di Atas Lapangan Golf Trump...

Scraping: https://www.kompas.com/global/read/2025/08/04/072600770/menteri-sayap-kanan-israel-pimpin-ibadah-di-kompleks-al-aqsa-langgar
Success: Menteri Sayap Kanan Israel Pimpin Ibadah di Kompleks Al Aqsa, Langgar Aturan...

Scraping: https://www.kompas.com/global/read/2025/08/04/065907470/terus-beli-minyak-moskwa-as-tuduh-india-danai-perang-rusia-di-ukraina
Success: Terus Beli Minyak Moskwa, AS Tuduh India Danai Perang Rusia di Ukraina...

Scraping: https://www.kompas.com/global/read/2025/08/04/063100570/pria-40-tahun-tewas-terjatuh-saat-reuni-oasis-konser-tetap-lanjut
Success: Pria 40 Tahun Tewas Terjatuh saat Reuni Oasis, Konser Tetap Lanjut...

Scraping: https://www.kompas.com/global/read/2025/08/04/062247270/hamas-tuntut-israel-penuhi-syarat-se

In [6]:
def main():
    base_url = "https://www.kompas.com/global?page=4"
    all_news_data = []
    
    try:
        headers = {'User-Agent': 'Mozilla/5.0'}
        response = requests.get(base_url, headers=headers, timeout=10)
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Cari link dengan selector yang lebih umum
        news_links = []
        for article in soup.find_all('a', href=True):
            link = article['href']
            if '/read/' in link or '/global/' in link:
                full_url = urljoin(base_url, link)
                if full_url not in news_links:
                    news_links.append(full_url)
        
        print(f"Found {len(news_links)} news links")
        
        for link in news_links:
            print(f"\nScraping: {link}")
            time.sleep(5)
            news_data = scrape_kompas(link)
            if news_data:
                print(f"Success: {news_data['title'][:100]}...")
                all_news_data.append(news_data)
            else:
                print("Failed to scrape this article")
        
        if all_news_data:
            df = pd.DataFrame(all_news_data)
            df.to_csv('kompas_global_4.csv', index=False, encoding='utf-8-sig')
            print(f"\nBerhasil menyimpan {len(df)} berita ke CSV")
        else:
            print("Tidak ada data yang berhasil di-scrape")
    
    except Exception as e:
        print(f"Main function error: {str(e)}")

if __name__ == "__main__":
    main()

Found 40 news links

Scraping: https://www.kompas.com/global/read/2025/08/04/113600770/jet-tempur-as-tembakkan-suar-usir-pesawat-di-atas-lapangan-golf-trump
Success: Jet Tempur AS Tembakkan Suar, Usir Pesawat di Atas Lapangan Golf Trump...

Scraping: https://www.kompas.com/global/read/2025/08/03/091047070/trump-terkesima-anwar-ibrahim-sukses-damaikan-thailand-kamboja
Success: Trump Terkesima, Anwar Ibrahim Sukses Damaikan Thailand-Kamboja...

Scraping: https://www.kompas.com/global/read/2025/08/03/080000670/rivalitas-geopolitik-konflik-thailand-kamboja
Success: Rivalitas Geopolitik Konflik Thailand-Kamboja...

Scraping: https://www.kompas.com/global/read/2025/08/03/072901170/trump-ngamuk-pecat-pejabat-biro-statistik-tak-terima-lapangan-kerja-as
Success: Trump Ngamuk Pecat Pejabat Biro Statistik, Tak Terima Lapangan Kerja AS Disebut Turun...

Scraping: https://www.kompas.com/global/read/2025/08/03/062255770/unik-global-bayi-lahir-berusia-30-tahun-makam-kuno-2600-tahun
Success: [UNIK GLO

In [7]:
def main():
    base_url = "https://www.kompas.com/global?page=5"
    all_news_data = []
    
    try:
        headers = {'User-Agent': 'Mozilla/5.0'}
        response = requests.get(base_url, headers=headers, timeout=10)
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Cari link dengan selector yang lebih umum
        news_links = []
        for article in soup.find_all('a', href=True):
            link = article['href']
            if '/read/' in link or '/global/' in link:
                full_url = urljoin(base_url, link)
                if full_url not in news_links:
                    news_links.append(full_url)
        
        print(f"Found {len(news_links)} news links")
        
        for link in news_links:
            print(f"\nScraping: {link}")
            time.sleep(5)
            news_data = scrape_kompas(link)
            if news_data:
                print(f"Success: {news_data['title'][:100]}...")
                all_news_data.append(news_data)
            else:
                print("Failed to scrape this article")
        
        if all_news_data:
            df = pd.DataFrame(all_news_data)
            df.to_csv('kompas_global_5.csv', index=False, encoding='utf-8-sig')
            print(f"\nBerhasil menyimpan {len(df)} berita ke CSV")
        else:
            print("Tidak ada data yang berhasil di-scrape")
    
    except Exception as e:
        print(f"Main function error: {str(e)}")

if __name__ == "__main__":
    main()

Found 40 news links

Scraping: https://www.kompas.com/global/read/2025/08/04/113600770/jet-tempur-as-tembakkan-suar-usir-pesawat-di-atas-lapangan-golf-trump
Success: Jet Tempur AS Tembakkan Suar, Usir Pesawat di Atas Lapangan Golf Trump...

Scraping: https://www.kompas.com/global/read/2025/08/02/104600970/ditinggal-kakek-nenek-belanja-bayi-3-tahun-jatuh-dari-lantai-18
Success: Ditinggal Kakek-Nenek Belanja, Bayi 3 Tahun Jatuh dari Lantai 18...

Scraping: https://www.kompas.com/global/read/2025/08/02/102758370/buka-data-negatif-trump-pecat-pejabat-biro-statistik-tenaga-kerja
Success: Buka Data Negatif, Trump Pecat Pejabat Biro Statistik Tenaga Kerja...

Scraping: https://www.kompas.com/global/read/2025/08/02/095600070/tambang-tembaga-bawah-tanah-terbesar-di-dunia-runtuh-penyelamat-berpacu
Success: Tambang Tembaga Bawah Tanah Terbesar di Dunia Runtuh, Penyelamat Berpacu Waktu...

Scraping: https://www.kompas.com/global/read/2025/08/02/090600270/putin-bikin-rudal-hipersonik-terbaru-dipasa

In [8]:
def main():
    base_url = "https://www.kompas.com/global?page=6"
    all_news_data = []
    
    try:
        headers = {'User-Agent': 'Mozilla/5.0'}
        response = requests.get(base_url, headers=headers, timeout=10)
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Cari link dengan selector yang lebih umum
        news_links = []
        for article in soup.find_all('a', href=True):
            link = article['href']
            if '/read/' in link or '/global/' in link:
                full_url = urljoin(base_url, link)
                if full_url not in news_links:
                    news_links.append(full_url)
        
        print(f"Found {len(news_links)} news links")
        
        for link in news_links:
            print(f"\nScraping: {link}")
            time.sleep(5)
            news_data = scrape_kompas(link)
            if news_data:
                print(f"Success: {news_data['title'][:100]}...")
                all_news_data.append(news_data)
            else:
                print("Failed to scrape this article")
        
        if all_news_data:
            df = pd.DataFrame(all_news_data)
            df.to_csv('kompas_global_6.csv', index=False, encoding='utf-8-sig')
            print(f"\nBerhasil menyimpan {len(df)} berita ke CSV")
        else:
            print("Tidak ada data yang berhasil di-scrape")
    
    except Exception as e:
        print(f"Main function error: {str(e)}")

if __name__ == "__main__":
    main()

Found 40 news links

Scraping: https://www.kompas.com/global/read/2025/08/04/113600770/jet-tempur-as-tembakkan-suar-usir-pesawat-di-atas-lapangan-golf-trump
Success: Jet Tempur AS Tembakkan Suar, Usir Pesawat di Atas Lapangan Golf Trump...

Scraping: https://www.kompas.com/global/read/2025/08/01/143200870/warga-gaza-terpaksa-ikat-batu-bata-di-perut-demi-redakan-lapar
Success: Warga Gaza Terpaksa Ikat Batu Bata di Perut demi Redakan Lapar...

Scraping: https://www.kompas.com/global/read/2025/08/01/140313970/jepang-tarik-16000-pistol-mainan-impor-dari-china-bisa-tembakkan-peluru
Success: Jepang Tarik 16.000 Pistol Mainan Impor dari China, Bisa Tembakkan Peluru Betulan...

Scraping: https://www.kompas.com/global/read/2025/08/01/133927470/4-pramugari-gugat-boeing-atas-ledakan-panel-kabin-max-9
Success: 4 Pramugari Gugat Boeing atas Ledakan Panel Kabin MAX 9...

Scraping: https://www.kompas.com/global/read/2025/08/01/130600670/gali-jalan-depan-rumah-pekerja-temukan-kerangka-manusia-1.000-ta

In [9]:
def main():
    base_url = "https://www.kompas.com/global?page=7"
    all_news_data = []
    
    try:
        headers = {'User-Agent': 'Mozilla/5.0'}
        response = requests.get(base_url, headers=headers, timeout=10)
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Cari link dengan selector yang lebih umum
        news_links = []
        for article in soup.find_all('a', href=True):
            link = article['href']
            if '/read/' in link or '/global/' in link:
                full_url = urljoin(base_url, link)
                if full_url not in news_links:
                    news_links.append(full_url)
        
        print(f"Found {len(news_links)} news links")
        
        for link in news_links:
            print(f"\nScraping: {link}")
            time.sleep(5)
            news_data = scrape_kompas(link)
            if news_data:
                print(f"Success: {news_data['title'][:100]}...")
                all_news_data.append(news_data)
            else:
                print("Failed to scrape this article")
        
        if all_news_data:
            df = pd.DataFrame(all_news_data)
            df.to_csv('kompas_global_7.csv', index=False, encoding='utf-8-sig')
            print(f"\nBerhasil menyimpan {len(df)} berita ke CSV")
        else:
            print("Tidak ada data yang berhasil di-scrape")
    
    except Exception as e:
        print(f"Main function error: {str(e)}")

if __name__ == "__main__":
    main()

Found 40 news links

Scraping: https://www.kompas.com/global/read/2025/08/04/113600770/jet-tempur-as-tembakkan-suar-usir-pesawat-di-atas-lapangan-golf-trump
Success: Jet Tempur AS Tembakkan Suar, Usir Pesawat di Atas Lapangan Golf Trump...

Scraping: https://www.kompas.com/global/read/2025/07/31/204600170/kamboja-minta-malaysia-bantu-bebaskan-20-tentaranya-di-thailand
Success: Kamboja Minta Malaysia Bantu Bebaskan 20 Tentaranya di Thailand...

Scraping: https://www.kompas.com/global/read/2025/07/31/201500070/trump-akan-hadiri-ktt-asean-oktober-mendatang
Success: Trump Akan Hadiri KTT ASEAN Oktober Mendatang...

Scraping: https://www.kompas.com/global/read/2025/07/31/191520770/jet-tempur-siluman-f-35-as-jatuh-insiden-kedua-tahun-ini
Success: Jet Tempur Siluman F-35 AS Jatuh, Insiden Kedua Tahun Ini...

Scraping: https://www.kompas.com/global/read/2025/07/31/184508570/serangan-as-ke-iran-berlanjut-donald-trump-jatuhkan-115-sanksi
Success: "Serangan" AS ke Iran Berlanjut, Donald Trump Jat

In [10]:
def main():
    base_url = "https://www.kompas.com/global?page=8"
    all_news_data = []
    
    try:
        headers = {'User-Agent': 'Mozilla/5.0'}
        response = requests.get(base_url, headers=headers, timeout=10)
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Cari link dengan selector yang lebih umum
        news_links = []
        for article in soup.find_all('a', href=True):
            link = article['href']
            if '/read/' in link or '/global/' in link:
                full_url = urljoin(base_url, link)
                if full_url not in news_links:
                    news_links.append(full_url)
        
        print(f"Found {len(news_links)} news links")
        
        for link in news_links:
            print(f"\nScraping: {link}")
            time.sleep(5)
            news_data = scrape_kompas(link)
            if news_data:
                print(f"Success: {news_data['title'][:100]}...")
                all_news_data.append(news_data)
            else:
                print("Failed to scrape this article")
        
        if all_news_data:
            df = pd.DataFrame(all_news_data)
            df.to_csv('kompas_global_8.csv', index=False, encoding='utf-8-sig')
            print(f"\nBerhasil menyimpan {len(df)} berita ke CSV")
        else:
            print("Tidak ada data yang berhasil di-scrape")
    
    except Exception as e:
        print(f"Main function error: {str(e)}")

if __name__ == "__main__":
    main()

Found 40 news links

Scraping: https://www.kompas.com/global/read/2025/08/04/113600770/jet-tempur-as-tembakkan-suar-usir-pesawat-di-atas-lapangan-golf-trump
Success: Jet Tempur AS Tembakkan Suar, Usir Pesawat di Atas Lapangan Golf Trump...

Scraping: https://www.kompas.com/global/read/2025/07/31/113600470/warisan-sriwijaya-sapa-warga-melbourne-lewat-festival-indonesia-2025
Success: Warisan Sriwijaya Sapa Warga Melbourne lewat Festival Indonesia 2025...

Scraping: https://www.kompas.com/global/read/2025/07/31/111323670/suhu-turkiye-lampaui-50-celsius-warga-teriak-tagihan-ac-bengkak
Success: Suhu Turkiye Lampaui 50 Celsius, Warga Teriak Tagihan AC Bengkak...

Scraping: https://www.kompas.com/global/read/2025/07/31/111321270/tni-dan-as-gelar-pelatihan-biorisiko-di-medan
Success: TNI dan AS Gelar Pelatihan Biorisiko di Medan...

Scraping: https://www.kompas.com/global/read/2025/07/31/110534370/gencatan-senjata-thailand-kamboja-rapuh-kedua-kubu-saling-tuduh
Success: Gencatan Senjata Thailan

In [11]:
def main():
    base_url = "https://www.kompas.com/global?page=9"
    all_news_data = []
    
    try:
        headers = {'User-Agent': 'Mozilla/5.0'}
        response = requests.get(base_url, headers=headers, timeout=10)
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Cari link dengan selector yang lebih umum
        news_links = []
        for article in soup.find_all('a', href=True):
            link = article['href']
            if '/read/' in link or '/global/' in link:
                full_url = urljoin(base_url, link)
                if full_url not in news_links:
                    news_links.append(full_url)
        
        print(f"Found {len(news_links)} news links")
        
        for link in news_links:
            print(f"\nScraping: {link}")
            time.sleep(5)
            news_data = scrape_kompas(link)
            if news_data:
                print(f"Success: {news_data['title'][:100]}...")
                all_news_data.append(news_data)
            else:
                print("Failed to scrape this article")
        
        if all_news_data:
            df = pd.DataFrame(all_news_data)
            df.to_csv('kompas_global_9.csv', index=False, encoding='utf-8-sig')
            print(f"\nBerhasil menyimpan {len(df)} berita ke CSV")
        else:
            print("Tidak ada data yang berhasil di-scrape")
    
    except Exception as e:
        print(f"Main function error: {str(e)}")

if __name__ == "__main__":
    main()

Found 40 news links

Scraping: https://www.kompas.com/global/read/2025/08/04/113600770/jet-tempur-as-tembakkan-suar-usir-pesawat-di-atas-lapangan-golf-trump
Success: Jet Tempur AS Tembakkan Suar, Usir Pesawat di Atas Lapangan Golf Trump...

Scraping: https://www.kompas.com/global/read/2025/07/30/170103270/gempa-rusia-viral-video-dokter-tetap-operasi-pasien-meski-berguncang
Success: Gempa Rusia: Viral Video Dokter Tetap Operasi Pasien meski Berguncang...

Scraping: https://www.kompas.com/global/read/2025/07/30/162305370/indonesia-masuk-51-wilayah-ini-berpotensi-diterjang-tsunami-usai-gempa
Success: Indonesia Masuk, 51 Wilayah Ini Berpotensi Diterjang Tsunami Usai Gempa Rusia...

Scraping: https://www.kompas.com/global/read/2025/07/30/160347370/gempa-rusia-m-88-sapu-bangunan-kota-pesisir-kamchatka
Success: Gempa Rusia M 8,8 Sapu Bangunan Kota Pesisir Kamchatka...

Scraping: https://www.kompas.com/global/read/2025/07/30/152409270/indonesia-siaga-tsunami-imbau-warga-di-5-provinsi-ini-mengu

In [12]:
def main():
    base_url = "https://www.kompas.com/global?page=10"
    all_news_data = []
    
    try:
        headers = {'User-Agent': 'Mozilla/5.0'}
        response = requests.get(base_url, headers=headers, timeout=10)
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Cari link dengan selector yang lebih umum
        news_links = []
        for article in soup.find_all('a', href=True):
            link = article['href']
            if '/read/' in link or '/global/' in link:
                full_url = urljoin(base_url, link)
                if full_url not in news_links:
                    news_links.append(full_url)
        
        print(f"Found {len(news_links)} news links")
        
        for link in news_links:
            print(f"\nScraping: {link}")
            time.sleep(5)
            news_data = scrape_kompas(link)
            if news_data:
                print(f"Success: {news_data['title'][:100]}...")
                all_news_data.append(news_data)
            else:
                print("Failed to scrape this article")
        
        if all_news_data:
            df = pd.DataFrame(all_news_data)
            df.to_csv('kompas_global_10.csv', index=False, encoding='utf-8-sig')
            print(f"\nBerhasil menyimpan {len(df)} berita ke CSV")
        else:
            print("Tidak ada data yang berhasil di-scrape")
    
    except Exception as e:
        print(f"Main function error: {str(e)}")

if __name__ == "__main__":
    main()

Found 40 news links

Scraping: https://www.kompas.com/global/read/2025/08/04/113600770/jet-tempur-as-tembakkan-suar-usir-pesawat-di-atas-lapangan-golf-trump
Success: Jet Tempur AS Tembakkan Suar, Usir Pesawat di Atas Lapangan Golf Trump...

Scraping: https://www.kompas.com/global/read/2025/07/30/081958770/perancis-inggris-akan-akui-palestina-apa-artinya-dan-bagaimana-respons
Success: Perancis-Inggris Akan Akui Palestina, Apa Artinya dan Bagaimana Respons Israel?...

Scraping: https://www.kompas.com/global/read/2025/07/30/075829070/gempa-rusia-direvisi-magnitudo-jadi-87-dan-picu-peringatan-tsunami-1
Success: Gempa Rusia Direvisi, Magnitudo Jadi 8,7 dan Picu Peringatan Tsunami 1 Meter di Jepang...

Scraping: https://www.kompas.com/global/read/2025/07/30/072653870/gempa-magnitudo-8-guncang-rusia-peringatan-tsunami-dikeluarkan-untuk
Success: Gempa Magnitudo 8 Guncang Rusia, Peringatan Tsunami Dikeluarkan untuk Pasifik dan Alaska...

Scraping: https://www.kompas.com/global/read/2025/07/30/0

In [3]:
import pandas as pd
import ollama
from tqdm import tqdm
import time
import re

In [14]:
# List semua file CSV yang ingin digabungkan
file_paths = [
    'kompas_global_1.csv', 'kompas_global_2.csv', 'kompas_global_3.csv', 'kompas_global_4.csv', 'kompas_global_5.csv', 'kompas_global_6.csv', 'kompas_global_7.csv', 'kompas_global_8.csv', 'kompas_global_9.csv', 'kompas_global_10.csv'
]

# Membaca dan menggabungkan semua file
all_data = pd.concat((pd.read_csv(file) for file in file_paths), ignore_index=True)

# Menyimpan hasil gabungan ke file baru
all_data.to_csv('kompas_global.csv', index=False, encoding='utf-8-sig')

In [4]:
MODEL_NAME = "phi3"
NUM_QUESTIONS = 2

# Variabel pelacakan
failed_generations = 0
parsing_failures = 0
success_count = 0

def generate_multiple_qa(text):
    """Fungsi generasi QA dengan error handling lebih baik"""
    global failed_generations
    
    # Validasi input
    text = str(text).strip()
    if not text or len(text) < 20:  # Skip teks terlalu pendek
        return None
    
    # Buat prompt
    prompt = f"""
    Buat {NUM_QUESTIONS} pertanyaan berbeda beserta jawaban singkat (maks 15 kata) dari teks berikut:
    {text}
    
    Format wajib:
    1. Pertanyaan: <pertanyaan1>
       Jawaban: <jawaban1>
    2. Pertanyaan: <pertanyaan2>
       Jawaban: <jawaban2>
    """
    
    try:
        response = ollama.generate(
            model=MODEL_NAME,
            prompt=prompt,
            options={
                "temperature": 0.7,
                "num_ctx": 2048,  # Context window lebih besar
                "repeat_penalty": 1.3,
                "num_keep": 4,
                "timeout": 120  # Timeout lebih panjang
            }
        )
        return response['response']
    except Exception as e:
        print(f"\nError generasi: {str(e)[:100]}...")
        failed_generations += 1
        return None

In [5]:
def parse_multiple_qa(generated_text):
    """Fungsi parsing yang lebih toleran"""
    global parsing_failures
    qa_pairs = []
    
    # Bersihkan dan normalisasi teks
    cleaned_text = re.sub(r'\n+', '\n', generated_text.strip())
    lines = [line.strip() for line in cleaned_text.split('\n') if line.strip()]
    
    # Cari pasangan QA dengan regex fleksibel
    question_pattern = re.compile(r'(\d+\.)?\s*pertanyaan:\s*(.+)', re.IGNORECASE)
    answer_pattern = re.compile(r'jawaban:\s*(.+)', re.IGNORECASE)
    
    i = 0
    while i < len(lines) - 1:
        # Cari pertanyaan
        q_match = question_pattern.match(lines[i])
        if q_match:
            question = q_match.group(2).strip()
            
            # Cari jawaban di line berikutnya
            if i+1 < len(lines):
                a_match = answer_pattern.match(lines[i+1])
                answer = a_match.group(1).strip() if a_match else "-"
                qa_pairs.append((question, answer))
                i += 2  # Loncat ke pertanyaan berikutnya
                continue
        i += 1
    
    if len(qa_pairs) < NUM_QUESTIONS:
        parsing_failures += 1
        
    return qa_pairs[:NUM_QUESTIONS]

In [7]:
def process_csv(input_path, output_path):
    """Fungsi utama pemrosesan CSV"""
    global success_count
    
    # Baca data
    try:
        df = pd.read_csv(input_path, encoding='latin1')
    except Exception as e:
        print(f"Gagal membaca CSV: {str(e)}")
        return
    
    # Validasi kolom
    if 'content' not in df.columns:
        print("Error: Kolom 'content' tidak ditemukan")
        return
    
    results = []
    problematic_rows = []
    
    # Proses setiap baris
    for idx, row in tqdm(df.iterrows(), total=len(df)):
        content = str(row['content'])
        generated = generate_multiple_qa(content)
        
        if not generated:
            problematic_rows.append({
                'row_id': idx,
                'content': content[:200] + "...",
                'error': 'Generasi gagal'
            })
            continue
        
        qa_pairs = parse_multiple_qa(generated)
        if not qa_pairs:
            problematic_rows.append({
                'row_id': idx,
                'content': content[:200] + "...",
                'generated_text': generated[:200] + "...",
                'error': 'Parsing gagal'
            })
            continue
        
        for q, a in qa_pairs:
            results.append({
                'original_content': content[:300] + "...",
                'question': q,
                'answer': a,
                'row_id': idx
            })
            success_count += 1
        
        time.sleep(0.5)  # Jeda antar request
    
    # Simpan hasil
    if results:
        pd.DataFrame(results).to_csv(output_path, index=False)
        print(f"\nSukses: {success_count} QA pairs tersimpan di {output_path}")
    else:
        print("Tidak ada hasil yang berhasil digenerate")
    
    # Simpan log error jika ada
    if problematic_rows:
        error_df = pd.DataFrame(problematic_rows)
        error_path = output_path.replace('.csv', '_errors.csv')
        error_df.to_csv(error_path, index=False)
        print(f"Log error ({len(problematic_rows)} baris) tersimpan di {error_path}")
    
    # Print summary
    print("\n=== Summary ===")
    print(f"Total konten diproses: {len(df)}")
    print(f"Generasi gagal: {failed_generations}")
    print(f"Parsing gagal: {parsing_failures}")
    print(f"QA pairs sukses: {success_count} (harusnya {len(df)*NUM_QUESTIONS})")

# Eksekusi
if __name__ == '__main__':
    process_csv('kompas_global.csv', 'kg_qa_fixed.csv')

100%|██████████| 390/390 [10:35:33<00:00, 97.78s/it]   

Tidak ada hasil yang berhasil digenerate
Log error (390 baris) tersimpan di kg_qa_fixed_errors.csv

=== Summary ===
Total konten diproses: 390
Generasi gagal: 0
Parsing gagal: 383
QA pairs sukses: 0 (harusnya 780)



