In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from urllib.parse import urljoin
import time

In [2]:
def scrape_kompas(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    
    try:
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()
        
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Coba selector alternatif untuk judul
        title = soup.find('h1', {'class': ['read__title', 'headline__title']}).get_text(strip=True) if soup.find('h1', {'class': ['read__title', 'headline__title']}) else 'Judul tidak ditemukan'
        
        # Coba selector alternatif untuk konten
        content_div = soup.find('div', class_='read__content') or soup.find('div', class_='article__content')
        if content_div:
            paragraphs = content_div.find_all('p', recursive=True)
            content = '\n'.join([p.get_text(strip=True) for p in paragraphs if p.get_text(strip=True)])
        else:
            content = 'Konten tidak ditemukan'
        
        return {
            'title': title,
            'content': content,
            'url': url
        }
    
    except Exception as e:
        print(f"Error saat scraping {url}: {str(e)}")
        return None

In [None]:
def main():
    base_url = "https://www.kompas.com/global"
    all_news_data = []
    
    try:
        headers = {'User-Agent': 'Mozilla/5.0'}
        response = requests.get(base_url, headers=headers, timeout=10)
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Cari link dengan selector yang lebih umum
        news_links = []
        for article in soup.find_all('a', href=True):
            link = article['href']
            if '/read/' in link or '/global/' in link:
                full_url = urljoin(base_url, link)
                if full_url not in news_links:
                    news_links.append(full_url)
        
        print(f"Found {len(news_links)} news links")
        
        for link in news_links:
            print(f"\nScraping: {link}")
            time.sleep(5)
            news_data = scrape_kompas(link)
            if news_data:
                print(f"Success: {news_data['title'][:100]}...")
                all_news_data.append(news_data)
            else:
                print("Failed to scrape this article")
        
        if all_news_data:
            df = pd.DataFrame(all_news_data)
            df.to_csv('kompas_global.csv', index=False, encoding='utf-8-sig')
            print(f"\nBerhasil menyimpan {len(df)} berita ke CSV")
        else:
            print("Tidak ada data yang berhasil di-scrape")
    
    except Exception as e:
        print(f"Main function error: {str(e)}")

if __name__ == "__main__":
    main()

Found 36 news links

Scraping: https://www.kompas.com/global/read/2025/07/22/105214670/temuan-terbaru-insiden-jeju-air-pilot-salah-matikan-mesin
Success: Temuan Terbaru Insiden Jeju Air: Pilot Salah Matikan Mesin...

Scraping: https://www.kompas.com/global/read/2025/07/23/085208270/dubes-as-sebut-pembunuhan-warga-palestina-amerika-oleh-pemukim-israel
Success: Dubes AS Sebut Pembunuhan Warga Palestina-Amerika oleh Pemukim Israel sebagai Aksi Terorisme...

Scraping: https://www.kompas.com/global/read/2025/07/23/080305070/pelajar-bangladesh-demo-tuntut-jawaban-atas-tragedi-jet-tempur-menewaskan
Success: Pelajar Bangladesh Demo, Tuntut Jawaban atas Tragedi Jet Tempur Menewaskan 25 Anak...

Scraping: https://www.kompas.com/global/read/2025/07/23/070700270/pesawat-tempur-supersonik-mirage-ukraina-jatuh-pilot-selamat
Success: Pesawat Tempur Supersonik Mirage Ukraina Jatuh, Pilot Selamat...

Scraping: https://www.kompas.com/global/read/2025/07/23/060700470/as-indonesia-sepakati-pemangkasan-tar

In [5]:
def main():
    base_url = "https://www.kompas.com/global?page=2"
    all_news_data = []
    
    try:
        headers = {'User-Agent': 'Mozilla/5.0'}
        response = requests.get(base_url, headers=headers, timeout=10)
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Cari link dengan selector yang lebih umum
        news_links = []
        for article in soup.find_all('a', href=True):
            link = article['href']
            if '/read/' in link or '/global/' in link:
                full_url = urljoin(base_url, link)
                if full_url not in news_links:
                    news_links.append(full_url)
        
        print(f"Found {len(news_links)} news links")
        
        for link in news_links:
            print(f"\nScraping: {link}")
            time.sleep(5)
            news_data = scrape_kompas(link)
            if news_data:
                print(f"Success: {news_data['title'][:100]}...")
                all_news_data.append(news_data)
            else:
                print("Failed to scrape this article")
        
        if all_news_data:
            df = pd.DataFrame(all_news_data)
            df.to_csv('kompas_global_1.csv', index=False, encoding='utf-8-sig')
            print(f"\nBerhasil menyimpan {len(df)} berita ke CSV")
        else:
            print("Tidak ada data yang berhasil di-scrape")
    
    except Exception as e:
        print(f"Main function error: {str(e)}")

if __name__ == "__main__":
    main()

Found 34 news links

Scraping: https://www.kompas.com/global/read/2025/07/22/105214670/temuan-terbaru-insiden-jeju-air-pilot-salah-matikan-mesin
Success: Temuan Terbaru Insiden Jeju Air: Pilot Salah Matikan Mesin...

Scraping: https://www.kompas.com/global/read/2025/07/22/155730270/iran-ganti-pertahanan-udara-yang-rusak-usai-perang-lawan-israel-pakai
Success: Iran Ganti Pertahanan Udara yang Rusak Usai Perang Lawan Israel, Pakai Buatan Rusia?...

Scraping: https://www.kompas.com/global/read/2025/07/22/154937670/planet-baru-telah-lahir-kali-pertama-disaksikan-para-astronom
Success: Planet Baru Telah Lahir, Kali Pertama Disaksikan Para Astronom...

Scraping: https://www.kompas.com/global/read/2025/07/22/151637270/insinyur-china-curi-teknologi-deteksi-rudal-nuklir-as-terancam-penjara-10
Success: Insinyur China Curi Teknologi Deteksi Rudal Nuklir AS, Terancam Penjara 10 Tahun...

Scraping: https://www.kompas.com/global/read/2025/07/22/150204370/spesifikasi-jet-tempur-f-7-bgi-buatan-china-m

In [13]:
def main():
    base_url = "https://www.kompas.com/global?page=3"
    all_news_data = []
    
    try:
        headers = {'User-Agent': 'Mozilla/5.0'}
        response = requests.get(base_url, headers=headers, timeout=10)
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Cari link dengan selector yang lebih umum
        news_links = []
        for article in soup.find_all('a', href=True):
            link = article['href']
            if '/read/' in link or '/global/' in link:
                full_url = urljoin(base_url, link)
                if full_url not in news_links:
                    news_links.append(full_url)
        
        print(f"Found {len(news_links)} news links")
        
        for link in news_links:
            print(f"\nScraping: {link}")
            time.sleep(5)
            news_data = scrape_kompas(link)
            if news_data:
                print(f"Success: {news_data['title'][:100]}...")
                all_news_data.append(news_data)
            else:
                print("Failed to scrape this article")
        
        if all_news_data:
            df = pd.DataFrame(all_news_data)
            df.to_csv('kompas_global_3.csv', index=False, encoding='utf-8-sig')
            print(f"\nBerhasil menyimpan {len(df)} berita ke CSV")
        else:
            print("Tidak ada data yang berhasil di-scrape")
    
    except Exception as e:
        print(f"Main function error: {str(e)}")

if __name__ == "__main__":
    main()

Found 40 news links

Scraping: https://www.kompas.com/global/read/2025/07/22/105214670/temuan-terbaru-insiden-jeju-air-pilot-salah-matikan-mesin
Success: Temuan Terbaru Insiden Jeju Air: Pilot Salah Matikan Mesin...

Scraping: https://www.kompas.com/global/read/2025/07/22/055922870/jet-tempur-jatuh-di-bangladesh-20-tewas-termasuk-pilot-dan-pelajar
Success: Jet Tempur Jatuh di Bangladesh, 20 Tewas Termasuk Pilot dan Pelajar...

Scraping: https://www.kompas.com/global/read/2025/07/22/052925570/populer-global-jet-delta-hindari-tabrakan-pesawat-pengebom-warga
Success: [POPULER GLOBAL] Jet Delta Hindari Tabrakan Pesawat Pengebom | Warga Australia Gugat Suplemen Blackm...

Scraping: https://www.kompas.com/global/read/2025/07/21/210149070/air-india-kecelakaan-lagi-pesawat-tergelincir-dan-rusak-parah
Success: Air India Kecelakaan Lagi, Pesawat Tergelincir dan Rusak Parah...

Scraping: https://www.kompas.com/global/read/2025/07/21/203514370/pisang-rp-101-miliar-di-museum-perancis-dimakan-pengun

In [14]:
def main():
    base_url = "https://www.kompas.com/global?page=4"
    all_news_data = []
    
    try:
        headers = {'User-Agent': 'Mozilla/5.0'}
        response = requests.get(base_url, headers=headers, timeout=10)
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Cari link dengan selector yang lebih umum
        news_links = []
        for article in soup.find_all('a', href=True):
            link = article['href']
            if '/read/' in link or '/global/' in link:
                full_url = urljoin(base_url, link)
                if full_url not in news_links:
                    news_links.append(full_url)
        
        print(f"Found {len(news_links)} news links")
        
        for link in news_links:
            print(f"\nScraping: {link}")
            time.sleep(5)
            news_data = scrape_kompas(link)
            if news_data:
                print(f"Success: {news_data['title'][:100]}...")
                all_news_data.append(news_data)
            else:
                print("Failed to scrape this article")
        
        if all_news_data:
            df = pd.DataFrame(all_news_data)
            df.to_csv('kompas_global_4.csv', index=False, encoding='utf-8-sig')
            print(f"\nBerhasil menyimpan {len(df)} berita ke CSV")
        else:
            print("Tidak ada data yang berhasil di-scrape")
    
    except Exception as e:
        print(f"Main function error: {str(e)}")

if __name__ == "__main__":
    main()

Found 40 news links

Scraping: https://www.kompas.com/global/read/2025/07/22/105214670/temuan-terbaru-insiden-jeju-air-pilot-salah-matikan-mesin
Success: Temuan Terbaru Insiden Jeju Air: Pilot Salah Matikan Mesin...

Scraping: https://www.kompas.com/global/read/2025/07/21/132703670/suku-badui-mundur-dari-sweida-konvoi-bantuan-kemanusiaan-mulai-masuk
Success: Suku Badui Mundur dari Sweida, Konvoi Bantuan Kemanusiaan Mulai Masuk Suriah...

Scraping: https://www.kompas.com/global/read/2025/07/21/130318370/kisah-f-117-jet-tempur-siluman-pertama-di-dunia-dari-proyek-rahasia
Success: Kisah F-117, Jet Tempur Siluman Pertama di Dunia, dari Proyek Rahasia hingga Medan Perang...

Scraping: https://www.kompas.com/global/read/2025/07/21/130124070/arab-badui-siap-patuhi-gencatan-senjata-dengan-druze-suriah-tapi
Success: Arab Badui Siap Patuhi Gencatan Senjata dengan Druze Suriah, tapi......

Scraping: https://www.kompas.com/global/read/2025/07/21/121149770/benjamin-netanyahu-keracunan-makanan-sidan

In [15]:
def main():
    base_url = "https://www.kompas.com/global?page=5"
    all_news_data = []
    
    try:
        headers = {'User-Agent': 'Mozilla/5.0'}
        response = requests.get(base_url, headers=headers, timeout=10)
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Cari link dengan selector yang lebih umum
        news_links = []
        for article in soup.find_all('a', href=True):
            link = article['href']
            if '/read/' in link or '/global/' in link:
                full_url = urljoin(base_url, link)
                if full_url not in news_links:
                    news_links.append(full_url)
        
        print(f"Found {len(news_links)} news links")
        
        for link in news_links:
            print(f"\nScraping: {link}")
            time.sleep(5)
            news_data = scrape_kompas(link)
            if news_data:
                print(f"Success: {news_data['title'][:100]}...")
                all_news_data.append(news_data)
            else:
                print("Failed to scrape this article")
        
        if all_news_data:
            df = pd.DataFrame(all_news_data)
            df.to_csv('kompas_global_5.csv', index=False, encoding='utf-8-sig')
            print(f"\nBerhasil menyimpan {len(df)} berita ke CSV")
        else:
            print("Tidak ada data yang berhasil di-scrape")
    
    except Exception as e:
        print(f"Main function error: {str(e)}")

if __name__ == "__main__":
    main()

Found 40 news links

Scraping: https://www.kompas.com/global/read/2025/07/22/105214670/temuan-terbaru-insiden-jeju-air-pilot-salah-matikan-mesin
Success: Temuan Terbaru Insiden Jeju Air: Pilot Salah Matikan Mesin...

Scraping: https://internasional.kompas.com/read/2025/07/20/144352870/gempa-m-74-guncang-lepas-pantai-rusia-picu-peringatan-tsunami
Success: Gempa M 7,4 Guncang Lepas Pantai Rusia, Picu Peringatan Tsunami...

Scraping: https://internasional.kompas.com/read/2025/07/20/141700770/sekelompok-pria-bersenjata-serang-bar-di-kota-wisata-ekuador-9-orang
Success: Sekelompok Pria Bersenjata Serang Bar di Kota Wisata Ekuador, 9 Orang Tewas...

Scraping: https://internasional.kompas.com/read/2025/07/20/133353570/bentrokan-di-wilayah-druze-memanas-suriah-upayakan-gencatan-senjata
Success: Bentrokan di Wilayah Druze Memanas, Suriah Upayakan Gencatan Senjata...

Scraping: https://internasional.kompas.com/read/2025/07/20/124700970/ceo-astronomer-mundur-ketahuan-selingkuh-di-konser-coldplay


In [16]:
def main():
    base_url = "https://www.kompas.com/global?page=6"
    all_news_data = []
    
    try:
        headers = {'User-Agent': 'Mozilla/5.0'}
        response = requests.get(base_url, headers=headers, timeout=10)
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Cari link dengan selector yang lebih umum
        news_links = []
        for article in soup.find_all('a', href=True):
            link = article['href']
            if '/read/' in link or '/global/' in link:
                full_url = urljoin(base_url, link)
                if full_url not in news_links:
                    news_links.append(full_url)
        
        print(f"Found {len(news_links)} news links")
        
        for link in news_links:
            print(f"\nScraping: {link}")
            time.sleep(5)
            news_data = scrape_kompas(link)
            if news_data:
                print(f"Success: {news_data['title'][:100]}...")
                all_news_data.append(news_data)
            else:
                print("Failed to scrape this article")
        
        if all_news_data:
            df = pd.DataFrame(all_news_data)
            df.to_csv('kompas_global_6.csv', index=False, encoding='utf-8-sig')
            print(f"\nBerhasil menyimpan {len(df)} berita ke CSV")
        else:
            print("Tidak ada data yang berhasil di-scrape")
    
    except Exception as e:
        print(f"Main function error: {str(e)}")

if __name__ == "__main__":
    main()

Found 40 news links

Scraping: https://www.kompas.com/global/read/2025/07/22/105214670/temuan-terbaru-insiden-jeju-air-pilot-salah-matikan-mesin
Success: Temuan Terbaru Insiden Jeju Air: Pilot Salah Matikan Mesin...

Scraping: https://internasional.kompas.com/read/2025/07/19/162659970/temuan-baru-jatuhnya-air-india-saklar-bahan-bakar-diduga-dimatikan
Success: Temuan Baru Jatuhnya Air India, Saklar Bahan Bakar Diduga Dimatikan Kapten Pilot...

Scraping: https://internasional.kompas.com/read/2025/07/19/151055370/batu-mars-langka-terbesar-di-bumi-terjual-rp-865-miliar-berat-24-kg
Success: Batu Mars Langka Terbesar di Bumi Terjual Rp 86,5 Miliar, Berat 24 Kg...

Scraping: https://internasional.kompas.com/read/2025/07/19/140502270/eks-presiden-brasil-bolsonaro-harus-pakai-gelang-kaki-agar-tak-kabur
Success: Eks Presiden Brasil Bolsonaro Harus Pakai Gelang Kaki agar Tak Kabur dari Penyelidikan...

Scraping: https://internasional.kompas.com/read/2025/07/19/134844070/usai-gereja-katolik-di-gaz

In [17]:
def main():
    base_url = "https://www.kompas.com/global?page=7"
    all_news_data = []
    
    try:
        headers = {'User-Agent': 'Mozilla/5.0'}
        response = requests.get(base_url, headers=headers, timeout=10)
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Cari link dengan selector yang lebih umum
        news_links = []
        for article in soup.find_all('a', href=True):
            link = article['href']
            if '/read/' in link or '/global/' in link:
                full_url = urljoin(base_url, link)
                if full_url not in news_links:
                    news_links.append(full_url)
        
        print(f"Found {len(news_links)} news links")
        
        for link in news_links:
            print(f"\nScraping: {link}")
            time.sleep(5)
            news_data = scrape_kompas(link)
            if news_data:
                print(f"Success: {news_data['title'][:100]}...")
                all_news_data.append(news_data)
            else:
                print("Failed to scrape this article")
        
        if all_news_data:
            df = pd.DataFrame(all_news_data)
            df.to_csv('kompas_global_7.csv', index=False, encoding='utf-8-sig')
            print(f"\nBerhasil menyimpan {len(df)} berita ke CSV")
        else:
            print("Tidak ada data yang berhasil di-scrape")
    
    except Exception as e:
        print(f"Main function error: {str(e)}")

if __name__ == "__main__":
    main()

Found 40 news links

Scraping: https://www.kompas.com/global/read/2025/07/22/105214670/temuan-terbaru-insiden-jeju-air-pilot-salah-matikan-mesin
Success: Temuan Terbaru Insiden Jeju Air: Pilot Salah Matikan Mesin...

Scraping: https://internasional.kompas.com/read/2025/07/18/170422270/hujan-deras-di-korea-selatan-tewaskan-4-orang-lebih-dari-1300
Success: Hujan Deras di Korea Selatan Tewaskan 4 Orang, Lebih dari 1.300 Dievakuasi...

Scraping: https://internasional.kompas.com/read/2025/07/18/160932470/pasukan-suriah-siap-dikerahkan-lagi-ke-sweida-meski-ada-peringatan
Success: Pasukan Suriah Siap Dikerahkan Lagi ke Sweida meski Ada Peringatan dari Israel...

Scraping: https://internasional.kompas.com/read/2025/07/18/152800970/alasan-lindungi-druze-apa-alasan-sebenarnya-israel-serang-suriah-
Success: Alasan Lindungi Druze, Apa Alasan Sebenarnya Israel Serang Suriah?...

Scraping: https://internasional.kompas.com/read/2025/07/18/151327270/kapal-perang-jepang-era-pd-ii-ditemukan-di-dasar-lau

In [18]:
def main():
    base_url = "https://www.kompas.com/global?page=8"
    all_news_data = []
    
    try:
        headers = {'User-Agent': 'Mozilla/5.0'}
        response = requests.get(base_url, headers=headers, timeout=10)
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Cari link dengan selector yang lebih umum
        news_links = []
        for article in soup.find_all('a', href=True):
            link = article['href']
            if '/read/' in link or '/global/' in link:
                full_url = urljoin(base_url, link)
                if full_url not in news_links:
                    news_links.append(full_url)
        
        print(f"Found {len(news_links)} news links")
        
        for link in news_links:
            print(f"\nScraping: {link}")
            time.sleep(5)
            news_data = scrape_kompas(link)
            if news_data:
                print(f"Success: {news_data['title'][:100]}...")
                all_news_data.append(news_data)
            else:
                print("Failed to scrape this article")
        
        if all_news_data:
            df = pd.DataFrame(all_news_data)
            df.to_csv('kompas_global_8.csv', index=False, encoding='utf-8-sig')
            print(f"\nBerhasil menyimpan {len(df)} berita ke CSV")
        else:
            print("Tidak ada data yang berhasil di-scrape")
    
    except Exception as e:
        print(f"Main function error: {str(e)}")

if __name__ == "__main__":
    main()

Found 40 news links

Scraping: https://www.kompas.com/global/read/2025/07/22/105214670/temuan-terbaru-insiden-jeju-air-pilot-salah-matikan-mesin
Success: Temuan Terbaru Insiden Jeju Air: Pilot Salah Matikan Mesin...

Scraping: https://internasional.kompas.com/read/2025/07/17/220200270/israel-serang-satu-satunya-gereja-katolik-di-gaza-italia-murka
Success: Israel Serang Satu-satunya Gereja Katolik di Gaza, Italia Murka...

Scraping: https://internasional.kompas.com/read/2025/07/17/213018770/siapa-druze-dan-kenapa-israel-menyerang-suriah
Success: Siapa Druze dan Kenapa Israel Menyerang Suriah?...

Scraping: https://internasional.kompas.com/read/2025/07/17/211300870/kisah-aldrich-ames-agen-cia-pengkhianat-bocorkan-rahasia-ke-uni-soviet
Success: Kisah Aldrich Ames, Agen CIA Pengkhianat, Bocorkan Rahasia ke Uni Soviet...

Scraping: https://internasional.kompas.com/read/2025/07/17/202411370/presiden-suriah-tak-takut-perang-kecam-israel-biang-keladi-kekacauan
Success: Presiden Suriah Tak Taku

In [28]:
def main():
    base_url = "https://www.kompas.com/global?page=9"
    all_news_data = []
    
    try:
        headers = {'User-Agent': 'Mozilla/5.0'}
        response = requests.get(base_url, headers=headers, timeout=10)
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Cari link dengan selector yang lebih umum
        news_links = []
        for article in soup.find_all('a', href=True):
            link = article['href']
            if '/read/' in link or '/global/' in link:
                full_url = urljoin(base_url, link)
                if full_url not in news_links:
                    news_links.append(full_url)
        
        print(f"Found {len(news_links)} news links")
        
        for link in news_links:
            print(f"\nScraping: {link}")
            time.sleep(5)
            news_data = scrape_kompas(link)
            if news_data:
                print(f"Success: {news_data['title'][:100]}...")
                all_news_data.append(news_data)
            else:
                print("Failed to scrape this article")
        
        if all_news_data:
            df = pd.DataFrame(all_news_data)
            df.to_csv('kompas_global_9.csv', index=False, encoding='utf-8-sig')
            print(f"\nBerhasil menyimpan {len(df)} berita ke CSV")
        else:
            print("Tidak ada data yang berhasil di-scrape")
    
    except Exception as e:
        print(f"Main function error: {str(e)}")

if __name__ == "__main__":
    main()

Found 40 news links

Scraping: https://www.kompas.com/global/read/2025/07/22/105214670/temuan-terbaru-insiden-jeju-air-pilot-salah-matikan-mesin
Success: Temuan Terbaru Insiden Jeju Air: Pilot Salah Matikan Mesin...

Scraping: https://www.kompas.com/global/read/2025/07/17/130850970/hindari-perang-lawan-israel-suriah-tarik-pasukan
Success: Hindari Perang Lawan Israel, Suriah Tarik Pasukan...

Scraping: https://www.kompas.com/global/read/2025/07/17/122122170/serangan-rusia-hantam-toko-ukraina-saat-ramai-belanja-2-orang-tewas
Success: Serangan Rusia Hantam Toko Ukraina Saat Ramai Belanja, 2 Orang Tewas...

Scraping: https://www.kompas.com/global/read/2025/07/17/122011570/malaysia-deportasi-181-wni-kjri-johor-bahru-fasilitasi-pemulangan
Success: Malaysia Deportasi 181 WNI, KJRI Johor Bahru Fasilitasi Pemulangan...

Scraping: https://www.kompas.com/global/read/2025/07/17/113000770/paranormal-top-as-meninggal-mendadak-saat-tur-boneka-annabelle
Success: Paranormal Top AS Meninggal Mendadak Sa

In [29]:
def main():
    base_url = "https://www.kompas.com/global?page=10"
    all_news_data = []
    
    try:
        headers = {'User-Agent': 'Mozilla/5.0'}
        response = requests.get(base_url, headers=headers, timeout=10)
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Cari link dengan selector yang lebih umum
        news_links = []
        for article in soup.find_all('a', href=True):
            link = article['href']
            if '/read/' in link or '/global/' in link:
                full_url = urljoin(base_url, link)
                if full_url not in news_links:
                    news_links.append(full_url)
        
        print(f"Found {len(news_links)} news links")
        
        for link in news_links:
            print(f"\nScraping: {link}")
            time.sleep(5)
            news_data = scrape_kompas(link)
            if news_data:
                print(f"Success: {news_data['title'][:100]}...")
                all_news_data.append(news_data)
            else:
                print("Failed to scrape this article")
        
        if all_news_data:
            df = pd.DataFrame(all_news_data)
            df.to_csv('kompas_global_10.csv', index=False, encoding='utf-8-sig')
            print(f"\nBerhasil menyimpan {len(df)} berita ke CSV")
        else:
            print("Tidak ada data yang berhasil di-scrape")
    
    except Exception as e:
        print(f"Main function error: {str(e)}")

if __name__ == "__main__":
    main()

Found 40 news links

Scraping: https://www.kompas.com/global/read/2025/07/22/105214670/temuan-terbaru-insiden-jeju-air-pilot-salah-matikan-mesin
Success: Temuan Terbaru Insiden Jeju Air: Pilot Salah Matikan Mesin...

Scraping: https://www.kompas.com/global/read/2025/07/16/192808270/jet-f-15-israel-hampir-gagal-serang-iran-rusak-saat-masuk-perbatasan
Success: Jet F-15 Israel Hampir Gagal Serang Iran, Rusak Saat Masuk Perbatasan...

Scraping: https://www.kompas.com/global/read/2025/07/16/183100370/wni-asal-madura-dikeroyok-di-kuala-lumpur-dubes-minta-tak-ada-aksi
Success: WNI Asal Madura Dikeroyok di Kuala Lumpur, Dubes Minta Tak Ada Aksi Balasan...

Scraping: https://www.kompas.com/global/read/2025/07/16/180720870/jepang-sebut-ancaman-militer-china-rusia-dan-korut-terburuk-sejak-perang
Success: Jepang Sebut Ancaman Militer China, Rusia, dan Korut Terburuk Sejak Perang Dunia II...

Scraping: https://www.kompas.com/global/read/2025/07/16/174100370/buntut-air-india-jatuh-ada-usul-pasang-ka

In [None]:
import pandas as pd
import ollama
from tqdm import tqdm

In [None]:
# List semua file CSV yang ingin digabungkan
file_paths = [
    'kompas_global_1.csv', 'kompas_global_2.csv', 'kompas_global_3.csv', 'kompas_global_4.csv', 'kompas_global_5.csv', 'kompas_global_6.csv', 'kompas_global_7.csv', 'kompas_global_8.csv', 'kompas_global_9.csv', 'kompas_global_10.csv'
]

# Membaca dan menggabungkan semua file
all_data = pd.concat((pd.read_csv(file) for file in file_paths), ignore_index=True)

# Menyimpan hasil gabungan ke file baru
all_data.to_csv('kompas_global.csv', index=False, encoding='utf-8-sig')

In [None]:
# Pilih model yang lebih ringan
MODEL_NAME = "phi3"

# Fungsi yang dioptimalkan
def generate_qa_light(text):
    prompt = f"""
    Buat 1 pertanyaan dan jawaban singkat (maks 20 kata) dari teks berikut:
    {text[:1000]}  # Lebih pendek untuk menghemat memori
    Format: Pertanyaan: <pertanyaan>\nJawaban: <jawaban>
    """
    
    try:
        response = ollama.generate(
            model=MODEL_NAME,
            prompt=prompt,
            options={
                "temperature": 0.3,  # Kurangi kreativitas untuk hasil lebih konsisten
                "num_ctx": 1024  # Batasi context window
            }
        )
        return response['response']
    except Exception as e:
        print(f"Error: {e}")
        return None

In [None]:
# Proses CSV dengan optimasi memori
def process_csv(input_path, output_path, sample_size=None):
    df = pd.read_csv(input_path)
    if sample_size:
        df = df.sample(min(sample_size, len(df)))
    
    results = []
    for _, row in tqdm(df.iterrows(), total=len(df)):
        generated = generate_qa_light(str(row['content']))
        if generated:
            try:
                q = generated.split("Pertanyaan: ")[1].split("\n")[0].strip()
                a = generated.split("Jawaban: ")[1].strip() if "Jawaban: " in generated else "-"
                results.append({
                    'content': row.get('content', ''),
                    'question': q,
                    'answer': a
                })
            except:
                continue
    
    pd.DataFrame(results).to_csv(output_path, index=False)

process_csv('kompas_global.csv', 'kg_qa.csv', sample_size=50)

100%|██████████| 50/50 [39:06<00:00, 46.93s/it]
