In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from urllib.parse import urljoin
import time

In [3]:
def scrape_kompas(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    
    try:
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()
        
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Coba selector alternatif untuk judul
        title = soup.find('h1', {'class': ['read__title', 'headline__title']}).get_text(strip=True) if soup.find('h1', {'class': ['read__title', 'headline__title']}) else 'Judul tidak ditemukan'
        
        # Coba selector alternatif untuk konten
        content_div = soup.find('div', class_='read__content') or soup.find('div', class_='article__content')
        if content_div:
            paragraphs = content_div.find_all('p', recursive=True)
            content = '\n'.join([p.get_text(strip=True) for p in paragraphs if p.get_text(strip=True)])
        else:
            content = 'Konten tidak ditemukan'
        
        return {
            'title': title,
            'content': content,
            'url': url
        }
    
    except Exception as e:
        print(f"Error saat scraping {url}: {str(e)}")
        return None

In [4]:
def main():
    base_url = "https://nasional.kompas.com/"
    all_news_data = []
    
    try:
        headers = {'User-Agent': 'Mozilla/5.0'}
        response = requests.get(base_url, headers=headers, timeout=10)
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Cari link dengan selector yang lebih umum
        news_links = []
        for article in soup.find_all('a', href=True):
            link = article['href']
            if '/read/' in link or '/nasional/' in link:
                full_url = urljoin(base_url, link)
                if full_url not in news_links:
                    news_links.append(full_url)
        
        print(f"Found {len(news_links)} news links")
        
        for link in news_links:
            print(f"\nScraping: {link}")
            time.sleep(5)
            news_data = scrape_kompas(link)
            if news_data:
                print(f"Success: {news_data['title'][:100]}...")
                all_news_data.append(news_data)
            else:
                print("Failed to scrape this article")
        
        if all_news_data:
            df = pd.DataFrame(all_news_data)
            df.to_csv('kompas_nasional_1.csv', index=False, encoding='utf-8-sig')
            print(f"\nBerhasil menyimpan {len(df)} berita ke CSV")
        else:
            print("Tidak ada data yang berhasil di-scrape")
    
    except Exception as e:
        print(f"Main function error: {str(e)}")

if __name__ == "__main__":
    main()

Found 32 news links

Scraping: https://nasional.kompas.com/read/2025/07/22/10123071/keakraban-prabowo-jokowi-di-solo-tepis-isu-pecah-kongsi
Success: Keakraban Prabowo-Jokowi di Solo, Tepis Isu Pecah Kongsi?...

Scraping: https://nasional.kompas.com/read/2025/07/23/11523801/hut-ke-80-ri-tak-digelar-di-ikn-komisi-ii-dpr-jakarta-masih-ibu-kota
Success: HUT Ke-80 RI Tak Digelar di IKN, Komisi II DPR: Jakarta Masih Ibu Kota...

Scraping: https://nasional.kompas.com/read/2025/07/23/11381711/beasiswa-wirausaha-pegadaian-batch-2-dorong-tumbuhnya-pelaku-usaha-muda
Success: Beasiswa Wirausaha Pegadaian Batch 2, Dorong Tumbuhnya Pelaku Usaha Muda...

Scraping: https://nasional.kompas.com/read/2025/07/23/11373351/eks-jaksa-agung-jadi-amicus-curiae-hasto-kapuspenkum-itu-hak-beliau-sifatnya
Success: Eks Jaksa Agung Jadi Amicus Curiae Hasto, Kapuspenkum: Itu Hak Beliau, Sifatnya Pribadi...

Scraping: https://nasional.kompas.com/read/2025/07/23/11340761/kenapa-kpk-bersurat-ke-presiden-dan-ketua-dpr-so

In [6]:
def main():
    base_url = "https://nasional.kompas.com/?page=2"
    all_news_data = []
    
    try:
        headers = {'User-Agent': 'Mozilla/5.0'}
        response = requests.get(base_url, headers=headers, timeout=10)
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Cari link dengan selector yang lebih umum
        news_links = []
        for article in soup.find_all('a', href=True):
            link = article['href']
            if '/read/' in link or '/nasional/' in link:
                full_url = urljoin(base_url, link)
                if full_url not in news_links:
                    news_links.append(full_url)
        
        print(f"Found {len(news_links)} news links")
        
        for link in news_links:
            print(f"\nScraping: {link}")
            time.sleep(5)
            news_data = scrape_kompas(link)
            if news_data:
                print(f"Success: {news_data['title'][:100]}...")
                all_news_data.append(news_data)
            else:
                print("Failed to scrape this article")
        
        if all_news_data:
            df = pd.DataFrame(all_news_data)
            df.to_csv('kompas_nasional_2.csv', index=False, encoding='utf-8-sig')
            print(f"\nBerhasil menyimpan {len(df)} berita ke CSV")
        else:
            print("Tidak ada data yang berhasil di-scrape")
    
    except Exception as e:
        print(f"Main function error: {str(e)}")

if __name__ == "__main__":
    main()

Found 32 news links

Scraping: https://nasional.kompas.com/read/2025/07/22/10123071/keakraban-prabowo-jokowi-di-solo-tepis-isu-pecah-kongsi
Success: Keakraban Prabowo-Jokowi di Solo, Tepis Isu Pecah Kongsi?...

Scraping: https://nasional.kompas.com/read/2025/07/23/09444421/kemenpan-rb-setujui-219364-formasi-jabatan-fungsional-kemenag-guru-paling
Success: Kemenpan-RB Setujui 219.364 Formasi Jabatan Fungsional Kemenag, Guru Paling Banyak...

Scraping: https://nasional.kompas.com/read/2025/07/23/09385541/lantik-2000-perwira-tni-polri-prabowo-terima-kasih-telah-merelakan-putra
Success: Lantik 2.000 Perwira TNI/Polri, Prabowo: Terima Kasih Telah Merelakan Putra-putrinya...

Scraping: https://nasional.kompas.com/read/2025/07/23/09361161/anggota-dpr-eks-marinir-satria-langgar-sumpah-sapta-marga-prajurit
Success: Anggota DPR: Eks Marinir Satria Langgar Sumpah Sapta Marga Prajurit...

Scraping: https://nasional.kompas.com/read/2025/07/23/09325791/polisi-dan-jaksa-jadi-saksi-kasus-korupsi-jalan-

In [7]:
def main():
    base_url = "https://nasional.kompas.com/?page=3"
    all_news_data = []
    
    try:
        headers = {'User-Agent': 'Mozilla/5.0'}
        response = requests.get(base_url, headers=headers, timeout=10)
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Cari link dengan selector yang lebih umum
        news_links = []
        for article in soup.find_all('a', href=True):
            link = article['href']
            if '/read/' in link or '/nasional/' in link:
                full_url = urljoin(base_url, link)
                if full_url not in news_links:
                    news_links.append(full_url)
        
        print(f"Found {len(news_links)} news links")
        
        for link in news_links:
            print(f"\nScraping: {link}")
            time.sleep(5)
            news_data = scrape_kompas(link)
            if news_data:
                print(f"Success: {news_data['title'][:100]}...")
                all_news_data.append(news_data)
            else:
                print("Failed to scrape this article")
        
        if all_news_data:
            df = pd.DataFrame(all_news_data)
            df.to_csv('kompas_nasional_3.csv', index=False, encoding='utf-8-sig')
            print(f"\nBerhasil menyimpan {len(df)} berita ke CSV")
        else:
            print("Tidak ada data yang berhasil di-scrape")
    
    except Exception as e:
        print(f"Main function error: {str(e)}")

if __name__ == "__main__":
    main()

Found 32 news links

Scraping: https://nasional.kompas.com/read/2025/07/22/10123071/keakraban-prabowo-jokowi-di-solo-tepis-isu-pecah-kongsi
Success: Keakraban Prabowo-Jokowi di Solo, Tepis Isu Pecah Kongsi?...

Scraping: https://nasional.kompas.com/read/2025/07/23/05412321/cak-imin-boyong-agam-rinjani-ke-bimtek-pkb-ini-aktivis-gunung-yang-tolong
Success: Cak Imin Boyong Agam Rinjani ke Bimtek PKB: Ini Aktivis Gunung yang Tolong Banyak Pendaki WNA...

Scraping: https://nasional.kompas.com/read/2025/07/23/05281951/cak-imin-undang-prabowo-gibran-anies-di-harlah-ke-27-pkb-malam-ini
Success: Cak Imin Undang Prabowo, Gibran, Anies di Harlah ke-27 PKB Malam Ini...

Scraping: https://nasional.kompas.com/read/2025/07/22/23031161/3-napi-filipina-yang-dipenjara-seumur-hidup-di-ri-berpotensi-dipulangkan
Success: 3 Napi Filipina yang Dipenjara Seumur Hidup di RI Berpotensi Dipulangkan...

Scraping: https://nasional.kompas.com/read/2025/07/22/23023771/cak-imin-doakan-tom-lembong-dapat-keadilan-di-ti

In [8]:
def main():
    base_url = "https://nasional.kompas.com/?page=4"
    all_news_data = []
    
    try:
        headers = {'User-Agent': 'Mozilla/5.0'}
        response = requests.get(base_url, headers=headers, timeout=10)
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Cari link dengan selector yang lebih umum
        news_links = []
        for article in soup.find_all('a', href=True):
            link = article['href']
            if '/read/' in link or '/nasional/' in link:
                full_url = urljoin(base_url, link)
                if full_url not in news_links:
                    news_links.append(full_url)
        
        print(f"Found {len(news_links)} news links")
        
        for link in news_links:
            print(f"\nScraping: {link}")
            time.sleep(5)
            news_data = scrape_kompas(link)
            if news_data:
                print(f"Success: {news_data['title'][:100]}...")
                all_news_data.append(news_data)
            else:
                print("Failed to scrape this article")
        
        if all_news_data:
            df = pd.DataFrame(all_news_data)
            df.to_csv('kompas_nasional_4.csv', index=False, encoding='utf-8-sig')
            print(f"\nBerhasil menyimpan {len(df)} berita ke CSV")
        else:
            print("Tidak ada data yang berhasil di-scrape")
    
    except Exception as e:
        print(f"Main function error: {str(e)}")

if __name__ == "__main__":
    main()

Found 30 news links

Scraping: https://nasional.kompas.com/read/2025/07/22/10123071/keakraban-prabowo-jokowi-di-solo-tepis-isu-pecah-kongsi
Success: Keakraban Prabowo-Jokowi di Solo, Tepis Isu Pecah Kongsi?...

Scraping: https://nasional.kompas.com/read/2025/07/22/20241481/prabowo-sebut-relasi-politik-kakak-adik-politikus-pdip-tak-harus-serumah
Success: Prabowo Sebut Relasi Politik Kakak-Adik, Politikus PDIP: Tak Harus Serumah...

Scraping: https://nasional.kompas.com/read/2025/07/22/20210941/pdip-gerindra-disebut-kakak-adik-deddy-sitorus-sinyal-yang-ditujukan-prabowo
Success: PDIP-Gerindra Disebut Kakak Adik, Deddy Sitorus: Sinyal yang Ditujukan Prabowo...

Scraping: https://nasional.kompas.com/read/2025/07/22/20114971/guru-besar-ui-romo-magnis-hingga-eks-jaksa-agung-kirim-amicus-curiae-untuk
Success: Guru Besar UI, Romo Magnis, hingga Eks Jaksa Agung Kirim Amicus Curiae untuk Hasto...

Scraping: https://nasional.kompas.com/read/2025/07/22/20102781/komisi-ii-dpr-akan-kaji-usul-morator

In [9]:
def main():
    base_url = "https://nasional.kompas.com/?page=5"
    all_news_data = []
    
    try:
        headers = {'User-Agent': 'Mozilla/5.0'}
        response = requests.get(base_url, headers=headers, timeout=10)
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Cari link dengan selector yang lebih umum
        news_links = []
        for article in soup.find_all('a', href=True):
            link = article['href']
            if '/read/' in link or '/nasional/' in link:
                full_url = urljoin(base_url, link)
                if full_url not in news_links:
                    news_links.append(full_url)
        
        print(f"Found {len(news_links)} news links")
        
        for link in news_links:
            print(f"\nScraping: {link}")
            time.sleep(5)
            news_data = scrape_kompas(link)
            if news_data:
                print(f"Success: {news_data['title'][:100]}...")
                all_news_data.append(news_data)
            else:
                print("Failed to scrape this article")
        
        if all_news_data:
            df = pd.DataFrame(all_news_data)
            df.to_csv('kompas_nasional_5.csv', index=False, encoding='utf-8-sig')
            print(f"\nBerhasil menyimpan {len(df)} berita ke CSV")
        else:
            print("Tidak ada data yang berhasil di-scrape")
    
    except Exception as e:
        print(f"Main function error: {str(e)}")

if __name__ == "__main__":
    main()

Found 31 news links

Scraping: https://nasional.kompas.com/read/2025/07/22/10123071/keakraban-prabowo-jokowi-di-solo-tepis-isu-pecah-kongsi
Success: Keakraban Prabowo-Jokowi di Solo, Tepis Isu Pecah Kongsi?...

Scraping: https://nasional.kompas.com/read/2025/07/22/18164071/jaga-stabilitas-harga-dan-inflasi-mendagri-tekankan-pentingnya-cadangan
Success: Jaga Stabilitas Harga dan Inflasi, Mendagri Tekankan Pentingnya Cadangan Pangan Pemerintah Daerah...

Scraping: https://nasional.kompas.com/read/2025/07/22/18160311/mendagri-minta-pemda-wajib-dukung-program-strategis-nasional
Success: Mendagri Minta Pemda Wajib Dukung Program Strategis Nasional...

Scraping: https://nasional.kompas.com/read/2025/07/22/18114011/kejagung-ajukan-ekstradisi-untuk-jurist-tan-tersangka-kasus-chromebook
Success: Kejagung Ajukan Ekstradisi untuk Jurist Tan Tersangka Kasus Chromebook...

Scraping: https://nasional.kompas.com/read/2025/07/22/18111371/bahlil-tanggapi-potensi-pdi-p-masuk-kabinet-hak-prerogatif-presi

In [10]:
def main():
    base_url = "https://nasional.kompas.com/?page=6"
    all_news_data = []
    
    try:
        headers = {'User-Agent': 'Mozilla/5.0'}
        response = requests.get(base_url, headers=headers, timeout=10)
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Cari link dengan selector yang lebih umum
        news_links = []
        for article in soup.find_all('a', href=True):
            link = article['href']
            if '/read/' in link or '/nasional/' in link:
                full_url = urljoin(base_url, link)
                if full_url not in news_links:
                    news_links.append(full_url)
        
        print(f"Found {len(news_links)} news links")
        
        for link in news_links:
            print(f"\nScraping: {link}")
            time.sleep(5)
            news_data = scrape_kompas(link)
            if news_data:
                print(f"Success: {news_data['title'][:100]}...")
                all_news_data.append(news_data)
            else:
                print("Failed to scrape this article")
        
        if all_news_data:
            df = pd.DataFrame(all_news_data)
            df.to_csv('kompas_nasional_6.csv', index=False, encoding='utf-8-sig')
            print(f"\nBerhasil menyimpan {len(df)} berita ke CSV")
        else:
            print("Tidak ada data yang berhasil di-scrape")
    
    except Exception as e:
        print(f"Main function error: {str(e)}")

if __name__ == "__main__":
    main()

Found 31 news links

Scraping: https://nasional.kompas.com/read/2025/07/22/10123071/keakraban-prabowo-jokowi-di-solo-tepis-isu-pecah-kongsi
Success: Keakraban Prabowo-Jokowi di Solo, Tepis Isu Pecah Kongsi?...

Scraping: https://nasional.kompas.com/read/2025/07/22/16570061/menaker-beberkan-strategi-besar-ketenagakerjaan-2025-2029-di-forum-pemred
Success: Menaker Beberkan Strategi Besar Ketenagakerjaan 2025–2029 di Forum Pemred...

Scraping: https://nasional.kompas.com/read/2025/07/22/16545361/kasasi-ditolak-eks-dirut-pt-timah-tetap-dihukum-20-tahun-bui
Success: Kasasi Ditolak, Eks Dirut PT Timah Tetap Dihukum 20 Tahun Bui...

Scraping: https://nasional.kompas.com/read/2025/07/22/16533131/hikmahanto-pemerintah-bisa-saja-kembalikan-status-wni-eks-marinir-satria
Success: Hikmahanto: Pemerintah Bisa Saja Kembalikan Status WNI Eks Marinir Satria Arta...

Scraping: https://nasional.kompas.com/read/2025/07/22/16440861/kubu-tom-lembong-sebut-negara-justru-untung-rp-900-m-dari-impor-gula-mentah

In [11]:
def main():
    base_url = "https://nasional.kompas.com/?page=7"
    all_news_data = []
    
    try:
        headers = {'User-Agent': 'Mozilla/5.0'}
        response = requests.get(base_url, headers=headers, timeout=10)
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Cari link dengan selector yang lebih umum
        news_links = []
        for article in soup.find_all('a', href=True):
            link = article['href']
            if '/read/' in link or '/nasional/' in link:
                full_url = urljoin(base_url, link)
                if full_url not in news_links:
                    news_links.append(full_url)
        
        print(f"Found {len(news_links)} news links")
        
        for link in news_links:
            print(f"\nScraping: {link}")
            time.sleep(5)
            news_data = scrape_kompas(link)
            if news_data:
                print(f"Success: {news_data['title'][:100]}...")
                all_news_data.append(news_data)
            else:
                print("Failed to scrape this article")
        
        if all_news_data:
            df = pd.DataFrame(all_news_data)
            df.to_csv('kompas_nasional_7.csv', index=False, encoding='utf-8-sig')
            print(f"\nBerhasil menyimpan {len(df)} berita ke CSV")
        else:
            print("Tidak ada data yang berhasil di-scrape")
    
    except Exception as e:
        print(f"Main function error: {str(e)}")

if __name__ == "__main__":
    main()

Found 31 news links

Scraping: https://nasional.kompas.com/read/2025/07/22/10123071/keakraban-prabowo-jokowi-di-solo-tepis-isu-pecah-kongsi
Success: Keakraban Prabowo-Jokowi di Solo, Tepis Isu Pecah Kongsi?...

Scraping: https://nasional.kompas.com/read/2025/07/22/15023141/demokrat-terima-kasih-gibran-jenguk-sby-doa-baiknya-percepat-pemulihan
Success: Demokrat Terima Kasih Gibran Jenguk SBY: Doa Baiknya Percepat Pemulihan...

Scraping: https://nasional.kompas.com/read/2025/07/22/14592811/posisi-wakapolri-masih-kosong-kompolnas-imbau-segera-diisi
Success: Posisi Wakapolri Masih Kosong, Kompolnas Imbau Segera Diisi...

Scraping: https://nasional.kompas.com/read/2025/07/22/14561641/satria-eks-marinir-minta-dipulangkan-komisi-i-pastikan-dulu-kewarganegaraan
Success: Satria Eks Marinir Minta Dipulangkan, Komisi I: Pastikan Dulu Kewarganegaraan dan Loyalitasnya...

Scraping: https://nasional.kompas.com/read/2025/07/22/14505901/prabowo-panggil-sejumlah-menteri-dan-dirut-bumn-bahas-kawasan-eko

In [12]:
def main():
    base_url = "https://nasional.kompas.com/?page=8"
    all_news_data = []
    
    try:
        headers = {'User-Agent': 'Mozilla/5.0'}
        response = requests.get(base_url, headers=headers, timeout=10)
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Cari link dengan selector yang lebih umum
        news_links = []
        for article in soup.find_all('a', href=True):
            link = article['href']
            if '/read/' in link or '/nasional/' in link:
                full_url = urljoin(base_url, link)
                if full_url not in news_links:
                    news_links.append(full_url)
        
        print(f"Found {len(news_links)} news links")
        
        for link in news_links:
            print(f"\nScraping: {link}")
            time.sleep(5)
            news_data = scrape_kompas(link)
            if news_data:
                print(f"Success: {news_data['title'][:100]}...")
                all_news_data.append(news_data)
            else:
                print("Failed to scrape this article")
        
        if all_news_data:
            df = pd.DataFrame(all_news_data)
            df.to_csv('kompas_nasional_8.csv', index=False, encoding='utf-8-sig')
            print(f"\nBerhasil menyimpan {len(df)} berita ke CSV")
        else:
            print("Tidak ada data yang berhasil di-scrape")
    
    except Exception as e:
        print(f"Main function error: {str(e)}")

if __name__ == "__main__":
    main()

Found 31 news links

Scraping: https://nasional.kompas.com/read/2025/07/22/10123071/keakraban-prabowo-jokowi-di-solo-tepis-isu-pecah-kongsi
Success: Keakraban Prabowo-Jokowi di Solo, Tepis Isu Pecah Kongsi?...

Scraping: https://nasional.kompas.com/read/2025/07/22/12185361/kebakaran-km-barcelona-komisi-v-usut-tuntas
Success: Kebakaran KM Barcelona, Komisi V: Usut Tuntas!...

Scraping: https://nasional.kompas.com/read/2025/07/22/12062431/keakraban-prabowo-jokowi-di-solo-tegaskan-relasi-keduanya-hangat-bantah
Success: “Keakraban Prabowo-Jokowi di Solo Tegaskan Relasi Keduanya Hangat, Bantah Rumor Renggang"...

Scraping: https://nasional.kompas.com/read/2025/07/22/12010031/adhi-perkuat-diversifikasi-investasi-strategis-untuk-pertumbuhan-yang
Success: ADHI Perkuat Diversifikasi Investasi Strategis untuk Pertumbuhan yang Berkelanjutan...

Scraping: https://nasional.kompas.com/read/2025/07/22/11530731/kpk-ingatkan-ormas-agama-patuhi-regulasi-tata-kelola-tambang
Success: KPK Ingatkan Ormas Ag

In [13]:
def main():
    base_url = "https://nasional.kompas.com/?page=9"
    all_news_data = []
    
    try:
        headers = {'User-Agent': 'Mozilla/5.0'}
        response = requests.get(base_url, headers=headers, timeout=10)
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Cari link dengan selector yang lebih umum
        news_links = []
        for article in soup.find_all('a', href=True):
            link = article['href']
            if '/read/' in link or '/nasional/' in link:
                full_url = urljoin(base_url, link)
                if full_url not in news_links:
                    news_links.append(full_url)
        
        print(f"Found {len(news_links)} news links")
        
        for link in news_links:
            print(f"\nScraping: {link}")
            time.sleep(5)
            news_data = scrape_kompas(link)
            if news_data:
                print(f"Success: {news_data['title'][:100]}...")
                all_news_data.append(news_data)
            else:
                print("Failed to scrape this article")
        
        if all_news_data:
            df = pd.DataFrame(all_news_data)
            df.to_csv('kompas_nasional_9.csv', index=False, encoding='utf-8-sig')
            print(f"\nBerhasil menyimpan {len(df)} berita ke CSV")
        else:
            print("Tidak ada data yang berhasil di-scrape")
    
    except Exception as e:
        print(f"Main function error: {str(e)}")

if __name__ == "__main__":
    main()

Found 32 news links

Scraping: https://nasional.kompas.com/read/2025/07/22/10123071/keakraban-prabowo-jokowi-di-solo-tepis-isu-pecah-kongsi
Success: Keakraban Prabowo-Jokowi di Solo, Tepis Isu Pecah Kongsi?...

Scraping: https://nasional.kompas.com/read/2025/07/22/07573061/cak-imin-sarankan-warga-tak-mampu-yang-tercoret-dari-pbi-jkn-lapor-ke-dinsos
Success: Cak Imin Sarankan Warga Tak Mampu yang Tercoret dari PBI JKN Lapor ke Dinsos...

Scraping: https://nasional.kompas.com/read/2025/07/22/07455661/bebas-dari-myanmar-selebgram-ap-diminta-bijak-saat-berada-di-negara-lain
Success: Bebas dari Myanmar, Selebgram AP Diminta Bijak Saat Berada di Negara Lain...

Scraping: https://nasional.kompas.com/read/2025/07/22/07332501/eks-marinir-satria-arta-minta-pulang-tni-al-tegaskan-sudah-diberhentikan
Success: Eks Marinir Satria Arta Minta Pulang, TNI AL Tegaskan Sudah Diberhentikan Tidak Hormat...

Scraping: https://nasional.kompas.com/read/2025/07/22/07315741/menanti-aksi-aparat-usut-beras-oplosa

In [14]:
def main():
    base_url = "https://nasional.kompas.com/?page=10"
    all_news_data = []
    
    try:
        headers = {'User-Agent': 'Mozilla/5.0'}
        response = requests.get(base_url, headers=headers, timeout=10)
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Cari link dengan selector yang lebih umum
        news_links = []
        for article in soup.find_all('a', href=True):
            link = article['href']
            if '/read/' in link or '/nasional/' in link:
                full_url = urljoin(base_url, link)
                if full_url not in news_links:
                    news_links.append(full_url)
        
        print(f"Found {len(news_links)} news links")
        
        for link in news_links:
            print(f"\nScraping: {link}")
            time.sleep(5)
            news_data = scrape_kompas(link)
            if news_data:
                print(f"Success: {news_data['title'][:100]}...")
                all_news_data.append(news_data)
            else:
                print("Failed to scrape this article")
        
        if all_news_data:
            df = pd.DataFrame(all_news_data)
            df.to_csv('kompas_nasional_10.csv', index=False, encoding='utf-8-sig')
            print(f"\nBerhasil menyimpan {len(df)} berita ke CSV")
        else:
            print("Tidak ada data yang berhasil di-scrape")
    
    except Exception as e:
        print(f"Main function error: {str(e)}")

if __name__ == "__main__":
    main()

Found 32 news links

Scraping: https://nasional.kompas.com/read/2025/07/22/10123071/keakraban-prabowo-jokowi-di-solo-tepis-isu-pecah-kongsi
Success: Keakraban Prabowo-Jokowi di Solo, Tepis Isu Pecah Kongsi?...

Scraping: https://nasional.kompas.com/read/2025/07/22/01103991/bos-bos-bank-daerah-jadi-tersangka-karena-beri-kredit-ke-pt-sritex-meski-tak
Success: Bos-bos Bank Daerah Jadi Tersangka karena Beri Kredit ke PT Sritex meski Tak Layak...

Scraping: https://nasional.kompas.com/read/2025/07/22/00490231/kejagung-duga-eks-direktur-sritex-cairkan-kredit-pakai-invoice-fiktif
Success: Kejagung Duga Eks Direktur Sritex Cairkan Kredit Pakai Invoice Fiktif...

Scraping: https://nasional.kompas.com/read/2025/07/22/00464461/eks-bos-keuangan-sritex-allan-severino-pakai-kredit-bank-buat-bayar-utang
Success: Eks Bos Keuangan Sritex Allan Severino Pakai Kredit Bank buat Bayar Utang, Bukan Modal Kerja...

Scraping: https://nasional.kompas.com/read/2025/07/22/00263971/kejagung-ungkap-kerugian-keuang

In [4]:
import pandas as pd
import ollama
from tqdm import tqdm

In [15]:
# List semua file CSV yang ingin digabungkan
file_paths = [
    'kompas_nasional_1.csv', 'kompas_nasional_2.csv', 'kompas_nasional_3.csv', 'kompas_nasional_4.csv', 'kompas_nasional_5.csv', 'kompas_nasional_6.csv', 'kompas_nasional_7.csv', 'kompas_nasional_8.csv', 'kompas_nasional_9.csv', 'kompas_nasional_10.csv'
]

# Membaca dan menggabungkan semua file
all_data = pd.concat((pd.read_csv(file) for file in file_paths), ignore_index=True)

# Menyimpan hasil gabungan ke file baru
all_data.to_csv('kompas_nasional.csv', index=False, encoding='utf-8-sig')

In [5]:
# Pilih model yang lebih ringan
MODEL_NAME = "phi3"

# Fungsi yang dioptimalkan
def generate_qa_light(text):
    prompt = f"""
    Buat 1 pertanyaan dan jawaban singkat (maks 20 kata) dari teks berikut:
    {text[:1000]}  # Lebih pendek untuk menghemat memori
    Format: Pertanyaan: <pertanyaan>\nJawaban: <jawaban>
    """
    
    try:
        response = ollama.generate(
            model=MODEL_NAME,
            prompt=prompt,
            options={
                "temperature": 0.3,  # Kurangi kreativitas untuk hasil lebih konsisten
                "num_ctx": 1024  # Batasi context window
            }
        )
        return response['response']
    except Exception as e:
        print(f"Error: {e}")
        return None

In [6]:
# Proses CSV dengan optimasi memori
def process_csv(input_path, output_path, sample_size=None):
    df = pd.read_csv(input_path)
    if sample_size:
        df = df.sample(min(sample_size, len(df)))
    
    results = []
    for _, row in tqdm(df.iterrows(), total=len(df)):
        generated = generate_qa_light(str(row['content']))
        if generated:
            try:
                q = generated.split("Pertanyaan: ")[1].split("\n")[0].strip()
                a = generated.split("Jawaban: ")[1].strip() if "Jawaban: " in generated else "-"
                results.append({
                    'content': row.get('content', ''),
                    'question': q,
                    'answer': a
                })
            except:
                continue
    
    pd.DataFrame(results).to_csv(output_path, index=False)

process_csv('kompas_nasional.csv', 'kn_qa.csv', sample_size=50)

100%|██████████| 50/50 [45:04<00:00, 54.10s/it]
