## CRAWL dari CNBC

### Def untuk crawling dari Portal CNBC
Note : 
- ganti User-Agent pada headers -> ke user agent browser sendiri

In [5]:
import os
import requests
from bs4 import BeautifulSoup
import csv
import time
import urllib.parse


headers_utama = {
        #'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/141.0.0.0 Safari/537.36 Edg/141.0.0.0',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,/;q=0.8',
        'Accept-Language': 'en-US,en;q=0.5',
        'Accept-Encoding': 'gzip, deflate',
        'Connection': 'keep-alive',
}

def cnbc_news_search(keyword, n_berita):
    headers = headers_utama

    data_cnbc = []
    seen_urls = set()  # Untuk track URL yang sudah disimpan
    page = 1

    print(f"Mulai scraping berita dari CNBC Indonesia dengan kata kunci '{keyword}'...")

    while len(data_cnbc) < n_berita:
        encoded_keyword = urllib.parse.quote(keyword)
        search_url = f'https://www.cnbcindonesia.com/search?query={encoded_keyword}&page={page}'
        print(f"\n🔄 Mengambil halaman {page} dari URL: {search_url}")

        try:
            res = requests.get(search_url, headers=headers, timeout=20)
            res.raise_for_status()
        except requests.exceptions.RequestException as e:
            print(f"❌ Error saat mengakses halaman pencarian: {e}")
            break

        soup = BeautifulSoup(res.content, 'html.parser')

        # Selector artikel untuk CNBC Indonesia
        selectors = [
            'div.list_search',
            'div[class*="list"]',
            'div[class*="search"]',
            'article',
            'div[class*="item"]'
        ]

        articles = []
        for selector in selectors:
            articles = soup.select(selector)
            if articles:
                print(f"✅ Menemukan artikel dengan selector: {selector}")
                break

        if not articles:
            print("❌ Tidak ada artikel yang ditemukan di halaman ini. Coba periksa struktur website.")
            print("HTML snippet untuk debugging:")
            print(soup.prettify()[:500])
            break

        for article in articles:
            if len(data_cnbc) >= n_berita:
                break

            try:
                title_elem = None
                url_berita = None

                # Selector untuk title dan URL CNBC Indonesia
                title_selectors = [
                    'h4 a',
                    'h3 a',
                    'h2 a',
                    'a[href*="/news/"]',
                    'a[href*="cnbcindonesia.com"]',
                    'div.title a',
                    '.media_list_link'
                ]

                for selector in title_selectors:
                    element = article.select_one(selector)
                    if element and element.get('href'):
                        title_elem = element
                        url_berita = element['href']
                        break

                if not title_elem or not url_berita:
                    continue

                # Pastikan URL lengkap
                if url_berita.startswith('/'):
                    url_berita = 'https://www.cnbcindonesia.com' + url_berita

                judul = title_elem.get_text(strip=True)

                # Skip artikel tanpa judul
                if not judul:
                    continue

                # Cek duplikat URL
                if url_berita in seen_urls:
                    continue
                seen_urls.add(url_berita)

                # Ambil tanggal
                tanggal = 'Tanggal tidak ditemukan'
                date_selectors = [
                    'span.date',
                    'time',
                    'div.date',
                    'span[class*="date"]',
                    'div[class*="date"]',
                    '[class*="time"]'
                ]

                for selector in date_selectors:
                    date_elem = article.select_one(selector)
                    if date_elem:
                        tanggal = date_elem.get_text(strip=True)
                        break

                print(f"[{len(data_cnbc) + 1}] Memproses berita: '{judul}'")

                # Ambil konten detail berita
                try:
                    detail_res = requests.get(url_berita, headers=headers, timeout=20)
                    detail_res.raise_for_status()
                    detail_soup = BeautifulSoup(detail_res.content, 'html.parser')

                    # Selector untuk konten CNBC Indonesia - Lebih komprehensif
                    content_selectors = [
                        'div.detail_text',
                        'div.content_detail',
                        'div.post_content',
                        'div.entry-content',
                        'div.article-content',
                        'div[class*="detail"]',
                        'div[class*="content"]',
                        'div[class*="text"]',
                        'div[class*="body"]',
                        'section[class*="content"]',
                        'article',
                        '.detail_area',
                        '.post-content',
                        '.entry-content'
                    ]

                    berita = ""
                    content_found = False

                    for selector in content_selectors:
                        content_div = detail_soup.select_one(selector)
                        if content_div:
                            print(f"🔍 Menggunakan selector: {selector}")

                            # Hapus elemen yang tidak diinginkan (ads, related articles, etc.)
                            unwanted_selectors = [
                                'script', 'style', 'noscript',
                                '.ads', '.advertisement', '[class*="ads"]', '[id*="ads"]',
                                '.related', '.recommend', '[class*="related"]',
                                '.social', '.share', '[class*="social"]', '[class*="share"]',
                                '.comment', '[class*="comment"]',
                                '.navigation', '.nav', '[class*="nav"]',
                                '.footer', '[class*="footer"]',
                                '.sidebar', '[class*="sidebar"]',
                                '.widget', '[class*="widget"]',
                                'iframe', 'embed', 'object'
                            ]

                            for unwanted_selector in unwanted_selectors:
                                unwanted_elements = content_div.select(unwanted_selector)
                                for elem in unwanted_elements:
                                    elem.decompose()

                            # Coba berbagai cara ekstraksi konten
                            paragraphs = content_div.find_all(['p', 'div'])
                            if paragraphs:
                                # Filter paragraf yang memiliki teks substansial
                                valid_paragraphs = []
                                for p in paragraphs:
                                    text = p.get_text(strip=True)
                                    if len(text) > 20 and not any(skip_word in text.lower() for skip_word in
                                        ['baca juga', 'lihat juga', 'subscribe', 'follow', 'share', 'comment']):
                                        valid_paragraphs.append(text)

                                if valid_paragraphs:
                                    berita = ' '.join(valid_paragraphs)
                                    content_found = True

                            # Jika tidak ada paragraf yang valid, ambil semua teks
                            if not content_found:
                                berita = content_div.get_text(separator=' ', strip=True)
                                # Clean up extra whitespace
                                berita = ' '.join(berita.split())

                            print(f"📝 Panjang konten: {len(berita)} karakter")
                            if len(berita) > 200:  # Threshold lebih tinggi
                                content_found = True
                                break

                    # Fallback: ambil semua paragraf dari halaman
                    if not content_found or len(berita) < 100:
                        print("🔄 Mencoba fallback method: mengambil semua paragraf dari halaman.")

                except requests.exceptions.RequestException as e:
                    print(f"⚠ Gagal mengambil konten dari URL: {url_berita}. Error: {e}")
                    berita = "Error mengambil konten."

                # Simpan data jika berita valid
                if len(berita.strip()) > 20:
                    data_cnbc.append({
                        'judul': judul,
                        'berita': berita,
                        'url': url_berita,
                        'tanggal': tanggal
                    })

            except Exception as e:
                print(f"⚠ Error saat memproses artikel: {e}")
                continue

        # Jeda antar halaman
        time.sleep(3)
        page += 1

        # Safety break untuk menghindari infinite loop
        if page > 1000:
            print("⚠ Mencapai batas maksimal halaman (1000). Menghentikan scraping.")
            break

    # Simpan hasil ke CSV
    if data_cnbc:
        
        save_dir = r'E:\$7th\TA\Multimodal_Process_Exploration\DATA\corpus'
        os.makedirs(save_dir, exist_ok=True)  # bikin folder kalau belum ada
        filename = os.path.join(save_dir, f'cnbc_news_search_{keyword.lower().replace(" ", "_")}.csv')
        with open(filename, 'w', newline='', encoding='utf-8') as f:
            writer = csv.DictWriter(f, fieldnames=['judul', 'berita', 'url', 'tanggal'])
            writer.writeheader()
            writer.writerows(data_cnbc)
        print(f"\n✅ {len(data_cnbc)} berita berhasil disimpan ke {filename}")
    else:
        print("\n❌ Tidak ada data yang ditemukan. Kemungkinan:")
        print("1. Struktur website telah berubah")
        print("2. Ada pembatasan akses (rate limiting)")
        print("3. Kata kunci tidak menghasilkan hasil")
        print("4. Halaman pencarian kosong")

    return data_cnbc

def cnbc_advanced_search(keyword, n_berita, start_page):
    """
    Versi advanced dengan lebih banyak opsi kustomisasi
    """
    headers = headers_utama

    data_cnbc = []
    seen_urls = set()
    page = start_page
    retry_count = 0
    max_retries = 3

    print(f"Mulai scraping berita CNBC Indonesia (Advanced)")
    print(f"Kata kunci: '{keyword}' | Target: {n_berita} berita | Mulai dari halaman: {start_page}")

    while len(data_cnbc) < n_berita:
        encoded_keyword = urllib.parse.quote(keyword)
        search_url = f'https://www.cnbcindonesia.com/search?query={encoded_keyword}&page={page}'

        try:
            print(f"\n🔄 Halaman {page} - Target tersisa: {n_berita - len(data_cnbc)} berita")
            res = requests.get(search_url, headers=headers, timeout=25)
            res.raise_for_status()
            retry_count = 0  # Reset retry count on success

        except requests.exceptions.RequestException as e:
            retry_count += 1
            if retry_count <= max_retries:
                print(f"⚠ Retry {retry_count}/{max_retries} untuk halaman {page}: {e}")
                time.sleep(5 * retry_count)  # Exponential backoff
                continue
            else:
                print(f"❌ Gagal mengakses halaman {page} setelah {max_retries} percobaan: {e}")
                break

        soup = BeautifulSoup(res.content, 'html.parser')

        # Multiple selectors untuk fleksibilitas
        article_selectors = [
            'div.list_search',
            'div[class*="media_list"]',
            'div[class*="search_item"]',
            'div[class*="list"]',
            'article'
        ]

        articles = []
        for selector in article_selectors:
            articles = soup.select(selector)
            if articles:
                print(f"✅ Ditemukan {len(articles)} artikel dengan selector: {selector}")
                break

        if not articles:
            print(f"❌ Halaman {page} kosong atau struktur berubah")
            if page == start_page:
                print("Debug HTML snippet:")
                print(soup.prettify()[:800])
            page += 1
            continue

        articles_processed = 0
        for article in articles:
            if len(data_cnbc) >= n_berita:
                break

            try:
                # Extract title and URL
                title_elem = None
                url_berita = None

                link_selectors = [
                    'h4 a', 'h3 a', 'h2 a',
                    'a[href*="/news/"]',
                    'a[href*="/market/"]',
                    'a[href*="/tech/"]',
                    '.media_list_link'
                ]

                for selector in link_selectors:
                    element = article.select_one(selector)
                    if element and element.get('href'):
                        title_elem = element
                        url_berita = element['href']
                        break

                if not title_elem or not url_berita:
                    continue

                if url_berita.startswith('/'):
                    url_berita = 'https://www.cnbcindonesia.com' + url_berita

                judul = title_elem.get_text(strip=True)
                if not judul or url_berita in seen_urls:
                    continue

                seen_urls.add(url_berita)

                print(f"[{len(data_cnbc) + 1}] {judul[:60]}...")

                # Get article content
                berita = get_article_content(url_berita, headers)

                if len(berita.strip()) > 30:
                    data_cnbc.append({
                        'judul': judul,
                        'berita': berita,
                        'url': url_berita,
                    })
                    articles_processed += 1

            except Exception as e:
                print(f"⚠ Error memproses artikel: {e}")
                continue

        print(f"📝 Halaman {page}: {articles_processed} artikel berhasil diproses")

        if articles_processed == 0:
            print(f"⚠ Tidak ada artikel valid di halaman {page}")

        page += 1
        time.sleep(2)  # Reduced delay

        if page > start_page + 100:  # Safety limit
            print("⚠ Mencapai batas halaman maksimal")
            break

    # Save results
    if data_cnbc:
        timestamp = time.strftime("%Y%m%d_%H%M%S")

        save_dir = r'E:\$7th\TA\Multimodal_Process_Exploration\DATA\corpus'
        os.makedirs(save_dir, exist_ok=True)
        filename = os.path.join(save_dir, f'cnbc_news_{keyword.lower().replace(" ", "")}{timestamp}.csv')


        with open(filename, 'w', newline='', encoding='utf-8') as f:
            writer = csv.DictWriter(f, fieldnames=['judul', 'berita', 'url', 'tanggal'])
            writer.writeheader()
            writer.writerows(data_cnbc)

        print(f"\n✅ Scraping selesai!")
        print(f"📄 {len(data_cnbc)} berita disimpan ke: {filename}")
        print(f"🔍 Kata kunci: {keyword}")
        print(f"📊 Halaman yang diproses: {start_page} - {page-1}")

    return data_cnbc

def get_article_content(url, headers):
    """
    Helper function untuk mengambil konten artikel
    """
    try:
        detail_res = requests.get(url, headers=headers, timeout=15)
        detail_res.raise_for_status()
        detail_soup = BeautifulSoup(detail_res.content, 'html.parser')

        content_selectors = [
            'div.detail_text',
            'div.content_detail',
            'div.post_content',
            'div.entry-content',
            'div.article-content',
            'div[class*="detail"]',
            'div[class*="content"]',
            'div[class*="text"]',
            'div[class*="body"]',
            'section[class*="content"]',
            'article',
            '.detail_area',
            '.post-content',
            '.entry-content'
        ]

        for selector in content_selectors:
            content_div = detail_soup.select_one(selector)
            if content_div:
                # Clean unwanted elements lebih agresif
                unwanted_selectors = [
                    'script', 'style', 'noscript',
                    '.ads', '.advertisement', '[class*="ads"]', '[id*="ads"]',
                    '.related', '.recommend', '[class*="related"]',
                    '.social', '.share', '[class*="social"]', '[class*="share"]',
                    '.comment', '[class*="comment"]',
                    '.navigation', '.nav', '[class*="nav"]',
                    'iframe', 'embed', 'object'
                ]

                for unwanted_selector in unwanted_selectors:
                    for unwanted in content_div.select(unwanted_selector):
                        unwanted.decompose()

                # Extract paragraphs dengan filtering
                paragraphs = content_div.find_all(['p', 'div'])
                if paragraphs:
                    valid_content = []
                    for p in paragraphs:
                        text = p.get_text(strip=True)
                        # Filter konten yang relevan
                        if (len(text) > 20 and
                            not any(skip in text.lower() for skip in
                                   ['baca juga', 'lihat juga', 'subscribe', 'follow', 'share'])):
                            valid_content.append(text)

                    if valid_content:
                        content = ' '.join(valid_content)
                    else:
                        content = content_div.get_text(separator=' ', strip=True)
                        content = ' '.join(content.split())  # Clean whitespace
                else:
                    content = content_div.get_text(separator=' ', strip=True)
                    content = ' '.join(content.split())

                if len(content) > 200:  # Threshold lebih tinggi
                    return content

        # Fallback method - ambil semua paragraf dari halaman
        all_paragraphs = detail_soup.find_all('p')
        if all_paragraphs:
            fallback_content = []
            for p in all_paragraphs:
                text = p.get_text(strip=True)
                if len(text) > 30:  # Paragraf substansial saja
                    fallback_content.append(text)

            if fallback_content:
                return ' '.join(fallback_content)

        return "Konten tidak dapat diambil - struktur website mungkin berubah."

    except Exception as e:
        return f"Error: {str(e)}"

if __name__ == "_main_":
    # Contoh penggunaan
    print("=== CNBC Indonesia News Scraper ===\n")

    # Opsi 1: Basic scraping
    # cnbc_news_search(keyword="IKN", n_berita=50)

    # Opsi 2: Advanced scraping dengan kontrol lebih banyak
    cnbc_advanced_search(keyword="food estate", n_berita=50, start_page=38)

In [None]:
# Basic
cnbc_news_search(keyword="ihsg", n_berita=20)

In [None]:
# Advanced
cnbc_advanced_search(keyword="ihsg", n_berita=1000, start_page=500)

Mulai scraping berita CNBC Indonesia (Advanced)
Kata kunci: 'ihsg' | Target: 1000 berita | Mulai dari halaman: 500

🔄 Halaman 500 - Target tersisa: 1000 berita
✅ Ditemukan 12 artikel dengan selector: div[class*="list"]
[1] VIDEOJika Suku Bunga Tinggi Berlangsung Lama, Rupiah Kian Vo...
[2] IHSG 6.700, Asing Ramai-Ramai Borong Saham IniMarket1 tahun ...
[3] IHSG Masih Merah, Asing Kurangi Porsi di Deretan Saham IniMa...
[4] Investasi Masih Suram, 2 Sektor Ini PemicunyaMarket1 tahun y...
[5] VIDEODana Asing Deras Tinggalkan RI, Rupiah Tertekan Sampai ...
[6] VIDEOMarket Focus: IHSG Lesu Hingga Pengusaha Minta Evaluasi...
[7] MARKET COMMENTARYIHSG Terkapar Lagi, 6 Saham Big Cap Ini Bia...
[8] Asing Makin Gencar Tarik Dana Keluar RI, IHSG Kian TertekanM...
📝 Halaman 500: 8 artikel berhasil diproses

🔄 Halaman 501 - Target tersisa: 992 berita
✅ Ditemukan 12 artikel dengan selector: div[class*="list"]
[9] Ini Hantu dari AS yang Bikin Pasar Keuangan RI GemetarMarket...
[10] Rupiah Babak Belur