In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
import time

In [10]:
def scrape_cnn_news(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    
    try:
        # Ambil konten halaman indeks
        response = requests.get(url, headers=headers)
        response.raise_for_status()
        
        soup = BeautifulSoup(response.text, 'html.parser')
        news_links = []
        
        # Temukan semua link berita di halaman indeks - pendekatan yang lebih spesifik
        article_containers = soup.find_all(['article', 'div'], class_=re.compile(r'(article|news|berita)'))
        
        for container in article_containers:
            link_element = container.find('a', href=True)
            if link_element:
                link = link_element['href']
                # Pastikan link lengkap (bukan relative URL)
                if link.startswith('/'):
                    link = 'https://www.cnnindonesia.com' + link
                if '/nasional/' in link and link not in news_links:
                    news_links.append(link)
        
        # Persiapan untuk menyimpan data
        news_data = []
        
        # Scraping setiap berita individu
        for news_url in news_links:
            try:
                # Delay untuk menghindari request berlebihan
                time.sleep(3)
                
                news_response = requests.get(news_url, headers=headers)
                news_response.raise_for_status()
                
                news_soup = BeautifulSoup(news_response.text, 'html.parser')
                
                # Ekstrak judul - beberapa kemungkinan lokasi
                title = None
                title_selectors = ['h1', 'h2', ['meta', 'property', 'og:title']]
                
                for selector in title_selectors:
                    if isinstance(selector, list) and selector[0] == 'meta':
                        meta_tag = news_soup.find('meta', {selector[1]: selector[2]})
                        if meta_tag and meta_tag.get('content'):
                            title = meta_tag['content']
                            break
                    else:
                        title_tag = news_soup.find(selector)
                        if title_tag:
                            title = title_tag.get_text(strip=True)
                            if title:
                                break
                
                if not title:
                    title = 'No Title'
                
                # Ekstrak konten berita - beberapa kemungkinan lokasi
                content = None
                content_selectors = [
                    'div.detail__body-text',
                    'div.text-detail',
                    'div.article-content',
                    'div.content'
                ]
                
                for selector in content_selectors:
                    content_div = news_soup.select_one(selector)
                    if content_div:
                        # Hapus tag yang tidak diinginkan dari konten
                        for element in content_div.find_all(['script', 'style', 'div.ads', 'figure', 'iframe', 'ul.related']):
                            element.decompose()
                        
                        # Ambil semua paragraf
                        paragraphs = content_div.find_all('p')
                        if paragraphs:
                            content = ' '.join([p.get_text(strip=True) for p in paragraphs])
                            content = re.sub(r'\s+', ' ', content).strip()
                            break
                
                if not content:
                    content = 'No Content'
                
                # Ekstrak tanggal
                date = None
                date_selectors = [
                    'div.date',
                    'span.date',
                    'time',
                    ['meta', 'property', 'article:published_time']
                ]
                
                for selector in date_selectors:
                    if isinstance(selector, list) and selector[0] == 'meta':
                        meta_tag = news_soup.find('meta', {selector[1]: selector[2]})
                        if meta_tag and meta_tag.get('content'):
                            date = meta_tag['content']
                            break
                    else:
                        date_tag = news_soup.find(selector)
                        if date_tag:
                            date = date_tag.get_text(strip=True)
                            if date:
                                break
                
                if not date:
                    date = 'No Date'
                
                # Simpan data
                news_data.append({
                    'title': title,
                    'content': content,
                    'date': date,
                    'url': news_url
                })
                
                print(f"Scraped: {title}")
                print(f"Content length: {len(content)} characters")
                
            except Exception as e:
                print(f"Error scraping {news_url}: {str(e)}")
                continue
        
        # Simpan ke CSV
        if news_data:
            df = pd.DataFrame(news_data)
            df.to_csv('cnn_nasional_1.csv', index=False, encoding='utf-8-sig')
            print(f"Successfully saved {len(news_data)} articles")
        else:
            print("No articles were scraped")
        
    except Exception as e:
        print(f"Error: {str(e)}")

# URL target
target_url = "https://www.cnnindonesia.com/nasional/indeks/3"
scrape_cnn_news(target_url)

No articles were scraped
