In [None]:
!pip install requests beautifulsoup4

import aiohttp
import asyncio
from bs4 import BeautifulSoup
import csv
import nest_asyncio



In [None]:
nest_asyncio.apply()  # Allow nested event loops in Jupyter Notebook

# Prompt user to input the keyword
keyword = input("Enter the exact phrase to search for: ").strip()

# Prompt user to input the maximum number of pages to search
max_pages = int(input("Enter the maximum number of pages to search: "))

# Set the base URL for search results on CNBC Indonesia
base_url = f'https://www.cnbcindonesia.com/search/?query={keyword.replace(" ", "+")}&page='

# Function to fetch page content asynchronously
async def fetch(session, url):
    try:
        async with session.get(url) as response:
            return await response.text()
    except Exception as e:
        print(f"Error fetching {url}: {e}")
        return ""

# Function to extract the article's publication date from the article page's meta tag
async def extract_article_date(session, article_url):
    try:
        article_response = await fetch(session, article_url)
        article_soup = BeautifulSoup(article_response, 'html.parser')
        meta_tag = article_soup.find('meta', attrs={'name': 'publishdate'})
        if meta_tag:
            return meta_tag.get('content')
    except Exception as e:
        print(f"Error extracting date for {article_url}: {e}")
    return None

# Function to check if the exact phrase exists in the article body
async def contains_exact_phrase(session, article_url, phrase):
    try:
        article_response = await fetch(session, article_url)
        article_soup = BeautifulSoup(article_response, 'html.parser')
        content = article_soup.find('main', class_='mx-auto w-full max-w-6xl flex-1 py-9')
        if content:
            return phrase.lower() in content.get_text(strip=True).lower()
    except Exception as e:
        print(f"Error checking phrase for {article_url}: {e}")
    return False

# Function to process a single article
async def process_article(session, article, phrase, writer):
    try:
        link = article.get('href')
        if not link.startswith('http'):
            link = 'https://www.cnbcindonesia.com' + link

        if await contains_exact_phrase(session, link, phrase):
            article_response = await fetch(session, link)
            article_soup = BeautifulSoup(article_response, 'html.parser')
            title_tag = article_soup.find('h1', class_='mb-4 text-32 font-extrabold')
            if title_tag:
                title = title_tag.get_text(strip=True)
                article_date = await extract_article_date(session, link)
                writer.writerow([title, link, article_date])
                return True
    except Exception as e:
        print(f"Error processing article: {e}")
    return False

# Main function to handle asynchronous scraping
async def main():
    csv_filename = f'{keyword.replace(" ", "_")}_articles.csv'
    total_articles = 0

    async with aiohttp.ClientSession() as session:
        with open(csv_filename, mode='w', newline='', encoding='utf-8') as file:
            writer = csv.writer(file)
            writer.writerow(['Title', 'Link', 'Published On'])

            for page_num in range(1, max_pages + 1):
                url = base_url + str(page_num)
                print(f"Fetching page {page_num}...")

                try:
                    page_content = await fetch(session, url)
                    soup = BeautifulSoup(page_content, 'html.parser')
                    articles = soup.find_all('a', href=True)

                    # Stop loop if no articles are found
                    if not articles:
                        print(f"No articles found on page {page_num}. Stopping...")
                        break

                    tasks = [process_article(session, article, keyword, writer) for article in articles]
                    results = await asyncio.gather(*tasks)

                    articles_added = sum(results)
                    total_articles += articles_added

                    print(f"Page {page_num}: {articles_added} articles added.")

                    # Stop if no articles were added from this page
                    if articles_added == 0:
                        print(f"No matching articles added from page {page_num}. Stopping...")
                        break

                except Exception as e:
                    print(f"Error fetching page {page_num}: {e}")

    print(f"\nTotal articles processed: {total_articles}")
    print(f"Results saved to {csv_filename}")

# Run the main function
if __name__ == '__main__':
    loop = asyncio.get_event_loop()
    loop.run_until_complete(main())

Enter the exact phrase to search for: Hutang
Enter the maximum number of pages to search: 2
Fetching page 1...
Page 1: 12 articles added.
Fetching page 2...
Page 2: 12 articles added.

Total articles processed: 24
Results saved to Hutang_articles.csv
