In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import pandas as pd
import time
import queue

In [2]:
seen = set()
# Initialiser une file d'attente pour stocker les articles et leurs liens
article_queue = queue.Queue()

In [3]:
def click_show_more_button(driver):
    try:
        # Attendez que le bouton "Afficher plus" soit visible
        show_more_button = WebDriverWait(driver, 10).until(
            EC.visibility_of_element_located((By.CSS_SELECTOR, "div.search-more"))
        )

        # Cliquez sur le bouton "Afficher plus" jusqu'à ce qu'il ne soit plus visible
        while show_more_button.is_displayed():
            WebDriverWait(driver, 10).until(lambda d: d.execute_script('return document.readyState') == 'complete')
            wait_results(driver)
            show_more_button.click()
            time.sleep(20)  # Attendez quelques secondes pour le chargement, ajustez selon besoin

    except Exception as e:
        print(f"Erreur lors du clic sur le bouton 'Afficher plus': {str(e)}")


def wait_results(driver):
    try:
        search_results = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, 'div.search-results')))
    except Exception as e:
        print(f"Erreur lors du chargement des resultats de rechereche: {str(e)}")


def add_to_queue(title, date, url):
    if url not in seen:
        article_queue.put((title, date, url))
        seen.add(url)

def scrape_forbes_search(query):
    url = f"https://www.forbes.com/search/?q={query}"
    chromedriver_path = '/usr/local/bin/chromedriver/chromedriver.exe'
    # Utilisation de Selenium pour ouvrir le navigateur
    chrome_options = webdriver.ChromeOptions()
    chrome_options.add_argument(f"executable_path={chromedriver_path}")
    driver = webdriver.Chrome(options=chrome_options)
    driver.get(url)

    try:
        click_show_more_button(driver)
        html_content = driver.page_source
        soup = BeautifulSoup(html_content, "html.parser")

        # Fonction pour extraire les articles et les ajouter à la file d'attente
        def extract_articles(soup):
            articles = soup.find_all("article", class_="stream-item et-promoblock-removeable-item et-promoblock-star-item")
            for article in articles:
                title_elem = article.find("a", class_="stream-item__title")
                date_elem = article.find("div", class_="stream-item__date")

                if title_elem and date_elem:
                    title = title_elem.text.strip()
                    date = date_elem.text.strip()
                    url_elem = title_elem.get('href')
                    full_url = url_elem

                    # Ajouter l'article et son lien à la file d'attente
                    add_to_queue(title, date, full_url)

        def get_article_content(article_url):
            # driver = webdriver.Chrome(options=chrome_options)
            driver.get(article_url)
            article_html = driver.page_source
            article_soup = BeautifulSoup(article_html, "html.parser")
            content_elem = article_soup.find("div", class_="body-container")
            content_elem_desc = content_elem.find("p") if content_elem else None
            content = content_elem_desc.text.strip() if content_elem_desc else ""
            return content

        # Appeler la fonction pour extraire les articles de la page actuelle
        extract_articles(soup)

        
        dates = []
        urls = []
        titles = []
        contents = []

        # Tant que la file d'attente n'est pas vide, traiter chaque article
        while not article_queue.empty():
            title, date, url = article_queue.get()
            print(f"Traitement de l'article: {title}")
            dates.append(date)
            urls.append(url)
            titles.append(title)
            contents.append(get_article_content(url))

        data = {
            'Date': dates,
            'URL': urls,
            'Title': titles,
            'Content': contents
        }
        df = pd.DataFrame(data)
        return df

    except Exception as e:
        print(f"Erreur lors du scraping de la page Forbes : {str(e)}")

    finally:
        driver.quit()



In [None]:
# Utilisation de la fonction avec une requête spécifique
df = scrape_forbes_search("Finance")
df