In [100]:
import re
import random
import time
import logging
import requests
import os

from bs4 import BeautifulSoup
from bs4.element import Comment

In [60]:
RECENT_ARTICLES_BASE_URL = "https://bonpote.com/articles/?e-page-55ad0bf="
ARTICLE_CLASS_REGEX = re.compile('.*post type-post.*')
MAX_SLEEP_TIME = 5

In [61]:
all_article_links = set()
current_page_nr = 1

while True:
    print(f"Scrapping page {current_page_nr} ...")
    
    page_url = RECENT_ARTICLES_BASE_URL + str(current_page_nr)
    page = requests.get(page_url)
    page_soup = BeautifulSoup(page.content, "html.parser")
    article_divs = page_soup.find_all("div",{"class" : ARTICLE_CLASS_REGEX})
    current_page_articles = [div.find('a', href=True)["href"] for div in article_divs]

    if len(current_page_articles) == 0:  # if no links are found, break the loop
        print(f"No articles found, exiting")
        break
        
    print(f"Scrapping page {current_page_nr} done: found {len(current_page_articles)} articles")
    
    all_article_links.update(current_page_articles)
    current_page_nr += 1
    
    time.sleep(random.randint(0, MAX_SLEEP_TIME))

Scrapping page 1 ...
Scrapping page 1 done: found 10 articles
Scrapping page 2 ...
Scrapping page 2 done: found 10 articles
Scrapping page 3 ...
Scrapping page 3 done: found 10 articles
Scrapping page 4 ...
Scrapping page 4 done: found 10 articles
Scrapping page 5 ...
Scrapping page 5 done: found 10 articles
Scrapping page 6 ...
Scrapping page 6 done: found 10 articles
Scrapping page 7 ...
Scrapping page 7 done: found 10 articles
Scrapping page 8 ...
Scrapping page 8 done: found 10 articles
Scrapping page 9 ...
Scrapping page 9 done: found 10 articles
Scrapping page 10 ...
Scrapping page 10 done: found 10 articles
Scrapping page 11 ...
Scrapping page 11 done: found 10 articles
Scrapping page 12 ...
Scrapping page 12 done: found 10 articles
Scrapping page 13 ...
Scrapping page 13 done: found 10 articles
Scrapping page 14 ...
Scrapping page 14 done: found 10 articles
Scrapping page 15 ...
Scrapping page 15 done: found 10 articles
Scrapping page 16 ...
Scrapping page 16 done: found 10 art

In [62]:
all_article_links

{'https://bonpote.com/10-actions-simples-pour-devenir-ecolo/',
 'https://bonpote.com/10-chiffres-a-connaitre-sur-lavion-et-le-climat/',
 'https://bonpote.com/10-erreurs-de-communication-sur-le-climat-a-rectifier-durgence/',
 'https://bonpote.com/10-idees-recues-sur-la-sobriete-des-modes-de-vie/',
 'https://bonpote.com/10-notions-indispensables-a-connaitre-sur-le-climat/',
 'https://bonpote.com/10-reponses-sur-le-programme-economique-du-nouveau-front-populaire/',
 'https://bonpote.com/100-entreprises-sont-elles-responsables-de-71-des-emissions/',
 'https://bonpote.com/2023-annee-record-pour-les-emissions-mondiales-de-co2/',
 'https://bonpote.com/2e-rapport-du-giec-les-medias-encore-une-fois-pas-au-niveau/',
 'https://bonpote.com/3-millions-de-dollar-pour-le-champion-du-monde-fortnite/',
 'https://bonpote.com/33-milliards-detres-humains-exposes-au-changement-climatique-le-nouveau-rapport-du-giec-est-sans-appel/',
 'https://bonpote.com/4-infographies-pour-comprendre-lagriculture/',
 'http

In [63]:
with open('bonpote_articles_urls.txt', 'w') as f:
    for line in list(all_article_links):
        f.write(f"{line}\n")

In [114]:
# https://stackoverflow.com/questions/1936466/how-to-scrape-only-visible-webpage-text-with-beautifulsoup

def tag_visible(element):
    if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]']:
        return False
    if isinstance(element, Comment):
        return False
    return True


def text_from_html(soup: BeautifulSoup):
    texts = soup.find_all(string=True)
    visible_texts = filter(tag_visible, texts)  
    return '\n'.join(t.strip() for t in visible_texts)

In [115]:
ARTICLE_CONTENT_REGEX = re.compile('.*elementor-widget-theme-post-content.*')
all_article_texts = {}

for article_link in all_article_links:
    print(f"Scrapping article {article_link} ...")
    
    article_page = requests.get(article_link)
    article_soup = BeautifulSoup(article_page.content, "html.parser")
    article_content_div = article_soup.find("div",{"class" : ARTICLE_CONTENT_REGEX})
    article_content = text_from_html(article_content_div)

    if not article_content:
        print(f"WARNING: Haven't found text for article {article_link}")
        
    print(f"Scrapping done for article {article_link}")
    
    all_article_texts[article_link] = article_content
    
    time.sleep(random.randint(0, MAX_SLEEP_TIME))

Scrapping article https://bonpote.com/droit-de-reponse-au-texte-ecologie-arretons-de-culpabiliser-les-francais/ ...
Scrapping done for article https://bonpote.com/droit-de-reponse-au-texte-ecologie-arretons-de-culpabiliser-les-francais/
Scrapping article https://bonpote.com/sante-et-climat-7-bonnes-raisons-de-lutter-contre-le-rechauffement-climatique/ ...
Scrapping done for article https://bonpote.com/sante-et-climat-7-bonnes-raisons-de-lutter-contre-le-rechauffement-climatique/
Scrapping article https://bonpote.com/le-pari-de-pascal/ ...
Scrapping done for article https://bonpote.com/le-pari-de-pascal/
Scrapping article https://bonpote.com/climat-on-aura-besoin-de-tout-le-monde/ ...
Scrapping done for article https://bonpote.com/climat-on-aura-besoin-de-tout-le-monde/
Scrapping article https://bonpote.com/climat-peut-on-vraiment-faire-confiance-au-giec/ ...
Scrapping done for article https://bonpote.com/climat-peut-on-vraiment-faire-confiance-au-giec/
Scrapping article https://bonpote

In [116]:
for article_link, article_text in all_article_texts.items():
    with open(os.path.join("data", "articles", article_link.split("/")[-2]), 'w') as f:
        f.write(article_text)

In [65]:
article_link = next(iter(all_article_links))

In [66]:
article_link

'https://bonpote.com/droit-de-reponse-au-texte-ecologie-arretons-de-culpabiliser-les-francais/'

In [67]:
article_page = requests.get(article_link)
article_soup = BeautifulSoup(article_page.content, "html.parser")

In [69]:
ARTICLE_CONTENT_REGEX = re.compile('.*elementor-widget-theme-post-content.*')

In [86]:
article_content_divs = article_soup.find("div",{"class" : ARTICLE_CONTENT_REGEX})

In [105]:
text_from_html(article_content_div)

'\n\nGetting your\nTrinity Audio\nplayer ready...\nCe texte sur les canicules marines est de\nRaphael Seguin\n, biologiste marin à l’UMR Marbec de l’Université de Montpellier.\nDurant l’été\n2022\n, la France a subi\n33 jours de canicule\n, un record absolu. Mais ce n’est rien comparé à ce qu’il s’est passé sous l’eau.\nDurant cette même période, la\nmer Méditerranée\na enduré plus de 70 jours de canicule marine, soit deux fois plus que sur terre. En France, au large de Marseille, des températures de plus de 5°C au-dessus de la normale ont été observées.\xa0Cette vague de chaleur marine est\nprobablement à l’origine\ndes\norages\nextrêmement violents et meurtriers\xa0 qui ont frappé la Corse le 18 août 2022.\nEt ce ne sera pas sans conséquences. Pour reprendre David Diaz, chercheur à l’Institut Espagnol d’océanographie, les vagues de chaleur\n“ressemblent à des incendies sous-marins, avec une faune et flore qui meurent comme si elles étaient brûlées”.\nLe record de température de la su