In [1]:
import bs4
import requests
import pandas as pd
from tqdm import tqdm

In [2]:
def parse_soup(url):
    """
        The function parse the page with beautifulsoup
        @param :  string containing the url of the rss feed
        @return : object containing the parse page
    """
    req = requests.get(url)
    data = req.text
    soup = bs4.BeautifulSoup(data, "lxml")
    return(soup)

def create_csv(content,name):
    """
        Transform a list into a csv file
    """
    pd.DataFrame(content).to_csv(name,sep=';')

In [3]:
# Create a list of articles url
def get_url_articles_jdle(soup_link_website,website_link,nb_max_pages):
    """
        Based on the links passed in parameter, we get the url articles of the website
        @return :  A list of links of articles
    """
    link_articles = []
    key_words = ['-eau-', '-eaux-','-ocean-','-oceans-','-mer-','pacifique','pollution','continent']
    try:
        # Get all pages of the website
        for i in range(1,nb_max_pages):
            # Get all articles in a page
            soup = parse_soup(soup_link_website + '/' + str(i))

            #Select only the link of the articles
            for article_link in soup.find_all('a'):
                if article_link.get("href") != None and '#' not in article_link.get("href") and any(word in article_link.get("href").lower() for word in ['plastique','dechets']) and any(word in article_link.get("href").lower() for word in key_words):
                    new_article = website_link + article_link.get("href")
                    link_articles.append(new_article)
    except Exception as e:
        print(e)
    return list(set(link_articles))

themes = ["air","eau","dechets","climat","risques-sante","sites-sols","energie","politque-sante"]
nb_pages_by_themes = [200,370,260,180,500,470,500,500]
website_link= "http://www.journaldelenvironnement.net"
path = '../data/links/url_articles_jdle.csv'

links_result = []
for i,theme in tqdm(enumerate(themes)):
    links_result = links_result + get_url_articles_jdle(website_link+'/'+theme,website_link,nb_pages_by_themes[i])

1it [02:06, 126.42s/it]

http://www.journaldelenvironnement.net/article/extraire-les-plastiques-flottants-une-goutte-d-eau-dans-l-ocean,93520
http://www.journaldelenvironnement.net/article/plastiques-marins-le-monstre-du-pacifique-grossit-a-vue-d-il,91033
http://www.journaldelenvironnement.net/article/90-des-bouteilles-d-eau-contiennent-du-plastique,90919
http://www.journaldelenvironnement.net/article/des-fontaines-d-eau-contre-les-dechets-plastique,90031
http://www.journaldelenvironnement.net/article/du-plastique-cache-dans-l-eau-du-robinet,85864
http://www.journaldelenvironnement.net/article/premiere-cartographie-de-la-pollution-plastique-des-oceans,81606
http://www.journaldelenvironnement.net/article/les-oiseaux-de-mer-attires-par-l-odeur-des-micro-plastiques,76527
http://www.journaldelenvironnement.net/article/microplastiques-le-royaume-uni-alimente-l-ocean-arctique,72324
http://www.journaldelenvironnement.net/article/continents-de-plastique-par-ici-la-sortie,70368
http://www.journaldelenvironnement.net/ar

2it [07:56, 193.56s/it]

http://www.journaldelenvironnement.net/article/extraire-les-plastiques-flottants-une-goutte-d-eau-dans-l-ocean,93520
http://www.journaldelenvironnement.net/article/plastiques-marins-le-monstre-du-pacifique-grossit-a-vue-d-il,91033
http://www.journaldelenvironnement.net/article/90-des-bouteilles-d-eau-contiennent-du-plastique,90919
http://www.journaldelenvironnement.net/article/des-fontaines-d-eau-contre-les-dechets-plastique,90031
http://www.journaldelenvironnement.net/article/du-plastique-cache-dans-l-eau-du-robinet,85864
http://www.journaldelenvironnement.net/article/premiere-cartographie-de-la-pollution-plastique-des-oceans,81606
http://www.journaldelenvironnement.net/article/les-oiseaux-de-mer-attires-par-l-odeur-des-micro-plastiques,76527
http://www.journaldelenvironnement.net/article/microplastiques-le-royaume-uni-alimente-l-ocean-arctique,72324
http://www.journaldelenvironnement.net/article/continents-de-plastique-par-ici-la-sortie,70368
http://www.journaldelenvironnement.net/ar

3it [10:27, 180.88s/it]

http://www.journaldelenvironnement.net/article/microplastiques-le-royaume-uni-alimente-l-ocean-arctique,72324


5it [20:09, 248.03s/it]

http://www.journaldelenvironnement.net/article/plastiques-en-mer-la-sante-des-huitres-s-en-ressent,66678


8it [32:48, 226.86s/it]


In [4]:
create_csv(list(set(links_result)),path)
pd.read_csv('../data/links/url_articles_jdle.csv',sep=';').shape

(18, 2)