In [22]:
import bs4
import requests
import pandas as pd
from tqdm import tqdm_notebook
import collections as col
import spacy 
nlp = spacy.load('fr_core_news_sm')
import math
import datetime

### Fonctions communes à tous les types de journaux

In [23]:
def parse_soup(url):
    """
        The function parse the page with beautifulsoup
        @param :  string containing the url of the rss feed
        @return : object containing the parse page
    """
    try:
        req = requests.get(url)
        data = req.text
        soup = bs4.BeautifulSoup(data, "lxml")
        return(soup)
    except Exception as e:
        print(e)
    
def clean_text(text):
    """
        Remove the inapropriate caracters/words in order to clean the content
    """
    try:
        return nlp(text).text
    except Exception as e:
        print(e)


def create_json_file(articles,name,path):
    """
        Transform the list containing info on articles into  json file
        @param articles : all articles scraped
        @param i : the rough article id
        @param path : path where the json files are stored
    """
    try:
        date_creation = datetime.datetime.now().strftime("%Y-%m-%d")
        df = pd.DataFrame(articles)
        filename = 'art_'+date_creation+'_' + name+'.json'
        with open(path + filename, 'w', encoding='utf-8') as file:
            df.to_json(file, orient='index',force_ascii=False) 
    except Exception as e:
        print(e)

## Journal de l'environnement

In [32]:
def get_url_articles_jdle(soup_link_website, website_link, nb_max_pages):
    """
        Based on the links passed in parameter, we get the url articles of the website
        @return :  A list of links of articles
    """
    try:
        link_articles = []
        key_words = ['plastique','microplastique']
        
        # Get all pages of the website
        for i in tqdm_notebook(range(1,nb_max_pages)):
            # Get all articles in a page
            soup = parse_soup(soup_link_website + '/' + str(i))

            #Select only the link of the articles
            for article_link in soup.find_all('a'):
                if article_link.get("href") != None and '#' not in article_link.get("href") and any(word in article_link.get("href").lower() for word in key_words):
                    new_article = website_link + article_link.get("href")
                    link_articles.append(new_article)
    except Exception as e:
        print(e)
    return list(set(link_articles))

def get_info_articles_jdle(link_articles):
    """
        We get the content of the different articles
        @return : dictionnary with info on articles
    """
    new_articles = []
    for index,link in tqdm_notebook(enumerate(link_articles)):
        # Initilialize variables
        contents = []
        description = title = pub_date = authors = ""

        # Get the content of the page relative to an article
        soup = parse_soup(link_articles)

        Get the description
        if soup.find('strong') != None:
            description= soup.find('strong').text

        # Get title
        if soup.find('h1', {'class':"articleTitre"}) != None:
                title=soup.find('h1', {'class':"articleTitre"}).text

        # Get the authors and publication_date
        posssible_authors = [soup.find('div',{"class":"articleHautPageAuthor"}),soup.find("span",{"class":"regular11px"})]
        for author in posssible_authors:
            if author != None:
                authors = author.text.split('par')[1] # get only the author
                pub_date = author.text.replace(' ','').split('par')[0][2:] # get only the date

        # Get the date at the right format
        dictio_month = {'janvier':'01','février':'02','mars':'03','avril':'04','mai':'05','juin':'06','juillet':'07','août':'08','septembre':'09','octobre':'10','novembre':'11','décembre':'12'}
        year = pub_date[-4:]
        day = pub_date[0:2]
        month = dictio_month[pub_date[2:len(pub_date)-4]]
        pub_date = str(year+'/'+month+'/'+day)

        # Get contents   
        for bal in soup.find_all("p", {"class":"MsoNormal"}):
            for cont in bal.find_all('span'):
                contents.append(cont.text)
            
        # Clean the string variable
        contents = clean_text(' '.join(contents))
        title = clean_text(title)
        description = clean_text(description)
        authors = clean_text(authors)
        
        new_articles.append({
            "id_article":index,
            "link":link,
            "title":title,
            "description":description,
            "content":contents,
            "authors":authors,
            "publication_date":pub_date
        })
    return new_articles

<b> On se concentre sur les thèmes eau, déchets et climat proposés par le journal de l'environnement 
dans le but de récupérer les liens qui nous sont utiles.

In [30]:
themes = ["eau","dechets","climat"]
nb_pages = [370,260,180]
link= "http://www.journaldelenvironnement.net"
path = '../data/links/url_articles_jdle.csv'

link_articles_jdle = []
for i,theme in tqdm_notebook(enumerate(themes)):
    link_articles_jdle.append(get_url_articles_jdle(link+'/'+themes[i],link,nb_pages[i]))
    
link_articles_jdle = list(set(link_articles_jdle[0]))

In [33]:
clean_articles_jdle = get_info_articles_jdle(link_articles_jdle)
#create_json_file(clean_articles_jdle,'jlde','../data/articles/')

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

No connection adapters were found for '['http://www.journaldelenvironnement.net/article/micro-plastiques-une-pollution-generalisee-a-la-surface-des-oceans,47764', 'http://www.journaldelenvironnement.net/article/la-concentration-des-dechets-plastique-a-explose-dans-le-pacifique,28952', 'http://www.journaldelenvironnement.net/article/vers-une-reduction-de-80-de-l-utilisation-des-sacs-plastique-en-europe,43835', 'http://www.journaldelenvironnement.net/article/la-nouvelle-faune-de-nos-rivieres-est-en-plastique,75825', 'http://www.journaldelenvironnement.net/article/en-fondant-la-banquise-pourrait-liberer-des-plastiques,46411', 'http://www.journaldelenvironnement.net/article/continents-de-plastique-par-ici-la-sortie,70368', 'http://www.journaldelenvironnement.net/article/les-moules-de-la-braderie-de-lille-ont-des-relents-de-plastique,36247', 'http://www.journaldelenvironnement.net/article/la-mediterranee-est-asphyxiee-aux-dechets-plastique,92098', 'http://www.journaldelenvironnement.net/art





AttributeError: 'NoneType' object has no attribute 'find'

## Le Monde

In [None]:
def get_url_articles_lmde(soup_link_website, article_link, start, end):
    """
        Based on the links passed in parameter, we get the url articles of the website
        @return :  A list of links of articles
    """
    link_articles = []
    key_words = ['plastique','microplastique']
    # Get all pages of the website
    for i in tqdm_notebook(range(start,end)):
        try:
            # Get all articles in a page
            soup = parse_soup(soup_link_website + str(i) + '.html')
            
            #Select only the link of the articles
            for bal in soup.find_all('h3',{"class":" "}):
                link = bal.find('a').get('href')
                if link != None and 'article' in link and any(word in link for word in key_words):
                    new_article = article_link + link
                    link_articles.append(new_article)
                    
        except Exception as e:
            print(e,bal)
    
    return list(set(link_articles))


def get_info_articles_lmde(link_articles):
    """
        We get the content of the different articles
        @return ! dictionnary with the sentences by articles
    """
    new_articles = []
    for index,link in tqdm_notebook(enumerate(link_articles)):
        # Initilialize variables
        contents = []
        description = title = pub_date = authors = ""
        
        # Get the content of the page relative to an article
        soup = parse_soup(link)

        # Get the description
        if soup.find('p',{"class":"article__desc"}) != None:
            description= soup.find('p',{"class":"article__desc"}).text

        # Get title
        if soup.find('h1',{"class":"article_title"}) != None:
                title = soup.find('h1',{"class":"article_title"}).text

        # Get publication date
        pub_date = link[40:50]

        # Get the authors
        if soup.find('span',{"class":"meta__author"}) != None:
            authors = soup.find('span',{"class":"meta__author"}).text

        # Get contents   
        if soup.find("section", {"class":"article__content"}) != None:
            article_content = soup.find("section", {"class":"article__content"})
        for content in article_content.find_all('p'):
            if content != None:
                contents.append(content.text)    
        
        # Clean the string variable
        contents = clean_text(' '.join(contents))
        title = clean_text(title)
        description = clean_text(description)
        authors = clean_text(authors)
        
        new_articles.append({
            "id_article":index,
            "link":link,
            "title":title,
            "description":description,
            "content":contents,
            "authors":authors,
            "publication_date":pub_date
        })
    return new_articles

<b> On dans un premier temps avec n pages (selectionnées) sur la catégorie planète du journal le monde

In [None]:
link = "https://www.lemonde.fr/"
path = '../data/links/url_articles_lmde.csv'
lmde_articles = []
start_page = 1
end_page=20 # choisir un nombre qui fonctionne

link_articles_lmde = get_url_articles_lmde(link+'planete/',link,start_page,end_page)

<b> On récupère les liens jugées utiles, puis on place le contenu de chaque article dans un fichier json

In [None]:
clean_articles_lmde = get_info_articles_lmde(link_articles_lmde)
create_json_file(clean_articles_lmde,'lmde','../data/articles/')

## Create csv files to keep the url

In [None]:
# Jounral de l'environnement
pd.DataFrame(link_articles_jdle).to_csv('../data/links/',sep=';')

# Le Monde
pd.DataFrame(link_articles_lmde).to_csv('../data/links/',sep=';')