In [1]:
import bs4
import requests
import pandas as pd
from tqdm import tqdm_notebook
import collections as col
import spacy 
nlp = spacy.load('fr_core_news_sm')
import math
import datetime
from dateutil.parser import parse

### Fonctions communes à tous les types de journaux

In [1]:
def parse_soup(url):
    """
        The function parse the page with beautifulsoup
        @param :  string containing the url of the rss feed
        @return : object containing the parse page
    """
    try:
        req = requests.get(url)
        data = req.text
        soup = bs4.BeautifulSoup(data, "lxml")
        return(soup)
    except Exception as e:
        print(e)
    
def clean_text(text):
    """
        Remove the inapropriate caracters/words in order to clean the content
    """
    try:
        return nlp(text).text
    except Exception as e:
        print(e)


def create_json_file(articles,name,path):
    """
        Transform the list containing info on articles into  json file
        @param articles : all articles scraped
        @param i : the rough article id
        @param path : path where the json files are stored
    """
    try:
        df = pd.DataFrame(articles)
        filename = 'rough_art_' + name+'.json'
        with open(path + filename, 'w', encoding='utf-8') as file:
            df.to_json(file, orient='index',force_ascii=False) 
    except Exception as e:
        print(e)

## Journal de l'environnement

In [3]:
def get_url_articles_jdle(website_link, nb_max_pages):
    """
        Based on the links passed in parameter, we get the url articles of the website
        @return :  A list of links of articles
    """
    articles = []
    key_words = ['plastique','microplastique','dechet','ocean','pacifique']

    # Get all pages of the website
    for i in tqdm_notebook(range(1,nb_max_pages)):
        
        # Get all articles in a page
        soup = parse_soup(website_link + "/recherche=plastiques-" + str(i) + "?sort=pertinence")

        #Select only the link of the articles
        for article_link,date in zip(soup.find_all("h2", {"class":"titreRecherche"}),soup.find_all('span' ,{'class':"color909090"})):
            article_link = article_link.find('a')
            if article_link.get("href") != None and '#' not in article_link.get("href") and any(word in article_link.get("href").lower() for word in key_words):
                # link
                new_article = website_link + article_link.get("href")
                
                # date
                date = date.text.lstrip()
                year = parse(date, fuzzy=True).year
                month = parse(date, fuzzy=True).month
                if month <= 9:
                    month = '0' + str(month)
                day = parse(date, fuzzy=True).day
                if day <= 9:
                    day = '0' + str(day)
                
                articles.append([new_article,str(year)+'/'+str(month)+'/'+str(day)])
                
    return articles

def extract_info_articles_jdle(articles_info):
    """
        We get the content of the different articles
        @return : dictionnary with info on articles
    """
    new_articles = []
    for index,info in tqdm_notebook(enumerate(articles_info)):
        # Initilialize variables
        contents = []
        description = title = pub_date = authors = ""

        # Get the content of the page relative to an article
        soup = parse_soup(info[0])

        # Get the description
        if soup.find('strong') != None:
            description= soup.find('strong').text

        # Get title
        if soup.find('h1', {'class':"articleTitre"}) != None:
                title=soup.find('h1', {'class':"articleTitre"}).text

        # Get the authors and publication_date
        posssible_authors = [soup.find('div',{"class":"articleHautPageAuthor"}),soup.find("span",{"class":"regular11px"})]
        for author in posssible_authors:
            if author != None and 'par' in author and len(author.text.replace(' ','').split('par')[0]) <20:
                authors = author.text.split('par')[1] # get only the author

        # Get publication date       
        pub_date = info[1] 
        
        # Get contents : first method   
        for bal in soup.find_all("p", {"class":"MsoNormal"}):
            for cont in bal.find_all('span'):
                contents.append(cont.text)
                
        # Get contents : second method      
        if not contents:
            for cont in soup.find_all("p"):
                if cont != None and cont.find('strong') == None and len(cont.text) > 80:
                    contents.append(cont.text)
            contents = contents[:-3]
            
        # We only analyse article with a content
        if contents:
            # Clean the string variable
            contents = clean_text(' '.join(contents))
            title = clean_text(title)
            description = clean_text(description)
            authors = clean_text(authors)

            new_articles.append({
                "id_article":index,
                "link":info[0],
                "title":title,
                "description":description,
                "content":contents,
                "authors":authors,
                "newspaper":'JournalEnvironnement',
                "publication_date":pub_date
            })
    return new_articles

In [4]:
link= "http://www.journaldelenvironnement.net"
max_pages = 73 # specify the right number
articles_jdle = get_url_articles_jdle(link,max_pages)

HBox(children=(IntProgress(value=0, max=72), HTML(value='')))




In [5]:
clean_articles_jdle = extract_info_articles_jdle(articles_jdle)
create_json_file(clean_articles_jdle,'jdle','../data/articles/')

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




## Le Monde

In [6]:
def get_url_articles_lmde(website_link, nb_max_pages):
    """
        Based on the links passed in parameter, we get the url articles of the website
        @return :  A list of links of articles
    """
    link_articles = []
    # Get all pages of the website
    for i in tqdm_notebook(range(1,nb_max_pages)):
        try:
            # Get all articles in a page
            soup = parse_soup(website_link + '/recherche/?keywords=plastique+ocean&page_num=' +str(i) + '&operator=and&exclude_keywords=&qt=recherche_texte_titre&author=&period=since_1944&start_day=01&start_month=01&start_year=1944&end_day=07&end_month=02&end_year=2019&sort=pertinence')
            
            #Select only the link of the articles
            for bal in soup.find_all('h3',{"class":"txt4_120"}):
                link = bal.find('a').get('href')
                if link != None and 'article' in link:
                    new_article = website_link + link
                    link_articles.append(new_article)
                    
        except Exception as e:
            print(e)
    
    return list(set(link_articles))


def get_info_articles_lmde(link_articles):
    """
        We get the content of the different articles
        @return ! dictionnary with the sentences by articles
    """
    new_articles = []
    for index,link in tqdm_notebook(enumerate(link_articles)):
        # Initilialize variables
        contents = []
        description = title = pub_date = authors = ""
        
        # Get the content of the page relative to an article
        soup = parse_soup(link)

        # Get the description
        if soup.find('p',{"class":"article__desc"}) != None:
            description= soup.find('p',{"class":"article__desc"}).text

        # Get title
        if soup.find('title') != None:
                title = soup.find('title').text

        # Get publication date
        pub_date = link.split('article')[1][0:11]
        year = parse(pub_date, fuzzy=True).year
        month = parse(pub_date, fuzzy=True).month
        if month <= 9:
            month = '0' + str(month)
        day = parse(pub_date, fuzzy=True).day
        if day <= 9:
            day = '0' + str(day)
        pub_date = str(year) + '/'+ str(month)+'/' + str(day)

        # Get the authors
        if soup.find('span',{"class":"meta__author"}) != None:
            authors = soup.find('span',{"class":"meta__author"}).text

        # Get contents   
        if soup.find("section", {"class":"article__content"}) != None:
            article_content = soup.find("section", {"class":"article__content"})
        for content in article_content.find_all('p'):
            if content != None:
                contents.append(content.text)    
        
        # Clean the string variable
        contents = clean_text(' '.join(contents))
        title = clean_text(title)
        description = clean_text(description)
        authors = clean_text(authors)
        
        new_articles.append({
            "id_article":index,
            "link":link,
            "title":title,
            "description":description,
            "content":contents,
            "authors":authors,
            "newspaper":'LeMonde',
            "publication_date":pub_date
        })
    return new_articles

In [7]:
link = "https://www.lemonde.fr"
max_pages = 56 # specify the right number
articles_lmde = get_url_articles_lmde(link,max_pages)

HBox(children=(IntProgress(value=0, max=55), HTML(value='')))




In [16]:
clean_articles_lmde = get_info_articles_lmde(articles_lmde)
create_json_file(clean_articles_lmde,'lmde','../data/articles/')

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




## 7econtinent 

In [12]:
def get_url_articles_7econ(website_link, nb_max_pages):
    """
        Based on the links passed in parameter, we get the url articles of the website
        @return :  A list of links of articles
    """
    link_articles = []
    # Get all pages of the website
    for i in tqdm_notebook(range(1,nb_max_pages)):
        try:
            # Get all articles in a page
            soup = parse_soup(website_link + 'page/'+str(i)+'/?s=plastique+ocean')
            
            #Select only the link of the articles
            for bal in soup.find_all('div',{"class":"search-entry-thumb"}):
                link = bal.find('a').get('href')
                if link != None:
                    link_articles.append(link)
                    
        except Exception as e:
            print(e)
    
    return list(set(link_articles))


def get_info_articles_7econ(link_articles):
    """
        We get the content of the different articles
        @return ! dictionnary with the sentences by articles
    """
    new_articles = []
    for index,link in tqdm_notebook(enumerate(link_articles)):
        # Initilialize variables
        contents = []
        title = pub_date = authors = ""
        
        # Get the content of the page relative to an article
        soup = parse_soup(link)

        # Get publication date
        if soup.find("time") != None:
            pub_date = str(soup.find("time")).split("datetime=")[1][1:11]
            
        # Get title
        if soup.find("title") != None:
            title = soup.find("title").text

        # Get the authors
        authors = "7eme Continent"

        # Get contents   
        for content in soup.find_all("p"):
            if content.find("strong") == None:
                contents.append(content.text)    
        
        if contents: # check if the article isn't a video or has a content
            # Clean the string variable
            contents = contents[:-10]
            contents = clean_text(' '.join(contents))
            title = clean_text(title)
            authors = clean_text(authors)

            new_articles.append({
                "id_article":index,
                "link":link,
                "title":title,
                "content":contents,
                "authors":authors,
                "newspaper":'7emeContinent',
                "publication_date":pub_date
            })
    return new_articles

In [13]:
link = "http://www.septiemecontinent.com/"
max_pages = 17 # specify the right number (16)
articles_7econ = get_url_articles_7econ(link,max_pages)

HBox(children=(IntProgress(value=0, max=16), HTML(value='')))




In [17]:
clean_articles_7econ = get_info_articles_7econ(articles_7econ)
clean_articles_7econ
create_json_file(clean_articles_7econ,'7econ','../data/articles/')

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




## Create csv files to keep the usefull links

In [25]:
date_creation = datetime.datetime.now().strftime("%Y-%m-%d")

# Jounral de l'environnement
pd.DataFrame(articles_jdle).to_csv('../data/links/link_jdle_'+str(date_creation)+'.csv',sep=';',encoding='utf-8')

# Le Monde
pd.DataFrame(articles_lmde).to_csv('../data/links/link_ldme_'+str(date_creation)+'.csv',sep=';')

# 7eme continent
pd.DataFrame(articles_7econ).to_csv('../data/links/link_7econ_'+str(date_creation)+'.csv',sep=';')