In [1]:
import bs4
import requests
import pandas as pd
import collections as col
import spacy 
nlp = spacy.load('fr_core_news_sm')
import math
import datetime
from tqdm import tqdm

In [2]:
def parse_soup(url):
    """
        The function parse the page with beautifulsoup
        @param :  string containing the url of the rss feed
        @return : object containing the parse page
    """
    req = requests.get(url)
    data = req.text
    soup = bs4.BeautifulSoup(data, "lxml")
    return(soup)

def get_contents(path,tag):
    """
        We get the content of the different articles
        @return ! dictionnary with the sentences by articles
    """
    contents = col.defaultdict(list)
    data = pd.read_csv(path,sep=';')
    try:
        for i,url in tqdm(enumerate(data['0'])):
            # Get the content of the page relative to an article
            soup = parse_soup(url)
            
            # Content is in p tag
            for bal in soup.find_all(tag):
                if bal != None and len(bal.text) > 35 and '*' not in bal.text:
                    contents[i].append(str(bal.text))
                    
            # Get clean content by article
            content = clean_content(contents[i])
            
            # Create the json file, containg an article's content
            create_json_files(content,i,'../data/articles/jdle/')
            
    except Exception as e:
        print(e)

def clean_content(content):
    """
        Remove the inapropriate caracters/words in order to clean the content
    """
    # Drop the cookies relatives information in the website
    content = content[:-3]

    new_words = ''
    # List of an article's words
    words = ' '.join(content)

    # Remove backslash caracters/words
    for word in words.split():
        if '\\' not in word:
            new_words = new_words + " " + word

    # Clean basic bad caracters
    content = nlp(new_words).text
    
    return content

def create_json_files(content,i,path):
    """
        Transform each article into a json file
        @param articles : all articles scraped
        @param i : the rough article id
        @param path : path where the json files are stored
    """
    try:
        date_creation = datetime.datetime.now().strftime("%Y-%m-%d")
        # hash_id = hash(i*math.pi+1.76899898898898e+10) # get a unique hashed value by article (could be changed)
        df = pd.DataFrame([content],
                          index=[i],
                          columns=['content'])
        
        filename = 'art_'+str(i)+'_'+date_creation+'.json'
        
        with open(path + filename, 'w', encoding='utf-8') as file:
            df.to_json(file, orient='index',force_ascii=False)
            
    except Exception as e:
        print(e)

In [3]:
get_contents('../data/links/url_articles_jdle.csv','p')

26it [00:14,  1.77it/s]
