In [278]:
import pandas as pd
import requests
from bs4 import BeautifulSoup as bs
import json
import unicodedata
import ssl
ssl._create_default_https_context = ssl._create_unverified_context

In [279]:
response = requests.get("https://www.lesoir.be/18/sections/le-direct")
soup = bs(response.content, "html.parser")


#### Create list with urls of first page of news articles

In [298]:
urls = []
base_url = "https://www.lesoir.be"
links = soup.select("h3 > a")
for link in links:
    url = link.get('href')
    dict = {'source_url': url if '//' in url else base_url + url }
    urls.append(dict)   
print(urls)


In [281]:
df = pd.DataFrame(urls)
df.head(20)

Unnamed: 0,source_url
0,https://www.lesoir.be/529252/article/2023-08-0...
1,https://www.lesoir.be/529248/article/2023-08-0...
2,https://www.lesoir.be/529241/article/2023-08-0...
3,http://soirmag.lesoir.be/529239/article/2023-0...
4,https://www.lesoir.be/529235/article/2023-08-0...
5,https://www.lesoir.be/529233/article/2023-08-0...
6,https://www.lesoir.be/529231/article/2023-08-0...
7,https://geeko.lesoir.be/2023/08/03/netflix-va-...
8,http://soirmag.lesoir.be/529228/article/2023-0...
9,https://geeko.lesoir.be/2023/08/03/avec-lintro...


In [285]:
def geeko_selector(url:str) -> set: # selector for geeko.lesoir.be urls
    response = requests.get(url)
    soup = bs(response.content, "html.parser")
    div = soup.find('div', attrs={'class': 'post-content-area'})
    paragraphs = div.find_all('p')
    if 'Suivez Geeko sur Facebook' in paragraphs[-1].text:
        paragraphs = paragraphs[:-1]
    text = ''    
    return paragraphs, text

In [286]:
def sosoir_selector(url:str) -> set: # selector for sosoir.lesoir.be urls
    response = requests.get(url)
    soup = bs(response.content, "html.parser")
    h2 = soup.find('h2', attrs={'class':'chapeau'}).text.strip()
    div = soup.find('div', attrs={'id':'artBody'})
    paragraphs = div.find_all('p')
    if 'sosoir.lesoir.be' in paragraphs[-1].text:
        paragraphs = paragraphs[:-1]
    text = h2    
    return paragraphs, text

In [287]:
def lesoirmag_selector(url:str) -> set: # selector for www.lesoir.be and soirmag.lesoir.be urls
    response = requests.get(url)
    soup = bs(response.content, "html.parser")
    article = soup.find('article',attrs={'class':'r-article'})
    paragraphs = article.find_all('p')   
    if 'www.soirmag.be' in paragraphs[-1].text:
        paragraphs = paragraphs[:-1] 
    text = ''    
    return paragraphs, text    

In [288]:
def find_article_title(url: str) -> str:
    response = requests.get(url)
    soup = bs(response.content, "html.parser")
    article_title = soup.find("h1").text
    return article_title

In [289]:
def find_article_text(url: str) -> str:
    if 'sosoir' in url:
        paragraphs, article_text = sosoir_selector(url)
    elif 'geeko' in url:
        paragraphs, article_text = geeko_selector(url)
    else:
        paragraphs, article_text = lesoirmag_selector(url)        
    for p in paragraphs:
        p_text = unicodedata.normalize("NFKD", p.text.strip())
        article_text += p_text
    return article_text
    

In [290]:
def find_published_date(url:str) -> str:
    response = requests.get(url)
    soup = bs(response.content, "html.parser")
    script = soup.find('script', {"type": "application/ld+json"})
    data = json.loads(script.text, strict=False)
    try:
        published_date = data['@graph'][0]['datePublished']
    except:
        published_date = data['datePublished']    
    return published_date

In [291]:
df['article_title'] = df['source_url'].apply(find_article_title)

In [292]:
df['article_text'] = df['source_url'].apply(find_article_text)

In [293]:
df['date'] = df['source_url'].apply(find_published_date)

In [294]:
df['language'] = 'fr'

In [295]:
df.head(20)

Unnamed: 0,source_url,article_title,article_text,date,language
0,https://www.lesoir.be/529252/article/2023-08-0...,« Je ne leur pardonnerai jamais » : Herman de ...,Suite de l’affaire des « surpensions » et inde...,2023-08-03T15:55:06+02:00,fr
1,https://www.lesoir.be/529248/article/2023-08-0...,La reine Paola et le ministre italien Antonio ...,Les commémorations de la catastrophe du Bois ...,2023-08-03T15:44:22+02:00,fr
2,https://www.lesoir.be/529241/article/2023-08-0...,Un bébé abandonné dans une poussette : la poli...,"Le samedi 29 juillet 2023 vers 16h50, une femm...",2023-08-03T15:33:05+02:00,fr
3,http://soirmag.lesoir.be/529239/article/2023-0...,Johnny Hallyday chante des génériques grâce à ...,Les internautes se sont amusés à mixer la vo...,2023-08-03T15:28:18+02:00,fr
4,https://www.lesoir.be/529235/article/2023-08-0...,Bagarre mortelle au chapiteau de Péronnes : la...,La police a lancé jeudi un appel à témoins ...,2023-08-03T15:18:58+02:00,fr
5,https://www.lesoir.be/529233/article/2023-08-0...,Coup d’Etat au Niger : 70 Belges et ayants dro...,Environ 70 Belges et leurs proches qui voulaie...,2023-08-03T15:05:40+02:00,fr
6,https://www.lesoir.be/529231/article/2023-08-0...,Détresse extrême dans les refuges animaliers :...,Aboiements sous pluie battante annoncent notre...,2023-08-03T14:58:55+02:00,fr
7,https://geeko.lesoir.be/2023/08/03/netflix-va-...,Netflix va diffuser sa première comédie romant...,Netflix continue de miser sur des productions ...,2023-08-03T12:13:48+02:00,fr
8,http://soirmag.lesoir.be/529228/article/2023-0...,Lizzo répond aux accusations «scandaleuses» la...,La star a été accusée par trois de ses anci...,2023-08-03T14:46:19+02:00,fr
9,https://geeko.lesoir.be/2023/08/03/avec-lintro...,"Avec l’introduction d’une taxe streaming, Deez...",Si Spotify est le numéro un mondial du stream...,2023-08-03T12:14:34+02:00,fr


In [296]:
# df.to_csv('data/lesoir_articles.csv')