In [2]:
import pandas as pd
import requests
from bs4 import BeautifulSoup as bs
import ssl
import re
ssl._create_default_https_context = ssl._create_unverified_context

In [3]:
sitemap = pd.read_xml('https://www.sudinfo.be/sites/default/files/sitemaps/sitemapnews-0.xml')
sitemap.head()

Unnamed: 0,loc,news,image
0,https://www.sudinfo.be/id693556/article/2023-0...,,
1,https://www.sudinfo.be/id693580/article/2023-0...,,
2,https://www.sudinfo.be/id693578/article/2023-0...,,
3,https://www.sudinfo.be/id693577/article/2023-0...,,
4,https://www.sudinfo.be/id693570/article/2023-0...,,


In [4]:
url = sitemap['loc']
url

0      https://www.sudinfo.be/id693556/article/2023-0...
1      https://www.sudinfo.be/id693580/article/2023-0...
2      https://www.sudinfo.be/id693578/article/2023-0...
3      https://www.sudinfo.be/id693577/article/2023-0...
4      https://www.sudinfo.be/id693570/article/2023-0...
                             ...                        
142    https://www.sudinfo.be/id692888/article/2023-0...
143    https://www.sudinfo.be/id692887/article/2023-0...
144    https://www.sudinfo.be/id692884/article/2023-0...
145    https://www.sudinfo.be/id692838/article/2023-0...
146    https://www.sudinfo.be/shopping/bons-plans/id6...
Name: loc, Length: 147, dtype: object

In [27]:
article_one = url[1]
article_one

'https://www.sudinfo.be/id693548/article/2023-08-03/futsal-premiere-revue-des-troupes-ce-vendredi-au-fmc-charleroi-qui-dispute-son'

# Title

In [28]:
def find_article_title(url: str) -> str:
    response = requests.get(url)
    soup = bs(response.content, "html.parser")
    title = soup.find("h1").text
    text = title.strip()
    return text

print(find_article_title(article_one))

Futsal | Première revue des troupes ce vendredi au FMC Charleroi, qui dispute son premier amical de la saison contre Courcelles


# Date

In [29]:
def find_published_date(url:str) -> str:
    response = requests.get(url)
    soup = bs(response.content, "html.parser")
    date = soup.find("time").text
    date_pattern = r"\d{2}/\d{2}/\d{4}" 
    date_match = re.search(date_pattern, date)
    date = date_match.group()
    day, month, year = date.split("/")
    date= f"{year}-{month}-{day}"
    return date

print(find_published_date(article_one))

2023-08-03


# Text

In [30]:
def find_article_text(url: str) -> str:
    response = requests.get(url)
    soup = bs(response.content, "html.parser")
    paragraph = soup.find_all("r-article--chapo", attrs={"class": None})
    paragraphs = [p.text for p in paragraph]
    cleaned_paragraphs = ' '.join(paragraphs)
    return cleaned_paragraphs

print(find_article_text(article_one))

Après avoir repris l’entraînement mardi, le FMC Charleroi joue en amical ce vendredi (21h15) contre l’AT Courcelles. La reprise s’est bien passée également pour le FT Charleroi qui jouera son premier match de préparation la semaine prochaine.


In [31]:
df = sitemap.drop(["news", "image"], axis=1)
df.rename(columns={"loc": "source_url"}, inplace=True)
df.head()

Unnamed: 0,source_url
0,https://www.sudinfo.be/id693551/article/2023-0...
1,https://www.sudinfo.be/id693548/article/2023-0...
2,https://www.sudinfo.be/id693534/article/2023-0...
3,https://www.sudinfo.be/id693530/article/2023-0...
4,https://www.sudinfo.be/id693518/article/2023-0...


In [32]:
df['published_date'] = df['source_url'].apply(find_published_date)
df.head()

Unnamed: 0,source_url,published_date
0,https://www.sudinfo.be/id693551/article/2023-0...,2023-08-03
1,https://www.sudinfo.be/id693548/article/2023-0...,2023-08-03
2,https://www.sudinfo.be/id693534/article/2023-0...,2023-08-03
3,https://www.sudinfo.be/id693530/article/2023-0...,2023-08-03
4,https://www.sudinfo.be/id693518/article/2023-0...,2023-08-03


In [33]:
df['article_title'] = df['source_url'].apply(find_article_title)
df.head()

Unnamed: 0,source_url,published_date,article_title
0,https://www.sudinfo.be/id693551/article/2023-0...,2023-08-03,Rugby (D1) | Soignies en pleine reconstruction...
1,https://www.sudinfo.be/id693548/article/2023-0...,2023-08-03,Futsal | Première revue des troupes ce vendred...
2,https://www.sudinfo.be/id693534/article/2023-0...,2023-08-03,Basket - Messieurs | Nouveau départ pour Hanef...
3,https://www.sudinfo.be/id693530/article/2023-0...,2023-08-03,Basket - Dames | Huy sera très ambitieux (tous...
4,https://www.sudinfo.be/id693518/article/2023-0...,2023-08-03,Basket | Sélectionné pour l’Euro U16 en Macédo...


In [34]:
df['article_text'] = df['source_url'].apply(find_article_text)
df = df.loc[:, ['source_url', 'article_title', 'article_text', 'published_date']]
df.to_csv('sudinfo_articles.csv')
df.head()

Unnamed: 0,source_url,article_title,article_text,published_date
0,https://www.sudinfo.be/id693551/article/2023-0...,Rugby (D1) | Soignies en pleine reconstruction...,Boris Montoisy a pris du recul et c’est donc B...,2023-08-03
1,https://www.sudinfo.be/id693548/article/2023-0...,Futsal | Première revue des troupes ce vendred...,"Après avoir repris l’entraînement mardi, le FM...",2023-08-03
2,https://www.sudinfo.be/id693534/article/2023-0...,Basket - Messieurs | Nouveau départ pour Hanef...,Cette semaine marque la reprise du basket. Déc...,2023-08-03
3,https://www.sudinfo.be/id693530/article/2023-0...,Basket - Dames | Huy sera très ambitieux (tous...,Cette semaine marque la reprise du basket. Déc...,2023-08-03
4,https://www.sudinfo.be/id693518/article/2023-0...,Basket | Sélectionné pour l’Euro U16 en Macédo...,Le basketteur formé au Spirou s’est envolé ce ...,2023-08-03
