In [77]:
import time
import pandas as pd
import requests
from bs4 import BeautifulSoup as bs
import json
import unicodedata

# Use the below import if you get a Certificate error in Mac
import ssl

ssl._create_default_https_context = ssl._create_unverified_context



def get_soup(url:str):
    response = requests.get(url)
    soup = bs(response.content, "html.parser")
    return soup

def find_article_title(url: str) -> str:
    soup = get_soup(url)
    article_title = soup.find("h1").text
    return article_title


# selector for geeko.lesoir.be urls
def geeko_selector(url: str) -> tuple:
    soup = get_soup(url)
    div = soup.find("div", attrs={"class": "post-content-area"})
    paragraphs = div.find_all("p")
    if "Suivez Geeko sur Facebook" in paragraphs[-1].text:
        paragraphs = paragraphs[:-1]
    text = ""
    return paragraphs, text


# selector for sosoir.lesoir.be urls
def sosoir_selector(url: str) -> tuple:
    soup = get_soup(url)
    h2 = soup.find("h2", attrs={"class": "chapeau"}).text.strip()
    div = soup.find("div", attrs={"id": "artBody"})
    paragraphs = div.find_all("p")
    if "sosoir.lesoir.be" in paragraphs[-1].text:
        paragraphs = paragraphs[:-1]
    text = h2
    return paragraphs, text


# selector for www.lesoir.be and soirmag.lesoir.be urls
def lesoirmag_selector(url: str) -> tuple:
    soup = get_soup(url)
    article = soup.find("article", attrs={"class": "r-article"})
    paragraphs = article.find_all("p")
    if "www.soirmag.be" in paragraphs[-1].text:
        paragraphs = paragraphs[:-1]
    text = ""
    return paragraphs, text


def find_article_text(url: str) -> str:
    if "sosoir" in url:
        paragraphs, article_text = sosoir_selector(url)
    elif "geeko" in url:
        paragraphs, article_text = geeko_selector(url)
    else:
        paragraphs, article_text = lesoirmag_selector(url)
    for p in paragraphs:
        p_text = unicodedata.normalize("NFKD", p.text.strip())
        article_text += p_text
    return article_text


def find_published_date(url: str) -> str:
    soup = get_soup(url)
    script = soup.find("script", {"type": "application/ld+json"})
    data = json.loads(script.text, strict=False)
    try:
        published_date = data["@graph"][0]["datePublished"]
    except:
        published_date = data["datePublished"]
    return published_date


print("Creating list of urls of news page ...")
soup = get_soup("https://www.lesoir.be/18/sections/le-direct")
urls = []
base_url = "https://www.lesoir.be"
links = soup.select("h3 > a")
for link in links:
    url = link.get("href")
    dict = {"url": url if "//" in url else base_url + url}
    urls.append(dict)

print("Creating dataframe ...")
df = pd.DataFrame(urls)

print("Adding article_text column to dataframe ...")
df["text"] = df["url"].apply(find_article_text)

print("Adding date column to dataframe ...")
df["date"] = df["url"].apply(find_published_date)

print("Adding article_title column to dataframe ...")
df["title"] = df["url"].apply(find_article_title)


print("Adding language column to dataframe ...")
df["language"] = "fr"

Creating list of urls of news page ...
Creating dataframe ...
Adding article_text column to dataframe ...
Adding date column to dataframe ...
Adding article_title column to dataframe ...
Adding language column to dataframe ...


In [78]:
df.head(1)

Unnamed: 0,url,text,date,title,language
0,https://www.lesoir.be/529861/article/2023-08-0...,"Rien n’évolue dans le dialogue social, de sou...",2023-08-07T16:10:51+02:00,Nouvelle grève des pilotes Ryanair les 14 et 1...,fr


In [79]:
df["date"] = pd.to_datetime(df['date'])

In [80]:
articles = df.to_dict(orient='records')

In [81]:
for article in articles:
    print(article['url'])

https://www.lesoir.be/529861/article/2023-08-07/nouvelle-greve-des-pilotes-ryanair-les-14-et-15-aout-la-cne-en-appelle-au
https://sosoir.lesoir.be/interdire-aux-clients-de-manger-seul-au-restaurant-la-tendance-qui-fait-debat
http://soirmag.lesoir.be/529857/article/2023-08-07/fou-rire-sur-le-plateau-de-bfmtv-apres-un-bruit-surprenant-en-plein-direct-video
https://www.lesoir.be/529854/article/2023-08-07/pascal-godefroit-30-ans-au-service-des-dinosaures
https://www.lesoir.be/529852/article/2023-08-07/incendies-chypre-lile-en-etat-dalerte-cause-de-vents-violents
http://soirmag.lesoir.be/529846/article/2023-08-07/un-italien-meurt-ecrase-sous-des-meules-de-fromages
https://www.lesoir.be/529844/article/2023-08-07/ryanair-nouvelle-greve-charleroi-les-14-et-15-aout
https://www.lesoir.be/529839/article/2023-08-07/lavocat-franco-espagnol-juan-branco-en-passe-detre-expulse-du-senegal-vers-la
https://www.lesoir.be/529837/article/2023-08-07/france-une-femme-sequestree-depuis-2011-ete-retrouvee-son-m

In [82]:
from pymongo import MongoClient

#Create a connection with MongoDB
client = MongoClient()
client = MongoClient("mongodb://bouman:80um4N!@ec2-15-188-255-64.eu-west-3.compute.amazonaws.com:27017/")


In [83]:
client.list_database_names()

['bouman_datatank', 'media_analysis']

In [84]:
db = client['bouman_datatank']


In [85]:
print(db.list_collection_names())

['articles']


In [86]:
collection = db['articles']
print(collection)

Collection(Database(MongoClient(host=['ec2-15-188-255-64.eu-west-3.compute.amazonaws.com:27017'], document_class=dict, tz_aware=False, connect=True), 'bouman_datatank'), 'articles')


In [87]:
for article in articles:
  
    if collection.find_one({'url': {'$eq': article['url']}}):
        print(article['url'])
        print("article exists")
    else:
       
        print("add article")    
   

add article
https://sosoir.lesoir.be/interdire-aux-clients-de-manger-seul-au-restaurant-la-tendance-qui-fait-debat
article exists
http://soirmag.lesoir.be/529857/article/2023-08-07/fou-rire-sur-le-plateau-de-bfmtv-apres-un-bruit-surprenant-en-plein-direct-video
article exists
https://www.lesoir.be/529854/article/2023-08-07/pascal-godefroit-30-ans-au-service-des-dinosaures
article exists
https://www.lesoir.be/529852/article/2023-08-07/incendies-chypre-lile-en-etat-dalerte-cause-de-vents-violents
article exists
http://soirmag.lesoir.be/529846/article/2023-08-07/un-italien-meurt-ecrase-sous-des-meules-de-fromages
article exists
https://www.lesoir.be/529844/article/2023-08-07/ryanair-nouvelle-greve-charleroi-les-14-et-15-aout
article exists
https://www.lesoir.be/529839/article/2023-08-07/lavocat-franco-espagnol-juan-branco-en-passe-detre-expulse-du-senegal-vers-la
article exists
https://www.lesoir.be/529837/article/2023-08-07/france-une-femme-sequestree-depuis-2011-ete-retrouvee-son-mari-i

In [88]:
client.close()