# Scraping DCG.media

%pip install bs4 pandas numpy ydata-profiling plotly tqdm ipywidgets

In [1]:
### SCRAPING
import requests as rq
from bs4 import BeautifulSoup

### DATABASES
import pandas as pd
import numpy as np
from ydata_profiling import ProfileReport

### VISUALISATION
#import plotly.express as px

### DIVERS
from tqdm.notebook import tqdm

### FORMAT
from datetime import datetime, timezone
import time
import json

## Définition des variables

In [2]:
website = 'spotemploi'
abbr = 'spe'
tld = '.com'

In [3]:
page_number = 1
top_url = f'https://www.{website+tld}'
top_search_url = f'{top_url}/page/{page_number}/?s'

## Définition des fonctions

### Obtenir le nombre de pages de résultats de recherche

In [4]:
def get_max_page_results(top_search_url):
    r = rq.get(top_search_url)
    soup = BeautifulSoup(r.content)
    page_final = soup.find('a', class_ = 'last').text
    return int(page_final)
get_max_page_results(top_search_url)

10

### Obtenir la liste des pages de résultats de recherche

In [5]:
def get_all_results_pages(page_final):
    urls = []
    for i in range(page_final):
        i = f'{top_url}/page/{i+1}/?s'
        urls.append(i)
    return urls
get_all_results_pages(get_max_page_results(top_search_url))

['https://www.spotemploi.com/page/1/?s',
 'https://www.spotemploi.com/page/2/?s',
 'https://www.spotemploi.com/page/3/?s',
 'https://www.spotemploi.com/page/4/?s',
 'https://www.spotemploi.com/page/5/?s',
 'https://www.spotemploi.com/page/6/?s',
 'https://www.spotemploi.com/page/7/?s',
 'https://www.spotemploi.com/page/8/?s',
 'https://www.spotemploi.com/page/9/?s',
 'https://www.spotemploi.com/page/10/?s']

### Obtenir tous les liens des articles sur une page de recherche

In [6]:
def get_articles_on_page(search_results):
    urls_articles = []
    s = rq.Session()
    for p in tqdm(search_results):
        r = s.get(p)
        soup = BeautifulSoup(r.content)
        articles = soup.find_all('h2', class_ = 'entry-title')
        for article in articles:
            url = article.find('a', href=True)['href']
            urls_articles.append(url)
    return urls_articles

In [7]:
get_articles_on_page(get_all_results_pages(get_max_page_results(top_search_url)))

  0%|          | 0/10 [00:00<?, ?it/s]

['https://www.spotemploi.com/pourquoi-la-formation-est-la-cle-dune-prevention-efficace-des-risques-professionnels/',
 'https://www.spotemploi.com/ent-tours-se-connecter-en-ligne/',
 'https://www.spotemploi.com/mycampus-connexion-a-mycampus-eduservices/',
 'https://www.spotemploi.com/les-cles-pour-reussir-son-test-psychotechnique-sncf/',
 'https://www.spotemploi.com/cet-apres-midi-ou-cette-apres-midi-une-guide-sur-lorthographe-correcte/',
 'https://www.spotemploi.com/bts-decouvrez-lage-de-chaque-membre-de-ce-groupe-de-k-pop-sensationnel/',
 'https://www.spotemploi.com/bts-decouvrez-lage-de-chaque-membre-de-ce-groupe-de-k-pop-sensationnel-2/',
 'https://www.spotemploi.com/leo-uga-decouvrez-les-programmes-et-ressources-de-luniversite-de-georgie/',
 'https://www.spotemploi.com/optimiser-lutilisation-de-la-messagerie-academique-nancy-metz/',
 'https://www.spotemploi.com/ent-u-bordeaux-portail-universitaire/',
 'https://www.spotemploi.com/optimisez-votre-experience-dapprentissage-avec-mycamp

### Récupérer les infos qu'on souhaite sur chaque article

In [8]:
def get_article_info(r, url_article):
    # Liste de variables
    noms_variables = [
        'url',  # URL scrapée
        'canonical_url', 'slug', 'meta_title', 'meta_desc',  # Infos issues de la balise meta
        'date_published', 'date_modified',  # Infos dates
        'author',  # Parfois en meta
        'title', 'category', 'views', 'reading_time',  # Metadonnées contenues ailleurs que dans la balise meta
        'content', 'raw_content'  # Contenu de la page
    ]
    
    # Initialisation du dictionnaire
    data = {nom: np.nan for nom in noms_variables}
    
    data['url'] = url_article
    
    soup = BeautifulSoup(r.content, 'lxml')
    
    try:
        data['canonical_url'] = soup.find('link', {'rel': 'canonical'})['href']
    except:
        pass
    
    try:
        data['slug'] = data['canonical_url'].split('/')[-2] if data['canonical_url'] else np.nan
    except:
        pass

    try:
        data['meta_title'] = soup.find('meta', {'property': 'og:title'})['content']
    except:
        pass

    try:
        data['meta_desc'] = soup.find('meta', {'property': 'og:description'})['content']
    except:
        pass

    try:
        data['date_published'] = pd.to_datetime(soup.find('meta', {'property': 'article:published_time'})['content'][:-6], utc=True)
    except:
        pass

    try:
        data['date_modified'] = pd.to_datetime(soup.find('meta', {'property': 'article:modified_time'})['content'][:-6], utc=True)
    except:
        pass

    try:
        data['author'] = soup.find("meta", {'name': 'author'})['content']
    except:
        try:
            data['author'] = soup.find(class_="author").text.split('Publié par ')[-1]
        except:
            pass

    try:
        data['title'] = soup.find('h1').text
    except:
        pass

    try:
        data['category'] = [x.text for x in soup.find_all(class_="tdb-entry-category")]
    except:
        pass

    try:
        data['views'] = int(soup.find('span', class_=lambda x: x and x.startswith('td-nr-views-')).text)
    except:
        pass

    try:
        data['reading_time'] = int(soup.find('meta', {'name': 'twitter:data2'})['content'].split(' ')[0]) if soup.find('meta', {'name': 'twitter:data2'}) and 'minutes' in soup.find('meta', {'name': 'twitter:data2'})['content'] else np.nan
    except:
        pass

    try:
        data['content'] = BeautifulSoup(str(soup.find("div", class_="td-post-content")), 'lxml').get_text()
    except:
        pass

    try:
        data['raw_content'] = str(soup.find("div", class_="td-post-content"))
    except:
        pass

    # Ajout de features
    data['length'] = len(str(data['content']).split())
    
    if pd.notnull(data['date_published']):
        data['days_since_published'] = int((pd.Timestamp.now(tz='UTC') - data['date_published']).days)
    else:
        data['days_since_published'] = np.nan
    
    # Création de la série
    series = pd.Series(data)
    series.name = url_article
    
    # Ajout de features supplémentaires
    if pd.notnull(data.get('views')) and pd.notnull(data.get('days_since_published')):
        series['views_daily'] = series['views'] / series['days_since_published']
        series['views_monthly'] = series['views_daily'] * 30
    else:
        series['views_daily'] = np.nan
        series['views_monthly'] = np.nan
    
    series['website'] = top_url
    
    return series

## Fonction finale

In [9]:
article_list = get_articles_on_page(get_all_results_pages(get_max_page_results(top_search_url)))

  0%|          | 0/10 [00:00<?, ?it/s]

### Fonction finale

In [10]:
# Import des packages
from concurrent.futures import ThreadPoolExecutor, as_completed
import time

# Initialisation du compteur
start_time = time.time()

# Fonction principale qui sauvegarde le DataFrame dans un fichier CSV et le retourne
def scrape_all_articles(urls, batch_size=250, max_workers=20):
    # Initialisation d'une session réutilisable
    session = rq.Session()

    # Création d'une fonction qui traite chaque URL
    def process_url(url):
            try:
                r = session.get(url)
                return get_article_info(r, url)
            except Exception as e:
                print(f"Erreur pour {url}: {str(e)}")
                return pd.Series(name=url)

    # Utilise ThreadPoolExecutor pour paralléliser le traitement
    results = []
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        # Parcourt la liste d'URLs par lots de taille 'batch_size'
        for i in range(0, len(urls), batch_size):
            # Extrait un lot d'URLs
            batch = urls[i:i+batch_size]
            # Crée et soumet des tâches pour chaque URL du lot
            futures = [executor.submit(process_url, url) for url in batch]
            
            # Traite les résultats au fur et à mesure qu'ils sont terminés
            for future in tqdm(as_completed(futures), total=len(batch), desc=f"Batch {i//batch_size + 1} / {len(urls)//batch_size+1}"):
                # Récupère le résultat de la tâche
                result = future.result()
                # Si le résultat n'est pas None (pas d'erreur), l'ajoute aux résultats
                if result is not None:
                    results.append(result)
    
    # Création du DataFrame final
    df_final = pd.DataFrame(results)
    
    # Ajout de colonnes calculées
    df_final['website'] = top_url
    df_final['scraping_date'] = pd.Timestamp.now(tz='UTC')

    df_final.columns = ['article_url', 'article_canonical_url', 'article_slug',
       'article_meta_title', 'article_meta_desc', 'article_date_published',
       'article_date_modified', 'article_author', 'article_title',
       'article_category', 'article_views', 'article_reading_time',
       'article_content', 'article_raw_content', 'article_length',
       'days_since_published', 'article_views_daily', 'article_views_monthly',
       'website', 'scraping_date']

    # Sauvegarde en CSV
    df_final.to_csv(f'scraping_{abbr}.csv', sep='|', index=True)
    print(f"Scraping terminé. Total d'articles : {len(df_final)}")
    return df_final

df = scrape_all_articles(article_list)

end_time = time.time()
print(end_time - start_time)

Batch 1 / 3:   0%|          | 0/250 [00:00<?, ?it/s]

Batch 2 / 3:   0%|          | 0/250 [00:00<?, ?it/s]

Batch 3 / 3:   0%|          | 0/32 [00:00<?, ?it/s]

Scraping terminé. Total d'articles : 532
67.06396150588989


In [11]:
# Création d'un rapport de profil avec pandas_profiling
profile = ProfileReport(df, title=f"{abbr.title()} Scraping Report", explorative=True)

# Génération du rapport au format HTML
profile.to_file(f"scraping_report_{abbr}.html")

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

(using `df.profile_report(correlations={"auto": {"calculate": False}})`
If this is problematic for your use case, please report this as an issue:
https://github.com/ydataai/ydata-profiling/issues
(include the error message: 'cannot reindex on an axis with duplicate labels')


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]