# Scraping DCG.media

%pip install bs4 pandas numpy ydata-profiling plotly tqdm ipywidgets

In [1]:
### SCRAPING
import requests as rq
from bs4 import BeautifulSoup

### DATABASES
import pandas as pd
import numpy as np
from ydata_profiling import ProfileReport

### VISUALISATION
#import plotly.express as px

### DIVERS
from tqdm.notebook import tqdm

### FORMAT
from datetime import datetime, timezone
import time
import json

## Définition des variables

In [2]:
website = 'le-managemental'
abbr = 'lmg'
tld = '.fr'

In [3]:
page_number = 1
top_url = f'https://www.{website+tld}'
top_search_url = f'{top_url}/page/{page_number}/?s'

## Définition des fonctions

### Obtenir le nombre de pages de résultats de recherche

In [4]:
def get_max_page_results(top_search_url):
    r = rq.get(top_search_url)
    soup = BeautifulSoup(r.content)
    page_final = soup.find('a', class_ = 'last').text
    return int(page_final)
get_max_page_results(top_search_url)

16

### Obtenir la liste des pages de résultats de recherche

In [5]:
def get_all_results_pages(page_final):
    urls = []
    for i in range(page_final):
        i = f'{top_url}/page/{i+1}/?s'
        urls.append(i)
    return urls
get_all_results_pages(get_max_page_results(top_search_url))

['https://www.le-managemental.fr/page/1/?s',
 'https://www.le-managemental.fr/page/2/?s',
 'https://www.le-managemental.fr/page/3/?s',
 'https://www.le-managemental.fr/page/4/?s',
 'https://www.le-managemental.fr/page/5/?s',
 'https://www.le-managemental.fr/page/6/?s',
 'https://www.le-managemental.fr/page/7/?s',
 'https://www.le-managemental.fr/page/8/?s',
 'https://www.le-managemental.fr/page/9/?s',
 'https://www.le-managemental.fr/page/10/?s',
 'https://www.le-managemental.fr/page/11/?s',
 'https://www.le-managemental.fr/page/12/?s',
 'https://www.le-managemental.fr/page/13/?s',
 'https://www.le-managemental.fr/page/14/?s',
 'https://www.le-managemental.fr/page/15/?s',
 'https://www.le-managemental.fr/page/16/?s']

### Obtenir tous les liens des articles sur une page de recherche

In [6]:
def get_articles_on_page(search_results):
    urls_articles = []
    s = rq.Session()
    for p in tqdm(search_results):
        r = s.get(p)
        soup = BeautifulSoup(r.content)
        articles = soup.find_all('h2', class_ = 'entry-title')
        for article in articles:
            url = article.find('a', href=True)['href']
            urls_articles.append(url)
    return urls_articles

In [7]:
get_articles_on_page(get_all_results_pages(get_max_page_results(top_search_url)))

  0%|          | 0/16 [00:00<?, ?it/s]

['https://www.le-managemental.fr/gestion-du-poste-client-pourquoi-former-ses-collaborateurs/',
 'https://www.le-managemental.fr/comment-optimiser-vos-chances-de-trouver-un-emploi-dans-le-secteur-entrepreneurial/',
 'https://www.le-managemental.fr/comment-faire-une-bonne-analyse-de-lexistant/',
 'https://www.le-managemental.fr/comment-faire-une-analyse-critique-dun-sujet/',
 'https://www.le-managemental.fr/code-barre-png-generer-un-code-barre-gratuitement/',
 'https://www.le-managemental.fr/parcelsapp-connexion-pour-le-suivi-de-son-colis/',
 'https://www.le-managemental.fr/ma-vie-rh-la-poste-decouvrez-les-services-rh-essentiels/',
 'https://www.le-managemental.fr/agendis-62-votre-assistant-de-planification-en-ligne/',
 'https://www.le-managemental.fr/webcsat-58-outil-pour-levaluation-de-la-satisfaction-client/',
 'https://www.le-managemental.fr/guide-didentification-au-webmail-urban-group-ratp-connexion-astuces-et-conseils/',
 'https://www.le-managemental.fr/optimisez-la-performance-de-

### Récupérer les infos qu'on souhaite sur chaque article

In [8]:
def get_article_info(r, url_article):
    # Liste de variables
    noms_variables = [
        'url',  # URL scrapée
        'canonical_url', 'slug', 'meta_title', 'meta_desc',  # Infos issues de la balise meta
        'date_published', 'date_modified',  # Infos dates
        'author',  # Parfois en meta
        'title', 'category', 'views', 'reading_time',  # Metadonnées contenues ailleurs que dans la balise meta
        'content', 'raw_content'  # Contenu de la page
    ]
    
    # Initialisation du dictionnaire
    data = {nom: np.nan for nom in noms_variables}
    
    data['url'] = url_article
    
    soup = BeautifulSoup(r.content, 'lxml')

    try:
        data['canonical_url'] = soup.find('link', {'rel': 'canonical'})['href']
    except:
        pass
    
    try:
        data['slug'] = data['canonical_url'].split('/')[-2] if data['canonical_url'] else np.nan
    except:
        pass

    try:
        data['meta_title'] = soup.find('meta', {'property': 'og:title'})['content']
    except:
        pass

    try:
        data['meta_desc'] = soup.find('meta', {'property': 'og:description'})['content']
    except:
        pass

    try:
        data['date_published'] = pd.to_datetime(soup.find('meta', {'property': 'article:published_time'})['content'][:-6], utc=True)
    except:
        pass

    try:
        data['date_modified'] = pd.to_datetime(soup.find('meta', {'property': 'article:modified_time'})['content'][:-6], utc=True)
    except:
        pass

    try:
        data['author'] = soup.find("meta", {'name': 'author'})['content']
    except:
        try:
            data['author'] = soup.find(class_="author").text.split('Publié par ')[-1]
        except:
            pass

    try:
        data['title'] = soup.find('h1').text
    except:
        pass

    try:
        data['category'] = soup.find('div', class_="entry-category").text
    except:
        pass

    try:
        data['views'] = int(soup.find('span', class_=lambda x: x and x.startswith('td-nr-views-')).text)
        print(data['views'])
    except:
        pass

    try:
        data['reading_time'] = int(soup.find('meta', {'name': 'twitter:data2'})['content'].split(' ')[0]) if soup.find('meta', {'name': 'twitter:data2'}) and 'minutes' in soup.find('meta', {'name': 'twitter:data2'})['content'] else np.nan
    except:
        pass

    try:
        data['content'] = BeautifulSoup(str(soup.find("div", class_="td-post-content")), 'lxml').get_text()
    except:
        pass

    try:
        data['raw_content'] = str(soup.find("div", class_="td-post-content"))
    except:
        pass

    # Ajout de features
    data['length'] = len(str(data['content']).split())
    
    if pd.notnull(data['date_published']):
        data['days_since_published'] = int((pd.Timestamp.now(tz='UTC') - data['date_published']).days)
    else:
        data['days_since_published'] = np.nan
    
    # Création de la série
    series = pd.Series(data)
    series.name = url_article
    
    # Ajout de features supplémentaires
    if pd.notnull(data.get('views')) and pd.notnull(data.get('days_since_published')):
        series['views_daily'] = series['views'] / series['days_since_published']
        series['views_monthly'] = series['views_daily'] * 30
    else:
        series['views_daily'] = np.nan
        series['views_monthly'] = np.nan
    
    series['website'] = top_url
    
    return series

## Fonction finale

In [9]:
article_list = get_articles_on_page(get_all_results_pages(get_max_page_results(top_search_url)))

  0%|          | 0/16 [00:00<?, ?it/s]

### Fonction finale

In [10]:
# Import des packages
from concurrent.futures import ThreadPoolExecutor, as_completed
import time

# Initialisation du compteur
start_time = time.time()

# Fonction principale qui sauvegarde le DataFrame dans un fichier CSV et le retourne
def scrape_all_articles(urls, batch_size=250, max_workers=20):
    # Initialisation d'une session réutilisable
    session = rq.Session()

    # Création d'une fonction qui traite chaque URL
    def process_url(url):
            try:
                r = session.get(url)
                return get_article_info(r, url)
            except Exception as e:
                print(f"Erreur pour {url}: {str(e)}")
                return pd.Series(name=url)

    # Utilise ThreadPoolExecutor pour paralléliser le traitement
    results = []
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        # Parcourt la liste d'URLs par lots de taille 'batch_size'
        for i in range(0, len(urls), batch_size):
            # Extrait un lot d'URLs
            batch = urls[i:i+batch_size]
            # Crée et soumet des tâches pour chaque URL du lot
            futures = [executor.submit(process_url, url) for url in batch]
            
            # Traite les résultats au fur et à mesure qu'ils sont terminés
            for future in tqdm(as_completed(futures), total=len(batch), desc=f"Batch {i//batch_size + 1} / {len(urls)//batch_size+1}"):
                # Récupère le résultat de la tâche
                result = future.result()
                # Si le résultat n'est pas None (pas d'erreur), l'ajoute aux résultats
                if result is not None:
                    results.append(result)
    
    # Création du DataFrame final
    df_final = pd.DataFrame(results)
    
    # Ajout de colonnes calculées
    df_final['website'] = top_url
    df_final['scraping_date'] = pd.Timestamp.now(tz='UTC')

    df_final.columns = ['article_url', 'article_canonical_url', 'article_slug',
       'article_meta_title', 'article_meta_desc', 'article_date_published',
       'article_date_modified', 'article_author', 'article_title',
       'article_category', 'article_views', 'article_reading_time',
       'article_content', 'article_raw_content', 'article_length',
       'days_since_published', 'article_views_daily', 'article_views_monthly',
       'website', 'scraping_date']

    # Sauvegarde en CSV
    df_final.to_csv(f'scraping_{abbr}.csv', sep='|', index=True)
    print(f"Scraping terminé. Total d'articles : {len(df_final)}")
    return df_final

df = scrape_all_articles(article_list)

end_time = time.time()
print(end_time - start_time)

Batch 1 / 4:   0%|          | 0/250 [00:00<?, ?it/s]

18
3067
988
37
2974
6787
4377
1280
707
1150
148
176
1090
921
236
736
179
487
467
1047
102
42
66
39
2933
62
2526
2378
3131
57
2892
2568
2657
2516
2309
2906
2811
2467
2953
1772
1203
896
109
63
64
300
70
7757
6787
113
144
78
2859
19846
2878
88
80
2813
112
4583
3016
3017
2871
2547
1852
2264
1449
2571
1432
1584
330
228
118
2737
1516
2890
2837
2912
2662
2903
2789
3033
2668
2654
2767
1456
1557
1119
1026
695
269
638
139
156
382
490
136
134
3977
2910
2348
2793
4080
2577
2040
1734
6787
7757
19846
1592
2878
2859
1632
2813
1393
538
3016
2941
464
503
207
145
158
170
7757
195
2365
171
213
177
381
182
179
192
182
204
219
196
215
3081
3004
2469
2768
2432
3489
3056
2457
2420
2024
2141
1605
1649
1392
1353
270
2988
1046
465
231
253
213
247
6787
1479
7757
257
19846
2859
2878
2813
3319
2117
2626
3016
1895
1395
1210
444
524
1155
941
544
3556
2532
1032
2236
3044
2739
3082
297
1989
1920
1777
414
362
313
342
1096
348
359
493
348
461
430
440
2051
409
1399
408
1296
3359
416
1821
2016
1606
2785
2519
2621
1245
900

Batch 2 / 4:   0%|          | 0/250 [00:00<?, ?it/s]

601
1889
2380
610
2464
3041
669
654668

2531
641
1189
666
628
619
663
635
627
1625
641
2157
677
2073
698
6787
722
2636
719
1798
19846
7757
2859
713
2878
2813
3016
2512
723
827
2042
722
1279
784
782
2579
2487
785
775
2381
796
802
643
734
597
2351
649
2337
2549
869
888
880
973
2387
1073
2655
813
968
2550
878
961
956
3130
976
1002
983
1822
992
3021
972
989
976
2716
2423
19846
6787
7757
976
986
2859
2878
2813
984
985
975
1021
3016
1021
994
1012
1048
1023
1532
970
1006
1480
1239
1047
1058
1020
986
1027
1142
1156
1139
2383
1173
1182
1148
1213
1177
3236
1183
1269
1261
1185
1204
1254
1249
1148
1617
1117
1206
1288
1119
1244
1225
1260
1391
1200
1285
1190
19846
7757
6787
1328
2878
2859
1223
1106
2813
1319
1360
3016
1351
1333
1444
1406
1321
1205
1403
1323
1390
1342
2640
1460
1481
1343
1516
1320
1346
1368
1275
2908
2751
1457
1377
2555
2928
1408
1439
2580
2639
1331
1365
1365
1388
2674
1476
1822
1923
2550
1422
1397
1431
1438
2691
6787
19846
2546
2386
2878
7757
1481
2859
2813
1424
3078
1446
2461
3016


Batch 3 / 4:   0%|          | 0/250 [00:00<?, ?it/s]

2960
2419
6787
1757
1548
1582
2859
7757
19846
2884
1522
2878
1974
1523
1535
1944
1516
2813
1545
3016
1954
1970
1892
2574
2000
1971
2034
2033
1691
1788
1950
2005
2006
1575
1661
2493
1968
1967
1602
1564
2506
2678
2373
1738
1715
1765
1601
2041
2707
2094
1778
1584
1535
1634
1652
1614
1655
1614
1626
2102
2126
2099
2082
6787
7757
19846
2859
2172
1619
2878
1832
2813
1855
1680
1615
1687
1629
1614
1681
3016
1639
1652
2281
2326
1693
2286
2245
2159
2343
1698
2922
1651
1674
1650
1686
2371
2922
3463
2351
2710
19846
2314
1639
2504
1675
1734
1714
1703
1707
1659
1703
1699
1721
1616
1597
2311
2510
2317
1719
2271
19846
6787
7757
2859
2878
2813
2280
2307
1694
2433
2769
2117
3131
1696
3016
2640
1724
2365
2347
2922
2380
2338
2335
1712
1500
1659
2392
1682
1704
2310
1573
1705
2536
2306
2368
2696
2532
2360
2387
1702
2330
1750
1734
2453
2772
1793
1840
2565
2440
2366
1768
2418
2420
2381
1826
7757
2593
19846
2485
6787
2859
2878
2395
3016
2813
2605
2508
2431
1874
1819
1690
1787
1836
1832
2441
2576
2415
2453
2497


Batch 4 / 4:   0%|          | 0/141 [00:00<?, ?it/s]

1784
1940
1835
1755
1825
1800
2047
2101
1846
1752
1823
1766
2434
1763
2327
1840
2703
1854
2569
1806
1824
1765
2471
2071
2004
1874
2010
2132
1940
2448
2165
2482
2078
2194
2050
2445
2360
2173
2358
2786
6787
19846
7757
2214
2203
2789
2878
2859
2202
2278
2418
2372
2328
2337
2813
2545
3016
2309
2800
2306
2360
2416
2695
2334
2348
2368
2423
2356
2337
2414
2450
2422
2407
2395
2725
2472
2379
2366
2381
2404
2461
2510
2487
2449
2376
2417
2608
2478
2566
2524
2534
2430
2542
2591
19846
2507
2385
6787
7757
2859
2615
2878
2813
2474
2513
2403
2584
2715
3016
2468
2558
2575
2533
2492
2484
2578
2523
2538
2614
2513
2633
2606
2685
2871
2816
6787
2878
2859
7757
19846
2755
2813
2859
3016
2878
2813
3016
Scraping terminé. Total d'articles : 891
74.108562707901


In [11]:
# Création d'un rapport de profil avec pandas_profiling
profile = ProfileReport(df, title=f"{abbr.title()} Scraping Report", explorative=True)

# Génération du rapport au format HTML
profile.to_file(f"scraping_report_{abbr}.html")

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

(using `df.profile_report(correlations={"auto": {"calculate": False}})`
If this is problematic for your use case, please report this as an issue:
https://github.com/ydataai/ydata-profiling/issues
(include the error message: 'cannot reindex on an axis with duplicate labels')


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]