In [1]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
from urllib import request
import time
import random
import os

#### **Note** :  Le code étant le même pour chaque maladie à part l'équation de recherche, le même code sera utilisé plusieurs fois en changeant juste l'équation de recherche

#### Scraping de la première page de résultat avec ajout d'un test pour assurer la bonne récupération des mots-clés

In [2]:
# URL de recherche sur PubMed avec l'équation de recherche
query = "(sleep habits OR sleep patterns OR sleep quality) AND (Alzheimer's disease OR cognitive decline OR dementia)"  #Alzheimer
url = f"https://pubmed.ncbi.nlm.nih.gov/?term={query.replace(' ', '+')}"

# Ajout d'un User-Agent pour éviter le blocage
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}

# Envoi de la requête
response = requests.get(url, headers=headers)

if response.status_code == 200:
    soup = BeautifulSoup(response.text, "html.parser")
    articles = []

    # Recherche des articles dans la page
    for result in soup.find_all("article", class_="full-docsum"):
        title_tag = result.find("a", class_="docsum-title")
        title = title_tag.text.strip() if title_tag else "Titre inconnu"
        link = "https://pubmed.ncbi.nlm.nih.gov" + title_tag["href"] if title_tag else "Lien non disponible"

        # Initialisation des valeurs par défaut
        publication_date = "Date inconnue"
        summary = "Résumé non disponible"
        keywords = "Mots-clés non disponibles"
        authors = "Auteurs non disponibles"
        affiliations = "Affiliations non disponibles"

        if title_tag and title_tag["href"]:
            article_url = link
            article_response = requests.get(article_url, headers=headers)

            if article_response.status_code == 200:
                article_soup = BeautifulSoup(article_response.text, "html.parser")

                # Extraction de l'abstract et des mots-clés dans la div "abstract"
                abstract_div = article_soup.find("div", class_="abstract")
                if abstract_div:
                    # Extraction du résumé
                    summary_tag = abstract_div.find("div", class_="abstract-content")
                    if summary_tag:
                        paragraphs = summary_tag.find_all("p")
                        summary = " ".join([p.text.strip() for p in paragraphs]) if paragraphs else summary_tag.text.strip()

                    # Extraction des mots-clés avec une approche plus flexible
                    for p in abstract_div.find_all("p"):
                        strong_tag = p.find("strong", class_="sub-title")
                        if strong_tag and "Keywords" in strong_tag.text:  # Vérifie si "Keywords" est présent dans le texte
                            keywords = p.text.replace("Keywords:", "").strip()
                            print(f" Mots-clés extraits: {keywords}")
                            break  # On arrête dès qu'on trouve les mots-clés


                # Extraction de la date de publication
                date_tag = article_soup.find("span", class_="cit") or article_soup.find("time", class_="history-date")
                if date_tag:
                    publication_date = date_tag.text.strip()

                # Extraction des auteurs
                authors_tag = article_soup.find("div", class_="authors-list")
                if authors_tag:
                    authors = ", ".join([a.text.strip() for a in authors_tag.find_all("a")])

                # Extraction des affiliations
                affiliations_tag = article_soup.find("div", class_="affiliations")
                if affiliations_tag:
                    affiliations = "; ".join([aff.text.strip() for aff in affiliations_tag.find_all("li")])

            
            time.sleep(2)   # Pause pour éviter le blocage

        articles.append([title, publication_date, link, summary, keywords, authors, affiliations])

    # Création du dataFrame
    df = pd.DataFrame(articles, columns=["Titre", "Date de publication", "Lien", "Résumé", "Mots-clés", "Auteurs", "Affiliations"])

    # Sauvegarde en CSV
    df.to_csv("alzheimer1.csv", index=False, encoding="utf-8")
else:
    print(f"Erreur lors de la requête : {response.status_code}")


 Mots-clés extraits: Alzheimer; aging; healthy elderly; mild cognitive impairment; older; sleep quality.
 Mots-clés extraits: AD; Alzheimer's dementia; Alzheimer's disease; MCI; mild cognitive impairment; sleep.
 Mots-clés extraits: Alzheimer's disease; Memory; Sleep disturbance.
 Mots-clés extraits: Alzheimer’s disease; cognitive decline; dementia.; sleep disorders; sleep parameters; sleep problems.
 Mots-clés extraits: AD risk factors; Alzheimer’s disease; brain stimulation; cognitive decline; diet; exercise; herbs; multi-therapeutic program; neurodegeneration; sleep; stress; supplements; therapeutics.
 Mots-clés extraits: Alzheimer's disease; Amyloid; Biomarkers; Cognition; Middle aged; Mild cognitive impairment; Obstructive sleep apnea; Older adults; Phosphorylated tau.


In [4]:
# verification
alzheimer = pd.read_csv("alzheimer1.csv")
alzheimer


Unnamed: 0,Titre,Date de publication,Lien,Résumé,Mots-clés,Auteurs,Affiliations
0,Sleep Quality and Aging: A Systematic Review o...,2022 Jul 11;19(14):8457.,https://pubmed.ncbi.nlm.nih.gov/35886309/,Aging is characterized by changes in the struc...,Alzheimer; aging; healthy elderly; mild cognit...,"Maria Casagrande, 1, Giuseppe Forte, 1, 2, Fra...",1 Department of Dynamic and Clinical Psycholog...
1,Pharmacological and non-pharmacological interv...,2021 Aug;30(4):e13229.,https://pubmed.ncbi.nlm.nih.gov/33289311/,Suboptimal sleep causes cognitive decline and ...,AD; Alzheimer's dementia; Alzheimer's disease;...,"Jonathan Blackman, 1, 2, Marta Swirski, 1, Jam...","1 North Bristol NHS Trust, Bristol, UK.; 2 Bri..."
2,Implications of sleep disturbance and inflamma...,2019 Mar;18(3):296-306.,https://pubmed.ncbi.nlm.nih.gov/30661858/,Nearly half of all adults older than 60 years ...,Mots-clés non disponibles,"Michael R Irwin, 1, Michael V Vitiello, 2","1 Cousins Center for Psychoneuroimmunology, Ja..."
3,Sleep and Alzheimer's disease.,2015 Feb:19:29-38.,https://pubmed.ncbi.nlm.nih.gov/24846773/,Sleep disorders are frequent in Alzheimer's di...,Alzheimer's disease; Memory; Sleep disturbance.,"Laure Peter-Derex, 1, Pierre Yammine, 2, Hélèn...","1 Service de Neurologie-sommeil, Centre Hospit..."
4,"Sleep, Cognitive impairment, and Alzheimer's d...",2017 Jan 1;40(1).,https://pubmed.ncbi.nlm.nih.gov/28364458/,Study objectives:\n \n \n Mou...,Alzheimer’s disease; cognitive decline; dement...,"Omonigho M Bubu, 1, Michael Brannick, 2, James...","1 Department of Epidemiology & Biostatistics, ..."
5,Alzheimer's disease and sleep disturbances: a ...,2019 Nov;77(11):815-824.,https://pubmed.ncbi.nlm.nih.gov/31826138/,The association between Alzheimer's disease (A...,Mots-clés non disponibles,"Conrado Regis Borges, 1, Dalva Poyares, 2, Ron...","1 Universidade de São Paulo, Faculdade de Medi..."
6,"Sleep Disturbance, Cognitive Decline, and Deme...",2017 Aug;37(4):395-406.,https://pubmed.ncbi.nlm.nih.gov/28837986/,Approximately half of older people report slee...,Mots-clés non disponibles,"Alexandra M V Wennberg, 1, Mark N Wu, 2, 3, Pa...","1 Department of Health Sciences Research, Mayo..."
7,Rationale for a Multi-Factorial Approach for t...,2023 Jan 14;24(2):1659.,https://pubmed.ncbi.nlm.nih.gov/36675177/,"Alzheimer's disease (AD) is a multifactorial, ...",AD risk factors; Alzheimer’s disease; brain st...,"Rammohan V Rao, 1, Kaavya G Subramaniam, 2, Ju...","1 Apollo Health, Burlingame, CA 94011, USA.; 2..."
8,"Obstructive sleep apnea, cognition and Alzheim...",2020 Apr:50:101250.,https://pubmed.ncbi.nlm.nih.gov/31881487/,Increasing evidence links cognitive-decline an...,Alzheimer's disease; Amyloid; Biomarkers; Cogn...,"Omonigho M Bubu, 1, Andreia G Andrade, 2, Ogie...","1 Center for Brain Health, Department of Psych..."
9,Sleep in Alzheimer's disease: a systematic rev...,2022 Apr 1;12(1):136.,https://pubmed.ncbi.nlm.nih.gov/35365609/,Polysomnography (PSG) studies of sleep changes...,Mots-clés non disponibles,"Ye Zhang, 1, Rong Ren, 2, Linghui Yang, 1, Hai...","1 Sleep Medicine Center, Department of Respira..."
