# Scraping PubMed for paper titles and abstracts containing the word "HSPA4"

In [3]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import time

def fetch_pmids_from_page(url):
    """Fetch PMIDs from a single PubMed search results page."""
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    pmids = soup.find_all('span', {'class': 'docsum-pmid'})
    pmid_list = [p.get_text() for p in pmids]
    return pmid_list

def retrieve_abstract(pmid):
    pubmed_url = f"https://pubmed.ncbi.nlm.nih.gov/{pmid}/"
    response = requests.get(pubmed_url)
    if response.status_code != 200:
        print(f"Failed to retrieve PubMed page for PMID {pmid}")
        return None

    soup = BeautifulSoup(response.content, 'html.parser')
    abstract_elem = soup.find('div', class_='abstract-content selected')
    if abstract_elem is None:
        print(f"Abstract not found for PMID {pmid}")
        return None

    abstract_text = abstract_elem.get_text(separator='\n').strip()
    return abstract_text

def scrape_pubmed_details(pmid):
    url = f'https://pubmed.ncbi.nlm.nih.gov/{pmid}'
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    title = soup.select('h1.heading-title')[0].text.strip() if soup.select('h1.heading-title') else "Title not found"
    abstract = retrieve_abstract(pmid)
    
    data = {'title': title, 'pmid': pmid, 'url': url, 'abstract': abstract}
    return data

# Initialize variables
all_pmids = []
out = []

# Calculate the number of pages based on the total number of articles (201) and articles per page (10)
total_pages = 201// 10 + (1 if 201 % 10 else 0)  # Adding 1 if there's a remainder to account for the last partial page

for page_number in range(total_pages):
    print(f"Processing page {page_number + 1} of {total_pages}")
    # Construct the URL for the current page
    url = f'https://pubmed.ncbi.nlm.nih.gov/?term=hspa4&page={page_number + 1}'
    
    # Fetch PMIDs from the current page
    pmids_on_page = fetch_pmids_from_page(url)
    all_pmids.extend(pmids_on_page)
    
    # Respectful scraping: pause between requests
    time.sleep(1)

# Scrape details for each PMID
for pmid in all_pmids:
    print(f"Scraping details for PMID: {pmid}")
    details = scrape_pubmed_details(pmid)
    out.append(details)
    
    # Respectful scraping: pause between requests
    time.sleep(1)


Processing page 1 of 21
Processing page 2 of 21
Processing page 3 of 21
Processing page 4 of 21
Processing page 5 of 21
Processing page 6 of 21
Processing page 7 of 21
Processing page 8 of 21
Processing page 9 of 21
Processing page 10 of 21
Processing page 11 of 21
Processing page 12 of 21
Processing page 13 of 21
Processing page 14 of 21
Processing page 15 of 21
Processing page 16 of 21
Processing page 17 of 21
Processing page 18 of 21
Processing page 19 of 21
Processing page 20 of 21
Processing page 21 of 21
Scraping details for PMID: 30643287
Scraping details for PMID: 38589927
Scraping details for PMID: 34163243
Scraping details for PMID: 25732714
Abstract not found for PMID 25732714
Scraping details for PMID: 34754620
Scraping details for PMID: 37339521
Scraping details for PMID: 35628491
Scraping details for PMID: 35158021
Scraping details for PMID: 23980576
Scraping details for PMID: 21487003
Scraping details for PMID: 22884543
Scraping details for PMID: 12005543
Scraping detail

In [4]:
from datetime import datetime
current_time = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

# save the results to a CSV file with the current time stamp
filename = f"zero_shot_classification_results_{current_time}.csv"

# Convert to DataFrame
df = pd.DataFrame(out)

# Save to Excel
df.to_csv('pubmed_scraped_data_hspa4.csv', index=False)

df.head()


                                               title      pmid  \
0  Tumor-educated B cells selectively promote bre...  30643287   
1  HSPA4 upregulation induces immune evasion via ...  38589927   
2  HSPA4 Knockdown Retarded Progression and Devel...  34163243   
3  HSPA4, the "Evil Chaperone" of the HSP Family,...  25732714   
4  Significant correlation between HSPA4 and prog...  34754620   

                                        url  \
0  https://pubmed.ncbi.nlm.nih.gov/30643287   
1  https://pubmed.ncbi.nlm.nih.gov/38589927   
2  https://pubmed.ncbi.nlm.nih.gov/34163243   
3  https://pubmed.ncbi.nlm.nih.gov/25732714   
4  https://pubmed.ncbi.nlm.nih.gov/34754620   

                                            abstract  
0  Primary tumors may create the premetastatic ni...  
1  Introduction:\n        \n\n      \n      Gastr...  
2  Purpose:\n        \n\n      \n      Colorectal...  
3                                               None  
4  Background:\n        \n\n      \n      Hep