In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import time
import pandas as pd
from Bio import Entrez
import xml.etree.ElementTree as ET
import os

In [2]:
def fetch_doi(pm_id):
    Entrez.email = "ae22b024@smail.iitm.ac.in"
    try:
        handle = Entrez.efetch(db="pubmed", id=pm_id, rettype="full", retmode="xml")
        xml_data = handle.read()
        handle.close()
    except Exception as e:
        print(f"Error fetching data for PM ID {pm_id}: {e}")
        return False
    
    root = ET.fromstring(xml_data)

    def language_check():
        language_tag = root.find('.//Language')
        if language_tag is not None and language_tag.text is not None:
            if language_tag.text == "eng":
                return True
            else:
                return False
        return True    # might become a problem in future
    
    doi_found = None
    for eloc in root.findall('.//ELocationID'):
        if eloc.get('EIdType') == 'doi':
            doi_found = eloc.text.strip()
            break
    
    if doi_found is None:
        for article_id in root.findall('.//ArticleId'):
            if article_id.get('IdType') == 'doi':
                doi_found = article_id.text.strip()
                break
    
    if language_check:
        return doi_found
    else:
        return None

In [3]:
pm_ids = [38861610, 38838020, 38771875, 38814864]
full_text_url = []

for pm_id in pm_ids:
    x = fetch_doi(pm_id)
    url = f"https://doi.org/{x}"
    print(url)
    full_text_url.append(url)

https://doi.org/10.1073/pnas.2311865121
https://doi.org/10.1073/pnas.2311241121
https://doi.org/10.1073/pnas.2317563121
https://doi.org/10.1073/pnas.2407437121


In [12]:
def get_full_text_table(full_url, j):
    driver = webdriver.Chrome()
    driver.get(full_url)

    html = driver.page_source
    soup = BeautifulSoup(html, 'html.parser')

    text_elements = []

    # Extract text from <div> elements with role="paragraph" and header tags
    for element in soup.find_all(['div', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6']):
        if element.name == 'div' and element.get('role') == 'paragraph':
            text_elements.append(element.get_text())
        elif element.name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
            text_elements.append(element.get_text())

    # Combine all text elements into a single string
    full_text = ' '.join(text_elements)

    # removing unwanted text
    unwanted_texts = [
            "DOI Not Found"
        ]
    if any(unwanted in full_text for unwanted in unwanted_texts):
        full_text = ""
        #print("DOI Not Found")

    if full_text.strip():
        # removing unwanted text
        prefix = "Featured Topics Articles By Topic Featured Topics Articles By Topic Featured Topic Articles By Topic ARTICLES AUTHORS Featured Topics Articles By Topic Featured Topics Articles By Topic Featured Topic Articles By Topic "
        if full_text.startswith(prefix):
            full_text = full_text[len(prefix):]
        truncation_point = full_text.find("Citation statements")
        if truncation_point != -1:
            full_text = full_text[:truncation_point]
        
        directory = r'C:\Users\Ajay Kanna\Desktop\UGRC\extract abstract\Proceedings of the National Academy of Sciences of the United States of America\full_text_stored'
        output_file_path = os.path.join(directory, f'extracted_text_{j}.txt')
        with open(output_file_path, 'w', encoding='utf-8') as file:
            print("Text extracted.")
            file.write(full_text)
    else:
        print(f"No text extracted from {full_url}")

    driver.quit()

for j, url in enumerate(full_text_url, start=1):
    if url:
        get_full_text_table(url, j)
    else:
        print("can't extract: maybe not written in english")

Text extracted.
Text extracted.
Text extracted.
No text extracted from https://doi.org/10.1073/pnas.2407437121
