In [31]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import time
import pandas as pd
from Bio import Entrez
import xml.etree.ElementTree as ET

In [32]:
def fetch_from_doi(pm_id):
    Entrez.email = "ae22b024@smail.iitm.ac.in"
    try:
        handle = Entrez.efetch(db="pubmed", id=pm_id, rettype="full", retmode="xml")
        xml_data = handle.read()
        handle.close()
    except Exception as e:
        print(f"Error fetching data for PM ID {pm_id}: {e}")
        return False
    
    root = ET.fromstring(xml_data)

    def language_check():
        language_tag = root.find('.//Language')
        if language_tag is not None and language_tag.text is not None:
            if language_tag.text == "eng":
                return True
            else:
                return False
        return True    # might become a problem in future
    
    doi_found = None
    for eloc in root.findall('.//ELocationID'):
        if eloc.get('EIdType') == 'doi':
            doi_found = eloc.text.strip()
            break
    
    if doi_found is None:
        for article_id in root.findall('.//ArticleId'):
            if article_id.get('IdType') == 'doi':
                doi_found = article_id.text.strip()
                break
    
    if language_check:
        return doi_found
    else:
        return None

In [33]:
# Extracting link from the bottom of the page (Also can we conclude if there is link on the bottom of the page then it's there on the side also)
import requests
from bs4 import BeautifulSoup

def language_check_for_bottom_link(pm_id):
    Entrez.email = "ae22b024@smail.iitm.ac.in"
    try:
        handle = Entrez.efetch(db="pubmed", id=pm_id, rettype="full", retmode="xml")
        xml_data = handle.read()
        handle.close()
    except Exception as e:
        print(f"Error fetching data for PM ID {pm_id}: {e}")
        return False
    
    root = ET.fromstring(xml_data)

    language_tag = root.find('.//Language')
    if language_tag is not None and language_tag.text is not None:
        if language_tag.text == "eng":
            return True
        else:
            return False
    return True    # might become a problem in future


def extract_full_text_url(url, pm_id):
    response = requests.get(url)

    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        div_element = soup.find('div', class_='full-text-links-list')

        if div_element != None:
          anchor_element = div_element.find('a', class_='link-item')
          href = anchor_element.get('href')
          if language_check_for_bottom_link(pm_id):
              print(href, pm_id)
              return href
          else:
              print("None")
              return None
          
        else:
          print("None")
          x = fetch_from_doi(pm_id)
          url = f"https://doi.org/{x}"
          return url

    else:
        print(f"Failed to retrieve the webpage. Status code: {response.status_code}")
        x = fetch_from_doi(pm_id)
        url = f"https://doi.org/{x}"
        return url


#l = [38801081, 38830938, 38801746, 38800010]
#l = [38848579]   # the link redirects to pdf, doi 
#l = [38848750]   # everything in German 
#l = [38857300, 38843221, 37967080, 38011087] # PLOS ONE
l = [38861610, 38838020]
full_text_url = []
for i in l:
  full_text_url.append(extract_full_text_url('https://pubmed.ncbi.nlm.nih.gov/' + f'{i}/', i))

None
https://www.pnas.org/doi/abs/10.1073/pnas.2311241121?url_ver=Z39.88-2003&rfr_id=ori:rid:crossref.org&rfr_dat=cr_pub  0pubmed 38838020


In [34]:
def get_full_text_table(full_url, j):
    driver = webdriver.Chrome()
    driver.get(full_url)

    html = driver.page_source
    soup = BeautifulSoup(html, 'html.parser')

    text_elements = []

    # Extract text from <div> elements with role="paragraph" and header tags
    for element in soup.find_all(['div', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6']):
        if element.name == 'div' and element.get('role') == 'paragraph':
            text_elements.append(element.get_text())
        elif element.name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
            text_elements.append(element.get_text())

    # Combine all text elements into a single string
    full_text = ' '.join(text_elements)

    # Extract and print tables
    tables = soup.find_all('table')
    for table in tables:
        headers = [th.text.strip() for th in table.find_all('th')]
        rows = []
        for tr in table.find_all('tr'):
            cells = tr.find_all('td')
            if not cells:
                continue
            row = [cell.text.strip() for cell in cells]
            rows.append(row)

        df = pd.DataFrame(rows, columns=headers if headers else None)
        print(df)
        print("Table saved.")

    print("Text extracted.")
    with open(f'extracted_text_{j}.txt', 'w', encoding='utf-8') as file:
        file.write(full_text)

    driver.quit()

for j, url in enumerate(full_text_url, start=1):
    if url:
        get_full_text_table(url, j)
    else:
        print("can't extract: maybe not written in english")

Empty DataFrame
Columns: []
Index: []
Table saved.
Text extracted.
Empty DataFrame
Columns: []
Index: []
Table saved.
Text extracted.


In [35]:
'''def get_full_text_table(full_url, j):
    driver = webdriver.Chrome()
    driver.get(full_url)

    html = driver.page_source
    soup = BeautifulSoup(html, 'html.parser')

    p_tags = soup.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'])
    text = ' '.join([p.get_text() for p in p_tags])

    tables = soup.find_all('table')
    for table in tables:
        headers = [th.text.strip() for th in table.find_all('th')]
        rows = []
        for tr in table.find_all('tr'):
            cells = tr.find_all('td')
            if not cells:
                continue
            row = [cell.text.strip() for cell in cells]
            rows.append(row)

        df = pd.DataFrame(rows, columns=headers)
        print(df)
        print("Table saved.")

    print("Text extracted.")
    #print(text)
    with open(f'extracted_text_{j}.txt', 'w', encoding='utf-8') as file:
        #file.write("\n\n\n\n")
        file.write(text)

    driver.quit()

for j, url in enumerate(full_text_url, start=1):
    if url:
        get_full_text_table(url, j)
    else:
        print("can't extract: maybe not written in english")'''

'def get_full_text_table(full_url, j):\n    driver = webdriver.Chrome()\n    driver.get(full_url)\n\n    html = driver.page_source\n    soup = BeautifulSoup(html, \'html.parser\')\n\n    p_tags = soup.find_all([\'p\', \'h1\', \'h2\', \'h3\', \'h4\', \'h5\', \'h6\'])\n    text = \' \'.join([p.get_text() for p in p_tags])\n\n    tables = soup.find_all(\'table\')\n    for table in tables:\n        headers = [th.text.strip() for th in table.find_all(\'th\')]\n        rows = []\n        for tr in table.find_all(\'tr\'):\n            cells = tr.find_all(\'td\')\n            if not cells:\n                continue\n            row = [cell.text.strip() for cell in cells]\n            rows.append(row)\n\n        df = pd.DataFrame(rows, columns=headers)\n        print(df)\n        print("Table saved.")\n\n    print("Text extracted.")\n    #print(text)\n    with open(f\'extracted_text_{j}.txt\', \'w\', encoding=\'utf-8\') as file:\n        #file.write("\n\n\n\n")\n        file.write(text)\n\n    