# Gets from Pub Med and process it so it can be chunked, vectorized, and stored in Qdrant

In [2]:
from Bio import Entrez
from lxml import etree
from io import BytesIO
import re
import spacy
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.documents import Document
from collections import defaultdict

In [42]:
Entrez.email = "charlie.kotula@gmail.com"

# Search query for getting relevant research articles
query = """
(
  rehabilitation AND "physical therapy" OR "return to sport" OR "return to play"
) AND (
  injury OR surgery OR postoperative OR musculoskeletal
) AND (
  exercise OR "therapeutic exercise" OR training
) AND (
  review[pt] OR systematic review[pt] OR meta-analysis[pt]
)
"""

# Get relevant UIDs and titles
metadata = {}
handle = Entrez.esearch(
    db='pmc',
    term=query,
    retmax=10, #change
)

# Get relevant articles
uids = Entrez.read(handle)['IdList']
handle.close()

# Get summaries for metadata
summary = Entrez.esummary(
    db='pmc',
    id=','.join(uids)
)
records = Entrez.read(summary)

# Map UIDs to titles and pmids
for rec in records:
    title = rec['Title'].lower()
    title = re.sub(r'[^a-z0-9]+', '_', title)

    metadata[rec['Id']] = {
        'title': title, 
        'pmcid': rec['ArticleIds']['pmcid']
    }

######################################################
# Functions to extract, clean, and chunk text from PMC
def get_xml(pmc_id):
    """
    Returns the xml tree representation of the PMC article corresponding to
    the input UID.
    """
    handle = Entrez.efetch(
        db='pmc',
        id=pmc_id,
        retmode='xml',
        # rettype='full'
    )

    xml_dat = handle.read()

    # Converts xml bytes to tree
    xml_tree = etree.parse(BytesIO(xml_dat))
    
    return xml_tree

def get_text(root):
    """
    Returns a dictionary of {section title: content} for the xml tree root.
    """
    text = []

    # Remove references
    for xref in root.xpath('.//xref'):
        parent = xref.getparent()
        if parent is None:
            continue

        # removes punction surrounding references
        prev = xref.getprevious()

        # Handles punctuation before ref
        if prev is not None and prev.tail is not None:
            prev.tail = re.sub(r'[\[\(]\s*$', ' ', prev.tail)
        else:
            # xref is the first child → clean parent.text
            if parent.text:
                parent.text = re.sub(r'[\[\(]\s*$', ' ', parent.text)   

        # Handles punctuation after ref
        if xref.tail:
            xref.tail = re.sub(r'^\s*[\]\)]*', ' ', xref.tail)
            
        parent.remove(xref)
            
    
    for sec in root.xpath('.//body//sec'):
        title = sec.findtext('title')
        if not title:
            continue
        title = title.lower()

        # Gets paragraphs from each section
        paragraphs = [
            ''.join(p.itertext()) for p in sec.findall('p')
        ]
    
        # Add sections to sections list
        if paragraphs: # ignores empty sections
            text.append((title , ' '.join(paragraph for paragraph in paragraphs)))
    
    return text

def clean_text(text):
    """
    Cleans article text, removing extra spaces, etc.
    """
    cleaned_text = []
    for section, words in text:
        words = re.sub(r'\s+', ' ', words)
        words = words.replace('\xa0', ' ').strip()
        cleaned_text.append((section, words))
    
    return cleaned_text

def semantic_chunking(text, pmc_id, title):
    """
    Performs chunking based on meaningful breakpoints, (newlines, periods, etc.)
    """
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=500,
        chunk_overlap=50,
        separators=['\n\n', '\n', '. ', ' ', '']
    )

    
    
    return chunks

In [43]:
# extract sections
tree = get_xml(uids[1])
root = tree.getroot()
text = get_text(root)

In [44]:
cleaned_text = clean_text(text)

In [45]:
documents = []

uid = uids[1]
pmcid = metadata[uid]['pmcid']
title = metadata[uid]['title']

for section, text in cleaned_text:
    # Create section label for metadata
    section = section.lower()
    section = re.sub(r'[^a-z0-9]+', '_', section)
    print(section)

    # Chunk text
    chunks = splitter.split_text(text)

    # Create langchain Documents
    for i, chunk in enumerate(chunks):
        doc = Document(
            page_content=chunk,
            metadata={
                "uid": uid,
                "pmcid": pmcid,
                "article": title,
                "section": section,
                "chunk_id": f'{uid}::{section}::{i}'
            }
        )

        documents.append(doc)


introduction
case_report
surgical_technique
post_operative_rehabilitation_and_clinical_outcomes
discussion
conclusion


In [46]:
documents

[Document(metadata={'uid': '12778911', 'pmcid': 'PMC12778911', 'article': 'meniscal_transplant_using_autologous_semitendinosus_tendon_leads_to_clinically_meaningful_quality_of_life_improvement_a_case_report', 'section': 'introduction', 'chunk_id': '12778911::introduction::0'}, page_content='The meniscus is a fibrocartilaginous structure essential to the normal biomechanics of the knee joint'),
 Document(metadata={'uid': '12778911', 'pmcid': 'PMC12778911', 'article': 'meniscal_transplant_using_autologous_semitendinosus_tendon_leads_to_clinically_meaningful_quality_of_life_improvement_a_case_report', 'section': 'introduction', 'chunk_id': '12778911::introduction::1'}, page_content='. Its primary functions include the distribution and dissipation of axial loads across the tibiofemoral joint through the integrated architecture of radial and circumferential collagen fibers; enhancement of joint stability and congruity by increasing the articular contact area and limiting excessive movement;