# Gets from Pub Med and process it so it can be chunked, vectorized, and stored in Qdrant

In [1]:
from Bio import Entrez
from lxml import etree
from io import BytesIO
import re
import spacy
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.documents import Document
from collections import defaultdict
from tqdm import tqdm
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor, as_completed
import time

In [2]:
Entrez.email = "charlie.kotula@gmail.com"

# Search query for getting relevant research articles
query = """
(
  rehabilitation AND "physical therapy" OR "return to sport" OR "return to play"
) AND (
  injury OR surgery OR postoperative OR musculoskeletal
) AND (
  exercise OR "therapeutic exercise" OR training
) AND (
  review[pt] OR systematic review[pt] OR meta-analysis[pt]
)
"""

def get_ids_with_metadata(query):
    """
    Gets PMC article UIDs, PMCIDs, and titles based on search query

    Args: query (str) - search query used to retrieve PMC articles
    Returns: metadata (dict) - dictionary containing PMCID and titles corresponding
        to the articles UID
    """
    # Get relevant UIDs and titles
    metadata = []
    handle = Entrez.esearch(
        db='pmc',
        term=query,
        retmax=10, # CHANGE
    )
    
    # Get relevant articles
    uids = Entrez.read(handle)['IdList']
    handle.close()
    
    # Get summaries for metadata
    summary = Entrez.esummary(
        db='pmc',
        id=','.join(uids)
    )
    records = Entrez.read(summary)
    
    # Map UIDs to titles and pmids
    for rec in records:
        title = rec['Title'].lower()
        title = re.sub(r'[^a-z0-9]+', '_', title)
    
        metadata.append(
            {
                'uid': rec['Id'],
                'pmcid': rec['ArticleIds']['pmcid'],
                'title': title
            }
        )

    return metadata

# Create metadata list to be used in multiprocessing
metadata = get_ids_with_metadata(query)

In [3]:
from data_preprocessing import process_article

if __name__ == "__main__":
    #### Multiprocessing of articles ####
    documents = []
    
    ### Processes articles one at a time
    # for article in tqdm(metadata):
    #     # process article (extract text, clean, chunk)d
    #     docs = process_article(article)
    #     documents.append(docs)
    
    start_time = time.time()

    ### Processes multiple articles in parallel
    with ProcessPoolExecutor(max_workers=8) as executor:
        futures = [executor.submit(process_article, article) for article in metadata]
    
        for future in as_completed(futures):
            documents.extend(future.result())
        
        end_time = time.time()
        print(f'processed {len(documents)} documents in {end_time - start_time} seconds')
        

processed 534 documents in 2.889578104019165 seconds


# Embedding using OpenAI and Qdrant

### To run the Qdrant docker:

`docker run -p 6333:6333 -p 6334:6334 \
    -v "$(pwd)/qdrant_storage:/qdrant/storage:z" \
    qdrant/qdrant`

In [4]:
from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams, PointStruct
from langchain_openai.embeddings import OpenAIEmbeddings
import uuid
from data_preprocessing import embed_and_upsert

In [5]:
def batched(iterable, batch_size):
    for i in range(0, len(iterable), batch_size):
        yield iterable[i : i + batch_size]

In [6]:
batched_docs = [batch for batch in batched(documents, 50)]

In [7]:
client = QdrantClient(url="http://localhost:6333")

client.create_collection(
    collection_name="rehab_collection",
    vectors_config=VectorParams(
        size=3072,
        distance=Distance.COSINE
    )
)

True

In [8]:
embeddings = OpenAIEmbeddings(model="text-embedding-3-large")

In [10]:
### Multiprocessing for embedding and storage of chunks
if __name__ == "__main__":
    start_time = time.time()
    with ThreadPoolExecutor(max_workers=8) as executor:
        futures = [
            executor.submit(
                embed_and_upsert, 
                document_batch=batch,
                client=client,
                embed_model=embeddings
            )
            for batch in batched_docs
        ]
        
        try:
            future.result()
        except Exception as e:
            print("batch failed: ", e)

    end_time = time.time()

    print(f'processed {len(documents)} embedded and upserted in {end_time - start_time} seconds')


processed 534 embedded and upserted in 1.7183849811553955 seconds


In [11]:
# checking db
client.count(collection_name='rehab_collection', exact=True)

CountResult(count=534)

In [12]:
# wipe db
client.delete_collection('rehab_collection')

True

# Testing Search
