In [5]:
!pip install qdrant-client sentence-transformers beautifulsoup4 requests numpy




In [6]:
import requests
import xml.etree.ElementTree as ET
import json
import uuid
import numpy as np
from bs4 import BeautifulSoup
from sentence_transformers import SentenceTransformer
import qdrant_client
from qdrant_client.models import PointStruct, Distance, VectorParams

In [7]:
bioBERT_model = SentenceTransformer("pritamdeka/S-BioBert-snli-multinli-stsb")


In [68]:
QDRANT_API_KEY = "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJhY2Nlc3MiOiJtIn0.ota-qmq7LDu8VAg1XW-RRzgXPngfjoSvuA01b7a-PLo"
QDRANT_URL = "https://8108fa10-87c0-489a-a138-e5742baa513d.europe-west3-0.gcp.cloud.qdrant.io:6333"
client = qdrant_client.QdrantClient(url=QDRANT_URL, api_key=QDRANT_API_KEY)

# COLLECTION_NAME = "medical_documents"
COLLECTION_NAME = "medical_documents2"

HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
    "Accept-Language": "en-US,en;q=0.5",
    "Connection": "keep-alive",
    "Upgrade-Insecure-Requests": "1"
}

def create_collection():
    """Create a new collection in Qdrant (Handles deprecated method)."""
    client.delete_collection(collection_name=COLLECTION_NAME) # Ensure fresh start
    client.create_collection(
        collection_name=COLLECTION_NAME,
        vectors_config=VectorParams(size=768, distance=Distance.COSINE) # BioBERT output size is 768
    )
    print(f"✅ Collection '{COLLECTION_NAME}' created.")


In [69]:
PUBMED_BASE_URL = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
PUBMED_FETCH_URL = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
ARXIV_BASE_URL = "http://export.arxiv.org/api/query"
# BIORXIV_BASE_URL = "https://api.biorxiv.org/details/biorxiv/"
FDA_BASE_URL = "https://api.fda.gov/drug/label.json"
# WHO_URL = "https://www.who.int/publications/guidelines"

In [70]:
def fetch_pubmed_articles(query, max_results=10):
    """Fetch articles from PubMed."""
    params = {"db": "pubmed", "term": query, "retmode": "json", "retmax": max_results}
    response = requests.get(PUBMED_BASE_URL, params=params)
    if response.status_code != 200:
        print(f"⚠️ PubMed request failed: {response.status_code}")
        return []

    article_ids = response.json().get("esearchresult", {}).get("idlist", [])
    return fetch_pubmed_details(article_ids)

In [71]:
def fetch_pubmed_details(article_ids):
    """Fetch article details from PubMed."""
    if not article_ids:
        return []

    params = {"db": "pubmed", "id": ",".join(article_ids), "retmode": "xml"}
    response = requests.get(PUBMED_FETCH_URL, params=params)
    root = ET.fromstring(response.content)

    articles = []
    for article in root.findall(".//PubmedArticle"):
        title = article.find(".//ArticleTitle").text
        abstract = article.find(".//AbstractText")
        abstract_text = abstract.text if abstract is not None else ""
        articles.append({"title": title, "abstract": abstract_text, "source": "PubMed"})

    return articles

In [72]:
def scrape_who_guidelines():
    """Scrape WHO guidelines."""
    response = requests.get(WHO_URL, headers=HEADERS)
    if response.status_code != 200:
        print("⚠️ WHO request failed.")
        return []

    soup = BeautifulSoup(response.text, "html.parser")
    guidelines = []
    for link in soup.find_all("a", class_="sf-list-vertical__item"):
        title = link.text.strip()
        url = link.get("href")
        if not url.startswith("http"):
            url = "https://www.who.int" + url
        guidelines.append({"title": title, "abstract": "", "source": "WHO", "url": url})

    return guidelines

In [73]:
def fetch_arxiv_papers(query, max_results=10):
    """Fetch papers from ArXiv."""
    params = {"search_query": query, "start": 0, "max_results": max_results}
    response = requests.get(ARXIV_BASE_URL, params=params)

    if response.status_code != 200:
        print(f"⚠️ ArXiv request failed: {response.status_code}")
        return []

    root = ET.fromstring(response.content)
    papers = []
    for entry in root.findall("{http://www.w3.org/2005/Atom}entry"):
        title = entry.find("{http://www.w3.org/2005/Atom}title").text
        summary = entry.find("{http://www.w3.org/2005/Atom}summary").text
        papers.append({"title": title, "abstract": summary, "source": "ArXiv"})

    return papers

In [74]:
def fetch_biorxiv_papers(query):
    """Fetch papers from bioRxiv with User-Agent to prevent 403 errors."""
    try:
        response = requests.get(f"{BIORXIV_BASE_URL}{query}", headers=HEADERS)
        response.raise_for_status()  # Raise error for HTTP issues
        data = response.json()

        if "collection" not in data:
            return []

        papers = [
            {
                "title": item["title"],
                "abstract": item["abstract"],
                "source": "bioRxiv",
                "doi": item.get("doi", ""),
                "date": item.get("date", ""),
                "authors": item.get("authors", ""),
            }
            for item in data["collection"][:10]
        ]

        return papers
    except requests.exceptions.RequestException as e:
        print(f"⚠️ Error fetching bioRxiv data: {e}")
        return []

In [75]:
def fetch_fda_drug_info():
    """Fetch drug information from FDA API."""
    try:
        params = {"limit": 10}
        response = requests.get(FDA_BASE_URL, params=params)
        response.raise_for_status()
        data = response.json()
        return [{"title": item.get("openfda", {}).get("brand_name", "Unknown"), "abstract": item.get("description", ""), "source": "FDA"} for item in data.get("results", [])]
    except Exception as e:
        print(f"⚠️ Error fetching FDA data: {e}")
        return []

In [76]:
def encode_and_store(documents):
    """Encodes documents using BioBERT and stores them in Qdrant with the updated schema."""
    points = []

    for doc in documents:
        text = f"{doc.get('title', '')} {doc.get('abstract', '')}"
        vector = bioBERT_model.encode(text).tolist()

        # Prepare the payload in the new schema format
        payload = {
            "id": str(uuid.uuid4()),  # Unique identifier
            "title": doc.get("title", "Unknown Title"),
            "abstract": doc.get("abstract", "No Abstract Available"),
            "source": doc.get("source", "Unknown Source"),
            "url": doc.get("url", "No URL Available")
        }

        points.append(PointStruct(id=payload["id"], vector=vector, payload=payload))

    # Store in Qdrant
    client.upsert(collection_name=COLLECTION_NAME, points=points)
    print("✅ Documents stored in Qdrant")


In [77]:
def save_data():
    """Fetch data, encode, and store in Qdrant."""
    diseases = ["diabetes", "cancer", "Alzheimer's", "hypertension", "stroke", "kidney disease"]

    all_data = []
    for disease in diseases:
        print(f"🔍 Fetching data for: {disease}")
        pubmed_data = fetch_pubmed_articles(disease, max_results=10)
        arxiv_data = fetch_arxiv_papers(disease, max_results=10)
        # biorxiv_data = fetch_biorxiv_papers(disease)
        # who_data = scrape_who_guidelines()
        fda_data = fetch_fda_drug_info()

        disease_data = pubmed_data + arxiv_data + fda_data
        # + biorxiv_data + who_data
        all_data.extend(disease_data)

    encode_and_store(all_data)
    print("✅ Data ingestion complete!")

# ✅ Run pipeline
if __name__ == "__main__":
    create_collection()
    save_data()

✅ Collection 'medical_documents2' created.
🔍 Fetching data for: diabetes
🔍 Fetching data for: cancer
🔍 Fetching data for: Alzheimer's
🔍 Fetching data for: hypertension
🔍 Fetching data for: stroke
🔍 Fetching data for: kidney disease
✅ Documents stored in Qdrant
✅ Data ingestion complete!
