In [None]:
!pip install qdrant-client sentence-transformers beautifulsoup4 requests numpy




In [None]:
import requests
import xml.etree.ElementTree as ET
import json
import uuid
import numpy as np
from bs4 import BeautifulSoup
from sentence_transformers import SentenceTransformer
import qdrant_client
from qdrant_client.models import PointStruct, Distance, VectorParams

In [None]:
bioBERT_model = SentenceTransformer("pritamdeka/S-BioBert-snli-multinli-stsb")


In [None]:
QDRANT_API_KEY = "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJhY2Nlc3MiOiJtIn0.ota-qmq7LDu8VAg1XW-RRzgXPngfjoSvuA01b7a-PLo"
QDRANT_URL = "https://8108fa10-87c0-489a-a138-e5742baa513d.europe-west3-0.gcp.cloud.qdrant.io:6333"
client = qdrant_client.QdrantClient(url=QDRANT_URL, api_key=QDRANT_API_KEY)

COLLECTION_NAME = "medical_documents"
# COLLECTION_NAME = "medical_documents2"

HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
    "Accept-Language": "en-US,en;q=0.5",
    "Connection": "keep-alive",
    "Upgrade-Insecure-Requests": "1"
}

def create_collection():
    """Create a new collection in Qdrant (Handles deprecated method)."""
    client.delete_collection(collection_name=COLLECTION_NAME) # Ensure fresh start
    client.create_collection(
        collection_name=COLLECTION_NAME,
        vectors_config=VectorParams(size=768, distance=Distance.COSINE) # BioBERT output size is 768
    )
    print(f"✅ Collection '{COLLECTION_NAME}' created.")


In [None]:
PUBMED_BASE_URL = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
PUBMED_FETCH_URL = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
ARXIV_BASE_URL = "http://export.arxiv.org/api/query"
BIORXIV_BASE_URL = "https://api.biorxiv.org/details/biorxiv/"
FDA_BASE_URL = "https://api.fda.gov/drug/label.json"
WHO_URL = "https://www.who.int/publications/guidelines"
RXNORM_BASE_URL = "https://rxnav.nlm.nih.gov/REST/drugs"
WHO_MEDICINES_URL = "https://www.who.int/medicines/publications/essentialmedicines/en/"



In [None]:
def fetch_pubmed_articles(query, max_results=100):
    """Fetch articles from PubMed."""
    params = {"db": "pubmed", "term": query, "retmode": "json", "retmax": max_results}
    response = requests.get(PUBMED_BASE_URL, params=params)
    if response.status_code != 200:
        print(f"⚠️ PubMed request failed: {response.status_code}")
        return []

    article_ids = response.json().get("esearchresult", {}).get("idlist", [])
    return fetch_pubmed_details(article_ids)

In [None]:
def fetch_pubmed_details(article_ids):
    """Fetch details of PubMed articles."""
    params = {"db": "pubmed", "id": ",".join(article_ids), "retmode": "xml"}
    response = requests.get(PUBMED_FETCH_URL, params=params)
    root = ET.fromstring(response.content)
    articles = []
    for article in root.findall(".//PubmedArticle"):
        title = article.find(".//ArticleTitle").text
        abstract = article.find(".//AbstractText")
        abstract_text = abstract.text if abstract is not None else ""
        articles.append({"title": title, "abstract": abstract_text, "source": "PubMed"})
    return articles

In [None]:
def scrape_who_guidelines():
    """Scrape WHO guidelines."""
    response = requests.get(WHO_URL, headers=HEADERS)
    if response.status_code != 200:
        print("⚠️ WHO request failed.")
        return []

    soup = BeautifulSoup(response.text, "html.parser")
    guidelines = []
    for link in soup.find_all("a", class_="sf-list-vertical__item"):
        title = link.text.strip()
        url = link.get("href")
        if not url.startswith("http"):
            url = "https://www.who.int" + url
        guidelines.append({"title": title, "abstract": "", "source": "WHO", "url": url})

    return guidelines

In [None]:
def fetch_arxiv_papers(query, max_results=100):
    """Fetch papers from ArXiv."""
    params = {"search_query": query, "start": 0, "max_results": max_results}
    response = requests.get(ARXIV_BASE_URL, params=params)

    if response.status_code != 200:
        print(f"⚠️ ArXiv request failed: {response.status_code}")
        return []

    root = ET.fromstring(response.content)
    papers = []
    for entry in root.findall("{http://www.w3.org/2005/Atom}entry"):
        title = entry.find("{http://www.w3.org/2005/Atom}title").text
        summary = entry.find("{http://www.w3.org/2005/Atom}summary").text
        papers.append({"title": title, "abstract": summary, "source": "ArXiv"})

    return papers

In [None]:
def fetch_biorxiv_papers(query):
    """Fetch papers from bioRxiv with User-Agent to prevent 403 errors."""
    try:
        response = requests.get(f"{BIORXIV_BASE_URL}{query}", headers=HEADERS)
        response.raise_for_status()  # Raise error for HTTP issues
        data = response.json()

        if "collection" not in data:
            return []

        papers = [
            {
                "title": item["title"],
                "abstract": item["abstract"],
                "source": "bioRxiv",
                "doi": item.get("doi", ""),
                "date": item.get("date", ""),
                "authors": item.get("authors", ""),
            }
            for item in data["collection"][:10]
        ]

        return papers
    except requests.exceptions.RequestException as e:
        print(f"⚠️ Error fetching bioRxiv data: {e}")
        return []

In [None]:
def fetch_fda_drug_info():
    """Fetch drug information from FDA API."""
    try:
        params = {"limit": 10}
        response = requests.get(FDA_BASE_URL, params=params)
        response.raise_for_status()
        data = response.json()
        return [{"title": item.get("openfda", {}).get("brand_name", "Unknown"), "abstract": item.get("description", ""), "source": "FDA"} for item in data.get("results", [])]
    except Exception as e:
        print(f"⚠️ Error fetching FDA data: {e}")
        return []

In [None]:
# ✅ RxNorm Drug Retrieval
def fetch_rxnorm_drug_info(disease):
    """Fetch drug names and dosages from RxNorm."""
    response = requests.get(f"{RXNORM_BASE_URL}?name={disease}")
    if response.status_code != 200:
        print(f"⚠️ RxNorm request failed: {response.status_code}")
        return []

    root = ET.fromstring(response.content)
    drugs = []

    for concept in root.findall(".//conceptGroup/conceptProperties"):
        drugs.append({
            "title": concept.find("name").text,
            "abstract": f"RxNorm ID: {concept.find('rxcui').text}",
            "source": "RxNorm", "url": "https://rxnav.nlm.nih.gov",
            "category": "Medicine"
        })

    return drugs

In [None]:
# ✅ WHO Essential Medicines Retrieval
def fetch_who_essential_medicines():
    """Scrapes WHO Essential Medicines List."""
    response = requests.get(WHO_MEDICINES_URL)
    if response.status_code != 200:
        print(f"⚠️ WHO request failed: {response.status_code}")
        return []

    soup = BeautifulSoup(response.text, "html.parser")
    medicines = []

    for link in soup.find_all("a", href=True):
        if "essentialmedicines" in link["href"]:
            medicines.append({
                "title": link.text.strip(), "abstract": "WHO essential medicine",
                "source": "WHO", "url": f"https://www.who.int{link['href']}",
                "category": "Medicine"
            })

    return medicines

In [None]:
def save_data():
    """Fetch data, encode, and store in Qdrant in smaller batches."""
    diseases = [
        "diabetes", "cancer", "Alzheimer's", "hypertension", "stroke", "kidney disease", "Common Cold", "Pneumonia",
        "Tuberculosis (TB)", "COVID-19", "Migraine", "HIV", "AIDS", "Malaria", "Parkinson’s Disease", "Heart Disease",
        "Asthma", "Depression", "Anxiety", "Schizophrenia", "Bipolar Disorder", "Osteoporosis", "Arthritis",
        "Dementia", "Multiple Sclerosis", "Epilepsy", "Chronic Obstructive Pulmonary Disease (COPD)", "Chronic Kidney Disease (CKD)",
        "Liver Disease", "Thyroid Disease", "Obesity", "Glaucoma", "Macular Degeneration", "Hearing Loss",
        "Amyotrophic Lateral Sclerosis (ALS)", "Huntington's Disease", "Cerebral Palsy", "Autism Spectrum Disorder",
        "Down Syndrome", "Spina Bifida", "Cystic Fibrosis", "Sickle Cell Anemia", "Hemophilia", "Rheumatoid Arthritis",
        "Lupus", "Crohn's Disease", "Ulcerative Colitis", "Irritable Bowel Syndrome (IBS)", "Eczema", "Psoriasis",
        "Sleep Apnea", "Gout", "Endometriosis", "Polycystic Ovary Syndrome (PCOS)", "Cataracts", "Vertigo",
        "Anemia", "Gallstones", "Kidney Stones", "Appendicitis", "Hepatitis", "Meningitis", "Encephalitis",
        "Sepsis", "Preeclampsia", "Erectile Dysfunction", "Infertility", "Benign Prostatic Hyperplasia (BPH)",
        "Urinary Tract Infection (UTI)", "Gastroesophageal Reflux Disease (GERD)"
    ]

    all_data = []

    for disease in diseases:
        print(f"🔍 Fetching data for: {disease}")

        # ✅ Fetch data from sources
        arxiv_data = fetch_arxiv_papers(disease, max_results=50)  # Limit to 50 results per source
        fda_data = fetch_fda_drug_info()
        biorxiv_data = fetch_biorxiv_papers(disease)
        who_data = scrape_who_guidelines()
        rxnorm_drug_info=fetch_rxnorm_drug_info(disease)
        who_essential_medicines = fetch_who_essential_medicines()

        # ✅ Combine results
        disease_data = arxiv_data
        #  + fda_data
        all_data.extend(disease_data)

    # ✅ Store data in batches
    encode_and_store_in_batches(all_data, batch_size=250)
    print("✅ Data ingestion complete!")

# ✅ Run pipeline
if __name__ == "__main__":
    create_collection()
    save_data()

✅ Collection 'medical_documents' created.
🔍 Fetching data for: diabetes
🔍 Fetching data for: cancer
🔍 Fetching data for: Alzheimer's
🔍 Fetching data for: hypertension
🔍 Fetching data for: stroke
🔍 Fetching data for: kidney disease
🔍 Fetching data for: Common Cold
🔍 Fetching data for: Pneumonia
🔍 Fetching data for: Tuberculosis (TB)
🔍 Fetching data for: COVID-19
🔍 Fetching data for: Migraine
🔍 Fetching data for: HIV
🔍 Fetching data for: AIDS
🔍 Fetching data for: Malaria
🔍 Fetching data for: Parkinson’s Disease
🔍 Fetching data for: Heart Disease
🔍 Fetching data for: Asthma
🔍 Fetching data for: Depression
🔍 Fetching data for: Anxiety
🔍 Fetching data for: Schizophrenia
🔍 Fetching data for: Bipolar Disorder
🔍 Fetching data for: Osteoporosis
🔍 Fetching data for: Arthritis
🔍 Fetching data for: Dementia
🔍 Fetching data for: Multiple Sclerosis
🔍 Fetching data for: Epilepsy
🔍 Fetching data for: Chronic Obstructive Pulmonary Disease (COPD)
🔍 Fetching data for: Chronic Kidney Disease (CKD)
🔍 Fetch

In [None]:
import numpy as np
import uuid
import time

In [None]:
def encode_and_store_in_batches(documents, batch_size=250):
    """Encodes documents in smaller batches & stores them in Qdrant to avoid size limit errors."""
    num_batches = int(np.ceil(len(documents) / batch_size))  # Total number of batches

    for i in range(num_batches):
        batch_data = documents[i * batch_size: (i + 1) * batch_size]  # Extract batch
        points = []

        for doc in batch_data:
            text = f"{doc['title']} {doc['abstract']}"
            vector = bioBERT_model.encode(text).tolist()
            unique_id = str(uuid.uuid4())  # Generate unique UUID

            points.append(PointStruct(id=unique_id, vector=vector, payload=doc))

        try:
            client.upsert(collection_name="medical_documents", points=points)  # Store batch
            print(f"✅ Stored batch {i+1}/{num_batches} ({len(batch_data)} records)")
        except Exception as e:
            print(f"⚠️ Error storing batch {i+1}: {e}")
        time.sleep(1)  # Prevent API rate limiting

In [None]:
import pandas
!pip install datasets --quiet


In [None]:
from datasets import load_dataset
import pandas as pd

dataset = load_dataset("MedRAG/textbooks")

# ✅ Convert to Pandas DataFrame (Modify split if needed)
df = pd.DataFrame(dataset['train']).head(30000)

if 'contents' in df.columns:
    df = df.drop(columns=['contents'])

print("📊 Dataset Preview:\n", df.head())

Resolving data files:   0%|          | 0/18 [00:00<?, ?it/s]

📊 Dataset Preview:
                id         title  \
0  Anatomy_Gray_0  Anatomy_Gray   
1  Anatomy_Gray_1  Anatomy_Gray   
2  Anatomy_Gray_2  Anatomy_Gray   
3  Anatomy_Gray_3  Anatomy_Gray   
4  Anatomy_Gray_4  Anatomy_Gray   

                                             content  
0  What is anatomy? Anatomy includes those struct...  
1  Observation and visualization are the primary ...  
2  How can gross anatomy be studied? The term ana...  
3  This includes the vasculature, the nerves, the...  
4  Each of these approaches has benefits and defi...  


In [None]:
def preprocess_data(df):
    df_cleaned = df.rename(columns={
        "title": "title",
        "content": "abstract"  # Map `content` to `abstract`
    })
    df_cleaned["source"] = df_cleaned["title"]  # Use `title` as `source`
    df_cleaned["id"] = df_cleaned.index.astype(str)  # Generate IDs
    return df_cleaned[["id", "title", "abstract", "source"]]

df_cleaned = preprocess_data(df)

In [2]:
def encode_and_store(df, batch_size=500):
    """Encodes dataset text using BioBERT and stores in Qdrant in smaller batches."""
    num_batches = int(np.ceil(len(df) / batch_size))  # Calculate number of batches

    for i in range(num_batches):
        batch_df = df[i * batch_size: (i + 1) * batch_size]  # Get batch
        points = []

        for _, row in batch_df.iterrows():
            text = f"{row['title']} {row['abstract']}"
            vector = bioBERT_model.encode(text).tolist()
            unique_id = str(uuid.uuid4())  # Generate a unique UUID

            points.append(PointStruct(id=unique_id, vector=vector, payload=row.to_dict()))

        client.upsert(collection_name=COLLECTION_NAME, points=points)  # Store batch
        print(f"✅ Stored batch {i+1}/{num_batches} ({len(batch_df)} records)")


In [3]:
encode_and_store(df_cleaned, batch_size=500)

NameError: name 'df_cleaned' is not defined