In [15]:
# === 1. IMPORTS ===
from PyPDF2 import PdfReader
import re
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
import pickle
import ollama

# === 2. LOAD & CHUNK PDF ===
def extract_text_from_pdf(pdf_path):
    reader = PdfReader(pdf_path)
    full_text = ""
    for page in reader.pages:
        full_text += page.extract_text() + "\n"
    return re.sub(r'\s+', ' ', full_text).strip()

def chunk_text(text, chunk_size=300, overlap=50):
    words = text.split()
    chunks = []
    i = 0
    while i < len(words):
        chunks.append(" ".join(words[i:i+chunk_size]))
        i += chunk_size - overlap
    return chunks

# Load and chunk PDF
pdf_path = "9241544228_eng.pdf"
full_text = extract_text_from_pdf(pdf_path)
chunks = chunk_text(full_text)

# === 3. EMBEDDING + FAISS SETUP ===
embed_model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = embed_model.encode(chunks, show_progress_bar=True).astype("float32")

# Store in FAISS index
index = faiss.IndexFlatL2(embeddings.shape[1])
index.add(embeddings)

# Optional: save if you want to reuse later
# faiss.write_index(index, "index.faiss")
# with open("chunks.pkl", "wb") as f: pickle.dump(chunks, f)

# === 4. RAG FUNCTIONS ===
def retrieve_context(query, k=5):
    query_embedding = embed_model.encode([query]).astype("float32")
    distances, indices = index.search(query_embedding, k)
    return [chunks[i] for i in indices[0]]

def build_prompt(context_chunks, question):
    context = "\n".join(context_chunks)
    return (
        "You are a helpful clinical assistant.\n\n"
        f"Context:\n{context}\n\n"
        f"Question: {question}\n\nAnswer:"
    )

def query_ollama(prompt, model="mistral"):
    response = ollama.chat(
        model=model,
        messages=[
            {"role": "system", "content": "You are a helpful clinical assistant."},
            {"role": "user", "content": prompt}
        ]
    )
    return response['message']['content']

# === 5. EXAMPLE QA ===
question = "Give me the correct coded classification for the following diagnosis: ‘Recurrent depressive disorder, currently in remission’"

top_chunks = retrieve_context(question, k=5)
prompt = build_prompt(top_chunks, question)
answer = query_ollama(prompt, model="mistral")  # Or llama2/gemma/etc.

print("💡 FINAL ANSWER:\n")
print(answer)


Batches: 100%|██████████| 12/12 [00:37<00:00,  3.09s/it]


💡 FINAL ANSWER:

 The correct coded classification for 'Recurrent depressive disorder, currently in remission' is F33.4 according to ICD-10.


In [16]:
question2 = "What are the diagnostic criteria for Obsessive-Compulsive Disorder (OCD)?"
top_chunks2 = retrieve_context(question2, k=5)
prompt2 = build_prompt(top_chunks2, question2)
answer2 = query_ollama(prompt2, model="mistral")

print("💡 Answer (OCD Criteria):\n")
print(answer2)


💡 Answer (OCD Criteria):

 The diagnostic criteria for Obsessive-Compulsive Disorder (OCD) include the following:
1. Obsessions or compulsions, or both, must be present on most days for at least 2 successive weeks.
2. The obsessions or compulsions must cause distress or interfere with activities.
3. The obsessions must be recognized as the individual's own thoughts or impulses.
4. There must be at least one thought or act that is still resisted unsuccessfully, even though others may be present which the sufferer no longer resists.
5. The thought of carrying out the act must not in itself be pleasurable (simple relief of tension or anxiety is not regarded as pleasure in this sense).
6. The thoughts, images, or impulses must be unpleasant and repetitive.
7. For a definite diagnosis, the symptoms should persist for at least 2 successive weeks without interruption.
