In [1]:
%pwd


'/Users/brejesh/portfolio/Developer/healthcare-agent-system/notebooks'

In [2]:
import os 
os.chdir("../")

In [3]:
%pwd

'/Users/brejesh/portfolio/Developer/healthcare-agent-system'

In [4]:
from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [5]:
#Extract data from the PDF
def load_pdf(data):
    loader = DirectoryLoader(data,
                    glob="*.pdf",
                    loader_cls=PyPDFLoader)
    
    documents = loader.load()

    return documents

In [6]:
extracted_data = load_pdf(data='/Users/brejesh/portfolio/Developer/healthcare-agent-system/data/raw_policies')

In [8]:
# extracted_data

In [9]:
#Create text chunks
def text_split(extracted_data):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size = 1000, chunk_overlap = 300)
    text_chunks = text_splitter.split_documents(extracted_data)

    return text_chunks

In [10]:
text_chunks = text_split(extracted_data)
print("length of my chunk:", len(text_chunks))

length of my chunk: 24927


In [12]:
# text_chunks 

In [13]:
from langchain_community.embeddings import HuggingFaceEmbeddings

In [14]:
#download embedding model
def download_hugging_face_embeddings():
    embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
    return embeddings

In [15]:
embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')

  embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')


In [16]:
embeddings

HuggingFaceEmbeddings(client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False, 'architecture': 'BertModel'})
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
), model_name='sentence-transformers/all-MiniLM-L6-v2', cache_folder=None, model_kwargs={}, encode_kwargs={}, multi_process=False, show_progress=False)

In [17]:
query_result = embeddings.embed_query("Hello world")
print("Length", len(query_result))

Length 384


In [19]:
# query_result

In [20]:
import os
from dotenv import load_dotenv, find_dotenv

# 1. Try to find .env file automatically (walks up directories)
# This is smarter than standard load_dotenv()
dotenv_path = find_dotenv()

if dotenv_path == "":
    print("❌ Error: Still cannot find .env file anywhere!")
else:
    print(f"✅ Found .env file at: {dotenv_path}")
    load_dotenv(dotenv_path)

# 2. Check if key is loaded now
api_key = os.getenv("PINECONE_API_KEY")

if api_key:
    print(f"✅ Success! Key loaded: {api_key[:5]}...")
    
    # 3. Clean the key just in case (remove spaces/quotes)
    clean_key = api_key.strip().replace('"', '').replace("'", "")
    os.environ["PINECONE_API_KEY"] = clean_key
else:
    print("❌ File found, but 'PINECONE_API_KEY' is missing inside it.")

✅ Found .env file at: /Users/brejesh/portfolio/Developer/healthcare-agent-system/.env
✅ Success! Key loaded: pcsk_...


In [21]:
import os
from dotenv import load_dotenv
from pinecone import Pinecone, ServerlessSpec

# 1. Load variables from .env file
load_dotenv() 

# 2. Get the key
api_key = os.getenv("PINECONE_API_KEY")

# Debugging: Print to check if it's None or empty (DO NOT share this output)
print(f"API Key loaded: {api_key[:4]}...") 

if not api_key:
    raise ValueError("API Key is missing! Check your .env file.")

# 3. Initialize
pc = Pinecone(api_key=api_key)

# 4. Create Index
pc.create_index(
    name="healthcare-agent-system",
    dimension=384,
    metric="cosine",
    spec=ServerlessSpec(
        cloud="aws",
        region="us-east-1"
    )
)

API Key loaded: pcsk...


{
    "name": "healthcare-agent-system",
    "metric": "cosine",
    "host": "healthcare-agent-system-gsxyak1.svc.aped-4627-b74a.pinecone.io",
    "spec": {
        "serverless": {
            "cloud": "aws",
            "region": "us-east-1"
        }
    },
    "status": {
        "ready": true,
        "state": "Ready"
    },
    "vector_type": "dense",
    "dimension": 384,
    "deletion_protection": "disabled",
    "tags": null
}

In [22]:
from langchain_pinecone import PineconeVectorStore

index_name = "healthcare-agent-system"

# Create the Vector Store
# This sends your 'text_chunks' and 'embeddings' to Pinecone
docsearch = PineconeVectorStore.from_documents(
    documents=text_chunks,
    embedding=embeddings,
    index_name=index_name
)

print("✅ Data successfully uploaded to Pinecone!")

✅ Data successfully uploaded to Pinecone!


In [23]:
from langchain_pinecone import PineconeVectorStore

docsearch = PineconeVectorStore.from_existing_index(
    index_name = index_name,
    embedding = embeddings,
)

In [24]:
docsearch

<langchain_pinecone.vectorstores.PineconeVectorStore at 0x139c46d50>

In [25]:
retriever = docsearch.as_retriever(search_type = "similarity" , search_kwargs={"k":5})

In [26]:
retrieved_docs  = retriever.invoke("Is Ablative Treatment for Spinal Pain covered for patients with sacroiliac joint pain?")

In [27]:
retrieved_docs

[Document(id='91a75898-78bb-4e30-9e11-5d4000628d7e', metadata={'creationdate': '', 'creator': 'PyPDF', 'moddate': '2025-12-23T04:42:11+00:00', 'page': 7.0, 'page_label': '8', 'producer': 'iLovePDF', 'source': '/Users/brejesh/portfolio/Developer/healthcare-agent-system/data/raw_policies/UHC_Policies_removed.pdf', 'total_pages': 3614.0}, page_content='Ablative Treatment for Spinal Pain Page 8 of 13 \nUnitedHealthcare Commercial and Individual Exchange Medical Policy Effective 05/01/2025 \nProprietary Information of UnitedHealthcare. Copyright 2025 United HealthCare Services, Inc. \n \nmonths, and six months follow-up. The ODI, VAS, and SF-36 Pain Component Summary showed improvements above \nminimal clinically important differences at one month, three months, and six months (all p values <  0.05). Change in ODI \npain impact declined 13.1 points [95% CI: 0.01,27.2] at one month from baseline, 16.5 points [95% CI: 2.5,30.6] at three \nmonths from baseline, and 21.1 points [95% CI: 7.0,35.