In [None]:
import torch
import torchvision.models as models



In [None]:
import langchain
from langchain.document_loaders import DirectoryLoader, PyPDFLoader
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter


def load_data(data):
    my_loader = DirectoryLoader(
        data,
        glob = '*.pdf',
        loader_cls = PyPDFLoader


        
    )

    docs = my_loader.load()
    return docs

my_data = load_data("book" )
my_data



In [None]:
from typing import List
from langchain.schema import Document
from dotenv import load_dotenv
import os
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_pinecone import PineconeVectorStore
from pinecone import Pinecone, ServerlessSpec
from langchain_openai import OpenAIEmbeddings  # or HuggingFaceEmbeddings

load_dotenv()

def filter_docs(docs: List[Document]) -> List[Document]:
    new_docs: List[Document] = []
    for doc in docs:
        src = doc.metadata.get("source")
        new_docs.append(Document(page_content=doc.page_content, metadata={"source": src}))
    return new_docs

medical_new_docs = filter_docs(my_data)

text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)
my_chunks = text_splitter.split_documents(medical_new_docs)

pc_api_key = os.getenv("PINECONE_API_KEY")
pc = Pinecone(api_key=pc_api_key)

index_name = "hemophilia-care-ai"
if not pc.has_index(index_name):
    pc.create_index(name=index_name, dimension=1536, metric="cosine", spec=ServerlessSpec(cloud="aws", region="us-east-1"))

index = pc.Index(index_name)

embedding = OpenAIEmbeddings(model="text-embedding-3-small")  # or HuggingFaceEmbeddings if preferred

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=50
)
small_chunks = text_splitter.split_documents(my_chunks)

vectors = []
batch_size = 50
for i in range(0, len(small_chunks), batch_size):
    batch = small_chunks[i:i+batch_size]
    vectors = [
        {"id": f"doc-{i+j}", "values": embedding.embed_query(doc.page_content), "metadata": {"text": doc.page_content}}
        for j, doc in enumerate(batch)
    ]
    index.upsert(vectors=vectors)




docsearch_client = PineconeVectorStore.from_existing_index(index_name=index_name, embedding=embedding)
retriever = docsearch_client.as_retriever(search_type="similarity", search_kwargs={"k": 3})


In [None]:
from langchain_openai import ChatOpenAI
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

chatModel = ChatOpenAI(model="gpt-4o")
system_prompt = (
    "You are a knowledgeable and cautious medical assistant. "
    "Answer patient or clinician questions using ONLY the retrieved context below. "
    "If the answer is not in the context, say clearly that you don't know. "
    "Keep answers concise, factual, and medically accurate. "
    "Limit responses to a maximum of three sentences and avoid speculation."
    "\n\nContext:\n{context}"
)

prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)


question_answer_chain = create_stuff_documents_chain(chatModel, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)
response = rag_chain.invoke({"input" : "What is hemophilia?"})
print(response)
