In [None]:
print("ok!")

: 

In [None]:
print("hellow world")

In [None]:
%pwd


In [None]:
from langchain.document_loaders import TextLoader,PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [None]:
import os
os.chdir("../")


In [None]:
%pwd

In [None]:
# Extract text from PDF files
def load_pdf_files(data):
    loader = DirectoryLoader(
        data,
        glob="*.pdf",
        loader_cls=PyPDFLoader
    )

    documents = loader.load()
    return documents

In [None]:
extracted_data = load_pdf_files("data")
extracted_data

In [None]:
from typing import List
from langchain.schema import Document

def filter_to_minimal_docs(docs: List[Document]) -> List[Document]:
    """
    Given a list of Document objects, return a new list of Document objects
    containing only 'source' in metadata and the original page_content.
    """
    minimal_docs: List[Document] = []
    for doc in docs:
        src = doc.metadata.get("source")
        minimal_docs.append(
            Document(
                page_content=doc.page_content,
                metadata={"source": src}
            )
        )
    return minimal_docs

In [None]:
minimal_docs = filter_to_minimal_docs(extracted_data)
minimal_docs

In [None]:
# Split the documents into smaller chunks
def text_split(minimal_docs):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=500,
        chunk_overlap=20,
    )
    texts_chunk = text_splitter.split_documents(minimal_docs)
    return texts_chunk

In [None]:
texts_chunk = text_split(minimal_docs)
print(f"Number of chunks: {len(texts_chunk)}")
texts_chunk

In [None]:
# we have to do embeddings of this data to vectors
from langchain.embeddings import HuggingFaceEmbeddings

def download_embeddings():
    """
    Download and return the HuggingFace embeddings model.
    """
    model_name = "sentence-transformers/all-MiniLM-L6-v2"
    embeddings = HuggingFaceEmbeddings(
        model_name=model_name
    )
    return embeddings

embedding = download_embeddings()

In [None]:
embedding

In [None]:
vector = embedding.embed_query("Hello world")
vector

In [None]:
from dotenv import load_dotenv
import os
load_dotenv()

In [None]:
# Replace your current cell with this safer version:

PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")

# Check and set environment variables safely
print("🔑 API Key Status:")
print(f"PINECONE_API_KEY: {'✅ Found' if PINECONE_API_KEY else '❌ Not found'}")
print(f"OPENAI_API_KEY: {'✅ Found' if OPENAI_API_KEY else '❌ Not found'}")
print(f"GEMINI_API_KEY: {'✅ Found' if GEMINI_API_KEY else '❌ Not found'}")

# Only set environment variables if they exist
if PINECONE_API_KEY:
    os.environ["PINECONE_API_KEY"] = PINECONE_API_KEY
else:
    print("⚠️ Warning: PINECONE_API_KEY not found")

if OPENAI_API_KEY:
    os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY
else:
    print("⚠️ Warning: OPENAI_API_KEY not found")

if GEMINI_API_KEY:
    os.environ["GEMINI_API_KEY"] = GEMINI_API_KEY
    os.environ["GOOGLE_API_KEY"] = GEMINI_API_KEY  # Some libraries use this name
    print("✅ Gemini API key set successfully")
else:
    print("❌ Error: GEMINI_API_KEY not found in .env file")
    print("Please add it to your .env file:")
    print("GEMINI_API_KEY=your_actual_gemini_api_key_here")

In [None]:
from pinecone import Pinecone 
pinecone_api_key = PINECONE_API_KEY

pc = Pinecone(api_key=pinecone_api_key)
pc

In [None]:
from pinecone import ServerlessSpec 

index_name = "medical-chatbot"

if not pc.has_index(index_name):
    pc.create_index(
        name = index_name,
        dimension=384,  # Dimension of the embeddings
        metric= "cosine",  # Cosine similarity
        spec=ServerlessSpec(cloud="aws", region="us-east-1")
    )


index = pc.Index(index_name)

In [None]:
from langchain_pinecone import PineconeVectorStore

# Check if index has data
index_stats = index.describe_index_stats()
total_vector_count = index_stats.get('total_vector_count', 0)

print(f"📊 Vectors in index: {total_vector_count}")

if total_vector_count == 0:
    print("📤 First run - uploading data...")
    docsearch = PineconeVectorStore.from_documents(
        documents=texts_chunk,
        embedding=embedding,
        index_name=index_name
    )
else:
    print("✅ Data exists - loading index...")
    docsearch = PineconeVectorStore.from_existing_index(
        index_name=index_name,
        embedding=embedding
    )

In [None]:
# Load Existing index 

from langchain_pinecone import PineconeVectorStore
# Embed each chunk and upsert the embeddings into your Pinecone index.
docsearch = PineconeVectorStore.from_existing_index(
    index_name=index_name,
    embedding=embedding
)

In [None]:
#Add more data to the existing Pinecone index
#dummy document reated and added to the index

dswith = Document(
    page_content="dswithbappy is a youtube channel that provides tutorials on various topics.",
    metadata={"source": "Youtube"}
)
docsearch.add_documents(documents=[dswith])

#This code creates a retriever system that searches your vector database for relevant documents.

In [None]:
retriever = docsearch.as_retriever(search_type="similarity", search_kwargs={"k":3})
retrieved_docs = retriever.invoke("What is Acne?")
retrieved_docs
#this will give an output that cannot be understood in human readable format
# so refine with llm by creating a chain as shown below


In [None]:
# Working solution with OpenAI - Replace your Gemini cell with this
from langchain_openai import ChatOpenAI
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate
import os

# Use OpenAI (your existing key works)
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

try:
    chatModel = ChatOpenAI(
        model="gpt-3.5-turbo",  # Cheaper and faster than gpt-4
        api_key=OPENAI_API_KEY,
        temperature=0.3,
        max_tokens=150
    )
    print("✅ Successfully initialized OpenAI GPT-3.5-turbo")
except Exception as e:
    print(f"❌ OpenAI setup failed: {e}")

In [None]:
system_prompt = (
    "You are an Medical assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer "
    "the question. If you don't know the answer, say that you "
    "don't know. Use three sentences maximum and keep the "
    "answer concise."
    "\n\n"
    "{context}"
)


prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)

In [None]:
question_answer_chain = create_stuff_documents_chain(chatModel, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)
# here i have created a rag chain where i am giving the retriever 
# which is not readable by human to question_answer_chain where 
# gpt4 is taken with system prompt

In [None]:
response = rag_chain.invoke({"input": "what is Acromegaly and gigantism?"})
print(response["answer"])