In [1]:
# import Libraries

import openai
import langchain
import pinecone
import pypdf
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Pinecone
from langchain.llms import OpenAI


In [None]:
from dotenv import load_dotenv
load_dotenv()

In [3]:
import os


In [4]:
## Lets Read the document
def read_doc(directory):
    loader =PyPDFLoader(directory)
    docs = loader.load()
    return docs

In [None]:
doc = read_doc('budget_speech.pdf')

# Extract only text from the document
doc_texts = [d.page_content for d in doc]

# Now print only text (optional)
print(doc_texts)


In [None]:
import json
import IPython.display as display

# Function to save metadata
def save_metadata(data, filename="LANGCHAINPROJECT1metadata.json"):
    """Save metadata to a JSON file."""
    with open(filename, "w") as f:
        json.dump(data, f, indent=4)

    # Clickable button to open metadata file
    button_html = f'''
    <a href="{filename}" target="_blank">
        <button style="padding:10px 20px; font-size:16px; background-color:#007bff; color:white; border:none; border-radius:5px; cursor:pointer;">
            📂 Open Metadata
        </button>
    </a>
    '''
    
    display.display(display.HTML(button_html))

# ✅ Extract full metadata from all pages
metadata = [d.metadata for d in doc]  # Extracts metadata from each document

# ✅ Debugging: Print extracted metadata to check if it's complete
print("Extracted Metadata:", metadata)

# ✅ Save full metadata to JSON
metadata_dict = {"metadata": metadata}
save_metadata(metadata_dict)


In [7]:
## Divide the docs into chunks
### https://api.python.langchain.com/en/latest/text_splitter/langchain.text_splitter.RecursiveCharacterTextSplitter.html#
def chunk_data(docs,chunk_size=800,chunk_overlap=50):
    text_splitter=RecursiveCharacterTextSplitter(chunk_size=chunk_size,chunk_overlap=chunk_overlap)
    doc=text_splitter.split_documents(docs)
    return docs


In [8]:
documents=chunk_data(docs=doc)
len(documents)


58

In [None]:
## Embedding Technique Of OPENAI as asked by Naresh Sir 
embeddings=OpenAIEmbeddings(api_key=os.environ['OPENAI_API_KEY'], model="text-embedding-3-small")
embeddings

In [None]:
vectors = embeddings.embed_query("How are you?")
vector_length = len(vectors)  

print("Vector length:", vector_length)


In [None]:
import os
from pinecone import Pinecone, ServerlessSpec

# Load API key from environment variable
api_key = os.getenv("PINECONE_API_KEY")

if not api_key:
    raise ValueError("Pinecone API key not found. Set it as an environment variable.")

print(f"API Key loaded successfully: {api_key[:10]}...")  # Display first 10 characters to verify

# Create a Pinecone instance
pc = Pinecone(api_key=api_key, environment="gcp-starter")

# Verify connection by listing available indexes
print("Available indexes:", pc.list_indexes().names())

# Example: Create an index (if necessary)
index_name = "langchainvector"
if index_name not in pc.list_indexes().names():
    print(f"Creating index: {index_name}")
    pc.create_index(
        name=index_name,
        dimension=1536,  # Replace with your desired dimension size
        metric="euclidean",
        spec=ServerlessSpec(cloud="aws", region="us-west-2")  # Adjust as necessary
    )

print("Pinecone initialized successfully!")


In [12]:
from langchain_pinecone import PineconeVectorStore
import pinecone
index = PineconeVectorStore.from_documents(
    documents=doc,
    embedding=embeddings,
    index_name=index_name
)

In [13]:
## Cosine Similarity Retreive Results from VectorDB
def retrieve_query(query,k=2):
    matching_results=index.similarity_search(query,k=k)
    return matching_results

In [14]:
from langchain.chains.question_answering import load_qa_chain
from langchain import OpenAI

In [None]:
llm=OpenAI(temperature=0.5)
chain=load_qa_chain(llm,chain_type="stuff")


In [16]:
## Search answers from VectorDB
def retrieve_answers(query):
    doc_search=retrieve_query(query)
    print(doc_search)
    response=chain.run(input_documents=doc_search,question=query)
    return response

In [17]:
def retrieve_answers(query):
    doc_search = retrieve_query(query)
    
    # Debugging: Check if doc_search has any documents
    print("Documents Retrieved:", doc_search)

    # If no documents are retrieved, return a default message
    if not doc_search:
        print("Error: No documents found for the query.")
        return "No relevant information found for your query."

    # Run the chain only if documents are available
    response = chain.run(input_documents=doc_search, question=query)
    return response


In [18]:
def retrieve_query(query, k=5):
    # Ensure the index is defined
    global index
    if 'index' not in globals():
        print("Error: Vector database (index) is not initialized.")
        return []
    
    # Perform similarity search
    doc_search = index.similarity_search(query, k=k)

    return doc_search


In [19]:
def retrieve_answers(query):
    # embed answer from docu_search
    doc_search = retrieve_query(query)

    # Debugging: Check if documents are retrieved
    if not doc_search:
        print("Error: No documents found for the query.")
        return "No relevant information found."

    # Run the LLM chain only if documents are found
    response = chain.run(input_documents=doc_search, question=query)
    return response


In [None]:
from langchain.chat_models import ChatOpenAI

llm = ChatOpenAI()
llm.invoke("Hello, world!")


In [None]:
our_query = "Any budget for public transport, if yes tell about it in detail?"
answer = retrieve_answers(our_query)
answer
