In [5]:
import os
import openai
import langchain
import pinecone
from langchain.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Pinecone
from langchain.llms import OpenAI

In [6]:
def read_doc(directory):
    loader = PyPDFDirectoryLoader(directory)
    files = loader.load()
    return files

In [11]:
documents = read_doc("./documents")
print(len(documents))


58


In [15]:
def chunk_text(text, chunk_size=800,chunk_overlap=50):
    splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size,chunk_overlap=chunk_overlap)
    chunks = splitter.split_documents(text)
    return chunks

In [28]:
docs = chunk_text(documents)

In [35]:
docs

[Document(metadata={'source': 'documents/budget_speech.pdf', 'page': 0}, page_content='GOVERNMENT OF INDIA\nBUDGET 2023-2024\nSPEECH\nOF\nNIRMALA SITHARAMAN\nMINISTER OF FINANCE\nFebruary 1,  2023'),
 Document(metadata={'source': 'documents/budget_speech.pdf', 'page': 2}, page_content='CONTENTS \nPART-A \n Page No.  \n\uf0b7 Introduction 1 \n\uf0b7 Achievements since 2014: Leaving no one behind 2 \n\uf0b7 Vision for Amrit Kaal  – an empowered and inclusive economy 3 \n\uf0b7 Priorities of this Budget 5 \ni. Inclusive Development  \nii. Reaching the Last Mile \niii. Infrastructure and Investment \niv. Unleashing the Potential \nv. Green Growth \nvi. Youth Power  \nvii. Financial Sector  \n \n \n \n \n \n \n \n \n\uf0b7 Fiscal Management 24 \nPART B  \n  \nIndirect Taxes  27 \n\uf0b7 Green Mobility  \n\uf0b7 Electronics   \n\uf0b7 Electrical   \n\uf0b7 Chemicals and Petrochemicals   \n\uf0b7 Marine products  \n\uf0b7 Lab Grown Diamonds  \n\uf0b7 Precious Metals  \n\uf0b7 Metals  \n\uf0b7

In [24]:
from dotenv import load_dotenv
load_dotenv()
embeddings = OpenAIEmbeddings(api_key=os.environ["OPENAI_API_KEY"])

  embeddings = OpenAIEmbeddings(api_key=os.environ["OPENAI_API_KEY"])


In [25]:
vectors = embeddings.embed_query("Hello, world!")
len(vectors)

1536

In [26]:
from pinecone import Pinecone

#replace the api_key with your own

pc = Pinecone(api_key="")
index = pc.Index("chatbot")

In [56]:
# Function to generate OpenAI embeddings
def get_openai_embeddings(text):
    response = openai.embeddings.create(input=text, model="text-embedding-ada-002")
    # Access the embedding directly from the 'data' field
    embedding = response.data[0].embedding
    return embedding



# Prepare your data for insertion into Pinecone
def prepare_data_for_pinecone(documents):
    data = []
    for i, doc in enumerate(documents):
        # Generate embeddings for the document content
        embedding = get_openai_embeddings(doc.page_content)  # Use the actual page content for embeddings
        
        # Each entry will have a unique ID, the embedding, and metadata including the actual content
        metadata = {
            'source': doc.metadata['source'],  # Add the source (like file name)
            'page': doc.metadata['page'],      # Add the page number
            'content': doc.page_content        # Add the actual text content
        }
        
        # Append the document with ID, embedding, and metadata (which now includes text content)
        data.append((f"doc_{i}", embedding, metadata))  # f"doc_{i}" is a unique ID
    return data



In [57]:
# Insert the embeddings into Pinecone
def insert_into_pinecone(data):
    index.upsert(vectors=data)

# Prepare and insert all documents into Pinecone
data_for_pinecone = prepare_data_for_pinecone(docs)
insert_into_pinecone(data_for_pinecone)


In [58]:
# Check if the documents are stored by querying the index
def query_pinecone(query_text):
    try:
        # Generate embedding for the query text
        query_embedding = get_openai_embeddings(query_text)

        # Debugging: Check the length of the embedding
        print(f"Query embedding length: {len(query_embedding)}")
        
        # Ensure embedding is the correct length for the Pinecone index
        if len(query_embedding) != 1536:
            raise ValueError(f"Embedding dimension mismatch: Expected 1536, got {len(query_embedding)}")

        # Perform the query in Pinecone using the updated syntax
        query_result = index.query( # You can set a custom namespace if necessary
            vector=query_embedding,  # Use vector instead of queries
            top_k=5,  # Return top 5 matches
            include_values=False,  # Whether to include the vectors in the result
            include_metadata=True  # Example filter (modify as per your use case)
        )

        # Check if matches are found
        if query_result['matches']:
            # Print the relevant matches with metadata and score
            return query_result
        else:
            print("No matches found.")

    except Exception as e:
        # Handle any potential errors in querying
        print(f"Error querying Pinecone: {str(e)}")


In [59]:
query_pinecone("Budget priorities for 2023-24")


Query embedding length: 1536


{'matches': [{'id': 'doc_3',
              'metadata': {'content': 'Budget 2023-2024 \n'
                                      ' \n'
                                      'Speech of  \n'
                                      'Nirmala Sitharaman \n'
                                      'Minister of Finance \n'
                                      'February 1, 2023 \n'
                                      'Hon’ble Speaker,  \n'
                                      ' I present the Budget for 2023-24. This '
                                      'is the first Budget in Amrit \n'
                                      'Kaal . \n'
                                      'Introduction \n'
                                      '1. This Budget hopes to build on the '
                                      'foundation laid in the previous \n'
                                      'Budget, and the blueprint drawn for '
                                      'India@100. We envision a prosperous \n'

In [73]:
from openai import OpenAI

def get_answer_from_openai(context, question):
    try:
        # Initialize the OpenAI client
        client = OpenAI()
        
        # Combine context and question into the messages array
        messages = [
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": f"The following is relevant context:\n{context}\n\nQuestion: {question}"}
        ]
        
        # Use the new way to call chat completions
        completion = client.chat.completions.create(
            model="gpt-4o-mini",  # Use the appropriate model as per your requirement
            messages=messages,
            max_tokens=150,  # Adjust based on how long you want the answer to be
            temperature=0.2  # Adjust for more or less randomness
        )
        
        # Extract the content of the response
        return completion.choices[0].message.content

    except Exception as e:
        print(f"Error generating answer from OpenAI: {str(e)}")
        return None

In [69]:

# Now combine the two functions (query and answer generation)
def answer_question_using_context(question):
    try:
        # Step 1: Query Pinecone for relevant context
        query_result = query_pinecone(question)
        
        # Step 2: If context is found, combine it and pass to OpenAI
        if query_result:
            context = ""
            for match in query_result['matches']:
                context += f"ID: {match['id']}, Metadata: {match['metadata']}, Content: {match['metadata']['content']}\n"
            
            # Step 3: Get the answer from OpenAI based on the context
            answer = get_answer_from_openai(context, question)
            
            return answer
        else:
            return "No relevant context found in Pinecone."

    except Exception as e:
        print(f"Error answering the question: {str(e)}")
        return None





In [74]:
# Example Usage
question = "What are the main priorities of the 2023-24 Indian Budget?"
answer = answer_question_using_context(question)
print(f"Answer: {answer}")

Query embedding length: 1536
Answer: The main priorities of the 2023-24 Indian Budget, as outlined in the speech, are referred to as the 'Saptarishi' guiding principles. They are:

1. Inclusive Development
2. Reaching the Last Mile
3. Infrastructure and Investment
4. Unleashing the Potential
5. Green Growth
6. Youth Power
7. Financial Sector
