In [1]:
from langchain_huggingface import HuggingFaceEmbeddings
from pinecone import Pinecone, ServerlessSpec
from dotenv import load_dotenv
import os
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_pinecone import PineconeVectorStore
from langchain.chains import RetrievalQA
from langchain.llms import OpenAI  # or any other LLM from LangChain

# Load environment variables
load_dotenv()

  from tqdm.autonotebook import tqdm


True

In [2]:
# Initialize the Hugging Face embedding model
from langchain_huggingface import HuggingFaceEmbeddings
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
print("Hugging Face Embedding Model Loaded.")

Hugging Face Embedding Model Loaded.


In [3]:
# Set Pinecone API Key
pinecone_api_key = os.getenv("PINECONE_API_KEY")
if not pinecone_api_key:
    raise ValueError("Pinecone API key is missing. Ensure it's set in the environment variables.")
    
# Initialize Pinecone client
pc = Pinecone(api_key=pinecone_api_key)
print("Pinecone Client Initialized.")

Pinecone Client Initialized.


In [4]:
# Define document loader
def read_doc(directory):
    file_loader = PyPDFDirectoryLoader(directory)
    documents = file_loader.load()
    return documents

In [5]:
# Load documents
doc_directory = "documents/"  # Change to your directory path
docs = read_doc(doc_directory)
print(f"{len(docs)} documents loaded.")

0 documents loaded.


In [6]:
# Split documents into chunks
def chunk_data(docs, chunk_size=800, chunk_overlap=50):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    chunks = text_splitter.split_documents(docs)
    for chunk in chunks:
        chunk.metadata["source"] = chunk.metadata.get("source", "Unknown")  # Add or retain metadata
    return chunks

In [7]:
documents = chunk_data(docs)
print(f"Documents split into {len(documents)} chunks.")

Documents split into 0 chunks.


In [8]:
# Create or connect to Pinecone index
index_name = "vector"
if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name,
        dimension=768,
        metric="cosine",
        spec=ServerlessSpec(cloud='aws', region='us-east-1')
    )
print(f"Pinecone index '{index_name}' is ready.")

index = pc.Index(index_name)

Pinecone index 'vector' is ready.


In [9]:
# Upload documents to Pinecone as vectors
vector_store = PineconeVectorStore.from_documents(
    documents=documents,
    embedding=embeddings,
    index_name=index_name
)
print("Documents uploaded to Pinecone.")

Documents uploaded to Pinecone.


In [10]:
# Define retrieval function
def retrieve_query(query, vector_store, k=2):
    results = vector_store.similarity_search(query=query, k=k)
    return results

In [11]:
# Define full query-answering pipeline using LangChain
def answer_query(query, retriever, llm):
    qa_chain = RetrievalQA(llm=llm, retriever=retriever)
    answer = qa_chain.run(query)
    return answer

In [12]:
query = "What is the purpose of the Power BI REST API?"

# Initialize LLM for generation (e.g., GPT-4 via OpenAI)
llm = OpenAI(model="gpt-4")  # Replace with your model and API key

# Create retriever from vector store
retriever = vector_store.as_retriever()

query_embedding = embeddings.embed_query(query)

results = index.query(
    vector=query_embedding,
    top_k=3,
    include_values=False,
    include_metadata=True
)

print(results)

  llm = OpenAI(model="gpt-4")  # Replace with your model and API key


{'matches': [{'id': '6c2bd2b0-c37f-4980-a9d1-d68bb5912fef',
              'metadata': {'page': 0.0,
                           'source': 'Power bi qn.pdf',
                           'text': 'Q 1. What is Power BI? \n'
                                   'ANS :-  Power BI is a business analytics '
                                   'tool by Microsoft\n'
                                   'that provides interactive visualizations '
                                   'and business\n'
                                   'intelligence capabilities with an '
                                   'interface simple enough\n'
                                   'for end users to create their own reports '
                                   'and dashboards.\n'
                                   'Top 50 Power-BI interview\n'
                                   'questions and answers\n'
                                   'Q 2. What are the key components of Power '
                                   'BI