In [2]:
from langchain_huggingface import HuggingFaceEmbeddings
from pinecone.grpc import PineconeGRPC as Pinecone
from pinecone import ServerlessSpec
from dotenv import load_dotenv
import os
from pdf2image import convert_from_path
import pytesseract
from langchain.schema import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_pinecone import PineconeVectorStore
from langchain_community.document_loaders import PyPDFDirectoryLoader
load_dotenv()



# Initialize the Hugging Face embedding model
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")


In [3]:
pinecone_api_key = os.getenv("PINECONE_API_KEY")

In [4]:
# Define document loader
def read_doc(directory):
    file_loader = PyPDFDirectoryLoader(directory)
    documents = file_loader.load()
    return documents

In [5]:
# Load the Power BI PDF file
doc = read_doc('documents/')
print(doc)

[Document(metadata={'source': 'documents\\Power_bi_qn.pdf', 'page': 0}, page_content=' \nQ 1. What is Power BI? \nANS :-  Power BI is a business analytics tool by Microsoft\nthat provides interactive visualizations and business\nintelligence capabilities with an interface simple enough\nfor end users to create their own reports and dashboards.\nTop 50 Power-BI interview\nquestions and answers\nQ 2. What are the key components of Power BI?\nANS :-   The key components are Power BI Desktop, Power\nBI Service, and Power BI Mobile.\nQ 3. What is Power BI Desktop?\nANS :-   Power BI Desktop is a desktop application used for\ncreating and publishing reports.\nQ 4. What is the Power BI Service? \nANS :-   Power BI Service is an online service for sharing\nand collaborating on Power BI reports.'), Document(metadata={'source': 'documents\\Power_bi_qn.pdf', 'page': 1}, page_content=' Q 5. What are Power BI dashboards?\nANS :-  Dashboards are single-page, often called\ncanvases, that use visualiz

In [6]:
# Convert documents into smaller chunks
def chunk_data(docs, chunk_size=800, chunk_overlap=50, source_name="Power bi qn.pdf"):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    chunks = text_splitter.split_documents(docs)
    for chunk in chunks:
        chunk.metadata["source"] = source_name  # Add metadata
    return chunks

In [7]:
documents = chunk_data(doc)
print("Chunked Documents:", documents)

Chunked Documents: [Document(metadata={'source': 'Power bi qn.pdf', 'page': 0}, page_content='Q 1. What is Power BI? \nANS :-  Power BI is a business analytics tool by Microsoft\nthat provides interactive visualizations and business\nintelligence capabilities with an interface simple enough\nfor end users to create their own reports and dashboards.\nTop 50 Power-BI interview\nquestions and answers\nQ 2. What are the key components of Power BI?\nANS :-   The key components are Power BI Desktop, Power\nBI Service, and Power BI Mobile.\nQ 3. What is Power BI Desktop?\nANS :-   Power BI Desktop is a desktop application used for\ncreating and publishing reports.\nQ 4. What is the Power BI Service? \nANS :-   Power BI Service is an online service for sharing\nand collaborating on Power BI reports.'), Document(metadata={'source': 'Power bi qn.pdf', 'page': 1}, page_content='Q 5. What are Power BI dashboards?\nANS :-  Dashboards are single-page, often called\ncanvases, that use visualizations 

In [8]:
# Load the Hugging Face Embedding Model
from langchain_huggingface import HuggingFaceEmbeddings
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")  # Hugging Face model
print("Embedding Model Loaded")


Embedding Model Loaded


In [9]:
from pinecone import ServerlessSpec
from pinecone.grpc import PineconeGRPC as Pinecone
pc = Pinecone(api_key=pinecone_api_key)
index_name = "vector"

if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name,
        dimension=768,
        metric="cosine",
        spec=ServerlessSpec(
            cloud='aws',
            region='us-east-1'
        )
    )

index = pc.Index(index_name)


In [10]:
from pinecone.grpc import PineconeGRPC as Pinecone
from langchain_pinecone import PineconeVectorStore
vector_store = PineconeVectorStore.from_documents(
    documents=documents,
    embedding=embeddings,
    index_name="vector"
)

In [11]:
# Query Pinecone for similar documents
def retrieve_query(query, embeddings, k=2):  
    # Convert query to embedding
    query_embedding = embeddings.embed_query(query)
    print("Query Embedding Shape:", len(query_embedding))
    # Search in Pinecone
    result = index.query(vector=query_embedding, top_k=k, include_metadata=True)
    return result

In [12]:
# Retrieve answers from the query
def retrieve_answers(query):
    results = retrieve_query(query, embeddings,index)  # Pass embeddings here
    print(f"Top Matches for Query: '{query}':")
    for match in results["matches"]:
        print(f"Score: {match['score']}, Content: {match['metadata']['source']}")

In [13]:
query = "What is the purpose of the Power BI RESTAPI?"

query_embedding = embeddings.embed_query(query)

results = index.query(
    vector=query_embedding,
    top_k=3,
    include_values=False,
    include_metadata=True
)

print(results)

{'matches': [{'id': 'a2246b33-7f41-4518-beed-752f96c236ec',
              'metadata': {'page': 6.0,
                           'source': 'Power bi qn.pdf',
                           'text': 'Q 30. What is Power BI Embedded?\n'
                                   'ANS :- Power BI Embedded is a service that '
                                   'allows you to\n'
                                   'embed Power BI reports and dashboards into '
                                   'your custom\n'
                                   'applications.\n'
                                   'Q 31. What is the purpose of the Power BI '
                                   'RESTAPI?\n'
                                   'ANS :-  The REST API allows developers to '
                                   'programmatically\n'
                                   'interact with Power BI service, automating '
                                   'tasks like\n'
                                   'embedding reports, cre