In [1]:
from langchain_community.document_loaders import PyPDFLoader
# from config import PDF_PATH
import os
from langchain_core.tools import tool

def load_pdf(file_path: str) -> list:
    """Load pdf at a particular path(only once per file)."""

    if not os.path.exists(file_path):
        raise FileNotFoundError(f"PDF file '{file_path}' not found.")

    loader = PyPDFLoader(file_path)
    docs = loader.load()
    return docs

In [4]:
docs = load_pdf("sp.pdf")

In [7]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

def split_pdf(docs:list) -> list:
    """
        Splits the pdf loaded list into chunks
    """
    splitter = RecursiveCharacterTextSplitter(chunk_size=100, chunk_overlap=20)
    chunks = splitter.split_documents(docs)
    return chunks

In [42]:
splitted_text = split_pdf(docs)

In [43]:
splitted_text

[Document(metadata={'producer': 'iLovePDF', 'creator': 'Decktape', 'creationdate': '2024-09-05T04:52:38+00:00', 'moddate': '2024-09-11T16:38:45+00:00', 'source': 'sp.pdf', 'total_pages': 231, 'page': 0, 'page_label': '1'}, page_content='Master Spring & Spring Boot\nwith Hibernate & React\n1'),
 Document(metadata={'producer': 'iLovePDF', 'creator': 'Decktape', 'creationdate': '2024-09-05T04:52:38+00:00', 'moddate': '2024-09-11T16:38:45+00:00', 'source': 'sp.pdf', 'total_pages': 231, 'page': 1, 'page_label': '2'}, page_content='Top frameworks in the Java world today\nSpring Framework\nSpring Boot'),
 Document(metadata={'producer': 'iLovePDF', 'creator': 'Decktape', 'creationdate': '2024-09-05T04:52:38+00:00', 'moddate': '2024-09-11T16:38:45+00:00', 'source': 'sp.pdf', 'total_pages': 231, 'page': 1, 'page_label': '2'}, page_content='Spring Boot\nBeginners find the first steps very diﬀicult:'),
 Document(metadata={'producer': 'iLovePDF', 'creator': 'Decktape', 'creationdate': '2024-09-05T0

In [45]:
whole_pdf_page_contents = list(map(lambda x:x.page_content, splitted_text))

In [9]:
def text_extract(docs:list) -> str:
    """
        Extracts the full text of loaded pdf
    """
    full_text = "\n".join([doc.page_content for doc in docs])
    return full_text

In [39]:
extracted_text = text_extract(docs)

In [12]:
from langchain_community.vectorstores import Chroma
import os
from langchain_huggingface import HuggingFaceEmbeddings

os.environ['HF_HOME'] = 'C:/Users/Aditya/Desktop/Langchain/Langchain_Models/LOCALINSTALLEDMODELS'

embedding_model =  HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
PERSIST_DIR = "./chroma_persist"
CHROMA_COLLECTION_NAME = "pdf_docs"

def embed_pdf(chunks:list)->Chroma:
    """
        Embedds the chunks into vectors and store in vector space
        Returns Vector Embeddings
    """
     
    if os.path.exists(PERSIST_DIR) and os.listdir(PERSIST_DIR):
        vectordb = Chroma(
            persist_directory=PERSIST_DIR,
            embedding_function=embedding_model,
            collection_name=CHROMA_COLLECTION_NAME
        )
    else:
        vectordb = Chroma.from_documents(
            chunks,
            embedding_model,
            persist_directory=PERSIST_DIR,
            collection_name=CHROMA_COLLECTION_NAME
        )
        vectordb.persist()

    return vectordb


  from .autonotebook import tqdm as notebook_tqdm


In [14]:
vectordb = embed_pdf(splitted_text)

  vectordb = Chroma(


In [22]:
vectordb_retriever = vectordb.as_retriever(search_type="mmr", search_kwargs={"k": 5})

In [23]:
vectordb_retriever.invoke("What is springboot")

[Document(metadata={'source': 'sp.pdf', 'total_pages': 231, 'producer': 'iLovePDF', 'moddate': '2024-09-11T16:38:45+00:00', 'page': 40, 'page_label': '41', 'creationdate': '2024-09-05T04:52:38+00:00', 'creator': 'Decktape'}, page_content='Boot\nWhat is the need for Spring Boot?\nWHAT are the goals of Spring Boot?'),
 Document(metadata={'moddate': '2024-09-11T16:38:45+00:00', 'source': 'sp.pdf', 'total_pages': 231, 'page_label': '54', 'creationdate': '2024-09-05T04:52:38+00:00', 'page': 53, 'creator': 'Decktape', 'producer': 'iLovePDF'}, page_content='1: @SpringBootConfiguration: Indicates that a class provides Spring Boot'),
 Document(metadata={'creationdate': '2024-09-05T04:52:38+00:00', 'page_label': '72', 'moddate': '2024-09-11T16:38:45+00:00', 'producer': 'iLovePDF', 'creator': 'Decktape', 'total_pages': 231, 'page': 71, 'source': 'sp.pdf'}, page_content='My favorite place on the internet\nEasiest way to create Spring Boot\nProjects\nRemember:'),
 Document(metadata={'moddate': '202

In [25]:
from langchain.retrievers.multi_query import MultiQueryRetriever
from dotenv import load_dotenv
from langchain_groq import ChatGroq

load_dotenv()
chat_model=ChatGroq(model="llama3-8b-8192")

multiquery_retriever = MultiQueryRetriever.from_llm(retriever=vectordb_retriever, llm=chat_model) 

In [26]:
multiquery_retriever.invoke("What is springboot")

[Document(metadata={'total_pages': 231, 'page_label': '113', 'page': 112, 'producer': 'iLovePDF', 'moddate': '2024-09-11T16:38:45+00:00', 'creator': 'Decktape', 'source': 'sp.pdf', 'creationdate': '2024-09-05T04:52:38+00:00'}, page_content='Goal: Get a 10,000 feet overview of folder structure\nREADME.md: Documentation'),
 Document(metadata={'page': 135, 'creator': 'Decktape', 'total_pages': 231, 'producer': 'iLovePDF', 'creationdate': '2024-09-05T04:52:38+00:00', 'page_label': '136', 'moddate': '2024-09-11T16:38:45+00:00', 'source': 'sp.pdf'}, page_content='Key Questions:\nHow to identify users?'),
 Document(metadata={'producer': 'iLovePDF', 'page': 216, 'creator': 'Decktape', 'moddate': '2024-09-11T16:38:45+00:00', 'total_pages': 231, 'page_label': '217', 'source': 'sp.pdf', 'creationdate': '2024-09-05T04:52:38+00:00'}, page_content='Engine\nDatabases: Relational & NoSQL (Amazon RDS, Google Cloud\nSQL, Azure SQL Database etc)'),
 Document(metadata={'page': 73, 'creationdate': '2024-09

In [31]:
similar_pages = vectordb_retriever.get_relevant_documents("What is hibernate")
whole_context = "\n\n".join([d.page_content for d in similar_pages])

In [32]:
similar_pages

[Document(metadata={'moddate': '2024-09-11T16:38:45+00:00', 'source': 'sp.pdf', 'page_label': '69', 'producer': 'iLovePDF', 'creationdate': '2024-09-05T04:52:38+00:00', 'total_pages': 231, 'page': 68, 'creator': 'Decktape'}, page_content='Who manages the entities?\nHibernate is one of the popular\nimplementations of JPA'),
 Document(metadata={'creationdate': '2024-09-05T04:52:38+00:00', 'moddate': '2024-09-11T16:38:45+00:00', 'source': 'sp.pdf', 'producer': 'iLovePDF', 'total_pages': 231, 'creator': 'Decktape', 'page': 68, 'page_label': '69'}, page_content='Using Hibernate directly would result in a\nlock in to Hibernate'),
 Document(metadata={'moddate': '2024-09-11T16:38:45+00:00', 'producer': 'iLovePDF', 'creationdate': '2024-09-05T04:52:38+00:00', 'source': 'sp.pdf', 'creator': 'Decktape', 'total_pages': 231, 'page': 63, 'page_label': '64'}, page_content='COURSE table\n04: Use JPA and Hibernate to play\nwith COURSE table\n05: Use Spring Data JPA to play'),
 Document(metadata={'page_

In [33]:
whole_context

'Who manages the entities?\nHibernate is one of the popular\nimplementations of JPA\n\nUsing Hibernate directly would result in a\nlock in to Hibernate\n\nCOURSE table\n04: Use JPA and Hibernate to play\nwith COURSE table\n05: Use Spring Data JPA to play\n\nMaster Spring & Spring Boot\nwith Hibernate & React\n1\n\n@Bean Indicates that a method produces a bean to be managed by the Spring container'

In [36]:
from langchain_core.tools import tool
from langchain_community.vectorstores import Chroma

def answer_question_on_query(vectordb:Chroma,user_query:str):
    """
    Answer a question based on input text from the vector store.
    Retrievers similar pages from store and gives answer
    """
    retriever = multiquery_retriever
    docs = retriever.get_relevant_documents(user_query)
    context = "\n\n".join([d.page_content for d in docs])
    question = user_query

    prompt = f"Answer the question using the following context:\n\n{context}\n\nQuestion: {question}"
    return chat_model.invoke(prompt).content


In [37]:
answer_question_on_query(vectordb,"What is hibernate")

'Based on the provided context, it appears that Hibernate is an implementation of the Java Persistence API (JPA), which is a standard for accessing, persisting, and managing data between Java objects/classes and a relational database. In other words, Hibernate is a tool that allows developers to interact with databases using Java, without having to write raw SQL queries.\n\nTo identify users, it seems that Hibernate is not directly related, and the context does not provide information on how to identify users. The main focus is on understanding the core features of Spring Framework and Hibernate as an implementation of JPA.'

In [49]:
def summarize_text(input_text:str) -> str:
    """
    Summarize text tool
    Returns Summary of provided Chunks
    """
    prompt = f"Summarize the following content in detail:\n\n{input_text}"
    return chat_model.invoke(prompt).content

In [48]:
summarize_text(extracted_text)

APIStatusError: Error code: 413 - {'error': {'message': 'Request too large for model `llama3-8b-8192` in organization `org_01k2p5brc5f7dr0d9tf84e1cak` service tier `on_demand` on tokens per minute (TPM): Limit 6000, Requested 23644, please reduce your message size and try again. Need more tokens? Upgrade to Dev Tier today at https://console.groq.com/settings/billing', 'type': 'tokens', 'code': 'rate_limit_exceeded'}}

In [50]:
def generate_quiz(full_text: str) -> str:
    """
    Generate a 10-question quiz from text 
    Returns a set of 10 questions with answers
    """
    prompt = (
        "Based on the following content, create a quiz of 10 questions "
        "with answers at the end:\n\n"
        f"{full_text}"
    )
    return chat_model.invoke(prompt).content

In [51]:
generate_quiz("Mapping in Hibernate")

'Here are 10 questions based on "Mapping in Hibernate":\n\n**Quiz: Mapping in Hibernate**\n\n**1. What is the primary purpose of mapping in Hibernate?**\na) To create a database schema\nb) To map Java objects to database tables\nc) To optimize database queries\nd) To handle transactions\n\n**2. What is the default naming strategy in Hibernate for mapping Java classes to database tables?**\na) Lowercase with underscores\nb) PascalCase\nc) CamelCase\nd) Uppercase\n\n**3. Which annotation is used to map a Java class to a database table in Hibernate?**\na) @Entity\nb) @Table\nc) @Mapping\nd) @DBTable\n\n**4. What is the role of the `<hibernate-mapping>` element in a Hibernate configuration file?**\na) To define a Java class\nb) To map a Java class to a database table\nc) To configure Hibernate properties\nd) To define a database connection\n\n**5. How do you map a Java class attribute to a database column in Hibernate?**\na) Using the @Column annotation\nb) Using the @Table annotation\nc) 

In [52]:
generate_quiz(extracted_text)

APIStatusError: Error code: 413 - {'error': {'message': 'Request too large for model `llama3-8b-8192` in organization `org_01k2p5brc5f7dr0d9tf84e1cak` service tier `on_demand` on tokens per minute (TPM): Limit 6000, Requested 23655, please reduce your message size and try again. Need more tokens? Upgrade to Dev Tier today at https://console.groq.com/settings/billing', 'type': 'tokens', 'code': 'rate_limit_exceeded'}}