In [14]:
# Basic Modules
import os
from dotenv import load_dotenv

# Data Ingestion Modules
from langchain_core.documents import Document
from langchain_community.document_loaders import PyPDFLoader

# Text Splitter
from langchain_text_splitters import RecursiveCharacterTextSplitter

# Text Embeddings
from langchain_google_genai import GoogleGenerativeAIEmbeddings

# Vector Database
from langchain_astradb import AstraDBVectorStore

# Model
from langchain_google_genai import  ChatGoogleGenerativeAI


In [2]:
# Connect with the LLM
os.environ["OPENAI_API_KEY"]=os.getenv("OPENAI_API_KEY")
os.environ["GROQ_API_KEY"]=os.getenv("GROQ_API_KEY")

## Langsmith Tracking And Tracing
os.environ["LANGCHAIN_API_KEY"]=os.getenv("LANGCHAIN_API_KEY")
os.environ["LANGCHAIN_PROJECT"]=os.getenv("LANGCHAIN_PROJECT")
os.environ["LANGCHAIN_TRACING_V2"]="true"

os.environ["ASTRA_DB_API_ENDPOINT"]=os.getenv("ASTRA_DB_API_ENDPOINT")
os.environ["ASTRA_DB_APPLICATION_TOKEN"]=os.getenv("ASTRA_DB_APPLICATION_TOKEN")

In [3]:
# Fecth Document
docs_loader = PyPDFLoader('Mastering Machine Learning with Python in Six Steps.pdf')
docs = docs_loader.load()
docs_page_length = len(docs)

In [4]:
# Text Splitter
textsplitter = RecursiveCharacterTextSplitter(chunk_size=1000,chunk_overlap=100)
splitted_docs= textsplitter.split_documents(docs)

In [5]:
# Create and use  google embeddings
embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")

In [6]:
vector_store = AstraDBVectorStore(  
    collection_name="python_book",
    embedding=embeddings,   
    api_endpoint=os.environ["ASTRA_DB_API_ENDPOINT"],
    token=os.environ["ASTRA_DB_APPLICATION_TOKEN"]
)

In [7]:
# Add processed documents TO Vector Database
vector_store.add_documents(documents=splitted_docs)


['0105954f628e4f84b3a4b86e8c54c1a4',
 '91a6cb62182a4d3c9d5d303c1e69e375',
 'd96359e2f02246c786b2f721944b01cc',
 '926dcf4037854f56b7adc6799846c4ba',
 '8eef09d000f24def9185217936d4a72c',
 'c33bcaf78f8f4151ba3c7660d23d70b4',
 'd1b8da19945c4a8db4cc028b7c45538d',
 '14035c3948844d4fb8074113a1c515e2',
 '6419a4523fbc41ee8b8f2b701f819b5d',
 '1002453ec50042818813317388b81f59',
 '9c5bae7bbcc14bf48240113003224e02',
 '0e6f7ce1ff30431e8ad50e5d2960414a',
 '86c7da3a3efc4c07b46272fd6dbb09a1',
 'dca0beb7a6c24623ab6c5ae1c099fe03',
 '7bc2b7f1f6104b74af66fb565f82c7cf',
 '1c5c735cfe2b4bf4937bad2fa616fcc4',
 '195188fbef9a41018dd36137d6a82988',
 'd3bab749bac54283ae9f8ccaaa4f8e34',
 'c82af3028ae94c53903ece0015eefd1e',
 '79f8a990c09146fab89c87f48e65cbf4',
 '06aabb93e5d043eab2847cfd9d214ba2',
 'd5fd4bf2201f4c528f62d3fd1f60d1d1',
 '9705d6810d344b53a3c0241191a32231',
 '33deb49843304e508016a66089bc81f2',
 'ddf0c8db8cb44822a298a1d73843c671',
 'ea52c30ef881409f9125c0ff9d3addc7',
 '9d8110ab81794b9f9b3d7c4f90337325',
 

In [15]:
vector_store.similarity_search_with_score("what is Machine Learning ?", k=3)

[(Document(id='507912d596474f309261d5167176a2ae', metadata={'producer': 'Adobe PDF Library 10.0.1', 'creator': 'Adobe InDesign CS6 (Windows)', 'creationdate': '2017-06-02T15:23:23+05:30', 'moddate': '2017-06-02T15:53:30+05:30', 'source': 'Mastering Machine Learning with Python in Six Steps.pdf', 'total_pages': 374, 'page': 70, 'page_label': '53'}, page_content='data that humans would be unlikely to find.\n•\t Machine Learning explores the study and construction of \nalgorithms that can learn from and make predictions on data. \nSuch algorithms operate by building a model from example \ninputs in order to make data driven predictions or decisions, \nrather than following strictly static program instructions.\nAll the above definitions are correct; in short, “Machine Learning is a collection of \nalgorithms and techniques used to create computational systems that learn from data in \norder to make predictions and inferences. ”\nMachine learning application area is abounding. Let’s look a

In [None]:
# Builing Retreiver Pipeline
def retriever_pipeline(search_type, size):
    retriever = vector_store.as_retriever(
        search_type = search_type,
        search_kwargs={"k" : size} #hyperparamter
    )
    return retriever




In [10]:
retriever = retriever_pipeline(search_type="mmr", size=3)

In [12]:
retriever.invoke("what is Machine Learning ?")

[Document(id='507912d596474f309261d5167176a2ae', metadata={'producer': 'Adobe PDF Library 10.0.1', 'creator': 'Adobe InDesign CS6 (Windows)', 'creationdate': '2017-06-02T15:23:23+05:30', 'moddate': '2017-06-02T15:53:30+05:30', 'source': 'Mastering Machine Learning with Python in Six Steps.pdf', 'total_pages': 374, 'page': 70, 'page_label': '53'}, page_content='data that humans would be unlikely to find.\n•\t Machine Learning explores the study and construction of \nalgorithms that can learn from and make predictions on data. \nSuch algorithms operate by building a model from example \ninputs in order to make data driven predictions or decisions, \nrather than following strictly static program instructions.\nAll the above definitions are correct; in short, “Machine Learning is a collection of \nalgorithms and techniques used to create computational systems that learn from data in \norder to make predictions and inferences. ”\nMachine learning application area is abounding. Let’s look at

In [None]:
model=ChatGoogleGenerativeAI(model='gemini-1.5-flash')

In [19]:
from langchain.schema.retriever import BaseRetriever
from langchain.schema import Document
from typing import List, Tuple, Any

class ScoredSearchRetriever(BaseRetriever):
    def __init__(self, vectorstore, search_type: str = "similarity", search_kwargs: dict = None):
        self.vectorstore = vectorstore
        self.search_type = search_type
        self.search_kwargs = search_kwargs or {"k": 4}

    def get_relevant_documents(self, query: str) -> List[Document]:
        # Standard interface: return only docs
        return [
            doc for doc, _ in self.vectorstore.similarity_search_with_score(
                query, **self.search_kwargs
            )
        ]

    def get_documents_with_scores(self, query: str) -> List[Tuple[Document, float]]:
        # Internal logic based on search_type
        if self.search_type == "similarity":
            return self.vectorstore.similarity_search_with_score(query, **self.search_kwargs)

        elif self.search_type == "mmr":
            return [
                (doc, None) for doc in self.vectorstore.max_marginal_relevance_search(
                    query, **self.search_kwargs
                )
            ]

        elif self.search_type == "similarity_score_threshold":
            return self.vectorstore.similarity_search_with_score(query, **self.search_kwargs)

        else:
            raise ValueError(f"Unsupported search_type: {self.search_type}")

  class ScoredSearchRetriever(BaseRetriever):


In [20]:
retriever = ScoredSearchRetriever(
    vectorstore=vector_store,
    search_type="similarity",  # or "mmr", "similarity_score_threshold"
    search_kwargs={"k": 5}
)

ValueError: "ScoredSearchRetriever" object has no field "vectorstore"

In [None]:
# Get documents only
docs = retriever.get_relevant_documents("What is LangChain used for?")

# Or get documents with scores
docs_with_scores = retriever.get_documents_with_scores("What is LangChain used for?")
for doc, score in docs_with_scores:
    print(f"Doc: {doc.page_content}")
    print(f"Score: {score}\n")