In [2]:
#pip install langchain-community pypdf

In [3]:
from langchain_core.documents import Document

documents = [
    Document(
        page_content="Dogs are great companions, known for their loyalty and friendliness.",
        metadata={"source": "mammal-pets-doc"},
    ),
    Document(
        page_content="Cats are independent pets that often enjoy their own space.",
        metadata={"source": "mammal-pets-doc"},
    ),
]

In [4]:
print(documents)

[Document(metadata={'source': 'mammal-pets-doc'}, page_content='Dogs are great companions, known for their loyalty and friendliness.'), Document(metadata={'source': 'mammal-pets-doc'}, page_content='Cats are independent pets that often enjoy their own space.')]


In [5]:
for doc in documents:
    print(doc.page_content)


Dogs are great companions, known for their loyalty and friendliness.
Cats are independent pets that often enjoy their own space.


In [6]:
from langchain_community.document_loaders import PyPDFLoader

file_path = "Chapter 01.pdf"
loader = PyPDFLoader(file_path)

docs = loader.load()

print(len(docs))

  from .autonotebook import tqdm as notebook_tqdm


19


In [7]:
print(f"{docs[0].page_content[:200]}\n")
print(docs[0].metadata)

IN2311:Operating systems
Roshani Wijesuriya

{'producer': 'Microsoft® PowerPoint® LTSC', 'creator': 'Microsoft® PowerPoint® LTSC', 'creationdate': '2024-10-12T10:35:42+05:30', 'title': '', 'author': 'Roshani Wijesuriya', 'moddate': '2024-10-12T10:35:42+05:30', 'source': 'Chapter 01.pdf', 'total_pages': 19, 'page': 0, 'page_label': '1'}


In [8]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000, chunk_overlap=200, add_start_index=True
)
all_splits = text_splitter.split_documents(docs)

len(all_splits)

20

In [None]:
#pip install -qU langchain-google-genai
#%pip install python-dotenv

Note: you may need to restart the kernel to use updated packages.


In [18]:
from dotenv import load_dotenv
import os

# Load environment variables from .env file
load_dotenv()

# Get API key from environment
google_api_key = os.getenv("GOOGLE_API_KEY")

if not google_api_key:
    raise ValueError("Google API key not found. Please set it in the .env file.")

from langchain_google_genai import GoogleGenerativeAIEmbeddings

embeddings = GoogleGenerativeAIEmbeddings(model="models/gemini-embedding-001")

In [19]:
vector_1 = embeddings.embed_query(all_splits[0].page_content)
vector_2 = embeddings.embed_query(all_splits[1].page_content)

assert len(vector_1) == len(vector_2)
print(f"Generated vectors of length {len(vector_1)}\n")
print(vector_1[:10])

Generated vectors of length 3072

[0.01898874156177044, -0.008987678214907646, 0.009013982489705086, -0.03462599590420723, -0.01431077066808939, 0.009711828082799911, -0.006494375877082348, -0.011864026077091694, -0.008294306695461273, 0.010139936581254005]


In [12]:
# pip install -qU langchain-core

In [13]:
from langchain_core.vectorstores import InMemoryVectorStore

vector_store = InMemoryVectorStore(embeddings)

In [20]:
ids = vector_store.add_documents(documents=all_splits)

In [24]:
results = vector_store.similarity_search(
    "What is an Operating System?"
)

print(results[0])

page_content='What is an Operating System?
• A program that acts as an intermediary between a user of a computer and the computer 
hardware.
• Operating system goals:
➢Execute user programs and make solving user problems easier
➢Make the computer system convenient to use
➢Use the computer hardware in an efficient manner' metadata={'producer': 'Microsoft® PowerPoint® LTSC', 'creator': 'Microsoft® PowerPoint® LTSC', 'creationdate': '2024-10-12T10:35:42+05:30', 'title': '', 'author': 'Roshani Wijesuriya', 'moddate': '2024-10-12T10:35:42+05:30', 'source': 'Chapter 01.pdf', 'total_pages': 19, 'page': 6, 'page_label': '7', 'start_index': 0}


In [30]:
results = await vector_store.asimilarity_search("What are the types of Operating Systems?")

print(results[0])

page_content='Types of Operating Systems
1. Batch Operating System' metadata={'producer': 'Microsoft® PowerPoint® LTSC', 'creator': 'Microsoft® PowerPoint® LTSC', 'creationdate': '2024-10-12T10:35:42+05:30', 'title': '', 'author': 'Roshani Wijesuriya', 'moddate': '2024-10-12T10:35:42+05:30', 'source': 'Chapter 01.pdf', 'total_pages': 19, 'page': 12, 'page_label': '13', 'start_index': 0}


In [None]:
# Note that providers implement different scores; the score here
# is a distance metric that varies inversely with similarity.

results = vector_store.similarity_search_with_score("What is an Operating System?")
doc, score = results[0]
print(f"Score: {score}\n")
print(doc)

Score: 0.939678698155014

page_content='Types of Operating Systems
1. Batch Operating System' metadata={'producer': 'Microsoft® PowerPoint® LTSC', 'creator': 'Microsoft® PowerPoint® LTSC', 'creationdate': '2024-10-12T10:35:42+05:30', 'title': '', 'author': 'Roshani Wijesuriya', 'moddate': '2024-10-12T10:35:42+05:30', 'source': 'Chapter 01.pdf', 'total_pages': 19, 'page': 12, 'page_label': '13', 'start_index': 0}


In [28]:
embedding = embeddings.embed_query("What are the types of Operating Systems?")

results = vector_store.similarity_search_by_vector(embedding)
print(results[0])

page_content='Types of Operating Systems
1. Batch Operating System' metadata={'producer': 'Microsoft® PowerPoint® LTSC', 'creator': 'Microsoft® PowerPoint® LTSC', 'creationdate': '2024-10-12T10:35:42+05:30', 'title': '', 'author': 'Roshani Wijesuriya', 'moddate': '2024-10-12T10:35:42+05:30', 'source': 'Chapter 01.pdf', 'total_pages': 19, 'page': 12, 'page_label': '13', 'start_index': 0}


In [31]:
from typing import List

from langchain_core.documents import Document
from langchain_core.runnables import chain


@chain
def retriever(query: str) -> List[Document]:
    return vector_store.similarity_search(query, k=1)


retriever.batch(
    [
        "What is an Operating System?",
        "What is the History of the Operating System",
    ],
)

[[Document(id='46a4bb54-e2ca-4a51-8139-e30a24feb1af', metadata={'producer': 'Microsoft® PowerPoint® LTSC', 'creator': 'Microsoft® PowerPoint® LTSC', 'creationdate': '2024-10-12T10:35:42+05:30', 'title': '', 'author': 'Roshani Wijesuriya', 'moddate': '2024-10-12T10:35:42+05:30', 'source': 'Chapter 01.pdf', 'total_pages': 19, 'page': 6, 'page_label': '7', 'start_index': 0}, page_content='What is an Operating System?\n• A program that acts as an intermediary between a user of a computer and the computer \nhardware.\n• Operating system goals:\n➢Execute user programs and make solving user problems easier\n➢Make the computer system convenient to use\n➢Use the computer hardware in an efficient manner')],
 [Document(id='8f990782-c70d-4cd2-90c1-69163a13578a', metadata={'producer': 'Microsoft® PowerPoint® LTSC', 'creator': 'Microsoft® PowerPoint® LTSC', 'creationdate': '2024-10-12T10:35:42+05:30', 'title': '', 'author': 'Roshani Wijesuriya', 'moddate': '2024-10-12T10:35:42+05:30', 'source': 'Cha

In [36]:
for i, docs in enumerate(results):
    print(f"\n🔹 Query {i+1}:")
    for item in docs:
        # Case 1: item is a tuple (Document, score)
        if isinstance(item, tuple):
            doc, score = item
            text = doc.page_content if hasattr(doc, "page_content") else str(doc)
        
        # Case 2: item is a Document
        elif hasattr(item, "page_content"):
            text = item.page_content
            score = None
        
        # Case 3: item is a plain string
        else:
            text = str(item)
            score = None

        print("Content:", text)
        if score is not None:
            print("Similarity Score:", score)




🔹 Query 1:
Content: id
Similarity Score: 46391cb9-d4ab-47dd-8689-99f010379321
Content: metadata
Similarity Score: {'producer': 'Microsoft® PowerPoint® LTSC', 'creator': 'Microsoft® PowerPoint® LTSC', 'creationdate': '2024-10-12T10:35:42+05:30', 'title': '', 'author': 'Roshani Wijesuriya', 'moddate': '2024-10-12T10:35:42+05:30', 'source': 'Chapter 01.pdf', 'total_pages': 19, 'page': 12, 'page_label': '13', 'start_index': 0}
Content: page_content
Similarity Score: Types of Operating Systems
1. Batch Operating System
Content: type
Similarity Score: Document

🔹 Query 2:
Content: id
Similarity Score: 3916b5ec-3a03-4a04-b64d-5dfada163b72
Content: metadata
Similarity Score: {'producer': 'Microsoft® PowerPoint® LTSC', 'creator': 'Microsoft® PowerPoint® LTSC', 'creationdate': '2024-10-12T10:35:42+05:30', 'title': '', 'author': 'Roshani Wijesuriya', 'moddate': '2024-10-12T10:35:42+05:30', 'source': 'Chapter 01.pdf', 'total_pages': 19, 'page': 8, 'page_label': '9', 'start_index': 0}
Content: pag