In [2]:
import os
api_key = os.environ["LANGCHAIN_API"] 

In [3]:
gemini_api = os.environ["GEMINI_API"]

Part 1 - OVERVIEW

In [4]:
import bs4
import getpass
from langchain_community.document_loaders import WebBaseLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_google_genai import GoogleGenerativeAIEmbeddings, GoogleGenerativeAI
from langchain_chroma import Chroma

USER_AGENT environment variable not set, consider setting it to identify your requests.


In [5]:
#documents 

pdf_url = "https://education.github.com/git-cheat-sheet-education.pdf" 
print(f"Loading document from URL: {pdf_url}...")

from langchain_community.document_loaders import PyPDFLoader
loader = PyPDFLoader(file_path= pdf_url)
docs = loader.load()
print("succesfully loaded")

Loading document from URL: https://education.github.com/git-cheat-sheet-education.pdf...


Ignoring wrong pointing object 11 0 (offset 0)


succesfully loaded


In [6]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 1000, chunk_overlap = 200)

In [7]:
splits = text_splitter.split_documents(docs)

In [8]:
print(splits)

[Document(metadata={'producer': 'Mac OS X 10.9.1 Quartz PDFContext', 'creator': 'Adobe Illustrator CC (Macintosh)', 'creationdate': "D:20140224195805Z00'00'", 'title': 'git-cheat-sheet-education', 'moddate': "D:20140224195805Z00'00'", 'source': 'https://education.github.com/git-cheat-sheet-education.pdf', 'total_pages': 2, 'page': 0, 'page_label': '1'}, page_content='GIT CHEAT SHEET\nSTAGE & SNAPSHOT\nWorking with snapshots and the Git staging area\ngit status\nshow modiﬁed ﬁles in working directory, staged for your next commit\ngit add [file]\nadd a ﬁle as it looks now to your next commit (stage)\ngit reset [file]\nunstage a ﬁle while retaining the changes in working directory\ngit diff\ndiﬀ of what is changed but not staged\ngit diff --staged\ndiﬀ of what is staged but not yet committed\ngit commit -m “[descriptive message]”\ncommit your staged content as a new commit snapshot\nSETUP\nConﬁguring user information used across all local repositories\ngit config --global user.name “[firs

In [9]:
import os
try:
    gemini_key = os.environ["GEMINI_API"]
except KeyError:
    raise EnvironmentError(
        "The environment variable 'GEMINI_API' is not set. Please set it to your API key."
    )


import os
os.environ["GOOGLE_API_KEY"] = os.environ.get("GEMINI_API", "")


embeddings = GoogleGenerativeAIEmbeddings(
    model="models/gemini-embedding-001", 
    api_key=gemini_key,
    credentials= None
)
vectorstore = Chroma.from_documents(
    documents=splits,
    embedding=embeddings
)

In [10]:
retriever = vectorstore.as_retriever()

In [11]:
from langsmith import Client
client = Client()  # reads LANGCHAIN_API_KEY/LANGCHAIN_USERNAME from env if needed
prompt = client.pull_prompt("rlm/rag-prompt")
print(prompt)


input_variables=['context', 'question'] input_types={} partial_variables={} metadata={'lc_hub_owner': 'rlm', 'lc_hub_repo': 'rag-prompt', 'lc_hub_commit_hash': '50442af133e61576e74536c6556cefe1fac147cad032f4377b60c436e6cdcb6e'} messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, template="You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.\nQuestion: {question} \nContext: {context} \nAnswer:"), additional_kwargs={})]


In [12]:
llm = GoogleGenerativeAI(
    model="gemini-2.5-flash-lite"
)


In [13]:
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

In [14]:
rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [15]:
rag_chain.invoke("what is git - why is it used - what docs  used")

"Git is a free and open-source distributed version control system used for managing changes locally on your computer. It's essential for tracking revisions and synchronizing repositories, especially with platforms like GitHub. The provided context is a Git cheat sheet featuring commonly used commands."

Part - 2 - INDEXING

In [16]:
question = "what kind of git commands are available here ?"
document = "git is a nice framework for version control"

In [24]:
def num_tokens_from_string(string: str, encoding_name: str) -> int:
    """return the number of tokens in a text string"""

    encoding = embeddings.embed_query(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens

In [21]:
query_res = embeddings.embed_query(question)
docu_res = embeddings.embed_query(document)
len(query_res)

3072

In [19]:
import numpy as np

def cosine_similarity(vec1, vec2):
    dot_product = np.dot(vec1, vec2)
    norm_vec = np.linalg.norm(vec1)
    norm_vec2 = np.linalg.norm(vec2)
    return dot_product / (norm_vec * norm_vec2)

In [22]:
similarity = cosine_similarity(query_res, docu_res)
print(f"cosine similarity {similarity}")

cosine similarity 0.8772521279888006


cosine similarity works in a way that 1 indicates completely identical and 0 means no similarity exists between the two embeddings

indexing

same code as importing a document - making chunks / embeddings and adding that to vectordb

###  Part - 3 - Retrieval

use your already made vectorstore and the associated as_retriever exposed method

In [27]:
retriever = vectorstore.as_retriever(search_kwargs = {"k":1})

search_kwargs and k = 1 returns the top embedding = if set to 5 - this will return you with 5 top embeddings

In [32]:
docs = retriever.invoke("what is git ?")
print(docs)

[Document(id='586c1b69-d929-4a5a-a1f2-98c5ba85ae6b', metadata={'producer': 'Mac OS X 10.9.1 Quartz PDFContext', 'moddate': "D:20140224195805Z00'00'", 'creationdate': "D:20140224195805Z00'00'", 'total_pages': 2, 'creator': 'Adobe Illustrator CC (Macintosh)', 'page': 0, 'title': 'git-cheat-sheet-education', 'page_label': '1', 'source': 'https://education.github.com/git-cheat-sheet-education.pdf'}, page_content='GIT CHEAT SHEET\nSTAGE & SNAPSHOT\nWorking with snapshots and the Git staging area\ngit status\nshow modiﬁed ﬁles in working directory, staged for your next commit\ngit add [file]\nadd a ﬁle as it looks now to your next commit (stage)\ngit reset [file]\nunstage a ﬁle while retaining the changes in working directory\ngit diff\ndiﬀ of what is changed but not staged\ngit diff --staged\ndiﬀ of what is staged but not yet committed\ngit commit -m “[descriptive message]”\ncommit your staged content as a new commit snapshot\nSETUP\nConﬁguring user information used across all local reposit

### Part - 4 Generation

In [37]:
from langchain_core.prompts import ChatPromptTemplate

In [39]:
template = """answer this question based on the following context :
            {context}
            
            Question : {question}"""

In [40]:
prompt = ChatPromptTemplate.from_template(template)
prompt

ChatPromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, template='answer this question based on the following context :\n            {context}\n            \n            Question : {question}'), additional_kwargs={})])

In [41]:
llm 

GoogleGenerativeAI(model='models/gemini-2.5-flash-lite', google_api_key=SecretStr('**********'), client=ChatGoogleGenerativeAI(model='models/gemini-2.5-flash-lite', google_api_key=SecretStr('**********'), client=<google.ai.generativelanguage_v1beta.services.generative_service.client.GenerativeServiceClient object at 0x00000276301D2A10>, default_metadata=(), model_kwargs={}))

In [43]:
chain = prompt |llm

In [48]:
chain.invoke({"context":docs,"question":"how to stage git?"})

'To stage a file in Git, you use the command: `git add [file]`. This command adds the file as it looks at that moment to your next commit, also known as staging.'

In [49]:
from langsmith import Client

client = Client()

prompt = client.pull_prompt("rlm/rag-prompt")

In [50]:
prompt

ChatPromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, metadata={'lc_hub_owner': 'rlm', 'lc_hub_repo': 'rag-prompt', 'lc_hub_commit_hash': '50442af133e61576e74536c6556cefe1fac147cad032f4377b60c436e6cdcb6e'}, messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, template="You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.\nQuestion: {question} \nContext: {context} \nAnswer:"), additional_kwargs={})])

In [52]:
rag_chain = (
    {"context" : retriever | format_docs, "question" : RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [53]:
rag_chain.invoke("what is github - how is it different from git")

"Git is a version control system that tracks changes locally on your computer. GitHub is a platform that provides a graphical user interface for day-to-day interaction, review, and repository synchronization, building upon Git's functionality. GitHub also offers installers for Windows and Mac to help users stay up-to-date with the command-line tool."