# Document loader

In [1]:
from dotenv import load_dotenv
load_dotenv()

True

In [2]:
### Read the list of PDFs in the dir

from langchain_community.document_loaders import PyMuPDFLoader
import os

pdfs = []
for root, dirs, files in os.walk("Lesson plan"):
    # print(root, dirs, files)
    for file in files:
        if file.endswith(".pdf"):
            pdfs.append(os.path.join(root, file))

In [3]:
pdfs

['Lesson plan\\Basic pronouns in Vietnam.pdf',
 'Lesson plan\\Basic pronouns in Vietnam_1.pdf']

In [4]:
docs = []
for pdf in pdfs:
    loader = PyMuPDFLoader(pdf)
    temp = loader.load()
    docs.extend(temp)

    # print(temp)
    # break

In [5]:
len(docs)

3

# Document chunking

In [6]:
from langchain_text_splitters import  RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
chunks = text_splitter.split_documents(docs)

In [7]:
import tiktoken

encoding = tiktoken.encoding_for_model("gpt-4o-mini")
len(encoding.encode(chunks[0].page_content)), len(encoding.encode(chunks[1].page_content)), len(encoding.encode(docs[1].page_content))

(256, 77, 103)

# Document vector embedding

In [8]:
from langchain_ollama import OllamaEmbeddings
from langchain_openai import OpenAIEmbeddings

import faiss
from langchain_community.vectorstores import FAISS 
from langchain_community.docstore.in_memory import InMemoryDocstore

In [9]:
# embeddings = OllamaEmbeddings(model='nomic-embed-text', base_url='http://localhost:11434')
embeddings = OpenAIEmbeddings(model='text-embedding-ada-002', openai_api_key=os.environ.get("OPENAI_API_KEY"))

In [10]:
vector = embeddings.embed_query("Hello World")

In [11]:
len(vector)
index = faiss.IndexFlatL2(len(vector))
index.ntotal, index.d

(0, 1536)

In [12]:
vector_store = FAISS(
    embedding_function=embeddings,
    index=index,
    docstore=InMemoryDocstore(),
    index_to_docstore_id={},
)

In [13]:
ids = vector_store.add_documents(documents=chunks)

In [14]:
len(ids), vector_store.index.ntotal

(4, 4)

In [15]:
### Retreival
question = "How to gain muscle mass"
docs = vector_store.search(query=question, k=1, search_type="similarity")

In [16]:
docs

[Document(id='a3b39cc4-fcb3-4bbe-abf3-bb81fce73ca4', metadata={'producer': 'Skia/PDF m142 Google Docs Renderer', 'creator': '', 'creationdate': '', 'source': 'Lesson plan\\Basic pronouns in Vietnam.pdf', 'file_path': 'Lesson plan\\Basic pronouns in Vietnam.pdf', 'total_pages': 2, 'format': 'PDF 1.4', 'title': 'Basic pronouns in Vietnam', 'author': '', 'subject': '', 'keywords': '', 'moddate': '', 'trapped': '', 'modDate': '', 'creationDate': '', 'page': 1}, page_content='[Test knowledge] \nTell me in the sentence below, are you younger or older than the person you are talking \nto? \n \n1.\u200b Anh chào em. Correct answer is older \n2.\u200b Em dẫn anh đi ăn tối. Correct answer is younger \n[Advanced] \n3.\u200b Hôm nay lau nhà, em lau cửa sổ còn anh lau bàn ghế. Correct answer is younger \n4.\u200b Năm mới con chúc ông mạnh khỏe. Correct answer is younger \n[End of Advanced]')]

In [17]:
db_name = "vietnamese lesson plan"

vector_store.save_local(db_name)

# Store data in vector search

In [18]:
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.schema import Document
from langchain_community.document_loaders import PyMuPDFLoader

# Read PDF content (simulate search result)
pdf_text = """
Basic pronouns in Vietnam The Vietnamese sentence is organized as you - verb - other person. In Vietnamese, addressing another person uses different basic pronouns depending on age, gender, and relationship. Below are some common nouns used to address people Em - denotes someone who is younger Anh - denotes an older maleolder male Advanced Ch - denotes an older male but younger than your parents Bc - denotes an older male older than your parents ng - denotes an older male as old as grandpa End of Advanced Ch - denotes an older female Advanced C - denotes an older female but younger than your parents D - denotes an older female older than your parents B - denotes an older female old as your grandparents Younger Same Old Older, but younger than your parents Older than your parents Older, as old as grandpare nts Male Em Bn Anh Ch Bc ng Female Em Bn Ch C D B To address yourself Anh - you are the older person in the relationship Em - you are the younger person in the relationship Ti - yourself, ignoring the age difference Test knowledge Tell me in the sentence below, are you younger or older than the person you are talking to? 1. Anh cho em. Correct answer is older 2. Em dn anh i n ti. Correct answer is younger Advanced 3. Hm nay lau nh, em lau ca s cn anh lau bn gh. Correct answer is younger 4. Nm mi con chc ng mnh khe. Correct answer is younger End of Advanced
"""
pdf = 'Lesson plan/Basic pronouns in Vietnam_1.pdf'
loader = PyMuPDFLoader(pdf)
temp = loader.load()
pdf_text = "\n".join([doc.page_content for doc in temp])
#print(pdf_text)
# (Optional) chunk if document is very large; here it's one chunk for brevity
doc = Document(page_content=pdf_text, metadata={"lesson_type": "basic_pronouns_in_vietnam"})

# embeddings = OpenAIEmbeddings()  # Set up API key if needed
# from langchain_ollama import OllamaEmbeddings
# embeddings = OllamaEmbeddings(model='nomic-embed-text', base_url='http://localhost:11434')

vector_db = FAISS.from_documents([doc], embeddings)
vector_db.save_local("faiss_pronoun_store")


In [19]:
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS

# embeddings = OllamaEmbeddings(model='nomic-embed-text', base_url='http://localhost:11434')
embeddings = OpenAIEmbeddings(model='text-embedding-ada-002', openai_api_key=os.environ.get("OPENAI_API_KEY"))
vector_db = FAISS.load_local("faiss_pronoun_store", embeddings, allow_dangerous_deserialization=True)

# Example query for pronouns in Vietnamese
query = "Vietnamese basic pronouns"
results = vector_db.similarity_search_with_score(query, k=3, filter={"lesson_type": "basic_pronouns_in_vietnam"})

for doc, score in results:
    print("Score:", score)
    print("Content snippet:", doc.page_content)  # Print first 200 chars
    print("Metadata:", doc.metadata)


  embeddings = OpenAIEmbeddings(model='text-embedding-ada-002', openai_api_key=os.environ.get("OPENAI_API_KEY"))


Score: 0.21859744
Content snippet: The Vietnamese sentence is organized as you - verb - other person. 
In Vietnamese, addressing another person uses different basic pronouns depending on 
age, gender, and relationship. Below are basic pronouns used to address people: 
●​ Em - denotes someone who is younger 
●​ Anh - denotes an older male/older male  
●​ Chị - denotes an older female 
Besides from basic pronouns, these are the advanced pronouns that provides more 
variety 
●​ Chú - denotes an older male but younger than your parents 
●​ Bác - denotes an older male older than your parents 
●​ Ông - denotes an older male as old as grandpa 
●​ Cô - denotes an older female but younger than your parents 
●​ Dì - denotes an older female older than your parents 
●​ Bà - denotes an older female old as your grandparents 
To address yourself, use these pronouns 
●​ Anh - you are the older person in the relationship 
●​ Em - you are the younger person in the relationship 
●​ Tôi - yourself, ignori

In [20]:
query = "Vietnamese basic pronouns basic"
results = vector_db.similarity_search_with_score(query, k=1, filter={"lesson_type": "basic_pronouns_in_vietnam"})
retriever = vector_db.as_retriever(search_type = 'mmr', 
                                      search_kwargs = {'k': 3, 'fetch_k': 20, 'lambda_mult': 1})
for doc, score in results:
    print("Score:", score)
    print("Content snippet:", doc.page_content)  # Print first 200 chars
    print("Metadata:", doc.metadata)

Score: 0.22699887
Content snippet: The Vietnamese sentence is organized as you - verb - other person. 
In Vietnamese, addressing another person uses different basic pronouns depending on 
age, gender, and relationship. Below are basic pronouns used to address people: 
●​ Em - denotes someone who is younger 
●​ Anh - denotes an older male/older male  
●​ Chị - denotes an older female 
Besides from basic pronouns, these are the advanced pronouns that provides more 
variety 
●​ Chú - denotes an older male but younger than your parents 
●​ Bác - denotes an older male older than your parents 
●​ Ông - denotes an older male as old as grandpa 
●​ Cô - denotes an older female but younger than your parents 
●​ Dì - denotes an older female older than your parents 
●​ Bà - denotes an older female old as your grandparents 
To address yourself, use these pronouns 
●​ Anh - you are the older person in the relationship 
●​ Em - you are the younger person in the relationship 
●​ Tôi - yourself, ignori

In [21]:
results = retriever.invoke("Basic pronuns")
results[0].page_content

'The Vietnamese sentence is organized as you - verb - other person. \nIn Vietnamese, addressing another person uses different basic pronouns depending on \nage, gender, and relationship. Below are basic pronouns used to address people: \n●\u200b Em - denotes someone who is younger \n●\u200b Anh - denotes an older male/older male  \n●\u200b Chị - denotes an older female \nBesides from basic pronouns, these are the advanced pronouns that provides more \nvariety \n●\u200b Chú - denotes an older male but younger than your parents \n●\u200b Bác - denotes an older male older than your parents \n●\u200b Ông - denotes an older male as old as grandpa \n●\u200b Cô - denotes an older female but younger than your parents \n●\u200b Dì - denotes an older female older than your parents \n●\u200b Bà - denotes an older female old as your grandparents \nTo address yourself, use these pronouns \n●\u200b Anh - you are the older person in the relationship \n●\u200b Em - you are the younger person in the re

# RAG with OLLAMA

In [22]:
from langchain_ollama import ChatOllama 
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough 
from langchain_core.prompts import ChatPromptTemplate

from langchain import hub

In [23]:
from langchain_ollama import ChatOllama 
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough 
from langchain_core.prompts import ChatPromptTemplate

from langchain import hub

In [24]:
prompt = hub.pull("rlm/rag-prompt")
prompt

ChatPromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, metadata={'lc_hub_owner': 'rlm', 'lc_hub_repo': 'rag-prompt', 'lc_hub_commit_hash': '50442af133e61576e74536c6556cefe1fac147cad032f4377b60c436e6cdcb6e'}, messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, template="You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.\nQuestion: {question} \nContext: {context} \nAnswer:"), additional_kwargs={})])

In [25]:
from langchain.prompts import ChatPromptTemplate, HumanMessagePromptTemplate, PromptTemplate

# Define the prompt template string with the specified sentence removed
template_str = (
    "You are an assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer the question. "
    "If you don't know the answer, just say that you don't know.\n"
    "Question: {question} \n"
    "Context: {context} \n"
    "Answer:"
)

# Create the inner PromptTemplate
prompt_template = PromptTemplate(
    input_variables=["context", "question"],
    template=template_str
)

# Create the HumanMessagePromptTemplate wrapping the PromptTemplate
human_message_prompt = HumanMessagePromptTemplate(prompt=prompt_template)

# Create the final ChatPromptTemplate
chat_prompt = ChatPromptTemplate(
    input_variables=["context", "question"],
    messages=[human_message_prompt],
    metadata={
        "lc_hub_owner": "rlm",
        "lc_hub_repo": "rag-prompt",
        "lc_hub_commit_hash": "50442af133e61576e74536c6556cefe1fac147cad032f4377b60c436e6cdcb6e"
    }
)


In [26]:
from langchain_openai import OpenAI
llm = OpenAI(
    api_key="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"
    )
# llm = ChatOllama(model='llama3.2:3b', base_url='http://localhost:11434')
llm.invoke('hi')

'\n\nI think it depends on the person and their personal beliefs and values. Some people may see it as an opportunity to learn about a different culture and broaden their perspectives, while others may see it as a chance to experience new and exciting things. Some may also see it as a way to escape their everyday lives and have an adventure. Ultimately, it is up to the individual to decide what they make of their trip and how they choose to perceive it.'

In [27]:
def format_docs(docs):
    return '\n\n'.join([doc.page_content for doc in docs])


In [29]:
retriever

VectorStoreRetriever(tags=['FAISS', 'OpenAIEmbeddings'], vectorstore=<langchain_community.vectorstores.faiss.FAISS object at 0x000001995F4665D0>, search_type='mmr', search_kwargs={'k': 3, 'fetch_k': 20, 'lambda_mult': 1})

In [30]:
rag_chain = (
    {"context": retriever|format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [31]:
question = "Give me only basic pronouns in Vietnamese , explain what are their purpose. Doesn't provide advanced pronouns"
response = rag_chain.invoke(question)

print(response)

 The basic pronouns used in Vietnamese for addressing people are "em" for younger individuals, "anh" for older males, and "chị" for older females. Their purpose is to denote age, gender, and relationship when addressing another person. These pronouns can also be used to address oneself, with "anh" representing the older person in the relationship, "em" representing the younger person, and "tôi" representing oneself without considering age difference.


# Use chunking instead

In [32]:
chunks = [
    {
        "content": """
In Vietnamese, addressing another person uses different basic pronouns depending on age, gender, and relationship.
Below are basic pronouns used to address people:
- Em: denotes someone who is younger
- Anh: denotes an older male
- Chị: denotes an older female
""",
        "metadata": {"lesson_type": "basic_pronouns"}
    },
    {
        "content": """
Besides from basic pronouns, these are advanced pronouns that provide more variety:
- Chú: denotes an older male but younger than your parents
- Bác: denotes an older male older than your parents
- Ông: denotes an older male as old as grandpa
- Cô: denotes an older female but younger than your parents
- Dì: denotes an older female older than your parents
- Bà: denotes an older female as old as your grandparents
""",
        "metadata": {"lesson_type": "advanced_pronouns"}
    },
    {
        "content": """
To address yourself, use these pronouns:
- Anh: you are the older person in the relationship
- Em: you are the younger person in the relationship
- Tôi: yourself, ignoring age difference
""",
        "metadata": {"lesson_type": "self_pronouns"}
    }
]


In [34]:
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.schema import Document

# Convert above chunks into Document objects
docs = [Document(page_content=chunk["content"], metadata=chunk["metadata"]) for chunk in chunks]

embeddings = OpenAIEmbeddings(model='text-embedding-ada-002', openai_api_key=os.environ.get("OPENAI_API_KEY"))
vector_db = FAISS.from_documents(docs, embeddings)
vector_db.save_local("faiss_pronoun_store")


In [36]:
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS

embeddings = OpenAIEmbeddings(model='text-embedding-ada-002', openai_api_key=os.environ.get("OPENAI_API_KEY"))
vector_db = FAISS.load_local("faiss_pronoun_store", embeddings, allow_dangerous_deserialization=True)

# Example: search for 'basic pronouns in Vietnamese'
query = "What are some basic pronouns in Vietnamese?"
results = vector_db.similarity_search_with_score(query, k=1, filter={"lesson_type": "basic_pronouns"})

for doc, score in results:
    print("Score:", score)
    print("Relevant excerpt:", doc.page_content)


Score: 0.21002021
Relevant excerpt: 
In Vietnamese, addressing another person uses different basic pronouns depending on age, gender, and relationship.
Below are basic pronouns used to address people:
- Em: denotes someone who is younger
- Anh: denotes an older male
- Chị: denotes an older female

