In [83]:
import os
from langchain.prompts import PromptTemplate
# from langchain_community.llms import Ollama
from langchain_ollama import OllamaLLM
from langchain.document_loaders import TextLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS

# Prompt

In [84]:
prompt_template = """
    You are an assistant that provides useful information about Bidhan. Use the following pieces of retrieved context to answer the question. 
    If the question is about Bidhan's education, summarize his degrees, institutions, and fields of study.
    If you don't know the answer, just say that you don't know. Keep the answer concise.
    {context}
    Question: {question}
    Answer:
    """.strip()

PROMPT = PromptTemplate(template= prompt_template)
PROMPT

PromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, template="You are an assistant that provides useful information about Bidhan. Use the following pieces of retrieved context to answer the question. \n    If the question is about Bidhan's education, summarize his degrees, institutions, and fields of study.\n    If you don't know the answer, just say that you don't know. Keep the answer concise.\n    {context}\n    Question: {question}\n    Answer:")

# Model

In [85]:
llm = OllamaLLM(model="llama3.2")
llm

OllamaLLM(model='llama3.2')

In [86]:
# testing the llm
llm.invoke("Hello!")

'How can I assist you today?'

# Document Loaders
Data for the context was sourced from my resume and my personal information and beliefs which were all put into two files available in the `data` folder.

In [None]:
all_documents = []

files = ['./data/aboutme.txt', './data/cv.txt']
for file in files:
    loader = TextLoader(file)
    documents = loader.load()
    all_documents.extend(documents)

In [88]:
all_documents

[Document(metadata={'source': './data/aboutme.txt'}, page_content='My Personal Information:\n\nName: Bidhan Bajracharya\nAge: 23 years (as of 2025)\nBirthdate: September 14, 2001\nGender: Male\nNationality: Nepalese\nHeight: 5\'10"\nSkin tone: Brown\nYear(s) of work experience: 1 year\nCurrent role or job responsibility: Student (no jobs)\nMajor or field I pursued during my education: Data science\nHighest level of education: Masters\n\nEducation Background:\n- Master’s Degree: Data Science and Artificial Intelligence (DSAI)\n    - Institution: Asian Institute of Technology (AIT)\n\n- Bachelor’s Degree: BSc (Hons) in Computing\n    - Institution: London Metropolitan University\n\nProfessional Experience:\n- Software Engineer (1 year)\n    - Industry: Education\n- Currently not employed\n\nTechnical Skills:\n- Programming Languages: Python, Java, JavaScript/TypeScript, C#, SQL\n- Frameworks & Libraries: React, Node.js, Pandas, NumPy, Matplotlib, Scikit-Learn\n\nBeliefs & Philosophy:\n- 

## Document Transformers

In [None]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=800, chunk_overlap=200)
doc = text_splitter.split_documents(all_documents)

In [90]:
doc 

[Document(metadata={'source': './data/aboutme.txt'}, page_content='My Personal Information:\n\nName: Bidhan Bajracharya\nAge: 23 years (as of 2025)\nBirthdate: September 14, 2001\nGender: Male\nNationality: Nepalese\nHeight: 5\'10"\nSkin tone: Brown\nYear(s) of work experience: 1 year\nCurrent role or job responsibility: Student (no jobs)\nMajor or field I pursued during my education: Data science\nHighest level of education: Masters'),
 Document(metadata={'source': './data/aboutme.txt'}, page_content='Education Background:\n- Master’s Degree: Data Science and Artificial Intelligence (DSAI)\n    - Institution: Asian Institute of Technology (AIT)\n\n- Bachelor’s Degree: BSc (Hons) in Computing\n    - Institution: London Metropolitan University\n\nProfessional Experience:\n- Software Engineer (1 year)\n    - Industry: Education\n- Currently not employed'),
 Document(metadata={'source': './data/aboutme.txt'}, page_content='Professional Experience:\n- Software Engineer (1 year)\n    - Indu

## Text Embedding Models
Source: https://huggingface.co/sentence-transformers/all-mpnet-base-v2

In [91]:
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")

## Vector Stores

In [92]:
vector_path = './vector-store'
if not os.path.exists(vector_path):
    os.makedirs(vector_path)
    print('create path done')

In [93]:
db_file_name = 'personal_info'

# load the vector store if it exists
if os.path.exists(os.path.join(vector_path, db_file_name)):
        vectordb = FAISS.load_local(
            folder_path=os.path.join(vector_path, db_file_name),
            embeddings=embedding_model,
            allow_dangerous_deserialization=True # because I'm sure the file is safe
        )
else:
    vectordb = FAISS.from_documents(
        documents = doc,
        embedding = embedding_model
    )

    vectordb.save_local(
        folder_path = os.path.join(vector_path, db_file_name)
    )

## Retrievers

In [94]:
retriever = vectordb.as_retriever(search_kwargs={"k": 5})

In [95]:
retriever.get_relevant_documents("What is your name?")

[Document(id='5dbec3bd-e895-45de-89a5-3fcd9f257314', metadata={'source': './data/aboutme.txt'}, page_content='My Personal Information:\n\nName: Bidhan Bajracharya\nAge: 23 years (as of 2025)\nBirthdate: September 14, 2001\nGender: Male\nNationality: Nepalese\nHeight: 5\'10"\nSkin tone: Brown\nYear(s) of work experience: 1 year\nCurrent role or job responsibility: Student (no jobs)\nMajor or field I pursued during my education: Data science\nHighest level of education: Masters'),
 Document(id='8f5dee91-2543-471d-a6b6-0810f83a8244', metadata={'source': './data/cv.txt'}, page_content='Level 9 Tech | Front-end Developer Intern (Aug 2022 – Dec 2022)\n    - Collaborated with a team of developers to build and maintain web applications.\n    - Utilized Git and GitHub to manage project workflows.\n    - Developed new features and improved existing functionalities to enhance user experience.'),
 Document(id='ffc13378-926e-4c92-8bfb-fe81e0b7efe9', metadata={'source': './data/cv.txt'}, page_conten

# Memory

In [96]:
from langchain.memory import ChatMessageHistory

In [97]:
history = ChatMessageHistory()
history

InMemoryChatMessageHistory(messages=[])

In [100]:
from langchain.memory import ConversationBufferMemory

In [None]:
memory = ConversationBufferMemory(memory_key="chat_history", input_key="question", output_key="answer")

# Chain

In [104]:
from langchain.chains import RetrievalQA

In [105]:
qa_chain = RetrievalQA.from_chain_type(
    llm = llm,
    chain_type = "stuff",
    retriever = retriever,
    return_source_documents = True,
    chain_type_kwargs={"prompt": PROMPT}
)

In [106]:
result = qa_chain({"query": "What is my birthdate?"})
answer = result.get('result', '')

In [107]:
sources = []
source_docs = result.get('source_documents', [])
for doc in source_docs:
    source = {
        "content": doc.page_content[:150] + "...",
        "source": doc.metadata.get('source', 'Unknown Source')
    }
    sources.append(source)

In [108]:
sources

[{'content': 'My Personal Information:\n\nName: Bidhan Bajracharya\nAge: 23 years (as of 2025)\nBirthdate: September 14, 2001\nGender: Male\nNationality: Nepalese\nHeight:...',
  'source': './data/aboutme.txt'},
 {'content': 'NAME: Bidhan Bajracharya\n\nEDUCATION BACKGROUND:\n- Asian Institute Of Technology\n    - Master of Science (M.S) - Data Science and AI | August 2024 - Ma...',
  'source': './data/cv.txt'},
 {'content': '- Uddum (Vue.js, Nuxt.js)\n    - Developed an online airplane ticket booking platform.\n    - Worked with experienced developers to optimize and resolve...',
  'source': './data/cv.txt'},
 {'content': 'Level 9 Tech | Front-end Developer Intern (Aug 2022 – Dec 2022)\n    - Collaborated with a team of developers to build and maintain web applications.\n ...',
  'source': './data/cv.txt'},
 {'content': 'AWARDS AND ACHIEVEMENT:\n- AAA Scholarship Award (2022)\n    - Recognized for exceptional Attitude, Attendance, and Academic performance.\n\nTECHNICAL SKI...',
  'sou

In [109]:
answer

'Your birthdate is September 14, 2001.'

## Queries

In [110]:
questions = [
 "How old are you?",
 "What is your highest level of education?",
 "What major or field of study did you pursue during your education?",
 "How many years of work experience do you have?",
 "What type of work or industry have you been involved in?",
 "Can you describe your current role or job responsibilities?",
 "What are your core beliefs regarding the role of technology in shaping society?",
 "How do you think cultural values should influence technological advancements?",
 "As a master's student, what is the most challenging aspect of your studies so far?",
 "What specific research interests or academic goals do you hope to achieve during your time as a master's student?"
]

In [111]:
qa_pairs = []

for idx, question in enumerate(questions, start=1):
    response = qa_chain({"query": question})
    qa_pairs.append({"question": question, "answer": response["result"]})
    print(f"{idx}) {question}")
    print(f"Answer: {response['result']}\n")

1) How old are you?
Answer: You are 23 years old (as of 2025).

2) What is your highest level of education?
Answer: Your highest level of education is a Master's degree, specifically in Data Science and Artificial Intelligence (DSAI) from Asian Institute Of Technology.

3) What major or field of study did you pursue during your education?
Answer: Bidhan pursued a BSc (Hons) in Computing from Islington College (Affiliated to London Metropolitan University), and later completed his Master's degree in Data Science and AI from the Asian Institute Of Technology.

4) How many years of work experience do you have?
Answer: You have 1 year of work experience as a Software Engineer in the education industry.

5) What type of work or industry have you been involved in?
Answer: Bidhan has been involved in the following types of work and industries:

1. Education (as a Software Engineer for 1 year)
2. IT and software development (as a Junior Software Engineer at Vurilo Pvt. Ltd., which is likely an

In [112]:
import json

with open("qa_pairs.json", "w", encoding="utf-8") as f:
    json.dump(qa_pairs, f, ensure_ascii=False, indent=4)

In [113]:
# json response
qa_pairs

[{'question': 'How old are you?',
  'answer': 'You are 23 years old (as of 2025).'},
 {'question': 'What is your highest level of education?',
  'answer': "Your highest level of education is a Master's degree, specifically in Data Science and Artificial Intelligence (DSAI) from Asian Institute Of Technology."},
 {'question': 'What major or field of study did you pursue during your education?',
  'answer': "Bidhan pursued a BSc (Hons) in Computing from Islington College (Affiliated to London Metropolitan University), and later completed his Master's degree in Data Science and AI from the Asian Institute Of Technology."},
 {'question': 'How many years of work experience do you have?',
  'answer': 'You have 1 year of work experience as a Software Engineer in the education industry.'},
 {'question': 'What type of work or industry have you been involved in?',
  'answer': 'Bidhan has been involved in the following types of work and industries:\n\n1. Education (as a Software Engineer for 1 ye

# Analysis and Problem Solving

For this assignment we had to create a chatbot applying RAG (Retrieval-Augmented Generation) techniques in Langchain framework. The chatbot would specalize in answering question related to me.

### List of the retriever and generator models used

####  Retriever model
**Embedding Model** <br>
For embeddings I used `sentence-transformers/all-mpnet-base-v2` from HuggingFace.

**Vector Store**<br>
For vector store I used `FAISS` which stands for Facebook AI Similarity Search, from langchain.

#### Generator model
For the llm model I went ahead and chose `llama3.2` from Ollama

### Analysis of issues related to the models providing unrelated information.

**Embedding Model Limitations**

The embedding model `(sentence-transformers/all-mpnet-base-v2)` converts text into vector representations for similarity searches. However, issues may arise if:
- The embeddings do not accurately capture the semantic meaning of a query, leading to retrieval of irrelevant documents.
- The model struggles with understanding domain-specific terminology or names.
- Short or ambiguous queries produce vague embeddings, retrieving unrelated results.

**For example**: User may ask, "What awards has bidhan achieved?" but it gets information about project achievements.

Possible fixes for this issue would be:
- Use hybrid retrieval (combining FAISS with BM25 or keyword search).

**Vector Store (FAISS) Issues**

FAISS is efficient for storing and retrieving embeddings, but it has its own challenges:

- If the database is small or lacks diversity, FAISS may return the closest match, even if it is not contextually relevant.
- Improper indexing or incorrect similarity metrics can result in documents that are mathematically similar but contextually irrelevant.
- If the retriever fetches too few documents, important context may be missed.

**For example**: User may ask, "Tell me about his job experiences." but the documents it retrieved was of job's project details.

Possible fixes for this issue would be:
- Break documents into overlapping chunks so relevant details remain intact.
- Instead of large paragraphs, creating smaller, context-rich sections.
- Apply reranking methods.

**LLM (LLaMA 3.2) Hallucinations**

The generator model (LLaMA 3.2) may produce unrelated or incorrect information due to:

- Over-reliance on context: If retrieved documents are not strongly relevant, the model may fill gaps with plausible but incorrect content.
- Lack of source verification: Since LLaMA is a general-purpose model, it might blend retrieved information with its own prior knowledge, causing inconsistencies.
- Bias in generation: The model may generate responses that favor more commonly seen patterns, even if they are not the best fit for the query.

**For example**: User may ask, "Tell me about his hobbies." but the retriever provides information relating to his technical skills,  so the llm might makeup informations.

Possible fixes for this issue would be:
- If the retriever finds no relevant documents, the chatbot should ask for clarification instead of hallucinating.
- Reply saying "I have no information on this." or something similar by implementing confidence threshold.
- Finetune the model further.