# LLM Model for the project (Gemini 2.0 with added RAG)
The workflow here was influenced by a popular article (https://masteringllm.medium.com/best-practices-for-rag-pipeline-8c12a8096453)

In [2]:
import os
import re
import pypdf

from dotenv import load_dotenv
from pathlib import Path
from uuid import uuid4

from langchain_google_genai import ChatGoogleGenerativeAI, GoogleGenerativeAIEmbeddings
from langchain_core.prompts import ChatPromptTemplate
from langchain_community.document_loaders import PyPDFLoader
from langchain_core.output_parsers import StrOutputParser
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma

load_dotenv(dotenv_path=Path('.env'))

True

In [4]:
# Merge pdfs if not done yet
path = './pdfs'
pdfs = os.listdir(path)
is_pdf_merged = False
merged_pdf_path = ""

def merge_pdf(path, pdfs):
    merged_pdf = pypdf.PdfWriter()
    pdf_paths = [os.path.join(path, pdf) for pdf in pdfs 
                 if re.search('Air', pdf)]
    
    for pdf in pdf_paths:        
        merged_pdf.append(pdf)
        os.remove(pdf)
    
    merged_pdf_path = os.path.join(path, "Air-Quality-Factors.pdf")
    merged_pdf.write(merged_pdf_path)


for i, pdf in enumerate(pdfs):
    if pdf == "Air-Quality-Factors.pdf":
        is_pdf_merged = True
        break
    elif (i == len(pdfs)-1) and (is_pdf_merged == False):
        merge_pdf(path, pdfs)
        break

### Load and Chunking

In [5]:
new_documents = []

# Set the new list of pdfs after removing the unmerged files
pdfs = os.listdir(path)
pdfs_path = [os.path.join(path, pdf) for pdf in pdfs]
pdfs_name = [pdf for pdf in pdfs]
print(pdfs_name)

api_key = os.getenv("GEMINI_API_KEY")
llm = ChatGoogleGenerativeAI(model='gemini-2.0-flash', api_key=api_key)
output_parser = StrOutputParser()

embeddings = GoogleGenerativeAIEmbeddings(google_api_key=api_key, model='models/embedding-001')
vector_store = Chroma(
    collection_name="bombatronic_collection",
    embedding_function=embeddings,
    persist_directory="./chroma_db"
)

def load_pdfs(pdf_file, pdf_name, i):
    loader = PyPDFLoader(pdf_file)
    pages = loader.load_and_split()

    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size = 100,
        chunk_overlap = 50,
        length_function = len,
        is_separator_regex= False
    )

    doc_split = text_splitter.split_documents(pages)

    # Set the document metadatas id and source (whilst removing the .pdf)
    for doc in doc_split:
        doc.metadata = {
            "id": i,
            "title": re.sub('.pdf', '', pdf_name)
    }
        
    existing_metadata = vector_store.get(
        where={"title": re.sub('.pdf', '', pdf_name)}
    )
    
    # print(existing_metadata)
    for key, value in existing_metadata.items():

        # Set new documents if the value of the metadata hasnt been established
        if (key == 'metadatas' and value == []):
            new_documents.extend(doc_split) 

for i, pdf in enumerate(pdfs_path):
    load_pdfs(pdf, pdfs_name[i], i)

# print(new_documents)

['Air-Quality-Factors.pdf', 'Bombatronic - Dataset.pdf', 'How Fire Incidents Happen.pdf']


  vector_store = Chroma(
Ignoring wrong pointing object 97 0 (offset 0)
Ignoring wrong pointing object 86 0 (offset 0)


### Embeddings & Vector Store

In [6]:
def store_embeddings(documents):
    uuids = [str(uuid4()) for _ in range(len(documents))]
    vector_store.add_documents(documents=documents, ids=uuids)

if  (new_documents != []):
    # Store new documents if theres any
    store_embeddings(new_documents)            

### Query

In [61]:
def is_rag_needed(input_text):
    prompt = ChatPromptTemplate([
        ("system", "You are an LLM that would specialize in answering "
        "questions related to Air Quality Index and its parameters, whilst also"
        "posessing a strong knowledge on Fire Cases worldwide, how to mitigate them, and "
        "the solution behind it if the user were to have a question surrounding it"),
        ("user", "I posses additional information about Air Quality, Fire Incidents, and"
        "a team called Bombatronic, do you think based on my question here can it help"
        "you to get a better understanding of my question, which is, {input} , also JUST ANSWER"
        "AS IN YES OR NO, do not answer anything besides YES or NO")
    ])
    chain = prompt | llm | output_parser
    response = chain.invoke({"input": input_text})
    return response.lower()

input_text = str(input())
rag_state = is_rag_needed(input_text=input_text)
print(rag_state)

yes


### Chatbot Invoking

In [73]:
def chatbot_response(input_text, rag_state, input_metadata):
    prompt = ChatPromptTemplate([
        ("system", "You are an LLM that would specialize in answering "
        "questions related to Air Quality Index and its parameters, whilst also"
        "posessing a strong knowledge on Fire Cases worldwide, how to mitigate them, and "
        "the solution behind it if the user were to have a question surrounding it. Also you need"
        "to possess an additional information about the team BOMBATRONIC when askes about it thats"
        "given from the context by the user. Keep your answer to the question in a paragaph like "
        "format and make it sweet and intuitive"),
        ("user", "{input}\n\n: {context}")
    ])
    
    chain = prompt | llm | output_parser
    if rag_state == "yes":
        print(input_metadata[0], type(input_metadata[0]))
        retrieved_docs = vector_store.similarity_search(
            query=input_text,
            k=3,
            filter={"title": input_metadata[0]} if input_metadata else None
        )
        print(retrieved_docs)
        context = "\n\n".join([doc.page_content for doc in retrieved_docs])

    response = chain.invoke({"input": input_text, "context": context})

    return response

key_and_metadata = {
    "air": "Air-Quality-Factors",
    "fire": "How Fire Incidents Happen",
    "bombatronic": "Bombatronic - Dataset"
}

input_metadata = [md for key, md in key_and_metadata.items() if re.search(key, input_text.lower())]

response = chatbot_response(input_text, rag_state, input_metadata)
print(response)

Air-Quality-Factors <class 'str'>
[Document(metadata={'id': 0, 'title': 'Air-Quality-Factors'}, page_content='Carbon  monoxide  and  carbon  dioxide  significantly  impact  air  quality,  each  through'), Document(metadata={'id': 0, 'title': 'Air-Quality-Factors'}, page_content='Carbon  Monoxide  (CO)  and  Carbon  Dioxide  (CO2)  effects  on  Air  Quality   1.  Definition'), Document(metadata={'id': 0, 'title': 'Air-Quality-Factors'}, page_content='influences\n \nair\n \nquality.\n \nRising\n \nCO\n₂\n \nlevels\n \ncontribute\n \nto\n \nglobal\n \nwarming,')]
Carbon monoxide (CO) significantly impacts air quality due to its toxic nature. It's a colorless, odorless gas produced by the incomplete combustion of fossil fuels and other carbon-containing materials. When inhaled, CO interferes with the blood's ability to carry oxygen, leading to various health problems, especially for individuals with heart disease, pregnant women, and young children. High concentrations of CO in the air can

### Evaluation

In [7]:
# Multiple evaluation tests
import pandas as pd
import numpy as np

df_eval = pd.read_csv('./eval.csv')
df_eval.head()


Unnamed: 0,question,answer_keywords,metadata
0,what carbon monoxide?,"Carbon monoxide, colorless, odorless, incomple...",Air-Quality-Factors
1,what carbon dioxide?,"Carbon dioxide, greenhouse gas, natural proces...",Air-Quality-Factors
2,What factors behind fire incident?,"Electrical malfunctions, faulty wiring, overlo...",How Fire Incidents Happen
3,How mitigate indoor fires public places?,"Regular inspections, maintenance, electrical s...",How Fire Incidents Happen
4,Apa bombatronic?,"Bombatronic, alat pemadam api otomatis, AI, co...",Bombatronic - Dataset


In [8]:
similarity = []
def jaccard_similarity(true, pred):
    # print(f"True answer: {true}\nPred answer: {pred}\n")
    true_set = set(true)
    pred_set = set(pred)

    intersection = true_set.intersection(pred_set)
    union = true_set.union(pred_set)

    similarity = len(intersection) / len(union)
    # print(similarity)
    return similarity

def evaluation_invoking(df):
    answers = []
    prompt = ChatPromptTemplate([
        ("system", "You are an LLM that would specialize in answering "
        "questions related to Air Quality Index and its parameters, whilst also"
        "posessing a strong knowledge on Fire Cases worldwide, how to mitigate them, and "
        "the solution behind it if the user were to have a question surrounding it. Also you need"
        "to possess an additional information about the team BOMBATRONIC when asked about it thats"
        "given from the context by the user. Keep your answer to the question in a paragaph like "
        "format and make it sweet and intuitive."),
        ("user", "{input}\n\n: {context}")
    ])
    
    chain = prompt | llm | output_parser

    for _, rows in df.iterrows():
        question = rows["question"]
        # print(question)
        # print(rows["metadata"])
        metadata = rows["metadata"]
        # print(metadata)
        retrieved_docs = vector_store.similarity_search(
            query=question,
            k=1,
            filter={"title": metadata}
        )
        # print(retrieved_docs)
        # print(f"DOCS Retrieved: {retrieved_docs}\n")
        context = "\n\n".join([doc.page_content for doc in retrieved_docs])
        response = chain.invoke({"input": question, "context": context})
        # print(f"Chatbot Response: {response}")
        answers.append(response)

    return answers

answers = evaluation_invoking(df_eval)

for i in range(len(answers)):
    true_answer = df_eval["answer_keywords"][i].split(" ")
    pred_answer = answers[i].split(" ")

    true_lowered = list(map(lambda x: x.lower(), true_answer))
    pred_lowered = list(map(lambda x: x.lower(), pred_answer))

    # print(f"{true_lowered}\n{pred_lowered}")
    true_cleaned = list(map(lambda x: re.sub(r'\,|\-', '', x), true_lowered))
    pred_cleaned = list(map(lambda x: re.sub(r'\,|\-', '', x), pred_lowered))

    sim_result = jaccard_similarity(true_cleaned, pred_cleaned)
    similarity.append(sim_result)

results = np.mean(similarity)
print(f"Mean similiarity between all answers (based on keywords): {(results * 100):.2f}%")

Mean similiarity between all answers (based on keywords): 11.55%
