# RAG

In [245]:
import os
from dotenv import load_dotenv

# load the environment variables
load_dotenv() 

os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_API_KEY"] = os.getenv("LANGCHAIN_API_KEY")
os.environ["LANGCHAIN_ENDPOINT"] = os.getenv("LANGCHAIN_ENDPOINT")
os.environ["LANGCHAIN_PROJECT"] = os.getenv("LANGCHAIN_PROJECT")
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")

## Retrieve PDFs (target)

get the pdf paths

In [246]:
from pathlib import Path

pdf_paths = list(Path("pdfs").glob("*.pdf"))
pdf_paths = [pdf_path.as_posix() for pdf_path in pdf_paths]

In [247]:
pdf_contents = {idx:path for idx, path in enumerate(pdf_paths)}

In [248]:
pdf_contents

{0: 'pdfs/MULTIPLE-ENTRY VISA FOR TEMPORARY VISITOR.pdf',
 1: 'pdfs/PACKAGE_TOUR.pdf',
 2: 'pdfs/VISITING RELATIVES.pdf',
 3: 'pdfs/VISITING US MIILITALY PERSONNEL.pdf',
 4: 'pdfs/TRANSIT.pdf',
 5: 'pdfs/OFFICIAL.pdf',
 6: 'pdfs/MULTIPLE-ENTRY VISA FOR TEMPORARY VISITOR FOR BUSINESS.pdf',
 7: 'pdfs/STUDENT, WORKER AND DEPENDENT.pdf',
 8: 'pdfs/HOUSEKEEPER OF DIPLOMAT.pdf',
 9: 'pdfs/MULTIPLE-ENTRY TEMPORARY VISITOR VISA (PHILIPPINE NATIONALS WITH.pdf',
 10: 'pdfs/ATTIC TOURS SM FAIRVIEW (1).pdf',
 11: 'pdfs/TOURISM.pdf',
 12: 'pdfs/BUSINESS.pdf',
 13: 'pdfs/SPOUSE OR CHLID OF JAPANESE NATIONAL RESIDING IN THE PHILIPPINES.pdf',
 14: 'pdfs/VISITING FRIENDS OR DISTANT RELATIVES.pdf',
 15: 'pdfs/NIKKEI-JIN (JAPANESE DESCENDANT).pdf'}

**PER PDF LEVEL CHUNCKING**

In [249]:
from langchain_community.document_loaders import PyPDFLoader


documents = []
count = 0
for pdf_path in pdf_paths:
    loader = PyPDFLoader(pdf_path)
    for document in loader.lazy_load():
        documents.append(document)

In [250]:
documents

[Document(metadata={'source': 'pdfs/MULTIPLE-ENTRY VISA FOR TEMPORARY VISITOR.pdf', 'page': 0}, page_content='MULTIPLE-ENTRY VISA FOR TEMPORARY VISITOR  \nMay 2024  \n  \nA. PURPOSE  \nVisit Japan several times as temporary visitor for tourism, business or visiting relatives, etc.  \n☞ Applicants need to satisfy one of the following conditions from I to IV.  \n☞ The period of each stay in Japan must be within 30 days.  \n  \nB. Requirements（Details→https://www.ph.emb-japan.go.jp/itpr_ja/11_000001_00898.html）  \n  ※ Downloadable from this website   \n   COMMON REQUIREMENTS  \n(1) Passport（Holder’s signature required）  \n(2) Application Form ※（A facial Photo (4.5×3.5cm) must be attached.）  \n(3) Request for Multiple-Entry Visa ※  \n(4) PSA issued Birth Certificate and Marriage Certificate (for married applicants), issued within 1 year \n☞ Unnecessary if there is used Japan Visa on passport.  \n【ADDITIONAL REQUIREMENTS】  \n- If (4) is unreadable, submit Birth/Marriage certificate issued b

In [251]:
from langchain.text_splitter import RecursiveCharacterTextSplitter


def splitDocuments(documents, chunk_size=100, chunk_overlap=20):
    """Documents further splitting

    Args:
        documents (Document): This Document is too large because it is per PDF pages.
    
    Return:
        additional_documents: documents that are splitted more
    """
    
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len,
        is_separator_regex=False,
    )
    
    additional_documents = []
    for document in documents:
        # Split the content of each document using the splitter
        current_documents = splitter.create_documents([document.page_content])
        # Add metadata to each new chunk (preserving original document metadata)
        for chunk in current_documents:
            chunk.metadata = document.metadata
            
        additional_documents.append(current_documents)
    
    return additional_documents

In [252]:
additional_documents = splitDocuments(documents)

combine

In [253]:
documents = documents + additional_documents[0]

In [254]:
documents

[Document(metadata={'source': 'pdfs/MULTIPLE-ENTRY VISA FOR TEMPORARY VISITOR.pdf', 'page': 0}, page_content='MULTIPLE-ENTRY VISA FOR TEMPORARY VISITOR  \nMay 2024  \n  \nA. PURPOSE  \nVisit Japan several times as temporary visitor for tourism, business or visiting relatives, etc.  \n☞ Applicants need to satisfy one of the following conditions from I to IV.  \n☞ The period of each stay in Japan must be within 30 days.  \n  \nB. Requirements（Details→https://www.ph.emb-japan.go.jp/itpr_ja/11_000001_00898.html）  \n  ※ Downloadable from this website   \n   COMMON REQUIREMENTS  \n(1) Passport（Holder’s signature required）  \n(2) Application Form ※（A facial Photo (4.5×3.5cm) must be attached.）  \n(3) Request for Multiple-Entry Visa ※  \n(4) PSA issued Birth Certificate and Marriage Certificate (for married applicants), issued within 1 year \n☞ Unnecessary if there is used Japan Visa on passport.  \n【ADDITIONAL REQUIREMENTS】  \n- If (4) is unreadable, submit Birth/Marriage certificate issued b

## Embeddings

In [255]:
from langchain_openai import OpenAIEmbeddings


embeddings = OpenAIEmbeddings(model="text-embedding-3-large")

## Vector Database (HNSW)

## HNSW

In [256]:
import hnswlib
from langchain_core.vectorstores import VectorStore
import numpy as np


class HNSWLibVectorStore(VectorStore):
    def __init__(self, index, embedding_model, documents):
        self.index = index
        self.embedding_model = embedding_model
        self.documents = [doc.page_content for doc in documents]

    def similarity_search(self, query, k=3):
        # Get embedding for the query
        query_embedding = self.embedding_model.embed_documents([query])
        query_embedding = np.array(query_embedding, dtype=np.float32)

        # Perform the knn query
        labels, distances = self.index.knn_query(query_embedding, k=k)

        # Return the closest documents and their distances
        results = [(self.documents[label], distances[0][i]) for i, label in enumerate(labels[0])]
        return results
    
    @classmethod
    def from_texts(cls, texts, embedding_model, **kwargs):
        # Step 1: Get the embeddings for the texts
        embeddings = embedding_model.embed(texts)
        
        # Step 2: Initialize the HNSW index (dimension should match embedding size)
        dim = len(embeddings[0])
        index = hnswlib.Index(space='cosine', dim=dim)  # Use cosine distance for similarity
        index.init_index(max_elements=len(texts), ef_construction=200, M=16)
        
        # Step 3: Add embeddings to the index
        embeddings = np.array(embeddings, dtype=np.float32)
        index.add_items(embeddings)

        # Step 4: Return an instance of the custom vector store
        return cls(index=index, embedding_model=embedding_model, documents=texts)

## test hnsw

In [257]:
dim = 3072  # Dimensionality of the embedding vectors (adjust to match your model's output)
num_elements = len(documents)  # Number of elements in your dataset

# Initialize the HNSW index with 'cosine' similarity space
index = hnswlib.Index(space='cosine', dim=dim)


# PARAMETERS to tweak
ef = 200 # 200 # Controls the quality of the graph construction
M = 16  # 16   # Controls the number of neighbors for each node in the graph

# Initialize the index with the number of elements
index.init_index(max_elements=num_elements, ef_construction=ef, M=M)

# Embed the documents
texts = [doc.page_content for doc in documents]
document_embeddings = embeddings.embed_documents(texts)


# Convert document embeddings to numpy array and add them to the index
document_embeddings = np.array(document_embeddings, dtype=np.float32)
index.add_items(document_embeddings)

# Create the custom vector store with HNSWLib
vector_store = HNSWLibVectorStore(index=index, embedding_model=embeddings, documents=documents)

In [258]:
# Perform a similarity search
query = "how much is the processing fee"
results = vector_store.similarity_search(query, k=10)

# Print the results
for doc, score in results:
    print(f"Document: {doc}, Similarity score: {score}")

Document: (7) Applicant’s Tax Payment Certificate, Similarity score: 0.7448184490203857
Document: be submitted to prove transactions within the last six months., Similarity score: 0.7824033498764038
Document: * For business owners, proof of actual payment of tax must be submitted in addition to the BIR Tax, Similarity score: 0.7859665155410767
Document: submitted), Similarity score: 0.7996006011962891
Document: Registration” and Mayor’s Permit from the City Hall must be submitted., Similarity score: 0.8013794422149658
Document: Payment Certificate Form (copy acceptable), Similarity score: 0.8068599700927734
Document: issued within 1 year, Similarity score: 0.8082274198532104
Document: (6) Applicant’s Bank Certificate (balance within the last six months must be shown), Similarity score: 0.8103916049003601
Document: (2) Application Form ※（A facial Photo (4.5×3.5cm) must be attached.）, Similarity score: 0.8142542243003845
Document: ※ Downloadable from this website   
   COMMON REQUIREMENT

evaluation

In [259]:
import pandas as pd


def evaluate(queries, vector_store=vector_store, k=3):
    
    _queries = []
    _documents = []
    _scores = []
    for query in queries:
        results = vector_store.similarity_search(query, k=k)
        
        for doc, score in results:
            _documents.append(doc)
            _scores.append(score)
        
        _queries = _queries + [query] * k
    
    evaluation = pd.DataFrame({
        "query": _queries,
        "document": _documents,
        "score": _scores
    })
    
    
    
    return evaluation
        
    

In [260]:
temp_queries = [
    "What are the requirements for tourists visa?",
    "What are the difference between tourists and visit relative or friend visa?",
    "How can avail the visa for Japan as tourists?",
    "What the tourists mean visa",
    "When your office open ?"
]

In [261]:
result = evaluate(temp_queries)

## RAG

In [305]:
from langchain_core.prompts import ChatPromptTemplate


class RAG:
    
    def __init__(self, documents, llm, embeddings, ef=200, M=16, chunk_size=100, chunk_overlap=20):
        self.chunk_size = chunk_size
        self.chunk_overlap = chunk_overlap
        self.documents = documents
        self.llm = llm 
        self.embeddings = embeddings
        self.vector_store = self.__configureVectorStore(ef=ef, M=M)
        self.prompt_template = self.__configurePromptTemplate()
        
    
    def __splitDocuments(self):
        """Documents further splitting

        Args:
            documents (Document): This Document is too large because it is per PDF pages.
        
        Return:
            additional_documents: documents that are splitted more
        """
        
        splitter = RecursiveCharacterTextSplitter(
            chunk_size=self.chunk_size,
            chunk_overlap=self.chunk_overlap,
            length_function=len,
            is_separator_regex=False,
        )
        
        additional_documents = []
        for document in documents:
            # Split the content of each document using the splitter
            current_documents = splitter.create_documents([document.page_content])
            # Add metadata to each new chunk (preserving original document metadata)
            for chunk in current_documents:
                chunk.metadata = document.metadata
                
            additional_documents.append(current_documents)
        
        return additional_documents

        
    def __configureVectorStore(self, ef, M):
        
        # SPLITTING TEXT
        self.documents = self.documents + self.__splitDocuments()[0]
        
        # VECTORIZING
        texts = [doc.page_content for doc in self.documents]
        document_embeddings = self.embeddings.embed_documents(texts)
        document_embeddings = np.array(document_embeddings, dtype=np.float32)
        
        dim = len(document_embeddings[0])
        num_elements = len(self.documents) 
        
        # INDEXING
        index = hnswlib.Index(space='cosine', dim=dim)
        index.init_index(max_elements=num_elements, ef_construction=ef, M=M)
        index.add_items(document_embeddings)
        
        return HNSWLibVectorStore(index=index, embedding_model=self.embeddings, documents=self.documents)
    
    
    def __configurePromptTemplate(self):
        system_template = """
                            Answer the following based on this {context}, 
                            otherwise just give this url (https://www.ph.emb-japan.go.jp/itpr_en/00_000035.html) 
                            for more information about Japan Visa
                        """
        return ChatPromptTemplate.from_messages([("system", system_template), ("user", "{query}")])
    
        
    def response(self, query):
        """"return the response as str"""
        
        query = query + " attic tours"
        
        # ------------RETRIEVE------------
        # SIMILARITY SEARCH
        context = self.vector_store.similarity_search(query, k=10)
        
        # ------------AUGMENTED------------
        prompt = self.prompt_template.invoke({
            "context": context,
            "query": query 
        })
        
        # ------------GENERATION------------
        response = self.llm.invoke(prompt)
        
        return response.content

In [306]:
from langchain_openai import ChatOpenAI


llm = ChatOpenAI(model="gpt-4o-mini")

rag = RAG(documents=documents, llm=llm, embeddings=embeddings)

In [309]:
response = rag.response("how much to process japan visa?")

In [310]:
print(response)

The processing fee for a Japan visa at Attic Tours is 1,680 pesos per person. If you want to apply for multiple visas, you need to add 500 pesos, making it a total of 2,180 pesos per person.
