# RAG (main)

## Dependencies

In [241]:
import os
from dotenv import load_dotenv
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate

In [242]:
# load the environment variables
load_dotenv() 

True

In [243]:
# setup

# LangChain
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_API_KEY"] = os.getenv("LANGCHAIN_API_KEY")
os.environ["LANGCHAIN_ENDPOINT"] = os.getenv("LANGCHAIN_ENDPOINT")
os.environ["LANGCHAIN_PROJECT"] = os.getenv("LANGCHAIN_PROJECT")

# OpenAI
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")

## Documents

Things to Consider
- chunking level
- data cleaning
- meta data

In [244]:
from langchain_community.document_loaders import PyPDFLoader

pdf_paths = [
    "https://www.ph.emb-japan.go.jp/files/100508281.pdf", # TOURISM
    "https://www.ph.emb-japan.go.jp/files/100412012.pdf", # PACKAGE TOUR
    "https://www.ph.emb-japan.go.jp/files/100508282.pdf", # BUSINESS, CONFERENCE or CULTURAL EXCHANGE, etc.
    "https://www.ph.emb-japan.go.jp/files/100508283.pdf", # VISITING RELATIVES
    "https://www.ph.emb-japan.go.jp/files/100508284.pdf", # VISITING FRIENDS OR DISTANT RELATIVES
    "https://www.ph.emb-japan.go.jp/files/100508285.pdf", # VISITING US MILITARY PERSONNEL
    "https://www.ph.emb-japan.go.jp/files/100585068.pdf", # SPOUSE OR CHLID OF JAPANESE NATIONAL RESIDING IN THE PHILIPPINES
    "https://www.ph.emb-japan.go.jp/files/100508287.pdf", # TRANSIT
    "https://www.ph.emb-japan.go.jp/files/100508288.pdf", # MULTIPLE-ENTRY VISA FOR TEMPORARY VISITOR
    "https://www.ph.emb-japan.go.jp/files/100674192.pdf", # MULTIPLE-ENTRY TEMPORARY VISITOR VISA (PHILIPPINE NATIONALS WITH CONSIDERABLE FINANCIAL CAPACITY) 
    "https://www.ph.emb-japan.go.jp/files/100404404.pdf", # MULTIPLE-ENTRY VISA FOR TEMPORARY VISITOR
    "https://www.ph.emb-japan.go.jp/files/100479463.pdf", # STUDENT, WORKER AND DEPENDENT
    "https://www.ph.emb-japan.go.jp/files/100415046.pdf", # OFFICIAL 
    "https://www.ph.emb-japan.go.jp/files/100415047.pdf", # HOUSEKEEPER OF DIPLOMAT/OFFICIAL
    "https://www.ph.emb-japan.go.jp/files/100415048.pdf", # NIKKEI-JIN (JAPANESE DESCENDANT)
    "https://www.ph.emb-japan.go.jp/files/100508289.pdf", # FILIPINO PARENTS TRAVELLING TO JAPAN WITH JAPANESE-FILIPINO CHILDREN
]
headers = {
    "User-Agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Mobile Safari/537.36"
    }

pages = []
for pdf_path in pdf_paths:
    loader = PyPDFLoader(pdf_path, headers=headers)
    async for page in loader.alazy_load():
        pages.append(page)

In [245]:
pages[0]

Document(metadata={'source': 'https://www.ph.emb-japan.go.jp/files/100508281.pdf', 'page': 0}, page_content='TOURISM   \nMay 2024  \n  \nA. PURPOSE  \nVisit Japan for tourism.  \n  \nB. Requirements（Details→https://www.ph.emb-japan.go.jp/itpr_ja/11_000001_00898.html）  \n    \n※ Downloadable from this website   \n      \n(1) Passport（Holder’s signature required）  \n(2) Application Form ※（A facial Photo (4.5×3.5cm) must be attached.）  \n(3) PSA issued Birth Certificate and Marriage Certificate (for married applicants), issued within 1 year \n☞ Unnecessary if there is used Japan Visa on passport. \n【ADDITIONAL REQUIREMENTS】  \n- If (3) is unreadable, submit Birth/Marriage certificate issued by Local Civil Registrar.  \n- If Birth Certificate is “LATE REGISTRATION”, submit Baptismal Certificate and School Record (Form 137).  \n-If there is no record of Birth/Marriage in PSA, submit Birth Certificate issued by Local Civil Registrar and Negative  \nCertificate issued by PSA.  \n(4) Itinerary

In [246]:
print(pages[0])

page_content='TOURISM   
May 2024  
  
A. PURPOSE  
Visit Japan for tourism.  
  
B. Requirements（Details→https://www.ph.emb-japan.go.jp/itpr_ja/11_000001_00898.html）  
    
※ Downloadable from this website   
      
(1) Passport（Holder’s signature required）  
(2) Application Form ※（A facial Photo (4.5×3.5cm) must be attached.）  
(3) PSA issued Birth Certificate and Marriage Certificate (for married applicants), issued within 1 year 
☞ Unnecessary if there is used Japan Visa on passport. 
【ADDITIONAL REQUIREMENTS】  
- If (3) is unreadable, submit Birth/Marriage certificate issued by Local Civil Registrar.  
- If Birth Certificate is “LATE REGISTRATION”, submit Baptismal Certificate and School Record (Form 137).  
-If there is no record of Birth/Marriage in PSA, submit Birth Certificate issued by Local Civil Registrar and Negative  
Certificate issued by PSA.  
(4) Itinerary in Japan  
 
【In case that applicant will shoulder part/all of travel expense】   
 
(5) Applicant’s Bank Certific

## Embeddings

Things to Consider
- Evaluation Metric
- dataset
- model

In [247]:
openai_embeddings = OpenAIEmbeddings(model="text-embedding-3-large")

## Custom embeddings

In [248]:
from langchain_core.embeddings import Embeddings
import torch
from transformers import RobertaTokenizer, RobertaModel

# Load pre-trained RoBERTa model
model = RobertaModel.from_pretrained('roberta-base')
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

# Load the model and tokenizer from the saved directory (CUSTOM)
# custom_model = RobertaModel.from_pretrained('./fine_tuned_roberta')
# tokenizer = RobertaTokenizer.from_pretrained('./fine_tuned_roberta')


class RoBERTaEmbedding(Embeddings):
    def __init__(self, model_name='roberta-base'):
        # Load pre-trained RoBERTa model and tokenizer
        self.model = model
        self.tokenizer = tokenizer
    
    def embed_documents(self, documents):
        embeddings = []
        for doc in documents:
            inputs = self.tokenizer(doc, return_tensors='pt', truncation=True, padding=True, max_length=512)
            with torch.no_grad():
                outputs = self.model(**inputs)
                # Using the mean of the last hidden state as the document embedding
                embedding = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
                embeddings.append(embedding)
        return embeddings

    def embed_query(self, query):
        inputs = self.tokenizer(query, return_tensors='pt', truncation=True, padding=True, max_length=512)
        with torch.no_grad():
            outputs = self.model(**inputs)
            # Using the mean of the last hidden state as the query embedding
            embedding = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
        return embedding

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [249]:
# Initialize the custom embedding class with your fine-tuned model and tokenizer
custom_embeddings = RoBERTaEmbedding()

## Vector Database

Things to Consider
- Evaluation Metric
- HNSW parameters

In [250]:
from langchain_core.vectorstores import InMemoryVectorStore

vector_store = InMemoryVectorStore.from_documents(pages, embeddings)

In [251]:
# check
docs = vector_store.similarity_search("What is the requirements for tourism?", k=7)
for doc in docs:
    print(f'Page {doc.metadata["page"]}: {doc.page_content}\n')

Page 0: TOURISM   
May 2024  
  
A. PURPOSE  
Visit Japan for tourism.  
  
B. Requirements（Details→https://www.ph.emb-japan.go.jp/itpr_ja/11_000001_00898.html）  
    
※ Downloadable from this website   
      
(1) Passport（Holder’s signature required）  
(2) Application Form ※（A facial Photo (4.5×3.5cm) must be attached.）  
(3) PSA issued Birth Certificate and Marriage Certificate (for married applicants), issued within 1 year 
☞ Unnecessary if there is used Japan Visa on passport. 
【ADDITIONAL REQUIREMENTS】  
- If (3) is unreadable, submit Birth/Marriage certificate issued by Local Civil Registrar.  
- If Birth Certificate is “LATE REGISTRATION”, submit Baptismal Certificate and School Record (Form 137).  
-If there is no record of Birth/Marriage in PSA, submit Birth Certificate issued by Local Civil Registrar and Negative  
Certificate issued by PSA.  
(4) Itinerary in Japan  
 
【In case that applicant will shoulder part/all of travel expense】   
 
(5) Applicant’s Bank Certificate (b

## Prompt Template

In [252]:
system_template = """
Answer the following based on this {context}, 
otherwise just give this url (https://www.ph.emb-japan.go.jp/itpr_en/00_000035.html) 
for more information about Japan Visa"""

prompt_template = ChatPromptTemplate.from_messages(
    [("system", system_template), ("user", "{question}")]
)

## LLM

In [253]:
llm = ChatOpenAI(model="gpt-4o-mini")

## RAG

In [258]:
class RAG:
    
    def __init__(self, docs, embeddings):
        self.llm = ChatOpenAI(model="gpt-4o-mini")
        self.vector_db = InMemoryVectorStore.from_documents(docs, embeddings)
        system_template = """
                            Answer the following based on this {context}, 
                            otherwise just give this url (https://www.ph.emb-japan.go.jp/itpr_en/00_000035.html) 
                            for more information about Japan Visa
                            """
        self.prompt_template = ChatPromptTemplate.from_messages([("system", system_template), ("user", "{question}")]
)
        
    
    def response(self, question):
        context = self.vector_db.similarity_search(question, k=1)
        
        prompt = self.prompt_template.invoke({
            "context": context,
            "question": question 
        })
        
        # Generate response from the LLM
        response = self.llm.invoke(prompt)
        
        return response.content

## Evaluation (BLUE SCORE)

In [259]:
# read Q and A
import pandas as pd


QA_PATH = "./dataset/qa/QA.xlsx"
qa = pd.read_excel(QA_PATH)
qa

Unnamed: 0,question,answer 1,answer 2,answer 3
0,What is the requirements for tourism?,"The requirements for tourism to Japan, as outl...",Here’s a simplified version of the requirement...,Here’s an even simpler version of the requirem...
1,What is the requirements for visiting relatives?,The requirements for visiting relatives in Jap...,Here’s a simplified version of the requirement...,Here’s an even simpler version of the requirem...
2,what is the requirements for business puposes?,The requirements for visiting Japan for **busi...,Here’s a simplified version of the requirement...,Here’s an even simpler version:\n\n### **For t...
3,what is the requirements for visiting friends?,The requirements for visiting Japan for **frie...,Here’s a simpler version:\n\n### **For the App...,Here’s an even simpler version:\n\n### **For t...
4,what is the requirements for visiting US milit...,The requirements for visiting **US military pe...,Here’s a simplified version:\n\n### **For the ...,Here’s the simplest version:\n\n### **For the ...
5,what is the requirements for spouse or child o...,The requirements for the **spouse or child of ...,Here’s a simpler version:\n\n### **For the App...,Here’s a more simplified version:\n\n### **For...
6,what is the requirements for transit?,The requirements for **transit** to Japan are ...,Here’s a simplified version for **transit** to...,Here’s the simplest version for **transit** to...


In [260]:
# tokenizer
import spacy

# Load spaCy model
nlp = spacy.load("en_core_web_sm")

# Function to tokenize words using spaCy
def tokenize_text(text):
    # Process the text with spaCy NLP pipeline
    doc = nlp(text)
    # Extract words (tokens)
    tokens = [token.text for token in doc]
    return tokens

In [262]:
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

embeddings_list = [
    ("openai embeddings", openai_embeddings),
    ("custom embeddings", custom_embeddings)
]

for name, embeddings in embeddings_list:
    rag = RAG(pages, embeddings)
    bleu_scores = []
    # Using SmoothingFunction() to prevent zero BLEU score
    smooth = SmoothingFunction().method1
    for row in qa.itertuples():
        question, answer1, answer2, answer3 = row[1], row[2], row[3], row[4]
        references = [tokenize_text(answer1), tokenize_text(answer2), tokenize_text(answer3)]
        # do something
        # Calculate BLEU score (use sentence_bleu for a single sentence)
        response = rag.response(question)
        candidate = tokenize_text(response)

        bleu_score = sentence_bleu(references, candidate, smoothing_function=smooth)
        bleu_scores.append(bleu_score)

    print(f"average bleu score ({name}): {sum(bleu_scores)/len(bleu_scores)}")
    

average bleu score (openai embeddings): 0.5051976801057746
average bleu score (custom embeddings): 0.0003528231294055397
