# <b>Ingest Dev Sever Setup

https://medium.com/@ajayverma23/the-art-and-science-of-rag-mastering-prompt-templates-and-contextual-understanding-a47961a57e27

In [2]:
# %pip install -r "C:\Users\dangq\OneDrive\M√°y t√≠nh\USTH\ICT\Internship\RAG Remake\RAG\requirements.txt

In [3]:
# %%writefile "C:\Users\dangq\OneDrive\M√°y t√≠nh\USTH\ICT\Internship\RAG Remake\RAG\src\main.py"
# import logging
# from fastapi import FastAPI
# import inngest
# import inngest.fast_api
# from inngest.experimental import ai 
# from dotenv import load_dotenv
# import uvicorn
# import uuid
# import os
# import datetime

# load_dotenv()

# inngest_client = inngest.Inngest(
#     app_id = "rag_remake_app",
#     logger = logging.getLogger("uvicorn"), ## Possible values: DEBUG, INFO, WARNING, ERROR, CRITICAL
#     is_production = False,
#     serializer = inngest.PydanticSerializer()
# )


# @inngest_client.create_function(
#     fn_id = "RAG: Ingest PDF Document",
#     trigger=inngest.TriggerEvent(event="rag/pdf_document/ingest")
# )

# async def ingest_pdf_document(ctx: inngest.Context):
#     return {'hello': 'world'}

# app = FastAPI()
# inngest.fast_api.serve(app, inngest_client, [ingest_pdf_document])


# <b> Vector Store Database

In [14]:
%%writefile "C:\Users\dangq\OneDrive\M√°y t√≠nh\USTH\ICT\Internship\RAG Remake\RAG_Basic\src\utils\qdrant_database.py"
from qdrant_client import QdrantClient
from qdrant_client.http.models import VectorParams, Distance, PointStruct

class QdrantStore:
    def __init__(self, url: str = "http://localhost:6333", collection_name: str ="PDFdoc", dim=1024) -> None:
        self.client: QdrantClient = QdrantClient(url=url)
        self.collection_name: str = collection_name
        
        if not self.client.collection_exists(collection_name=self.collection_name):
            self.client.create_collection(
                collection_name=self.collection_name,
                vectors_config=VectorParams(
                    size=dim, 
                    distance=Distance.COSINE
                ),
            )
            
    def count_vectors(self) -> int:
            """Returns the number of vectors currently in the collection"""
            return self.client.count(
                collection_name=self.collection_name
            ).count
    
    def upsert(self, ids, vectors, payloads):
        points = [
            PointStruct(
                id=ids[i], 
                vector=vectors[i], 
                payload=payloads[i]
            )
            for i in range(len(ids))
        ]

        self.client.upsert(
            collection_name=self.collection_name,
            points=points
        )

    def similarity_search(self, query_vector, top_k: int = 5):
        if hasattr(query_vector, 'tolist'):
            query_vector = query_vector.tolist()

        # 2. Use 'query_points' (Robust replacement for .search)
        results = self.client.query_points(
            collection_name=self.collection_name,
            query=query_vector,
            limit=top_k,
        ).points


        contexts = []
        sources = set()

        for r in results:
            payload = r.payload or {}
            text = payload.get('text', '')
            source = payload.get('source', '')
            if text:
                contexts.append(text)
                sources.add(source)

        return {
            'contexts': contexts,
            'sources': list(sources)
        }

Writing C:\Users\dangq\OneDrive\M√°y t√≠nh\USTH\ICT\Internship\RAG Remake\RAG_Basic\src\utils\qdrant_database.py


# <b> PDF Loader

In [None]:
%%writefile "C:\Users\dangq\OneDrive\M√°y t√≠nh\USTH\ICT\Internship\RAG Remake\RAG_Basic\src\utils\pdf_loader.py"
from llama_parse import LlamaParse  
from llama_index.core import SimpleDirectoryReader
import os
import pickle
from dotenv import load_dotenv

load_dotenv()
llamaparse_api_key = os.getenv("LLAMA_CLOUD_API_KEY")

class PDFLoader:
    def __init__(self, file_path:str):
        self.file_path = file_path
        file_name = os.path.basename(file_path)

        self.cache_path = os.path.join("cache", f"{file_name}.pkl")
        self.markdown_path = os.path.join("markdown", f"{file_name}.md")
    
        os.makedirs("cache", exist_ok=True)
        os.makedirs("markdown", exist_ok=True)

    def load_docs(self):

        # Check if the parsed data file exists
        if os.path.exists(self.cache_path):
            # Load the parsed data from the file
            with open(self.cache_path, "rb") as f:
                parsed_data = pickle.load(f)
            print(f"Loaded parsed data from cache: {self.cache_path}")
            return parsed_data
        else:
            # Parse PDF and extract text
            parser = LlamaParse(
                api_key = os.getenv("LLAMA_CLOUD_API_KEY"),
                result_type = "markdown",
            )

            file_extractor = {".pdf": parser}

            documents = SimpleDirectoryReader(
                input_files=[self.file_path], 
                file_extractor=file_extractor
            ).load_data()

            # Save the parsed data to a file for future use
            print(f"Saving parsed data to cache: {self.cache_path}")
            with open(self.cache_path, "wb") as f:
                pickle.dump(documents, f)
            
            # Save the raw text content to a Markdown file
            print(f"Saving markdown content to: {self.markdown_path}")
            with open(self.markdown_path, "w", encoding="utf-8") as f:
                # LlamaParse might return multiple document parts, we join them
                for doc in documents:
                    f.write(doc.text + "\n\n")
            return documents


Writing C:\Users\dangq\OneDrive\M√°y t√≠nh\USTH\ICT\Internship\RAG Remake\RAG_Basic\src\utils\pdf_loader.py


# <b> Document Processor

In [16]:
%%writefile "C:\Users\dangq\OneDrive\M√°y t√≠nh\USTH\ICT\Internship\RAG Remake\RAG_Basic\src\utils\document_processor.py"
from typing import Optional
from transformers import AutoTokenizer
from langchain_text_splitters import RecursiveCharacterTextSplitter, MarkdownHeaderTextSplitter
from sentence_transformers import SentenceTransformer
import uuid


class DocumentProcessor:
    def __init__(self, chunk_size: int = 1000, chunk_overlap: int = 200, model_name: str = "BAAI/bge-m3"):
        self.chunk_size = chunk_size
        self.chunk_overlap = chunk_overlap
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.separators = ["\n\n", "\n", " ", ""]
    
    def chunk_document(self, text: str) -> list[str]:
        """Chunk a document into smaller pieces based on token count"""
        tokens = self.tokenizer.encode(text)
        total_tokens = len(tokens)
        chunks = []
        start = 0

        while start < total_tokens:
            end = min(start + self.chunk_size, total_tokens)
            chunk_tokens = tokens[start:end]
            chunk_text = self.tokenizer.decode(chunk_tokens)
            chunks.append(chunk_text)
            start += self.chunk_size - self.chunk_overlap
        
        return chunks

    def preprocess_documents(self, documents: list[str]) -> list[str]:
        """Preprocess a list of documents by chunking them"""
        all_chunks = []
        for doc in documents:
            chunks = self.chunk_document(doc)
            all_chunks.extend(chunks)
        return all_chunks
    
    def chunk_Recursive_char(self, texts: str) -> list[str]:
        """Chunk a document using RecursiveCharacterTextSplitter"""
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=self.chunk_size,
            chunk_overlap=self.chunk_overlap, # Use the value from init
            length_function=self.count_tokens,
            separators=self.separators, # Standard recursive separators
            keep_separator=False,
            strip_whitespace=True
        )

        # Handle Input Flexibility (String vs List)
        if isinstance(texts, str):
            input_list = [texts] # Wrap single string in list
        else:
            input_list = texts

        # 3. Process Statelessly 
        final_chunks = []
        
        for text in input_list:
            chunks = text_splitter.split_text(text)
            final_chunks.extend(chunks)
            
        return final_chunks

    def chunk_by_markdown(self, markdown_txt: str):
        headers_to_split_on = [
            ("#", "Title"),
            ("##", "Section"),
            ("###", "Subsection"),
        ]
        
        # 2. Split
        markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)
        md_header_splits = markdown_splitter.split_text(markdown_txt)

        return md_header_splits


    def count_tokens(self, text: str) -> int:
        """Helper to count tokens in a string"""
        return len(self.tokenizer.encode(text))
    
# if __name__ == "__main__":

    # """
    # ---------------------------------------------------------
    # 1. Load and Chunk Documents
    # ---------------------------------------------------------
    # """
    # loader = PDFLoader("C:\\Users\\dangq\\OneDrive\\M√°y t√≠nh\\USTH\\ICT\\Internship\\RAG Remake\\RAG\\data\\raw\\2405.17247v1.pdf")
    # docs = loader.load_docs()
    # # print(docs[10].text)

    # docs_texts = [doc.text for doc in docs]
    # processor = DocumentProcessor(chunk_size=500, chunk_overlap=50)
    # chunks = processor.chunk_Recursive_char(docs_texts)
    # print("====="*50)
    # print(f"\nCreated {len(chunks)} chunks.")

    # # Uncomment to see some sample chunks
    # # for i, chunk in enumerate(chunks[5:10]):  # Print first 5 chunks
    # #     print(f"Chunk {i+1} (Length: {processor.count_tokens(chunk)} tokens): {chunk}\n")
    

    # """
    # ---------------------------------------------------------
    # 2. Generate Embeddings (The missing step)
    # ---------------------------------------------------------
    # """
    # # embed_model = AutoTokenizer.from_pretrained("BAAI/bge-m3")
    # embed_model = SentenceTransformer("BAAI/bge-m3")
    # vectors = embed_model.encode(chunks, show_progress_bar=True)


    # """
    # ---------------------------------------------------------
    # 3. Prepare Data for Qdrant
    # ---------------------------------------------------------
    # """
    # ids = [str(uuid.uuid4()) for _ in range(len(chunks))]
    # payloads = [
    #     {"text": chunk, "source": "2405.17247v1.pdf", "chunk_index": i} 
    #     for i, chunk in enumerate(chunks)
    # ]


    # """
    # ---------------------------------------------------------
    # 4. Upsert to Vector Database
    # ---------------------------------------------------------
    # """
    # vector_store = QdrantStore(dim=1024, collection_name="PDFdoc_test")
    # vector_store.upsert(ids, vectors, payloads)
    # print("Upsert completed.")

    # """
    # ---------------------------------------------------------
    # 5. Test Search Functionality
    # ---------------------------------------------------------
    # """
    # print("====="*50)
    # print("Testing search functionality...")
    # query = "What is the core idea of Contrastive-based VLMs?"

    # # We must embed the query using the SAME model
    # query_vector = embed_model.encode(query).tolist()

    # search_results = vector_store.similarity_search(query_vector, top_k=3)
    # print(f"Query: {query}\n")
    # for context in search_results['contexts']:
    #     print(f"--- Result ---\n{context[:200]}...\n")

Writing C:\Users\dangq\OneDrive\M√°y t√≠nh\USTH\ICT\Internship\RAG Remake\RAG_Basic\src\utils\document_processor.py


# <b> LLM Wrapper Class


In [7]:
# import os

# # Define the content
# content = """
# ===================================================================
# RAG EVALUATION: HARD QUESTIONS (STATISTICAL & ARCHITECTURAL SPECS)
# Source Document: 2405.17247v1.pdf (An Introduction to VLM)
# ===================================================================

# [Test Case 1: Zero-Shot Performance]
# Question: What was the specific zero-shot classification accuracy attained by the ResNet-101 CLIP model, and which supervised model did it match?
# Answer: The ResNet-101 CLIP model attained 76.2% zero-shot classification accuracy, matching the performance of a supervised ResNet model.

# [Test Case 2: Training Compute Resources]
# Question: How many GPUs and how much time were required for the first round of training for MiniGPT-4?
# Answer: The first round of training for MiniGPT-4 required only four A100 GPUs for around ten hours.

# [Test Case 3: Technical Specifications - Tokenizer]
# Question: What are the specific tokenization parameters (image size, token count, and vocabulary size) used by the CM3Leon image tokenizer borrowed from Gafni et al.?
# Answer: It encodes a 256x256 image into 1024 tokens from a vocabulary of 8192.

# [Test Case 4: Fine-Tuning Efficiency]
# Question: When using the weight-sharing technique with VL-adapter, what specific percentage of total parameters requires updating for video-text tasks?
# Answer: For video-text tasks, only 3.39% of the total parameters require updating.

# [Test Case 5: Benchmark Improvement]
# Question: On the MMHAL-BENCH, by what percentage does LLAVA-RLHF outperform baselines, and what is the specific focus of this benchmark?
# Answer: LLAVA-RLHF outperforms baselines by 60% on MMHAL-BENCH, which has a special focus on penalizing hallucinations.
# """

# # Save to file
# file_path = "C:\\Users\\dangq\\OneDrive\\M√°y t√≠nh\\USTH\\ICT\\Internship\\RAG Remake\\RAG\\data\\raw\\rag_hard_questions.txt"
# with open(file_path, "w", encoding="utf-8") as f:
#     f.write(content.strip())

# print(f"File successfully created at: {os.path.abspath(file_path)}")

In [8]:
# import lmstudio as lms

# # model = lms.llm("meta-llama-3.1-8b-instruct")

# client = lms.Client(api_host="127.0.0.1:1234")
# model = client.llm.model("microsoft_-_phi-3.5-mini-instruct")
# result = model.respond("Hello")

# print(result)

In [17]:
%%writefile "C:\Users\dangq\OneDrive\M√°y t√≠nh\USTH\ICT\Internship\RAG Remake\RAG_Basic\src\llm\custom_model.py"
import lmstudio as lms
class CustomModel:
    def __init__(self, model_name: str = "microsoft_-_phi-3.5-mini-instruct", api_host: str = "127.0.0.1:1234"):
        self.model_name = model_name
        self.api_host = api_host
        self.client = None
        self.model = None
        self._load_model()

    def _load_model(self):
        try:
            print(f"Connecting to: {self.model_name}... ")
            self.client = lms.Client(api_host=self.api_host)
            self.model = self.client.llm.model(self.model_name)
        except Exception as e:
            print(f"Connection Error: {e}")
            print("Tip: Ensure LM Studio is open and 'Start Server' is green.")
        
    def generate(self, user_query: str, context_input) -> str:

        if not self.model:
            return "Error: Model not loaded."
        
        if isinstance(context_input, list):
            context_str = "\n\n".join(context_input)
        else:
            context_str = str(context_input)
        
        system_instruction = f"""
            Answer the question below by first outlining the main points of context relevant to the question,
            then use that outline to generate the final answer. 

            Context: 
            {context_str}

            Question: 
            {user_query}
        
        """
        print("System Instruction: \n")
        print(system_instruction)
        print("==="*50)
        
        try: 
            print("==="*50)
            response = self.model.respond(system_instruction)
            return "Generative Answer: \n" + str(response)
        except Exception as e:
            return f"Generate Error: {e}"

# if __name__ == "__main__":
#     embed_model = SentenceTransformer("BAAI/bge-m3")
#     vector_store = QdrantStore(dim=1024, collection_name="PDFdoc_test")
#     query = "What is the core idea of Contrastive-based VLMs?"

#     # We must embed the query using the SAME model
#     query_vector = embed_model.encode(query).tolist()

#     search_results = vector_store.similarity_search(query_vector, top_k=3)
#     # print(f"Query: {query}\n")
#     # for context in search_results['contexts']:
#     #     print(f"--- Result ---\n{context[:200]}...\n")

#     llm = CustomModel()
#     answer = llm.generate(query, search_results['contexts'])
#     print(answer)
    
    

Writing C:\Users\dangq\OneDrive\M√°y t√≠nh\USTH\ICT\Internship\RAG Remake\RAG_Basic\src\llm\custom_model.py


# <b> MAIN

In [2]:
%%writefile "C:\Users\dangq\OneDrive\M√°y t√≠nh\USTH\ICT\Internship\RAG Remake\RAG_Basic\examples\chat_session.py"
import sys
import os

# Get the absolute path of the current directory
current_dir = os.getcwd()

# Get the parent directory (which is 'RAG_Basic')
project_root = os.path.dirname(current_dir)
print(project_root)

# Add the project root to the system path
if project_root not in sys.path:
    sys.path.append(project_root)
    
from src.llm import CustomModel
from src.utils import DocumentProcessor, PDFLoader, QdrantStore
from sentence_transformers import SentenceTransformer
import uuid

"""
---------------------------------------------------------
1. SET UP
---------------------------------------------------------
"""
llm = CustomModel()
embed_model = SentenceTransformer("BAAI/bge-m3")
processor = DocumentProcessor(chunk_size=500, chunk_overlap=50)
vector_store = QdrantStore(dim=1024, collection_name="TEST")
loader = PDFLoader("C:\\Users\\dangq\\OneDrive\\M√°y t√≠nh\\USTH\\ICT\\Internship\\RAG Remake\\RAG\\data\\raw\\2405.17247v1.pdf")


"""
---------------------------------------------------------
2. Read and Chunk Documents (Preproces):
    - Chunking
    - Create vectors embedding for each chunks
    - Construct Metadata (need to improve in the future)
    - Upsert the data to vector database
---------------------------------------------------------
"""

# Check if we already have data
current_count = vector_store.count_vectors()

if current_count == 0:
    print("==="*50)
    print("Collection is empty. Starting processing and ingestion...")
    
    docs = loader.load_docs()
    docs_texts = [doc.text for doc in docs]

    chunks = processor.chunk_Recursive_char(docs_texts)
    print(f"\nCreated {len(chunks)} chunks.")

    # Generate Embeddings
    print("Generating embeddings...")
    vectors = embed_model.encode(chunks, show_progress_bar=True)

    # Prepare Metadata for Qdrant
    ids = [str(uuid.uuid4()) for _ in range(len(chunks))]
    payloads = [
        {"text": chunk, "source": "2405.17247v1.pdf", "chunk_index": i} 
        for i, chunk in enumerate(chunks)
    ]

    # Upsert into vector database
    vector_store.upsert(ids, vectors, payloads)
    print("Upsert completed. Your Collection is ready to use!")
    print("==="*50)
    
else:
    print("==="*50)
    print(f"Collection '{vector_store.collection_name}' already contains {current_count} vectors.")
    print("Skipping Step 2 (Ingestion).")
    print("==="*50)



"""
---------------------------------------------------------
3. Test Similarity Search 
---------------------------------------------------------
"""

# We must embed the query using the SAME model

query = " On the MMHAL-BENCH, by what percentage does LLAVA-RLHF outperform baselines, and what is the specific focus of this benchmark?"
query_vector = embed_model.encode(query).tolist()
search_results = vector_store.similarity_search(query_vector, top_k=3)


"""
---------------------------------------------------------
4. Run Inference
---------------------------------------------------------
"""
print("==="*100)
answer = llm.generate(query, search_results['contexts'])
print(answer)


Overwriting C:\Users\dangq\OneDrive\M√°y t√≠nh\USTH\ICT\Internship\RAG Remake\RAG_Basic\examples\chat_session.py


In [11]:
# import pickle
# import os

# # 1. Define your input and output filenames
# input_pkl_path = r"C:\\Users\\dangq\\OneDrive\\M√°y t√≠nh\\USTH\\ICT\\Internship\\RAG Remake\\RAG\\data\\raw\\2405.17247v1.pdf.pkl"
# output_md_path = r"C:\\Users\\dangq\\OneDrive\\M√°y t√≠nh\\USTH\\ICT\\Internship\\RAG Remake\\RAG\\data\\raw\\2405.17247v1.md"  # We save as .md to keep formatting

# def convert_pickle_to_markdown():
#     # Check if file exists
#     if not os.path.exists(input_pkl_path):
#         print(f"‚ùå Error: File not found at {input_pkl_path}")
#         return

#     print(f"üìñ Loading {input_pkl_path}...")
    
#     # 2. Load the data from pickle
#     with open(input_pkl_path, "rb") as f:
#         documents = pickle.load(f)
    
#     # Verify it's what we expect (List of Documents)
#     print(f"‚úÖ Loaded {len(documents)} document chunk(s).")
    
#     # 3. Extract and combine text
#     # LlamaParse often returns one large document, or split pages. 
#     # We join them just in case.
#     full_markdown_text = ""
    
#     for i, doc in enumerate(documents):
#         # The '.text' attribute contains the parsed markdown
#         content = doc.text
        
#         # Optional: Add a separator if you have multiple chunks
#         if i > 0:
#             full_markdown_text += "\n\n--- SEGMENT BREAK ---\n\n"
            
#         full_markdown_text += content

#     # 4. Save to a new file
#     with open(output_md_path, "w", encoding="utf-8") as f:
#         f.write(full_markdown_text)
        
#     print(f"üéâ Success! Converted text saved to: {output_md_path}")

# if __name__ == "__main__":
#     convert_pickle_to_markdown()

In [5]:
%%writefile "C:\Users\dangq\OneDrive\M√°y t√≠nh\USTH\ICT\Internship\RAG Remake\RAG_Basic\src\llm\__init__.py"
from .custom_model import CustomModel

__all__ = ["CustomModel"]

Writing C:\Users\dangq\OneDrive\M√°y t√≠nh\USTH\ICT\Internship\RAG Remake\RAG_Basic\src\llm\__init__.py


In [6]:
%%writefile "C:\Users\dangq\OneDrive\M√°y t√≠nh\USTH\ICT\Internship\RAG Remake\RAG_Basic\src\utils\__init__.py"
from .document_processor import DocumentProcessor
from .pdf_loader import PDFLoader
from .qdrant_database import QdrantStore

__all__ = ["DocumentProcessor", "PDFLoader", "QdrantStore"]

Writing C:\Users\dangq\OneDrive\M√°y t√≠nh\USTH\ICT\Internship\RAG Remake\RAG_Basic\src\utils\__init__.py
