In [46]:
import os
import re
import glob
import pymupdf4llm  # pip install pymupdf4llm
from langchain_text_splitters import RecursiveCharacterTextSplitter
import chromadb
from chromadb.utils import embedding_functions
from langchain_community.vectorstores import Chroma
from langchain_huggingface import HuggingFaceEmbeddings
from sentence_transformers import CrossEncoder
from dotenv import load_dotenv
from openai import OpenAI
from typing import List, Dict, Tuple
import pdfplumber
import pytesseract
from pdf2image import convert_from_path


In [47]:
# --- Sub-function 1: Table Formatter ---
def _table_to_markdown(table: List[List[str]]) -> str:
    """Converts a raw list-of-lists table into a Markdown string."""
    if not table or len(table) < 2:
        return ""

    # Clean cell content (remove newlines within cells)
    clean_table = [[str(cell).replace('\n', ' ').strip() if cell is not None else "" for cell in row] for row in table]

    # Construct Markdown parts
    header = "| " + " | ".join(clean_table[0]) + " |"
    separator = "| " + " | ".join(["---"] * len(clean_table[0])) + " |"

    body_rows = []
    for row in clean_table[1:]:
        body_rows.append("| " + " | ".join(row) + " |")

    return f"\n{header}\n{separator}\n" + "\n".join(body_rows) + "\n"

# --- Sub-function 2: Hybrid Extractor (OCR + Digital) ---
def _extract_pdf_content(pdf_path: str) -> List[Dict]:
    """
    Extracts text and tables from a single PDF.
    Uses OCR fallback if page text is insufficient (< 100 chars).
    """
    extracted_pages = []
    print(f"Processing: {os.path.basename(pdf_path)}")

    try:
        with pdfplumber.open(pdf_path) as pdf:
            for i, page in enumerate(pdf.pages):
                page_content = []

                # 1. Attempt Standard Extraction
                tables = page.extract_tables()
                text = page.extract_text() or ""
                clean_text = text.strip()

                # 2. Check if OCR is needed (Hybrid Approach)
                if len(clean_text) < 100:
                    # Render page to image
                    # Note: Linux/WSL requires poppler-utils installed
                    images = convert_from_path(pdf_path, first_page=i+1, last_page=i+1, dpi=200)
                    if images:
                        ocr_text = pytesseract.image_to_string(images[0])
                        clean_text = ocr_text.strip()

                if clean_text:
                    page_content.append(clean_text)

                # 3. Format Tables (if found digitally)
                if tables:
                    page_content.append("\n\n### Data Tables:\n")
                    for table in tables:
                        md_table = _table_to_markdown(table)
                        page_content.append(md_table)

                # 4. Compile Page Result
                final_content = "\n".join(page_content)

                # Only add pages that actually have content
                if len(final_content.strip()) > 0:
                    extracted_pages.append({
                        "content": final_content,
                        "metadata": {
                            "source": os.path.basename(pdf_path),
                            "page": i + 1
                        }
                    })

    except Exception as e:
        print(f"Error reading {pdf_path}: {e}")

    return extracted_pages

# --- Sub-function 3: Cleaner and Splitter ---
def _clean_and_split(raw_docs: List[Dict], chunk_size: int = 1000, chunk_overlap: int = 200) -> Tuple[List[str], List[Dict]]:
    """
    Cleans the extracted text and splits it into chunks while preserving metadata.
    """
    # Initialize the splitter
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        separators=["\n\n", "\n", " ", ""]
    )

    all_splits = []
    all_metadatas = []

    for doc in raw_docs:
        text = doc['content'] # Note: key matches extraction output
        metadata = doc['metadata']

        # 1. Normalization: Replace 3+ newlines with 2 to fix spacing issues
        text = re.sub(r'\n{3,}', '\n\n', text)

        # 2. Split
        chunks = text_splitter.split_text(text)

        # 3. Align Metadata
        for chunk in chunks:
            all_splits.append(chunk)
            all_metadatas.append(metadata.copy())

    return all_splits, all_metadatas

# --- Main Entry Point ---
def process_directory_for_rag(directory_path: str) -> Tuple[List[str], List[Dict]]:
    """
    Main function to process all PDFs in a directory.

    Returns:
        tuple: (List of text chunks, List of corresponding metadata dicts)
    """
    pdf_files = glob.glob(os.path.join(directory_path, "*.pdf"))
    print(f"Found {len(pdf_files)} PDF files in {directory_path}")

    raw_documents = []

    # Step 1: Extraction
    for pdf_file in pdf_files:
        pages = _extract_pdf_content(pdf_file)
        raw_documents.extend(pages)

    print(f"Total pages extracted: {len(raw_documents)}")

    # Step 2: Cleaning & Splitting
    final_splits, final_metadatas = _clean_and_split(raw_documents)

    print(f"Total chunks created: {len(final_splits)}")

    return final_splits, final_metadatas

In [48]:
# rag_data_dir = os.path.join(os.path.dirname(__file__), "rag_data")
rag_data_dir = "./rag_data"
splits, metadata = process_directory_for_rag(rag_data_dir)

Found 2 PDF files in ./rag_data
Processing: STCW_guide_english.pdf
Processing: training-manual-vol-i-pre-sea.pdf
Total pages extracted: 121
Total chunks created: 422


## Testing

In [49]:
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

print(f"Split into {len(splits)} chunks.")
print("Creating Vector DB...")
# Create ChromaDB instance in memory
vector_db = Chroma.from_texts(
    texts=splits,
    metadatas=metadata,
    embedding=embedding_model,
    collection_name="secret_project"
)

print("Data ingested!")

Split into 422 chunks.
Creating Vector DB...
Data ingested!


In [50]:
def search(query: str, top_k_retrieval=20, top_k_rerank=5):

    # Stage 1: Semantic Retrieval (Bio-Encoder)
    results = vector_db.similarity_search(
        query=query,
        k=top_k_retrieval
    )
    # print(results)
    # Stage 2: Re-ranking (Cross-Encoder)
    # Prepare pairs: (Query, Document_Context)
    pairs = [[query, doc.page_content] for doc in results]

    # Predict scores
    scores = cross_encoder.predict(pairs)

    # Sort by score (descending)
    ranked_indices = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)

    # Stage 3: Return top-k results
    retrieved = []
    for rank, idx in enumerate(ranked_indices[:top_k_rerank]):
        retrieved.append({
            "rank": rank+1,
            "score": scores[idx],
            "content": results[idx].page_content,
            "page": results[idx].metadata['page'],
            "source": results[idx].metadata['source']
        })

    return retrieved


In [51]:
query = "What is the classification of shipboard tasks?"
cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')
docs = search(query)
print("--- Retrieved Context ---")
for doc in docs:
    print(f"Content \n: {doc}")

--- Retrieved Context ---
Content 
: {'rank': 1, 'score': np.float32(2.8034935), 'content': 'INTERNATIONAL TRANSPORT WORKERS’ FEDERATION\n\n### Glossary of terms\n\nYou need to know the meaning of some basic terms that will be used in this guide:\n\nAdministration:\n\nThe government of the Party (country) whose flag a ship is entitled to fly. An\n\nadministration deals, among other things, with regulating the training, education and\n\ncertification of seafarers in accordance with the requirements of the convention.\n\nApproved:\n\nThis means that a national maritime administration has approved a training\n\nprogramme or seagoing service because it meets the requirements of the amended\n\nSTCW Convention.\n\nFunction:\n\nThis is a way of classifying shipboard tasks by functions and levels of responsibility.\n\nThere are seven functional areas, at three different levels of responsibility. The levels\n\nof responsibility are: **management** level (applies to senior officers); **operation

In [52]:
from typing import List

load_dotenv()

class RAGGenerator:
    def __init__(self, api_key: str = None, base_url: str = None, model_name: str = "gpt-5-nano:free"):
        """
        Initialize the OpenAI-compatible client.

        Args:
            api_key: Your API Key (use "dummy" for local models like Ollama)
            base_url: The API endpoint (e.g., "http://localhost:11434/v1" for Ollama)
            model_name: The specific model to target (e.g., "llama3", "gpt-4o")
        """
        self.client = OpenAI(
            api_key=api_key or os.getenv("OPENAI_API_KEY"),
            base_url=base_url or os.getenv("OPENAI_BASE_URL")
        )
        self.model_name = model_name

    def construct_prompt(self, query: str, context_chunks: List[str]) -> str:
        """
        Builds the prompt by combining the user query with retrieved context.
        """
        # Join chunks with a clear separator
        context_str = "\n\n---\n\n".join(context_chunks)

        prompt = f"""You are a helpful assistant for maritime regulations.
Answer the user's question based ONLY on the following context.
If the answer is not in the context, say "I don't know."

The context may contain Markdown tables. Please interpret the rows and columns accurately.

### CONTEXT:
{context_str}

### USER QUESTION:
{query}

### ANSWER:
"""
        return prompt

    def generate_answer(self, query: str, context_chunks: List[str]) -> str:
        """
        Sends the prompt to the LLM and returns the response.
        """
        prompt = self.construct_prompt(query, context_chunks)

        try:
            response = self.client.chat.completions.create(
                model=self.model_name,
                messages=[
                    {"role": "system", "content": "You are a precise technical assistant."},
                    {"role": "user", "content": prompt}
                ],
                temperature=0.1, # Keep strict for RAG to avoid hallucinations
            )
            return response.choices[0].message.content
        except Exception as e:
            return f"Error during inference: {e}"


In [53]:
rag = RAGGenerator()

In [58]:
# 1. SIMULATE RETRIEVAL
query = "What are the specific carpet area requirements for classrooms in a Maritime Training Institute based on student intake capacity?"
docs = search(query)
context = ["-> ".join([doc['source'], doc['content']]) for doc in docs]
# 2. GENERATE RESPONSE
print(f"Query: {query}\n")
answer = rag.generate_answer(query, context)
print(f"Response:\n{answer}")

['training-manual-vol-i-pre-sea.pdf-> 2.9. Faculty room\n2.9.1. A separate room not less than 8 m2 shall be provided for the Principal/head of\nInstitute. A carpet area of not less than 4 m2for each full-time faculty member shall be\nprovided. Modular separation between each faculty space is recommended.\n2.9.2. The faculty shall also be provided with separate chair, table and cupboard.\nAdditional space, table and chairs shall be provided for visiting faculty. Ventilation and\nlighting arrangement shall be same as for classrooms.\n2.10. Classroom requirements\n2.10.1. Class-room: The carpet area requirement of the class rooms and tutorial rooms\ndepends upon the number of students and type of seating arrangement. The size (carpet\narea) of the classroom shall be 30 m2, 36 m2, and 50 m2 for intake capacity of 20, 24 and\n40 candidates respectively. Institutes approved prior to 1st November, 2016 may continue\nwith the prevalent classroom size. However, if they apply for increase in cap

In [60]:
query = "Summarize the mandatory minimum rest hour requirements for watch-keeping personnel under the STCW Convention, including how these periods can be divided and any permissible exceptions."
docs = search(query)
context = ["-> ".join([doc['source'], doc['content']]) for doc in docs]
# 2. GENERATE RESPONSE
print(f"Query: {query}\n")
answer = rag.generate_answer(query, context)
print(f"Response:\n{answer}")

Query: Summarize the mandatory minimum rest hour requirements for watch-keeping personnel under the STCW Convention, including how these periods can be divided and any permissible exceptions.

Response:
- 10 hours of rest in any 24-hour period for watch-keeping personnel (calculated from the time the watch starts; not from 00:00).

- The 10 hours may be divided into up to two rest periods, provided:
  - one period is at least 6 hours long, and
  - no period is shorter than 1 hour.

- In any seven-day period, the total rest must not be less than 77 hours.

- The minimum rest period is not obligatory in emergencies, drills, or overriding operational conditions.

- There is an exceptional provision in the 2010 STCW: administrations may allow an exception where rest is not less than 70 hours in any seven-day period and the exception lasts up to two weeks. Within those two weeks, rest can be divided into three periods, with the interval between two exemption periods at least twice the durat