In [None]:
!pip install PyMuPDF pymupdf nltk spacy sentence-transformers faiss-cpu transformers accelerate
!python -m spacy download en_core_web_sm

Collecting PyMuPDF
  Downloading pymupdf-1.26.1-cp39-abi3-manylinux_2_28_x86_64.whl.metadata (3.4 kB)
Downloading pymupdf-1.26.1-cp39-abi3-manylinux_2_28_x86_64.whl (24.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m63.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyMuPDF
Successfully installed PyMuPDF-1.26.1
Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m115.3 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Res

In [None]:
!pip install faiss-cpu
!pip install transformers accelerate



In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import fitz  # PyMuPDF

def extract_text_from_pdf(pdf_path):
    """
    Extracts text page-by-page from a PDF using PyMuPDF.
    Returns a list of dicts with page content and metadata.
    """
    doc = fitz.open(pdf_path)
    pages = []

    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        text = page.get_text()
        pages.append({
            "page_number": page_num + 1,
            "text": text.strip(),
        })

    doc.close()
    return pages

In [None]:
pdf_path = "/content/drive/MyDrive/Colab Notebooks/RAG_Agent/Input/RTCU_Manual.pdf"
pages = extract_text_from_pdf(pdf_path)

for p in pages[:2]:  # Preview first 2 pages
    print(f"\n--- Page {p['page_number']} ---")
    print(p['text'][:500])  # Preview first 500 characters



--- Page 1 ---
RTCU IDE Users Manual
© 2025 Logic IO, www.logicio.com
Version 9.98

--- Page 2 ---
* * * * THIS PAGE IS INTENTIONALLY LEFT BLANK * * *


In [None]:
import json

with open("/content/drive/MyDrive/Colab Notebooks/RAG_Agent/Input/extracted_pages.json", "w", encoding="utf-8") as f:
    json.dump(pages, f, ensure_ascii=False, indent=2)


In [None]:
import spacy
nlp = spacy.load("en_core_web_sm")

def split_into_sentences(text):
    doc = nlp(text)
    return [sent.text.strip() for sent in doc.sents]


In [None]:
def chunk_text_from_pages(pages, chunk_size=300, overlap=50):
    all_chunks = []

    for page in pages:
        sentences = split_into_sentences(page["text"])
        current_chunk = []
        total_words = 0

        for sentence in sentences:
            words = sentence.split()
            total_words += len(words)
            current_chunk.append(sentence)

            if total_words >= chunk_size:
                chunk_text = " ".join(current_chunk)
                all_chunks.append({
                    "page_number": page["page_number"],
                    "text": chunk_text.strip()
                })

                # Overlap logic
                overlap_words = " ".join(current_chunk)[-overlap:]
                current_chunk = [overlap_words]
                total_words = len(overlap_words.split())

        # Add leftovers
        if current_chunk:
            all_chunks.append({
                "page_number": page["page_number"],
                "text": " ".join(current_chunk).strip()
            })

    return all_chunks

chunks = chunk_text_from_pages(pages)
print(f"Total chunks: {len(chunks)}")
print("Sample chunk:")
print(chunks[0]["text"][:200])


Total chunks: 2353
Sample chunk:
RTCU IDE Users Manual
© 2025 Logic IO, www.logicio.com
Version 9.98


In [None]:
from sentence_transformers import SentenceTransformer

# Load the model
model = SentenceTransformer('all-MiniLM-L6-v2')

def embed_chunks(chunks):
    texts = [chunk["text"] for chunk in chunks]
    embeddings = model.encode(texts, show_progress_bar=True)

    # Append embedding to each chunk
    for i in range(len(chunks)):
        chunks[i]["embedding"] = embeddings[i].tolist()  # Convert to JSON-safe list

    return chunks


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
embedded_chunks = embed_chunks(chunks)
print(f"Total embedded chunks: {len(embedded_chunks)}")
print("Sample chunk with embedding:")
print(embedded_chunks[0]["text"][:200])
print("\nVector snippet:", embedded_chunks[0]["embedding"][:5])  # First 5 values

Batches:   0%|          | 0/74 [00:00<?, ?it/s]

Total embedded chunks: 2353
Sample chunk with embedding:
RTCU IDE Users Manual
© 2025 Logic IO, www.logicio.com
Version 9.98

Vector snippet: [-0.01452592946588993, -0.0146857388317585, -0.09026658535003662, 0.048196788877248764, 0.01550771202892065]


In [None]:
import faiss
import numpy as np

def build_faiss_index(embedded_chunks):
    dim = len(embedded_chunks[0]["embedding"])  # typically 384 for MiniLM
    index = faiss.IndexFlatL2(dim)  # L2 = Euclidean distance

    # Store a separate metadata list for retrieval later
    texts = []
    metadata = []

    vectors = [np.array(chunk["embedding"], dtype=np.float32) for chunk in embedded_chunks]
    index.add(np.array(vectors))

    for chunk in embedded_chunks:
        texts.append(chunk["text"])
        metadata.append({
            "page_number": chunk.get("page_number"),
            # Add more info if needed
        })

    return index, texts, metadata


In [None]:
def search_faiss(query, model, index, texts, metadata, top_k=3):
    query_embedding = model.encode([query])[0].astype("float32")
    distances, indices = index.search(np.array([query_embedding]), top_k)

    results = []
    for idx in indices[0]:
        results.append({
            "text": texts[idx],
            "metadata": metadata[idx]
        })

    return results


In [None]:
# Build index once
index, texts, metadata = build_faiss_index(embedded_chunks)

# Now search
query = "How does RTCU handle Modbus?"
results = search_faiss(query, model, index, texts, metadata, top_k=3)

# Show results
for i, res in enumerate(results):
    print(f"\n🔹 Result {i+1} (Page {res['metadata']['page_number']}):\n")
    print(res['text'][:500])  # Show snippet



🔹 Result 1 (Page 1212):

© 2025 Logic IO, www.logicio.com
1172
RTCU IDE Users Manual
Standard Function Library
modbus: Functions for MODBUS
4.2.28
modbus: MODBUS communication
4.2.28.1
MODBUS is a messaging protocol for master/slave communication between devices connected
on different types of buses or networks and has been the serial de facto standard of the industry
since 1979. The MODBUS API is an alternative to the I/O Extension feature which gives the application full
control over the MODBUS communication - thus al

🔹 Result 2 (Page 1246):

© 2025 Logic IO, www.logicio.com
1206
RTCU IDE Users Manual
Standard Function Library
net_id : INT
The ID of the MODBUS connection. unit_id : INT
The address of the device on the MODBUS connection to receive the data. Note: if the MODBUS connection is opened in slave mode, this parameter is ignored.
frame : PTR
Address of the buffer that contains the package to send. size : INT
Number of bytes to send from the buffer. Returns: INT
0
- Success.

In [None]:
def build_rag_prompt(query, context_chunks):
    context = "\n\n".join(chunk["text"] for chunk in context_chunks)
    prompt = f"""
You are a helpful assistant. Use the information below to answer the question.

### Context:
{context}

### Question:
{query}

### Answer:"""
    return prompt.strip()


In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

# model_id = "mistralai/Mistral-7B-Instruct-v0.1"  # You can also try 'tiiuae/falcon-rw-1b' or 'microsoft/phi-2'
model_id = "microsoft/phi-2"

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", torch_dtype=torch.float16)

tokenizer_config.json:   0%|          | 0.00/7.34k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/1.08k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/735 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/35.7k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/564M [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [None]:
def generate_answer(prompt, max_tokens=300):
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    outputs = model.generate(
        **inputs,
        max_new_tokens=max_tokens,
        do_sample=True,
        temperature=0.7,
        top_p=0.95,
        pad_token_id=tokenizer.eos_token_id
    )
    return tokenizer.decode(outputs[0], skip_special_tokens=True).split("### Answer:")[-1].strip()


In [None]:
def rag_pipeline(query):
    # Step 1: Semantic search
    results = search_faiss(query, model, index, texts, metadata, top_k=3)

    # Step 2: Prompt building
    prompt = build_rag_prompt(query, results)

    # Step 3: Generate answer
    answer = generate_answer(prompt)  # or generate_with_ollama(prompt)

    return answer, results


In [None]:
question = "What are the core integration features of RTCU?"
answer, retrieved_context = rag_pipeline(question)

print("🔍 Answer:")
print(answer)


AttributeError: 'PhiForCausalLM' object has no attribute 'encode'