In [11]:
from dotenv import load_dotenv
import os
from pathlib import Path

current_dir = os.getcwd()
path = Path(current_dir)
BASE_DIR = path.parent  # Go one level up

load_dotenv(BASE_DIR / ".env")

True

# Load and Embed Manual File

In [8]:
import fitz  # PyMuPDF
from glob import glob

# Text chunking function to split text into manageable chunks
def chunk_text(text, chunk_size=1000):
    chunks = []
    while len(text) > chunk_size:
        # Split the text at the closest space to the chunk size to avoid cutting words
        split_point = text.rfind(' ', 0, chunk_size)
        chunks.append(text[:split_point])
        text = text[split_point:].strip()
    if text:
        chunks.append(text)
    return chunks


In [9]:
import ollama
import os

def emb_text(text):
    response = ollama.embeddings(model=os.getenv("EMBEDDING_MODEL"), prompt=text)
    return response["embedding"]

In [10]:
# **1️⃣ Ekstrak dan embedding teks dari PDF**
text_chunks = []
embedding_vectors = []

for file_path in glob("documents_RAG/*.pdf", recursive=True):
    doc = fitz.open(file_path)
    pdf_text = ""
    i = 0
    for page in doc:
        pdf_text += page.get_text()
        i += 1

        if i == 5:
            break
    
    # Buat *chunks* dari teks PDF
    chunks = chunk_text(pdf_text)
    text_chunks.extend(chunks)  # Simpan teks chunked
    
    # Konversi *chunks* ke embedding
    for chunk in chunks:
        embedding_vectors.append(emb_text(chunk))

print(len(embedding_vectors))

9


# Connect to Milvus

In [4]:
from pymilvus import connections

# Hubungkan ke Milvus
connections.connect(alias="default", host="localhost", port="19530")
print("Terhubung ke Milvus!")

Terhubung ke Milvus!


In [16]:
from pymilvus import Collection, FieldSchema, CollectionSchema, DataType, Index, utility
import numpy as np
import fitz  # PyMuPDF
from glob import glob
from dotenv import load_dotenv
import ollama
import os

# Drop existing collection if it exists
if utility.has_collection(os.getenv("COLLECTION_NAME")):
    utility.drop_collection(os.getenv("COLLECTION_NAME"))
    print("Existing collection dropped!")

# Update the schema with correct vector dimensions
fields = [
    FieldSchema(name="id", dtype=DataType.INT64, is_primary=True, auto_id=True),
    FieldSchema(name="text", dtype=DataType.VARCHAR, max_length=2048),
    FieldSchema(name="vector", dtype=DataType.FLOAT_VECTOR, dim=1024)  # Changed to match your embeddings
]

schema = CollectionSchema(fields, description="Koleksi embedding dokumen")

# Create new collection
collection = Collection(name=os.getenv("COLLECTION_NAME"), schema=schema)
print("Koleksi berhasil dibuat!")

# Create index on vector field
index_params = {
    "metric_type": "IP",
    "index_type": "IVF_FLAT",
    "params": {"nlist": 128}
}
collection.create_index(field_name="vector", index_params=index_params)
print("Index berhasil dibuat!")

# Insert data
entities = [
    {
        "text": text,
        "vector": vector
    }
    for text, vector in zip(text_chunks, embedding_vectors)
]

insert_result = collection.insert(entities)
collection.flush()
collection.load()
print(f"Data berhasil dimasukkan dengan total {len(text_chunks)} chunk.")

Existing collection dropped!
Koleksi berhasil dibuat!
Index berhasil dibuat!
Data berhasil dimasukkan dengan total 9 chunk.


In [17]:
search_params = {"metric_type": "IP", "params": {"nprobe": 10}}

query_text = "This manual is a guide"
query_vector = ollama.embeddings(model=os.getenv("EMBEDDING_MODEL"), prompt=query_text)["embedding"]

results = collection.search(
    data=[query_vector],  # Query vector(s)
    anns_field="vector",  # The field to search
    param=search_params,
    limit=5,  # Get top 5 results
    output_fields=["text"]  # Retrieve associated text
)

# Print the results
for hits in results:
    for hit in hits:
        print(f"Score: {hit.distance}, Text: {hit.entity.get('text')}")

Score: 179.9971466064453, Text: Introduction
This manual is a guide for using the MITSUBISHI CNC C80 Series.
This manual describes operations, production processes and maintenances for users who operate the MITSUBISHI CNC
installed machine tool. Read this manual thoroughly before using CNC unit. Moreover study the "Precautions for Safety" on
the next page before use to use the unit safely. Be sure to keep this manual always at hand.
CAUTION
For items described as "Restrictions" or "Usable State" in this manual, the instruction manual issued by the
machine tool builder (MTB) takes precedence over this manual.
Items not described in this manual must be interpreted as "not possible".
This manual is written on the assumption that all the applicable functions are included. Some of them, however,
may not be available for your NC system. Refer to the specifications issued by the machine tool builder before
use.
Refer to the Instruction Manual issued by each MTB for details on each machine too