In [11]:
from dotenv import load_dotenv
import os
from pathlib import Path

current_dir = os.getcwd()
path = Path(current_dir)
BASE_DIR = path.parent  # Go one level up

load_dotenv(BASE_DIR / ".env")

True

In [14]:
MILVUS_URL = os.getenv("MILVUS_URL")
MILVUS_METRIC_TYPE = os.getenv("MILVUS_METRIC_TYPE")
COLLECTION_NAME = os.getenv("COLLECTION_NAME")

EMBEDDING_MODEL = os.getenv("EMBEDDING_MODEL")
MISTRAL_API_KEY = os.getenv("MISTRAL_API_KEY")
LLM_MODEL = os.getenv("LLM_MODEL")

In [None]:
from glob import glob
from mistralai import Mistral
import time


# Initialize Mistral client
client = Mistral(api_key=MISTRAL_API_KEY)



# Process a single PDF file using Mistral OCR
def process_pdf_with_ocr(file_path):
    try:
        # Upload the PDF file to Mistral
        print(f"Uploading {file_path}...")
        with open(file_path, "rb") as f:
            uploaded_pdf = client.files.upload(
                file={
                    "file_name": os.path.basename(file_path),
                    "content": f,
                },
                purpose="ocr"
            )
        
        # Get the signed URL for the uploaded file
        signed_url = client.files.get_signed_url(file_id=uploaded_pdf.id)
        
        # Process the PDF with OCR
        print(f"Processing OCR for {file_path}...")
        ocr_response = client.ocr.process(
            model="mistral-ocr-latest",
            document={
                "type": "document_url",
                "document_url": signed_url.url,
            }
        )
        
        # Extract the text content from the OCR response
        # The actual structure depends on Mistral's API response format
        # This is a placeholder - adjust based on actual API response structure
        extracted_text = ocr_response.pages
        
        print(ocr_response.pages)

        return extracted_text
    
    except Exception as e:
        print(f"Error processing {file_path}: {str(e)}")
        return ""



In [None]:
document_path = os.path.join(BASE_DIR, "documents_RAG", "iTNC530 機械操作手冊 MV154,MV154APC,MV204,UX300.pdf")
pdf_text = process_pdf_with_ocr(document_path)

c:\Users\surya\Downloads\Rapi\Vanessa\server_latest\AR-RAG-Project-Server-Docker\documents_RAG\iTNC530 機械操作手冊 MV154,MV154APC,MV204,UX300.pdf


In [None]:
# Assuming pdf_text is a list of OCRPageObject elements
markdowns = [page.markdown for page in pdf_text]

# If you want to join all markdowns together into one single string (optional):
all_markdown = "\n".join(markdowns)

# Display or process the extracted markdowns
print(all_markdown)

extracted_markdown=all_markdown   

In [None]:
# Recursive function to split text into manageable chunks with overlap (kept from original code)
def recursive_chunk_text(text, chunk_size=1000, overlap=150):
    chunks = []
    
    # Base case: if the text is smaller than chunk_size, return it as is
    if len(text) <= chunk_size:
        chunks.append(text)
        return chunks
    
    # Try splitting the text at the closest space to avoid cutting words
    split_point = text.rfind(' ', 0, chunk_size)
    
    # If no space is found, just split at the chunk_size
    if split_point == -1:
        split_point = chunk_size
    
    # Split the text and add the first chunk
    chunks.append(text[:split_point].strip())
    
    # Calculate the starting point for the next chunk (with overlap)
    next_start = max(0, split_point - overlap)
    
    # Recursively process the remaining part, including the overlap
    chunks.extend(recursive_chunk_text(text[next_start:].strip(), chunk_size, overlap))
    
    return chunks

In [None]:
# Extract text from PDF files using Mistral OCR
text_lines = []
for file_path in glob(os.path.join(BASE_DIR, "documents_RAG", "*.pdf"), recursive=True):
    # Process the PDF file with OCR
    pdf_text = process_pdf_with_ocr(file_path)
    
    # Apply recursive chunking to split the extracted text into smaller sections
    if pdf_text:
        text_lines.extend(recursive_chunk_text(extracted_markdown))
        
        # Add a delay between API calls to avoid rate limiting
        time.sleep(1)

# Write all the chunks to a text file for external viewing
with open("chunks_output.txt", "w") as file:
    for i, chunk in enumerate(text_lines):
        file.write(f"Chunk {i+1}:\n{chunk}\n\n")

In [None]:
import ollama

def emb_text(text):
    response = ollama.embeddings(model=EMBEDDING_MODEL, prompt=text)
    return response["embedding"]


In [None]:
test_embedding = emb_text("This is a test")
embedding_dim = len(test_embedding)
print(embedding_dim)
print(test_embedding[:10])


# Connect to Milvus

In [None]:
from pymilvus import Collection, FieldSchema, CollectionSchema, DataType, Index, utility, connections
import numpy as np
import fitz
from glob import glob

In [None]:
# Connect to milvus
connections.connect(alias="default", uri=MILVUS_URL)
print("Connected to Milvus!")

Terhubung ke Milvus!


In [None]:
# Drop existing collection if it exists
if utility.has_collection(COLLECTION_NAME):
    utility.drop_collection(COLLECTION_NAME)
    print(f"Existing collection {COLLECTION_NAME} dropped!")

In [None]:
# Define schema
fields = [
    FieldSchema(name="id", dtype=DataType.INT64, is_primary=True, auto_id=True),
    FieldSchema(name="text", dtype=DataType.VARCHAR, max_length=2048),
    FieldSchema(name="vector", dtype=DataType.FLOAT_VECTOR, dim=embedding_dim)
]

schema = CollectionSchema(fields, description="Koleksi embedding dokumen")

# Create new collection
collection = Collection(name=COLLECTION_NAME, schema=schema, consistency_level="Strong")
print(f"Koleksi '{COLLECTION_NAME}' berhasil dibuat!")

In [None]:
# Create index on vector field
index_params = {
    "metric_type": MILVUS_METRIC_TYPE,
    "index_type": "IVF_FLAT",
    "params": {"nlist": 128}
}

collection.create_index(field_name="vector", index_params=index_params)
print("Index berhasil dibuat!")

In [None]:
# Insert data
entities = [
    {
        "text": text,
        "vector": vector
    }
    # for text, vector in zip(text_chunks, embedding_vectors)
    for i, (text, vector) in enumerate(tqdm(zip(text_chunks, embedding_vectors), desc="Processing embeddings"))
]

insert_result = collection.insert(entities)
collection.flush()
collection.load()
print(f"Data berhasil dimasukkan dengan total {len(text_chunks)} chunk.")

Existing collection dropped!
Koleksi berhasil dibuat!
Index berhasil dibuat!
Data berhasil dimasukkan dengan total 9 chunk.


In [17]:
search_params = {"metric_type": "IP", "params": {"nprobe": 10}}

query_text = "This manual is a guide"
query_vector = ollama.embeddings(model=os.getenv("EMBEDDING_MODEL"), prompt=query_text)["embedding"]

results = collection.search(
    data=[query_vector],  # Query vector(s)
    anns_field="vector",  # The field to search
    param=search_params,
    limit=5,  # Get top 5 results
    output_fields=["text"]  # Retrieve associated text
)

# Print the results
for hits in results:
    for hit in hits:
        print(f"Score: {hit.distance}, Text: {hit.entity.get('text')}")

Score: 179.9971466064453, Text: Introduction
This manual is a guide for using the MITSUBISHI CNC C80 Series.
This manual describes operations, production processes and maintenances for users who operate the MITSUBISHI CNC
installed machine tool. Read this manual thoroughly before using CNC unit. Moreover study the "Precautions for Safety" on
the next page before use to use the unit safely. Be sure to keep this manual always at hand.
CAUTION
For items described as "Restrictions" or "Usable State" in this manual, the instruction manual issued by the
machine tool builder (MTB) takes precedence over this manual.
Items not described in this manual must be interpreted as "not possible".
This manual is written on the assumption that all the applicable functions are included. Some of them, however,
may not be available for your NC system. Refer to the specifications issued by the machine tool builder before
use.
Refer to the Instruction Manual issued by each MTB for details on each machine too