In [5]:
from dotenv import load_dotenv
import os
from pathlib import Path

current_dir = os.getcwd()
path = Path(current_dir)
BASE_DIR = path.parent  # Go one level up

load_dotenv(BASE_DIR / ".env")

True

In [6]:
MILVUS_URL = os.getenv("MILVUS_URL")
MILVUS_METRIC_TYPE = os.getenv("MILVUS_METRIC_TYPE")
COLLECTION_NAME = os.getenv("COLLECTION_NAME")

EMBEDDING_MODEL = os.getenv("EMBEDDING_MODEL")
MISTRAL_API_KEY = os.getenv("MISTRAL_API_KEY")
LLM_MODEL = os.getenv("LLM_MODEL")

## Extract File

In [None]:
from glob import glob
from mistralai import Mistral
import time


# Initialize Mistral client
client = Mistral(api_key=MISTRAL_API_KEY)


# Process a single PDF file using Mistral OCR
def process_pdf_with_ocr(file_path):
    try:
        # Upload the PDF file to Mistral
        print(f"Uploading {file_path}...")
        with open(file_path, "rb") as f:
            uploaded_pdf = client.files.upload(
                file={
                    "file_name": os.path.basename(file_path),
                    "content": f,
                },
                purpose="ocr"
            )
        
        # Get the signed URL for the uploaded file
        signed_url = client.files.get_signed_url(file_id=uploaded_pdf.id)
        
        # Process the PDF with OCR
        print(f"Processing OCR for {file_path}...")
        ocr_response = client.ocr.process(
            model="mistral-ocr-latest",
            document={
                "type": "document_url",
                "document_url": signed_url.url,
            }
        )
        
        # Extract the text content from the OCR response
        # The actual structure depends on Mistral's API response format
        # This is a placeholder - adjust based on actual API response structure
        extracted_text = ocr_response.pages
        
        print(ocr_response.pages)

        return extracted_text
    
    except Exception as e:
        print(f"Error processing {file_path}: {str(e)}")
        return ""



In [None]:
document_path = os.path.join(BASE_DIR, "documents_RAG", "iTNC530 機械操作手冊 MV154,MV154APC,MV204,UX300.pdf")
pdf_text = process_pdf_with_ocr(document_path)

c:\Users\surya\Downloads\Rapi\Vanessa\server_latest\AR-RAG-Project-Server-Docker\documents_RAG\iTNC530 機械操作手冊 MV154,MV154APC,MV204,UX300.pdf


In [None]:
# Assuming pdf_text is a list of OCRPageObject elements
markdowns = [page.markdown for page in pdf_text]

# If you want to join all markdowns together into one single string (optional):
all_markdown = "\n".join(markdowns)

# Display or process the extracted markdowns
print(all_markdown)

extracted_markdown=all_markdown   

In [None]:
# Recursive function to split text into manageable chunks with overlap (kept from original code)
def recursive_chunk_text(text, chunk_size=1000, overlap=150):
    chunks = []
    
    # Base case: if the text is smaller than chunk_size, return it as is
    if len(text) <= chunk_size:
        chunks.append(text)
        return chunks
    
    # Try splitting the text at the closest space to avoid cutting words
    split_point = text.rfind(' ', 0, chunk_size)
    
    # If no space is found, just split at the chunk_size
    if split_point == -1:
        split_point = chunk_size
    
    # Split the text and add the first chunk
    chunks.append(text[:split_point].strip())
    
    # Calculate the starting point for the next chunk (with overlap)
    next_start = max(0, split_point - overlap)
    
    # Recursively process the remaining part, including the overlap
    chunks.extend(recursive_chunk_text(text[next_start:].strip(), chunk_size, overlap))
    
    return chunks

In [None]:
# Extract text from PDF files using Mistral OCR
text_lines = []
for file_path in glob(os.path.join(BASE_DIR, "documents_RAG", "*.pdf"), recursive=True):
    # Process the PDF file with OCR
    pdf_text = process_pdf_with_ocr(file_path)
    
    # Apply recursive chunking to split the extracted text into smaller sections
    if pdf_text:
        text_lines.extend(recursive_chunk_text(extracted_markdown))
        
        # Add a delay between API calls to avoid rate limiting
        time.sleep(1)

# Write all the chunks to a text file for external viewing
with open("chunks_output.txt", "w") as file:
    for i, chunk in enumerate(text_lines):
        file.write(f"Chunk {i+1}:\n{chunk}\n\n")

In [None]:
import ollama

def emb_text(text):
    response = ollama.embeddings(model=EMBEDDING_MODEL, prompt=text)
    return response["embedding"]


In [None]:
test_embedding = emb_text("This is a test")
embedding_dim = len(test_embedding)
print(embedding_dim)
print(test_embedding[:10])


## Load Extracted Chunks

In [3]:
# Read the saved text chunks from the file
text_chunks = []

with open(os.path.join(BASE_DIR, 'data', 'processed', 'chunks_output.txt'), "r", encoding="utf-8") as file:
    chunk = ""
    for line in file:
        # Detect new chunk start
        if line.startswith("Chunk "):
            if chunk:
                text_chunks.append(chunk.strip())  # Save previous chunk
            chunk = ""  # Start new chunk
        else:
            chunk += line  # Append line to chunk

    # Save the last chunk
    if chunk:
        text_chunks.append(chunk.strip())

# Print first 3 chunks for verification
print("\n".join(text_chunks[:3]))

# BLASER 

## we cut faster

## USER MANUAL

## FOR <br> MV154 / MV154APC series, MV204 series \& UX300 <br> Heidenhain iTNC530
# CONTENTS 

1. Safety ..... 5
1.1 Intended Use ..... 5
1.2.1 Safe installation ..... 5
1.2.2 Machine guarding ..... 6
1.2.3 Software ..... 6
1.2.4 Authorized personnel and training ..... 6
1.2.5 Safe working practice ..... 6
1.3 Safety Cautions List ..... 7
1.4 Safety Devices ..... 10
1.4.1 Emergency Stop ..... 10
1.4.2 Guard ..... 10
1.4.3 Window ..... 11
1.4.4 Door Interlock ..... 11
1.4.5 Cabinet door switch ..... 11
1.6 Residual Risks ..... 12
2. Introduction ..... 13
2.1 Consumption Material ..... 13
2.1.1 Lubrication oil for linear rail and ballscrew ..... 13
2.1.2 Lubrication oil for pneumatic system ..... 13
2.1.3 Cutting fluid ..... 13
2.1.4 Filter for cutting fluid ..... 13
2.1.5 Coolant of cooler ..... 14
2.2 Operation Panel ..... 15
2.3 Buttons/Knobs on chip conveyor
luid ..... 13
2.1.4 Filter for cutting fluid ..... 13
2.1.5 Coolant of cooler ...

In [6]:
import ollama

def emb_text(text):
    response = ollama.embeddings(model=EMBEDDING_MODEL, prompt=text)
    return response["embedding"]
    


In [7]:
test_embedding = emb_text("This is a test")
embedding_dim = len(test_embedding)
print(embedding_dim)
print(test_embedding[:10])


1024
[0.2327725738286972, 0.42580944299697876, 0.19741831719875336, 0.4612889289855957, -0.4603482782840729, -0.14149177074432373, -0.18266350030899048, -0.07604783028364182, 0.39978229999542236, 0.8336597681045532]


In [19]:
# EMBED TEXT
embedding_vectors = [emb_text(text_chunk) for text_chunk in text_chunks]
print(len(embedding_vectors))

75


In [20]:
!pip install pickle

ERROR: Could not find a version that satisfies the requirement pickle (from versions: none)
ERROR: No matching distribution found for pickle

[notice] A new release of pip available: 22.3.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [23]:
import pickle

with open(os.path.join(BASE_DIR, 'data', 'processed', 'embeddings.pkl'), 'wb') as f:
    pickle.dump(embedding_vectors, f)

## Connect to Milvus

In [2]:
from pymilvus import Collection, FieldSchema, CollectionSchema, DataType, Index, utility, connections
import numpy as np
import fitz
from glob import glob
import tqdm

In [7]:
# Connect to milvus
connections.connect(alias="default", uri=MILVUS_URL)
print("Connected to Milvus!")

Connected to Milvus!


In [8]:
# Drop existing collection if it exists
if utility.has_collection(COLLECTION_NAME):
    utility.drop_collection(COLLECTION_NAME)
    print(f"Existing collection {COLLECTION_NAME} dropped!")

Existing collection vanessa_rag_collection dropped!


In [None]:
# Define schema
fields = [
    FieldSchema(name="file_uuid", dtype=DataType.VARCHAR, max_length=36, is_primary=True),
    FieldSchema(name="text", dtype=DataType.VARCHAR, max_length=2048),
    FieldSchema(name="vector", dtype=DataType.FLOAT_VECTOR, dim=get_embedding_dimension())
]

schema = CollectionSchema(fields, description="Koleksi embedding dokumen")

# Create new collection
collection = Collection(name=COLLECTION_NAME, schema=schema, consistency_level="Strong")
print(f"Koleksi '{COLLECTION_NAME}' berhasil dibuat!")

Koleksi 'vanessa_rag_collection' berhasil dibuat!


In [18]:
# Create index on vector field
index_params = {
    "metric_type": MILVUS_METRIC_TYPE,
    "index_type": "IVF_FLAT",
    "params": {"nlist": 128}
}

collection.create_index(field_name="vector", index_params=index_params)
print("Index berhasil dibuat!")

Index berhasil dibuat!


In [25]:
from tqdm import tqdm

# Correct list comprehension
entities = [
    {
        "text": text,
        "vector": vector
    }
    for i, (text, vector) in enumerate(tqdm(zip(text_chunks, embedding_vectors), desc="Processing embeddings"))
]

# Insert into Milvus
insert_result = collection.insert(entities)
collection.flush()
collection.load()
print(f"Data berhasil dimasukkan dengan total {len(text_chunks)} chunk.")


Processing embeddings: 75it [00:00, 136178.70it/s]


Data berhasil dimasukkan dengan total 75 chunk.


In [26]:
search_params = {"metric_type": "IP", "params": {"nprobe": 3}}

query_text = "This manual is a guide"
query_vector = ollama.embeddings(model=EMBEDDING_MODEL, prompt=query_text)["embedding"]

results = collection.search(
    data=[query_vector],  # Query vector(s)
    anns_field="vector",  # The field to search
    param=search_params,
    limit=5,  # Get top 5 results
    output_fields=["text"]  # Retrieve associated text
)

# Print the results
for hits in results:
    for hit in hits:
        print(f"Score: {hit.distance}, Text: {hit.entity.get('text')}")

Score: 166.4105224609375, Text: n and acted upon.
DON'T run the machine until you have made clear to your supervisor that you understand the potential hazard of spindle rotation, the throwing of coolant and the throwing of swarf from the cutting process.
DON'T run the machine until you have read and understood all manuals provided with the machine.
DON'T run the machine until you have read and understood all the machine and control keys.
DON'T run the machine for the first time without a qualified instructor. Ask your supervisor for help when you need it.
PROTECT your eyes. Wear safety glasses with side shields at all times.
DON'T get caught in moving parts. Remove watches, rings, jewellery, neckties and loose fitting clothes.
PROTECT your head. Wear a safety helmet when working near overhead hazards.
KEEP your hair away from moving parts.
PROTECT your feet. Always wear safety shoes with steel toes and oil resistant soles.
Gloves are easily caught in moving parts. TAKE THEM OFF before 