In [None]:
from dotenv import load_dotenv
import os

load_dotenv()

MILVUS_URL = os.getenv("MILVUS_URL")
COLLECTION_NAME = os.getenv("COLLECTION_NAME")
EMBEDDING_MODEL = os.getenv("EMBEDDING_MODEL")
DOCUMENTS_PATH = os.getenv("DOCUMENTS_PATH")
MISTRAL_API_KEY = os.getenv("MISTRAL_API_KEY")  

In [None]:
import os
from glob import glob
from mistralai import Mistral
import time


# Initialize Mistral client
client = Mistral(api_key=MISTRAL_API_KEY)



# Process a single PDF file using Mistral OCR
def process_pdf_with_ocr(file_path):
    try:
        # Upload the PDF file to Mistral
        print(f"Uploading {file_path}...")
        with open(file_path, "rb") as f:
            uploaded_pdf = client.files.upload(
                file={
                    "file_name": os.path.basename(file_path),
                    "content": f,
                },
                purpose="ocr"
            )
        
        # Get the signed URL for the uploaded file
        signed_url = client.files.get_signed_url(file_id=uploaded_pdf.id)
        
        # Process the PDF with OCR
        print(f"Processing OCR for {file_path}...")
        ocr_response = client.ocr.process(
            model="mistral-ocr-latest",
            document={
                "type": "document_url",
                "document_url": signed_url.url,
            }
        )
        
        # Extract the text content from the OCR response
        # The actual structure depends on Mistral's API response format
        # This is a placeholder - adjust based on actual API response structure
        extracted_text = ocr_response.pages
        
        print(ocr_response.pages)

        return extracted_text
    
    except Exception as e:
        print(f"Error processing {file_path}: {str(e)}")
        return ""



In [None]:
pdf_text = process_pdf_with_ocr("./documents_RAG/iTNC530 機械操作手冊 MV154,MV154APC,MV204,UX300.pdf")

In [None]:
# Assuming pdf_text is a list of OCRPageObject elements
markdowns = [page.markdown for page in pdf_text]

# If you want to join all markdowns together into one single string (optional):
all_markdown = "\n".join(markdowns)

# Display or process the extracted markdowns
print(all_markdown)

extracted_markdown=all_markdown   

In [None]:
# Recursive function to split text into manageable chunks with overlap (kept from original code)
def recursive_chunk_text(text, chunk_size=1000, overlap=150):
    chunks = []
    
    # Base case: if the text is smaller than chunk_size, return it as is
    if len(text) <= chunk_size:
        chunks.append(text)
        return chunks
    
    # Try splitting the text at the closest space to avoid cutting words
    split_point = text.rfind(' ', 0, chunk_size)
    
    # If no space is found, just split at the chunk_size
    if split_point == -1:
        split_point = chunk_size
    
    # Split the text and add the first chunk
    chunks.append(text[:split_point].strip())
    
    # Calculate the starting point for the next chunk (with overlap)
    next_start = max(0, split_point - overlap)
    
    # Recursively process the remaining part, including the overlap
    chunks.extend(recursive_chunk_text(text[next_start:].strip(), chunk_size, overlap))
    
    return chunks

In [None]:
# Extract text from PDF files using Mistral OCR
text_lines = []
for file_path in glob(os.path.join(DOCUMENTS_PATH, "*.pdf"), recursive=True):
    # Process the PDF file with OCR
    pdf_text = process_pdf_with_ocr(file_path)
    
    # Apply recursive chunking to split the extracted text into smaller sections
    if pdf_text:
        text_lines.extend(recursive_chunk_text(extracted_markdown))
        
        # Add a delay between API calls to avoid rate limiting
        time.sleep(1)

# Write all the chunks to a text file for external viewing
with open("chunks_output.txt", "w") as file:
    for i, chunk in enumerate(text_lines):
        file.write(f"Chunk {i+1}:\n{chunk}\n\n")

In [None]:
import ollama

def emb_text(text):
    response = ollama.embeddings(model=EMBEDDING_MODEL, prompt=text)
    return response["embedding"]


In [14]:
test_embedding = emb_text("This is a test")
embedding_dim = len(test_embedding)
print(embedding_dim)
print(test_embedding[:10])


768
[1.0914263725280762, 0.5967336893081665, -3.9346163272857666, -0.6990123391151428, 1.5423402786254883, -0.13473758101463318, 0.8982678651809692, -0.46930229663848877, 0.9009982347488403, -0.6395869851112366]


In [None]:
from pymilvus import MilvusClient

milvus_client = MilvusClient(uri=MILVUS_URL)

In [None]:
if milvus_client.has_collection(COLLECTION_NAME):
    milvus_client.drop_collection(COLLECTION_NAME)


In [None]:
milvus_client.create_collection(
    collection_name=COLLECTION_NAME,
    dimension=embedding_dim,
    metric_type="IP",  # Inner product distance
    consistency_level="Strong",  # Strong consistency level
)


In [None]:
from tqdm import tqdm

data = []

for i, line in enumerate(tqdm(text_lines, desc="Creating embeddings")):
    data.append({"id": i, "vector": emb_text(line), "text": line})

milvus_client.insert(collection_name=COLLECTION_NAME, data=data)

Creating embeddings: 100%|██████████| 72/72 [00:04<00:00, 14.99it/s]


{'insert_count': 72, 'ids': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71], 'cost': 0}