In [2]:
from unstructured.partition.pdf import partition_pdf
from pathlib import Path
import pickle
import os

file_path = '../content/output/' + 'fmea_handbook.pdf'
output_path = '../content/output/'


output_dir = Path(output_path)
output_dir.mkdir(exist_ok=True)
output_file_path = output_dir / "chunks.pkl"

OLLAMA_URL = "http://127.0.0.1:11434"

  from .autonotebook import tqdm as notebook_tqdm


In [None]:

# Reference: https://docs.unstructured.io/open-source/core-functionality/chunking
chunks = partition_pdf(
    filename=file_path,
    infer_table_structure=True,            # extract tables
    strategy="hi_res",                     # mandatory to infer tables

    extract_image_block_types=["Image"],   # Add 'Table' to list to extract image of tables
    # image_output_dir_path=output_path,   # if None, images and tables will saved in base64

    extract_image_block_to_payload=True,   # if true, will extract base64 for API usage

    chunking_strategy="by_title",          # or 'basic'
    max_characters=10000,                  # defaults to 500
    combine_text_under_n_chars=2000,       # defaults to 0
    new_after_n_chars=6000,

    # extract_images_in_pdf=True,          # deprecated
)



The `max_size` parameter is deprecated and will be removed in v4.26. Please specify in `size['longest_edge'] instead`.


In [6]:
if chunks is not None and len(chunks) > 0:
    with open(output_file_path, "wb") as f:
        pickle.dump(chunks, f)

In [3]:
with open(output_file_path, "rb") as f:
    chunks = pickle.load(f)

In [4]:
set([str(type(el)) for el in chunks])

{"<class 'unstructured.documents.elements.CompositeElement'>"}

In [5]:
# separate tables from texts
tables = []
texts = []

for chunk in chunks:
    if "Table" in str(type(chunk)):
        tables.append(chunk)

    if "CompositeElement" in str(type((chunk))):
        texts.append(chunk)

In [6]:
# Get the images from the CompositeElement objects
def get_images_base64(chunks):
    images_b64 = []
    for chunk in chunks:
        if "CompositeElement" in str(type(chunk)):
            chunk_els = chunk.metadata.orig_elements
            for el in chunk_els:
                if "Image" in str(type(el)):
                    images_b64.append(el.metadata.image_base64)
    return images_b64

images = get_images_base64(chunks)

In [7]:
import base64
from PIL import Image
import io
import os

images_dir = output_dir / "images"
os.makedirs(output_dir, exist_ok=True)

for i, img_b64 in enumerate(images):
    image_data = base64.b64decode(img_b64)
    image = Image.open(io.BytesIO(image_data)).convert("RGB")  # Convert to RGB for JPEG

    file_path = os.path.join(images_dir, f"image_{i+1}.jpg")
    image.save(file_path)

In [8]:
#Remove images you dont need by hand and load them again
images = []
for filename in sorted(os.listdir(images_dir)):
    file_path = os.path.join(images_dir, filename)
    with open(file_path, "rb") as image_file:
        encoded = base64.b64encode(image_file.read()).decode("utf-8")
        images.append(encoded)

In [9]:
from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams
import requests

# If you're using local Qdrant on default port
client = QdrantClient(host="qdrant", port=6333)

def embed_texts_ollama(texts, model="mxbai-embed-large:335m"):
    url = "http://host.docker.internal:11434/api/embeddings"
    payload = {
        "model": model,
        "prompt": texts if isinstance(texts, str) else texts[0]
    }

    if isinstance(texts, list):
        embeddings = []
        for text in texts:
            payload["prompt"] = text
            response = requests.post(url, json=payload)
            response.raise_for_status()
            embeddings.append(response.json()["embedding"])
        return embeddings
    else:
        response = requests.post(url, json=payload)
        response.raise_for_status()
        return [response.json()["embedding"]]

collection_name = "my_first_rag_collection"

if not client.collection_exists(collection_name=collection_name):
    client.create_collection(
        collection_name=collection_name,
        vectors_config=VectorParams(size=1024, distance=Distance.COSINE)  # 1024 is the dimension for mxbai
    )


print(client.get_collections())

collections=[CollectionDescription(name='my_first_rag_collection')]


In [10]:
from qdrant_client.models import PointStruct
import uuid

def upload_chunks_to_qdrant(chunks, collection_name):
    texts = [chunk.page_content for chunk in chunks]
    embeddings = embed_texts_ollama(texts)  # Step 1
    
    points = []
    for i, (chunk, embedding) in enumerate(zip(chunks, embeddings)):
        payload = {
            "text": chunk.page_content,
            **chunk.metadata  # Optional: add page_number, category, etc.
        }

        points.append(PointStruct(
            id=str(uuid.uuid4()),
            vector=embedding,
            payload=payload
        ))

    client.upsert(collection_name=collection_name, points=points)

In [None]:
query_vector = embed_texts_ollama("FMEA")[0]

results = client.query_points(
    collection_name=collection_name,
    query=query_vector,
    limit=5
)

for result in results:
    print(result.payload["text"], result.score)


SyntaxError: invalid syntax. Perhaps you forgot a comma? (3599291810.py, line 5)