This Python code block initializes a Weaviate client, defines a schema for a "Document" class with properties for title, content, and source, and creates the schema in Weaviate. Adjust the Weaviate client's URL as needed.

In [None]:
import weaviate

client = weaviate.Client("http://cda-DESKTOP:8080")

schema = {
    "classes": [
        {
            "class": "NewUnstructuredDocument",
            "description": "A class to store documents",
            "properties": [
                {"name": "title", "dataType": ["string"], "description": "The title of the document"},
                {"name": "content", "dataType": ["text"], "description": "The content of the document"},
                {"name": "datePublished", "dataType": ["date"], "description": "The date the document was published"},
                {"name": "url", "dataType": ["string"], "description": "The URL of the document"}
            ]
        }
    ]
}
client.schema.delete_class('NewUnstructuredDocument')
client.schema.delete_class('UnstructuredDocument')
client.schema.create(schema)

In [27]:
import os
import weaviate
#from weaviate.util import generate_uuid5
from unstructured.partition.pdf import partition_pdf
from pathlib import Path
from minio import Minio

# Initialize MinIO Client
minioClient = Minio("cda-DESKTOP:9000",
                    access_key="cda_cdaprod",
                    secret_key="cda_cdaprod",
                    secure=False)

# Define function to download PDF files from MinIO bucket
def download_files_from_minio(bucket_name, prefix="", local_dir="downloaded_data"):
    objects = minioClient.list_objects(bucket_name, prefix=prefix, recursive=True)
    os.makedirs(local_dir, exist_ok=True)
    for obj in objects:
        file_path = os.path.join(local_dir, obj.object_name)
        os.makedirs(os.path.dirname(file_path), exist_ok=True)
        minioClient.fget_object(bucket_name, obj.object_name, file_path)
    print(f"Downloaded files from bucket {bucket_name} to {local_dir}")

# Define function to process PDFs and extract text
def process_pdfs_and_extract_text(local_dir="downloaded_data"):
    elements_list = []
    for pdf_file in Path(local_dir).glob("*.pdf"):
        elements = partition_pdf(filename=str(pdf_file))
        elements_list.extend(elements)
    return elements_list

# Initialize Weaviate client
client = weaviate.Client("http://cda-DESKTOP:8080")

# Define function to upload extracted data to Weaviate
def upload_data_to_weaviate(elements, class_name="UnstructuredDocument"):
    for element in elements:
        # Initialize element_id with None or a default value
        element_id = getattr(element.metadata, "element_id", None)
        data_object = {
            "content": element.text,
            "element_id": element_id,  # Use the obtained element_id
            # Add more fields as needed
        }
        client.data_object.create(data_object, class_name=class_name)
    print("Uploaded data to Weaviate")

if __name__ == "__main__":
    # Step 1: Download PDFs from MinIO
    download_files_from_minio("cda-datasets")

    # Step 2: Process downloaded PDFs to extract text
    elements = process_pdfs_and_extract_text()

    # Step 3: Upload extracted data to Weaviate
    upload_data_to_weaviate(elements)

Downloaded files from bucket cda-datasets to downloaded_data
Uploaded data to Weaviate


In [19]:
# Retrieve all objects of class "UnstructuredDocument"
results = client.query.get("UnstructuredDocument", ["content", "element_id"]).do()
for result in results['data']['Get']['UnstructuredDocument']:
    print(result)

{'content': '[15] Rishi Bommasani, Drew A. Hudson, Ehsan Adeli, and et al. 2022. On the Opportunities and Risks of Foundation Models. arXiv:2108.07258 [cs.LG] [16] Michael Brenner. 2010. Creating dynamic story plots with continual multiagent planning. In Proceedings of the 24th AAAI Conference on Artificial Intelligence. [17] Rodney A. Brooks, Cynthia Breazeal, Marko Marjanovic, Brian Scassellati, and Matthew Williamson. 2000. The Cog Project: Building a Humanoid Robot. In Computation for Metaphors, Analogy, and Agents (Lecture Notes on Artificial Intelligence, 1562), Chrystopher Nehaniv (Ed.). Springer-Verlag, Berlin, 52–87. [18] Tom B. Brown, Benjamin Mann, Nick Ryder, Melanie Subbiah, Jared Kaplan, Prafulla Dhariwal, Arvind Neelakantan, Pranav Shyam, Girish Sastry, Amanda Askell, Sandhini Agarwal, Ariel Herbert-Voss, Gretchen Krueger, Tom Henighan, Rewon Child, Aditya Ramesh, Daniel M. Ziegler, Jeffrey Wu, Clemens Winter, Christopher Hesse, Mark Chen, Eric Sigler, Mateusz Litwin, Sc

In [23]:
# Retrieve a specific object by element_id (replace 'your_element_id_here' with an actual element ID)
element_id = 'your-element-id-here'
specific_result = client.query.get("UnstructuredDocument", ["content", "element_id"]).with_where(
    {"path": ["element_id"], "operator": "Equal", "valueString": element_id}
).do()
print(specific_result)

{'data': {'Get': {'UnstructuredDocument': []}}}
