In [2]:
import os
from dotenv import load_dotenv

load_dotenv()

True

In [3]:
#Configs for local deployment
from unstructured_ingest.v2.pipeline.pipeline import Pipeline
from unstructured_ingest.v2.interfaces import ProcessorConfig
from unstructured_ingest.v2.processes.connectors.local import (
    LocalIndexerConfig,
    LocalDownloaderConfig,
    LocalConnectionConfig,
    LocalUploaderConfig
)
from unstructured_ingest.v2.processes.partitioner import PartitionerConfig
from unstructured_ingest.v2.processes.chunker import ChunkerConfig

from unstructured.chunking.title import chunk_by_title

In [4]:
from langchain_chroma import Chroma
from langchain_openai import OpenAIEmbeddings

import chromadb

from uuid import uuid4
from langchain.vectorstores.utils import filter_complex_metadata

In [None]:
directory_with_pdfs="images"
directory_with_results="output"

Pipeline.from_configs(
    context=ProcessorConfig(),
    indexer_config=LocalIndexerConfig(input_path=directory_with_pdfs),
    downloader_config=LocalDownloaderConfig(),
    source_connection_config=LocalConnectionConfig(),
    partitioner_config=PartitionerConfig(
        partition_by_api=True,
        api_key=os.getenv("UNSTRUCTURED_API_KEY"),
        partition_endpoint=os.getenv("UNSTRUCTURED_API_URL"),
        strategy="hi_res",
        additional_partition_args={
            "split_pdf_page": True,
            "split_pdf_concurrency_level": 15,
            },
        ),
    uploader_config=LocalUploaderConfig(output_dir=directory_with_results)
).run()

In [1]:
PartitionerConfig??

Object `PartitionerConfig` not found.


In [5]:
from unstructured.staging.base import elements_from_json

def load_processed_files(directory_path):
    elements = []
    for filename in os.listdir(directory_path):
        if filename.endswith('.json'):
            file_path = os.path.join(directory_path, filename)
            try:
                elements.extend(elements_from_json(filename=file_path))
            except IOError:
                print(f"Error: Could not read file {filename}.")

    return elements

elements = load_processed_files(directory_with_results)

In [6]:
elements_filtered = [el for el in elements if el.category != "Header"]

In [7]:
chunks = chunk_by_title(elements_filtered)

In [8]:
from langchain_community.vectorstores import Chroma
from langchain_core.documents import Document
from langchain_openai import OpenAIEmbeddings

In [9]:
def filter_meta(meta:dict):
    filtered_metadata = {}
    for key, value in meta.items():
        if isinstance(value, (str, int, float, bool)):
            filtered_metadata[key] = value
    return filtered_metadata

In [12]:
documents = []
for element in chunks:
    metadata_im = element.metadata.to_dict()
    metadata = filter_meta(metadata_im)
    del metadata["orig_elements"]
    #metadata["source"] = metadata["filename"]
    documents.append(Document(page_content=element.text, metadata=metadata))

In [14]:
embeddings = OpenAIEmbeddings()

In [15]:
vectorstore = Chroma.from_documents(documents, embeddings)

In [16]:
vectorstore.similarity_search(query="winter")

[Document(metadata={'filename': 'winter-sports.epub', 'filetype': 'application/epub', 'is_continuation': True}, page_content='in this excellent republic, mid-winter thaws occur.'),
 Document(metadata={'filename': 'winter-sports.epub', 'filetype': 'application/epub', 'is_continuation': True}, page_content='summer, when the sun has come to an understanding with the snow, a fine golf-course is found to reveal itself. But all winter long the sun blazes on Montana, while its altitude and the cold of its nights preserves its frozen mantle. Of the view I have already spoken: there is something to be said for a view in the intervals of falling-down, and in the meditation and quiescence which such falls sometimes entail.'),
 Document(metadata={'filename': 'winter-sports.epub', 'filetype': 'application/epub', 'is_continuation': True}, page_content='winter, on a still evening those evidences of life are dumb, and yet the silence itself is pregnant with vitality. At sunset the high tops burn in ro

In [17]:
retriever = vectorstore.as_retriever()

In [18]:
retriever.invoke("where is ice hocked played")

[Document(metadata={'filename': 'winter-sports.epub', 'filetype': 'application/epub'}, page_content='Now this arrangement of hog-score (usually called “the hog”), back score, sweeping score, “house” and crampit (or hack), scratched in the ice according to these directions, completes the construction of one end of the rink. At the other end a similar construction is made in alignment, the centre of the two houses being 39 yards from one another. Here is the rink ready for play, and the rest of the rules deal entirely with the game itself.'),
 Document(metadata={'filename': 'winter-sports.epub', 'filetype': 'application/epub'}, page_content='[Image unavailable.]\n\nICE HOCKEY\n\nFrom the Drawing by Fleming Williams'),
 Document(metadata={'filename': 'winter-sports.epub', 'filetype': 'application/epub', 'is_continuation': True}, page_content='icemen should ever be allowed on the rink. How amazed would be the pioneers of outdoor artificial rinks if they could see the huge and perfect surfa