In [2]:
import os
from langchain_chroma import Chroma
# First we change the directory to the root directory of the project.
os.chdir(os.getcwd().replace("\\notebooks\\data", ""))
print("Current path: ", os.getcwd())

# Load local environment variables
from dotenv import load_dotenv
print("Environment variables are loaded = ", load_dotenv())

Current path:  D:\Code\uni_llm
Environment variables are loaded =  True


In [32]:
from typing import Optional, Sequence

from google.api_core.client_options import ClientOptions
from google.cloud import documentai

# TODO(developer): Uncomment these variables before running the sample.
project_id = os.getenv("GOOGLE_PROJECT_ID")
location = "us" # Format is "us" or "eu"
processor_id = os.getenv("PROCESSOR_ID") # Create processor before running sample
processor_version = "rc" # Refer to https://cloud.google.com/document-ai/docs/manage-processor-versions for more information
mime_type = "application/pdf" # Refer to https://cloud.google.com/document-ai/docs/file-types for supported file types


def process_document_layout_sample(
        project_id: str,
        location: str,
        processor_id: str,
        processor_version: str,
        file_path: str,
        mime_type: str,
) -> documentai.Document:
    process_options = documentai.ProcessOptions(
        layout_config=documentai.ProcessOptions.LayoutConfig(
            chunking_config=documentai.ProcessOptions.LayoutConfig.ChunkingConfig(
                chunk_size=100,
                include_ancestor_headings=True,
            )
        )
    )

    document = process_document(
        project_id,
        location,
        processor_id,
        processor_version,
        file_path,
        mime_type,
        process_options=process_options,
    )

    print("Document Layout Blocks")
    for block in document.document_layout.blocks:
        print(block)

    print("Document Chunks")
    for chunk in document.chunked_document.chunks:
        print(chunk)
        
    return document



def process_document(
        project_id: str,
        location: str,
        processor_id: str,
        processor_version: str,
        file_path: str,
        mime_type: str,
        process_options: Optional[documentai.ProcessOptions] = None,
) -> documentai.Document:
    # You must set the `api_endpoint` if you use a location other than "us".
    client = documentai.DocumentProcessorServiceClient(
        client_options=ClientOptions(
            api_endpoint=f"{location}-documentai.googleapis.com"
        )
    )

    # The full resource name of the processor version, e.g.:
    # `projects/{project_id}/locations/{location}/processors/{processor_id}/processorVersions/{processor_version_id}`
    # You must create a processor before running this sample.
    name = client.processor_version_path(
        project_id, location, processor_id, processor_version
    )

    # Read the file into memory
    with open(file_path, "rb") as image:
        image_content = image.read()

    # Configure the process request
    request = documentai.ProcessRequest(
        name=name,
        raw_document=documentai.RawDocument(content=image_content, mime_type=mime_type),
        # Only supported for Document OCR processor
        process_options=process_options,
    )

    result = client.process_document(request=request)

    # For a full list of `Document` object attributes, reference this page:
    # https://cloud.google.com/document-ai/docs/reference/rest/v1/Document
    return result.document

In [None]:
documents = []

In [35]:
from langchain_core.documents import Document

file_path = ["data/uni_data/EERS_first.pdf", "data/uni_data/EERS_second.pdf", "data/uni_data/RR_first.pdf", "data/uni_data/RR_second.pdf", "data/uni_data/RR_third.pdf"]
for i, path in enumerate(file_path):
    if i <= 2:
        continue
    parsed_document = process_document_layout_sample(project_id, location, processor_id, processor_version, path, mime_type)
    document_name = "education and examination regulation"
    if i > 1:
        document_name = "regulation and rules"
    for chunk in parsed_document.chunked_document.chunks:
        doc = Document(page_content=chunk.content, metadata={"page": chunk.page_span.page_start, "document_name": document_name})
        documents.append(doc)

Document Layout Blocks
text_block {
  text: "Section 3 Procedures"
  type_: "heading-1"
  blocks {
    text_block {
      text: "ARTICLE 3.1 EXAMS"
      type_: "heading-2"
      blocks {
        text_block {
          text: "1. Students must register in time for their education, exams and re-sits. If they miss the deadline, they cannot participate in the exam. If exceptional circumstances prevented registering, they are eligible for dispensation from the registration deadline. See also Article 3.7 on Hardship."
          type_: "paragraph"
        }
        block_id: "3"
        page_span {
          page_start: 1
          page_end: 1
        }
      }
      blocks {
        text_block {
          text: "2."
          type_: "paragraph"
        }
        block_id: "4"
        page_span {
          page_start: 1
          page_end: 1
        }
      }
      blocks {
        text_block {
          text: "3."
          type_: "paragraph"
        }
        block_id: "5"
        page_span

In [36]:
from langchain_openai import OpenAIEmbeddings
# Get the API key from the environment variables
openai_api_key = os.getenv("OPENAI_API_KEY")

In [38]:
# Initialize the OpenAIEmbeddings class
embedding = OpenAIEmbeddings(model="text-embedding-3-large", openai_api_key=openai_api_key)

In [39]:
# Create a Chroma vector database OpenAI embeddings
db_openai = Chroma.from_documents(documents, embedding, persist_directory="./data/vectorDB/openai_vectorDB_google_parser")