## Import libraries

In [4]:
from dotenv import load_dotenv
from google.cloud import storage
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
from langchain_google_vertexai import VertexAIEmbeddings
from langchain_pinecone import PineconeVectorStore
from pinecone import Pinecone
import fitz
import os

In [None]:
# Import environment variables from .env file
load_dotenv()

PROJECT_ID = os.getenv("GOOGLE_CLOUD_PROJECT")
GCS_BUCKET = os.getenv("GCS_BUCKET")
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
PINECONE_INDEX_NAME = os.getenv("PINECONE_INDEX_NAME")
PINECONE_ENVIRONMENT = os.getenv("PINECONE_ENVIRONMENT")

## Pull data from GCS

In [6]:
"""Function to convert pdf bytes to text"""
def get_data_from_pdf(pdf_bytes):
    text = ""
    with fitz.open(stream=pdf_bytes, filename="pdf") as doc:
        for page in doc:
            text += page.get_text()
    return text

In [7]:
"""Function that reads all documents from GCS bucket"""
def read_all_docs():  
    client = storage.Client(project=PROJECT_ID)
    bucket = client.bucket(GCS_BUCKET)
    prefix = "raw_data/"
    blobs = bucket.list_blobs(prefix=prefix)

    contents = {}

    for blob in blobs:
        if blob.name.endswith("/"):
            continue

        content = ""

        if blob.name.lower().endswith(".pdf"):
            pdf_bytes = blob.download_as_bytes()
            content = get_data_from_pdf(pdf_bytes)
        else:
            content = blob.download_as_text()
        
        contents[blob.name] = content

    return contents


In [8]:
# Trigger data extraction
contents = read_all_docs()
print("Import complete!")

MuPDF error: format error: cmsOpenProfileFromMem failed

MuPDF error: format error: cmsOpenProfileFromMem failed

MuPDF error: format error: cmsOpenProfileFromMem failed

MuPDF error: format error: No default Layer config

MuPDF error: format error: No default Layer config

MuPDF error: format error: No default Layer config

Import complete!


## Chunk and embed data

In [12]:
"""Function to chunk text data"""
def chunk_documents_batch(documents, chunk_size=1000, chunk_overlap=200):
    # Iterate through douments and create lanchain document with content and filename
    lc_documents = [Document(page_content=doc_content, metadata={"source": doc_name}) 
                    for doc_name, doc_content in documents.items()
    ]

    # Initialise RecursiveCharacterTextSplitter
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size, 
        chunk_overlap=chunk_overlap, 
        separators=["\n\n", "\n", ".", " "], 
        length_function=len
    )

    # Split langchain documents into chunks
    split_docs = text_splitter.split_documents(lc_documents)

    return split_docs

In [15]:
"""Function to embed documents"""
def embed_and_upload_to_pinecone(documents, batch_size=100):

    if not all([PINECONE_API_KEY, PINECONE_ENVIRONMENT, PINECONE_INDEX_NAME]):
        raise EnvironmentError("Missing Pinecone credentials in .env file.")

    # Initialize Pinecone and fetch the index
    pc = Pinecone(api_key=PINECONE_API_KEY, environment=PINECONE_ENVIRONMENT)
    index = pc.Index(PINECONE_INDEX_NAME)

    # Use Vertex AI embedding model
    embedding_model = VertexAIEmbeddings(model_name="text-embedding-005")

    # Initialise vectorstore
    vector_store = PineconeVectorStore(index=index, embedding=embedding_model)

    # Batch upload vectors to Pinecone
    for i in range(0, len(documents), batch_size):
        batch = documents[i:i + batch_size]
        vector_store.add_documents(batch)
        print(f"Uploaded batch {i//batch_size + 1} of {len(documents)//batch_size + 1}")

    print(f"Uploaded {len(documents)} documents to Pinecone index '{PINECONE_INDEX_NAME}'")


In [16]:
# Trigger chunking
split_docs = chunk_documents_batch(contents)
print(f"{len(split_docs)} chunks were created!")

# Trigger upload to Pinecone
embed_and_upload_to_pinecone(split_docs)


28859 chunks were created!
Uploaded batch 1 of 289
Uploaded batch 2 of 289
Uploaded batch 3 of 289
Uploaded batch 4 of 289
Uploaded batch 5 of 289
Uploaded batch 6 of 289
Uploaded batch 7 of 289
Uploaded batch 8 of 289
Uploaded batch 9 of 289
Uploaded batch 10 of 289
Uploaded batch 11 of 289
Uploaded batch 12 of 289
Uploaded batch 13 of 289
Uploaded batch 14 of 289
Uploaded batch 15 of 289
Uploaded batch 16 of 289
Uploaded batch 17 of 289
Uploaded batch 18 of 289
Uploaded batch 19 of 289
Uploaded batch 20 of 289
Uploaded batch 21 of 289
Uploaded batch 22 of 289
Uploaded batch 23 of 289
Uploaded batch 24 of 289
Uploaded batch 25 of 289
Uploaded batch 26 of 289
Uploaded batch 27 of 289
Uploaded batch 28 of 289
Uploaded batch 29 of 289
Uploaded batch 30 of 289
Uploaded batch 31 of 289
Uploaded batch 32 of 289
Uploaded batch 33 of 289
Uploaded batch 34 of 289
Uploaded batch 35 of 289
Uploaded batch 36 of 289
Uploaded batch 37 of 289
Uploaded batch 38 of 289
Uploaded batch 39 of 289
Uploade