In [1]:
# !/Users/davidramirez/google-cloud-sdk/bin/gcloud auth application-default login
# !gcloud config set account dramescalante@gmail.com
# !pip install boto3
# !pip install -U langchain-google-vertexai
# !pip install langchain-google-alloydb-pg
# !pip install langchain-google-firestore
# !pip install google-cloud-vision
# !pip install boto3 google-cloud-vision
# !pip install PyPDF2
# !pip install gcsfs

In [2]:
from langchain.schema.document import Document
from langchain_google_firestore import FirestoreVectorStore
from langchain_google_vertexai import VertexAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from google.cloud import firestore
import boto3
from io import BytesIO
from PyPDF2 import PdfReader
from google.cloud import storage

In [3]:
#GCP
project_id ="legal-advisor-001"
location = "us-west1"
collection="vectorStoreLegal"
embed_model_name="textembedding-gecko-multilingual@001"
bucket_name = "legalrepo"


#Other
chunk_size = 1500
chunk_overlap=300
k=3

In [4]:
# Set the project id
# !yes | gcloud config set project {project_id}


# Get the documents from GCP Storage

In [5]:
# Function to read text files from Google Cloud Storage
def read_text_files_from_gcs(bucket_name, folder_path):
    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)
    blobs = bucket.list_blobs(prefix=folder_path)
    data = []
    for blob in blobs:
        print(blob.name)
        if blob.name.endswith('.txt'):
            content = blob.download_as_text()
            data.append(Document(page_content=content, metadata={"filename":blob.name}))
        elif blob.name.endswith('.pdf'):
            content = blob.download_as_string()
            doc = PdfReader(BytesIO(content))
            text = "".join([p.extract_text() for p in doc.pages])
            data.append(Document(page_content=text, metadata={"filename": blob.name}))
    return data

# Chunking

In [6]:

# Split documents into chunks using RecursiveCharacterTextSplitter
def chunk_documents(data, chunk_size, chunk_overlap):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    texts = text_splitter.split_documents(data)
    return texts

# Create FirestoreVectorStore

In [7]:
def create_vector_store(texts, project_id, embed_model_name, collection):
    embedding = VertexAIEmbeddings(
        model_name=embed_model_name,
        project=project_id,
    )
    # Create a vector store with documents
    # vector_store = FirestoreVectorStore.from_documents(
    #     collection=collection,
    #     documents=texts,
    #     embedding=embedding,
    # )

    # Create a vector store 
    vector_store = FirestoreVectorStore(collection=collection, embedding_service=embedding)

    # Add the texts to the vector store
    count = len(texts)
    if count > 400:
        for i in (range(count//400)):
            vector_store.add_documents(texts[i*400:(i+1)*400])
        vector_store.add_documents(texts[count//400*400:])
    else:
        vector_store.add_documents(texts)
    return vector_store

# Set up vector store

In [8]:
def create_update_vector_store(bucket_name, chunk_size, chunk_overlap, project_id, collection, embed_model_name):
    # Read documents from the specified folder in the bucket
    data = read_text_files_from_gcs(bucket_name, "")
    # Chunk documents
    texts = chunk_documents(data, chunk_size, chunk_overlap)
    
    # Delete all existing documents
    db = firestore.Client(project=project_id)
    if db.collection(collection):
        # Reference the collection
        collection_ref = db.collection(collection)
        # Get all documents in the collection
        docs = collection_ref.stream()
        for doc in docs:
            doc.reference.delete()

    # Create/Add documents to Vector store
    vector_store = create_vector_store(texts, project_id, embed_model_name, collection)
    return vector_store

In [9]:
db = firestore.Client()
collection_ref = db.collection(collection)
docs = collection_ref.stream()
previous_count = doc_count = sum(1 for _ in docs)

In [10]:
vector_store = create_update_vector_store(bucket_name, chunk_size, chunk_overlap, project_id, collection, embed_model_name) #Run this for create/update vector store

Creg015-2018.pdf
Creg038-2014.pdf
Creg075-2021.pdf
Ley_142_de_1994.pdf
Ley_1715_de_2014 (LEY DE ENERGIAS  RENOVABLES).pdf
Ley_2099_de_2021 (LEY DE TRANSICION ENERGETICA).pdf
1145
0 400
400 800


In [11]:
# Get a chunk count on store
db = firestore.Client()
collection_ref = db.collection(collection)
docs = collection_ref.stream()
new_count = doc_count = sum(1 for _ in docs)
print(f"Updated from {previous_count} chunks to {new_count} chunks")

Updated from 798 chunks to 1145 chunks
