In [1]:
#!pip install langchain
#!pip install nltk
#!pip install google-cloud-aiplatform>=1.38.0
#!pip install pgvector
#!pip install psycopg2-binary
#!pip install unstructured

In [None]:
%env GOOGLE_APPLICATION_CREDENTIALS=key.json

In [None]:
%env GOOGLE_PROJECT_NUMBER=

In [None]:
%env PG_VECTOR_PASSWORD=

In [5]:
import nltk
import os
from langchain.embeddings import VertexAIEmbeddings
from langchain.text_splitter import NLTKTextSplitter
from langchain.vectorstores import PGVector
from langchain_community.document_loaders import DirectoryLoader
from langchain_community.vectorstores.pgvector import DistanceStrategy as ds
from urllib.parse import quote

In [None]:
# download nltk tokenizers
nltk.download('punkt')

In [7]:
def create_chunks_from_documents(directory):
    """
    create_chunks_from_documents Function Documentation

    Description:
        This function takes a directory path containing documents as input and creates text chunks from the documents using the NLTKTextSplitter.

    Parameters:
        directory (str): The path to the directory containing documents.

    Returns:
        List[str]: A list of text chunks extracted from the documents in the specified directory.

    Dependencies:
        - DirectoryLoader: A class responsible for loading documents from a given directory.
        - NLTKTextSplitter: A class that utilizes NLTK (Natural Language Toolkit) for splitting text into chunks.

    Usage:
        1. Provide the path to the directory containing documents as the 'directory' parameter.
        2. The function loads the documents using the DirectoryLoader.
        3. It then uses the NLTKTextSplitter to split the loaded documents into text chunks.
        4. The resulting list of text chunks is returned.

    Example:
        directory_path = '/path/to/documents'
        chunks = create_chunks_from_documents(directory_path)
        print(chunks)

    Note:
        - Make sure to have the necessary dependencies installed before using this function.
        - Adjust the 'chunk_size' and 'chunk_overlap' parameters of NLTKTextSplitter for customized chunking behavior.
    """
    doc_loader = DirectoryLoader(directory)
    documents = doc_loader.load()

    text_splitter = NLTKTextSplitter(chunk_size=1024, chunk_overlap=200)

    return text_splitter.split_documents(documents)

In [21]:
# use project number instead of project name
# https://stackoverflow.com/questions/66518534/httperror-403-with-consumer-invalid-as-the-reason-when-deploying-a-machine-learn
project_number = os.environ["GOOGLE_PROJECT_NUMBER"]
#embed_model = VertexAIEmbeddings(project=project_number,model_name="textembedding-gecko@003")
embed_model = VertexAIEmbeddings(project=project_number,model_name="textembedding-gecko-multilingual@001")

In [None]:
all_docs_chunks = create_chunks_from_documents("./data-clean/")

In [10]:
# create connection string for Postgres database
password = os.environ["PG_VECTOR_PASSWORD"]
CONNECTION_STRING = f"postgresql+psycopg2://mario:%s@postgres_pgvector:5432/test_db" % quote(password)

In [None]:
# Create tables with embeddings and metadata
# langchain_pg_collection contains collection names and metadata
# langchain_pg_embedding contains embeddings with documents and metadata
PGVector.from_documents(documents=all_docs_chunks, embedding=embed_model, collection_name="all_docs_multi", connection_string=CONNECTION_STRING)
PGVector.from_documents(documents=all_docs_chunks, embedding=embed_model, collection_name="all_docs_multi_l2", connection_string=CONNECTION_STRING, distance_strategy=ds.EUCLIDEAN)
PGVector.from_documents(documents=all_docs_chunks, embedding=embed_model, collection_name="all_docs_multi_inner", connection_string=CONNECTION_STRING, distance_strategy=ds.MAX_INNER_PRODUCT)