In [None]:
FOLDER_PATH = "/path/to/your/folder"
ES_CLOUD_ID = "your_elasticsearch_cloud_id"
INDEX_NAME = "your_index_name"
ES_USER = "your_elasticsearch_username"
ES_PASSWORD = "your_elasticsearch_password"

In [None]:
from langchain.document_loaders import PyMuPDFLoader, UnstructuredWordDocumentLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.vectorstores import ElasticsearchStore
from elasticsearch import Elasticsearch

import os
from glob import glob
'''
Load document và chia thành các chunk để đẩy vào elasticsearch
'''
def load_documents_from_folder(folder_path):
    documents = []
    for file in glob(os.path.join(folder_path, "*")):
        if file.endswith(".pdf"):
            loader = PyMuPDFLoader(file)
        elif file.endswith(".docx"):
            loader = UnstructuredWordDocumentLoader(file)
        else:
            continue
        docs = loader.load()
        documents.extend(docs)
    return documents

def split_documents(documents):
    splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
    return splitter.split_documents(documents)

docs = load_documents_from_folder(FOLDER_PATH)
split_docs = split_documents(docs)

In [None]:
'''
đẩy các chunk documents vào elasticsearch
'''

def create_elasticsearch_store(docs):
    embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

    elastic_vector_store = ElasticsearchStore.from_documents(
        documents=docs,
        embedding=embeddings,
        es_cloud_id=ES_CLOUD_ID,
        index_name=INDEX_NAME,
        es_user=ES_USER,
        es_password=ES_PASSWORD,
    )
    return elastic_vector_store
elasticsearch_store = create_elasticsearch_store(split_docs)

In [None]:
es_connection = Elasticsearch(
    cloud_id=ES_CLOUD_ID,
    basic_auth=(ES_USER, ES_PASSWORD),
)

elasticsearch_store = ElasticsearchStore(
    embedding=HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2"),
    index_name=INDEX_NAME,
    es_connection=es_connection,
)