# Ingestion

Ingestion from the bucket into the OpenSearch database.

In [None]:
!pip install "minio<7.0"  opensearch-py langchain langchain-community sentence-transformers
# remove sentence-transformers if not needed!!!

In [None]:
from typing import List
import glob

## Read files

In [None]:
import os
import tempfile
import shutil

from minio import Minio
from minio.error import BucketAlreadyOwnedByYou, NoSuchKey

BUCKET = "rag-demo-source"

MINIO_HOST = os.environ["MINIO_ENDPOINT_URL"].split("http://")[1]

# Initialize a MinIO client
mc = Minio(
    endpoint=MINIO_HOST,
    access_key=os.environ["AWS_ACCESS_KEY_ID"],
    secret_key=os.environ["AWS_SECRET_ACCESS_KEY"],
    secure=False,
)

In [None]:
TMP_DIR = f"{tempfile.gettempdir()}/rag"
shutil.rmtree(TMP_DIR)
os.makedirs(TMP_DIR, exist_ok=True)

In [None]:
for o in mc.list_objects(BUCKET, recursive=True):
    mc.fget_object(o.bucket_name, o.object_name, f"{TMP_DIR}/{o.object_name}")
    print("Downloaded:", o.object_name)

## Load as Documents and split

In [None]:
from langchain.document_loaders import (
    CSVLoader,
    EverNoteLoader,
    PyMuPDFLoader,
    TextLoader,
    UnstructuredEmailLoader,
    UnstructuredEPubLoader,
    UnstructuredHTMLLoader,
    UnstructuredMarkdownLoader,
    UnstructuredODTLoader,
    UnstructuredPowerPointLoader,
    UnstructuredWordDocumentLoader,
)

from langchain.docstore.document import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter


# Map file extensions to document loaders and their arguments
LOADER_MAPPING = {
    ".csv": (CSVLoader, {}),
    ".doc": (UnstructuredWordDocumentLoader, {}),
    ".docx": (UnstructuredWordDocumentLoader, {}),
    ".enex": (EverNoteLoader, {}),
    ".epub": (UnstructuredEPubLoader, {}),
    ".html": (UnstructuredHTMLLoader, {}),
    ".md": (UnstructuredMarkdownLoader, {}),
    ".odt": (UnstructuredODTLoader, {}),
    ".pdf": (PyMuPDFLoader, {}),
    ".ppt": (UnstructuredPowerPointLoader, {}),
    ".pptx": (UnstructuredPowerPointLoader, {}),
    ".txt": (TextLoader, {"encoding": "utf8"}),
    # Add more mappings for other file extensions and loaders as needed
}

def load_single_document(
    file_path: str,
) -> List[Document]:  # Return a list of 'Document' objects
    ext = "." + file_path.rsplit(".", 1)[-1]
    if ext in LOADER_MAPPING:
        loader_class, loader_args = LOADER_MAPPING[ext]
        loader = loader_class(file_path, **loader_args)
        return loader.load()

    raise ValueError(f"Unsupported file extension '{ext}'")

In [None]:
CHUNK_SIZE = 500
CHUNK_OVERLAP = 50

files = []
for ext in LOADER_MAPPING:
    files.extend(glob.glob(os.path.join(TMP_DIR, f"**/*{ext}"), recursive=True))

files_len = len(files)
docs = []
count = 0
for f in files:
    count+=1
    print(f"Processing file {f}, {count}/{files_len}")
    docs.extend(load_single_document(f))
    

if not docs:
    print("No new documents to load")
    exit(0)

print(f"Loaded {len(docs)} new documents")
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP
)  
texts = text_splitter.split_documents(docs)
print(f"Split into {len(texts)} chunks of text (max. {CHUNK_SIZE} tokens each)")
# texts

## Embeddings

In [None]:
from langchain.embeddings import HuggingFaceEmbeddings

In [None]:
embeddings_model_name = "sentence-transformers/all-MiniLM-L6-v2"
use_gpu = False

hfe = HuggingFaceEmbeddings(
    model_name=embeddings_model_name,
    model_kwargs={"device": "cuda" if use_gpu else "cpu"},
)

## Save to Opensearch

In [None]:
from opensearchpy import OpenSearch

import os

In [None]:
host = os.environ['OPENSEARCH_HOST']
port = os.environ['OPENSEARCH_PORT']
auth = (
    os.environ['OPENSEARCH_USER'],
    os.environ['OPENSEARCH_PASSWORD']
) 

client = OpenSearch(
    hosts = [{'host': host, 'port': port}],
    http_compress = True, 
    http_auth = auth,
    use_ssl = True,
    verify_certs = False,
    ssl_assert_hostname = False,
    ssl_show_warn = False
)

In [None]:
def delete_opensearch_index(opensearch_client, index_name):
    print(f"Trying to delete index {index_name}")
    try:
        response = opensearch_client.indices.delete(index=index_name)
        print(f"Index {index_name} deleted")
        return response['acknowledged']
    except Exception as e:
        print(f"Index {index_name} not found, nothing to delete")
        return True

def create_index(opensearch_client, index_name):
    settings = {
        "settings": {
            "index": {
                "knn": True
                }
            }
        }
    response = opensearch_client.indices.create(index=index_name, body=settings)
    return bool(response['acknowledged'])

def create_index_mapping(opensearch_client, index_name):
    response = opensearch_client.indices.put_mapping(
        index=index_name,
        body={
            "properties": {
                "vector_field": {
                    "type": "knn_vector",
                    "dimension": 384
                },
                "text": {
                    "type": "keyword"
                }
            }
        }
    )
    return bool(response['acknowledged'])

In [None]:
INDEX_NAME = "rag_index"
FORCE_RECREATE = True

if FORCE_RECREATE:
    delete_opensearch_index(client, INDEX_NAME)

index_exists = client.indices.exists(index=INDEX_NAME)

if not index_exists:
    print("Creating OpenSearch index")
    index_created = create_index(client, INDEX_NAME)
    if index_created:
        print("Creating OpenSearch index mapping")
        success = create_index_mapping(client, INDEX_NAME)
        print(f"OpenSearch Index mapping created")
else:
    print("Opensearch index already exists")

In [None]:
from langchain.vectorstores import OpenSearchVectorSearch

opensearch_vector_search = OpenSearchVectorSearch(
    opensearch_url = f"https://{host}:{port}",
    index_name = INDEX_NAME,
    embedding_function = hfe,
    http_auth = auth,
    verify_certs = False,
    ssl_assert_hostname = False,
    ssl_show_warn = False
)

In [None]:
batch_size = 10
texts_len = len(texts)
for i in range(0, texts_len, batch_size):
    batch = texts[i:i + batch_size]
    print(f"Processing batch {int(i/batch_size)}/{int(texts_len/batch_size)}")
    opensearch_vector_search.add_texts(
        texts=[t.page_content for t in batch],
        ids=[f"{t.metadata.get('ID')}_{hash(t.page_content)}" for t in batch],
        metadatas=[t.metadata for t in batch],
        bulk_size=batch_size
    )

print("Finished!")

In [None]:
opensearch_vector_search.similarity_search("what is FPV drone?")