In [12]:
!pip install kfp[kubernetes] -q

In [36]:
import os

import kfp
from kfp import dsl
from kfp import kubernetes

In [None]:
kfp_client = kfp.Client()

# Steps

In [27]:
@dsl.component(
    base_image='python:3.11',
    packages_to_install=["opensearch-py==2.7.1"]
)
def setup_os(rag_index_name:str = "rag_index", force_recreate: bool = True):
    import os
    from opensearchpy import OpenSearch
    
    def delete_opensearch_index(opensearch_client, index_name):
        print(f"Trying to delete index {index_name}")
        try:
            response = opensearch_client.indices.delete(index=index_name)
            print(f"Index {index_name} deleted")
            return response['acknowledged']
        except Exception as e:
            print(f"Index {index_name} not found, nothing to delete")
            return True
    
    def create_index(opensearch_client, index_name):
        settings = {
            "settings": {
                "index": {
                    "knn": True
                    }
                }
            }
        response = opensearch_client.indices.create(index=index_name, body=settings)
        return bool(response['acknowledged'])
    
    def create_index_mapping(opensearch_client, index_name):
        response = opensearch_client.indices.put_mapping(
            index=index_name,
            body={
                "properties": {
                    "vector_field": {
                        "type": "knn_vector",
                        "dimension": 384
                    },
                    "text": {
                        "type": "keyword"
                    }
                }
            }
        )
        return bool(response['acknowledged'])

    host = os.environ['OPENSEARCH_HOST']
    port = os.environ['OPENSEARCH_PORT']
    auth = (
        os.environ['OPENSEARCH_USER'],
        os.environ['OPENSEARCH_PASSWORD']
    ) 
    
    client = OpenSearch(
        hosts = [{'host': host, 'port': port}],
        http_compress = True, 
        http_auth = auth,
        use_ssl = True,
        verify_certs = False,
        ssl_assert_hostname = False,
        ssl_show_warn = False
    )
    
    if force_recreate:
        delete_opensearch_index(client, rag_index_name)
    
    index_exists = client.indices.exists(index=rag_index_name)
    
    if not index_exists:
        print("Creating OpenSearch index")
        index_created = create_index(client, rag_index_name)
        if index_created:
            print("Creating OpenSearch index mapping")
            success = create_index_mapping(client, rag_index_name)
            print(f"OpenSearch Index mapping created")
    else:
        print("Opensearch index already exists")

In [45]:
@dsl.component(
    base_image='python:3.11',
    packages_to_install=["minio<7.0"]
)
def download_data(bucket_name: str = "rag-demo-source", data_mount_point: str = "/data", data_folder: str = "raw") -> str:
    import os

    from minio import Minio
    from minio.error import BucketAlreadyOwnedByYou, NoSuchKey

    # Initialize a MinIO client
    mc = Minio(
        endpoint=os.environ["MINIO_ENDPOINT_URL"].split("http://")[1],
        access_key=os.environ["AWS_ACCESS_KEY_ID"],
        secret_key=os.environ["AWS_SECRET_ACCESS_KEY"],
        secure=False,
    )

    objects = mc.list_objects(bucket_name)
    for obj in objects:
        mc.fget_object(bucket_name, obj.object_name, f"{data_mount_point}/{data_folder}/{obj.object_name}")
        print("\t", "Downloaded", obj.object_name)

    return data_folder

In [46]:
@dsl.component
def remove_unsupported_files():
    print("download data")

@dsl.component
def ingest_os():
    print("download data")


In [47]:
@dsl.pipeline(
    name="Ingestion Pipeline",
    description="Ingest data from S3 to OpenSearch"
)
def add_pipeline(
    rag_index_name:str = "rag_index",
    rag_index_force_recreate:bool = True,
):
    setup_os_task = setup_os(
        rag_index_name = rag_index_name, 
        force_recreate = rag_index_force_recreate)
    kubernetes.use_secret_as_env(setup_os_task,
                                 secret_name='opensearch-secret',
                                 secret_key_to_env={
                                     'username': 'OPENSEARCH_USER',
                                     'password': 'OPENSEARCH_PASSWORD',
                                     'host': 'OPENSEARCH_HOST',
                                     'port': 'OPENSEARCH_PORT',
                                 })
    
    pvc_data_ingestion = kubernetes.CreatePVC(
        pvc_name_suffix='-data-ingestion',
        access_modes=['ReadWriteMany'],
        size='1Gi',
        storage_class_name='microk8s-hostpath',
    )
    
    download_data_task = download_data(
        bucket_name = "rag-demo-source"
    ).set_env_variable(
        "MINIO_ENDPOINT_URL", os.environ["MINIO_ENDPOINT_URL"]
    ).set_caching_options(
        enable_caching = False
    ).after(setup_os_task)
    kubernetes.use_secret_as_env(download_data_task,
                                 secret_name='mlpipeline-minio-artifact',
                                 secret_key_to_env={
                                     'accesskey': 'AWS_ACCESS_KEY_ID',
                                     'secretkey': 'AWS_SECRET_ACCESS_KEY',
                                 })
    kubernetes.mount_pvc(
        download_data_task,
        pvc_name=pvc_data_ingestion.outputs['name'],
        mount_path='/data',
    )

    remove_unsupported_files_task = remove_unsupported_files().after(download_data_task)
    kubernetes.mount_pvc(
        remove_unsupported_files_task,
        pvc_name=pvc_data_ingestion.outputs['name'],
        mount_path='/data',
    )

    ingest_os_task = ingest_os().after(remove_unsupported_files_task)
    kubernetes.mount_pvc(
        ingest_os_task,
        pvc_name=pvc_data_ingestion.outputs['name'],
        mount_path='/data',
    )

    delete_pvc_data_ingestion = kubernetes.DeletePVC(
        pvc_name=pvc_data_ingestion.outputs['name']
    ).after(ingest_os_task)

In [49]:
kfp_client.create_run_from_pipeline_func(
    add_pipeline,
    arguments={
        "rag_index_name" : "rag_index",
        "rag_index_force_recreate" : True,
    }
)

RunPipelineResult(run_id=8205624f-7865-4b79-bd18-79318e85c723)