## Library Installation

In [14]:
# !pip install pymilvus==2.4.1
# !pip install langchain==0.1.6
# !pip install sentence-transformers
# !pip install PyPDF2==1.23

In [1]:
# For watson studio
# !pip install --force-reinstall typing-extensions
# !pip install --force-reinstall packaging
# !pip install grpcio==1.58.0

In [14]:
import logging
from pymilvus import CollectionSchema, FieldSchema, DataType, Collection
from langchain.text_splitter import CharacterTextSplitter
from sentence_transformers import SentenceTransformer
from langchain_community.document_loaders import PyPDFLoader

## Credential

In [15]:
### Milvus Credential
cert="""-----BEGIN CERTIFICATE-----
MIIDzzCCAregAwIBAgIUHvv3+lYxL4ZyBS6R9qD0D9pqfaIwDQYJKoZIhvcNAQEL
BQAwdzELMAkGA1UEBhMCR0IxDzANBgNVBAgMBkxvbmRvbjESMBAGA1UEBwwJWW9y
ayBSb2FkMRswGQYDVQQKDBJDbGllbnQgRW5naW5lZXJpbmcxEjAQBgNVBAsMCUFT
IGFuZCBQVzESMBAGA1UEAwwJbG9jYWxob3N0MB4XDTIzMDgxMDE1MDIwMVoXDTMz
MDgwNzE1MDIwMVowdzELMAkGA1UEBhMCR0IxDzANBgNVBAgMBkxvbmRvbjESMBAG
A1UEBwwJWW9yayBSb2FkMRswGQYDVQQKDBJDbGllbnQgRW5naW5lZXJpbmcxEjAQ
BgNVBAsMCUFTIGFuZCBQVzESMBAGA1UEAwwJbG9jYWxob3N0MIIBIjANBgkqhkiG
9w0BAQEFAAOCAQ8AMIIBCgKCAQEAw5GJ9I0yB+FD53ro7tQnPWlMfMfO9jOojztA
EIyVFgFhoZ+CZJH+3y1e2GZm7a3wIbQS6f0Y1rZGMktAq+8UPASMSarVraiWYsrL
4znFboNFRJ2wInnPlYJis6lbCffahHzE+ye3Mx6zeSQAijImCRtaCCwZzD93kVFB
MDFHQGAEwga5plAgHhkfpXrqrzVRq1idNiojj0PRSofhb0ywWbGyjTlbC7u6odcH
as78+S6SbXHM5AqAqfTMRPZKrRmphEDYYGNG+VBfUuVI6vqd3fS7xA0AImZ7j/CW
QbFdh9TLXl+D5dVToykIgFdjtkez93ORG1HYskrZnVlGObgOnwIDAQABo1MwUTAd
BgNVHQ4EFgQUZEW3JzZOej4jLgbKcmn/t9IYQ58wHwYDVR0jBBgwFoAUZEW3JzZO
ej4jLgbKcmn/t9IYQ58wDwYDVR0TAQH/BAUwAwEB/zANBgkqhkiG9w0BAQsFAAOC
AQEAZIwIRF7Wdx/QuseV13ALfZRjHWkFHaYLgUjXW+rIyCUEr1Iu421PF/CEJKMb
kQ3T+DGDBPjrWlTxQFAJoVpvsbVeaM6qRsqHe1z9xJ4tHYYUKwdeAJl5lnrGD027
HPP0qAUvm+D2NepMOJyomktB8J8TOS+2KWpot0HZtteFP4S53Lo7+tOu9374fF/2
eg/QZkZzKaiQhcnFir7etyQBBFvo4gXXHgo884hYA8DltGpA3zlIIkTKeftafjQ+
jyAnDpq+rQ3hfKMhjeC1ATausae6td6VRP55ZfOrM4t+DEbrmh/WGM3NzzMjf91M
b8a2Cp2o7BLIPi8LwWfkQM+4dQ==
-----END CERTIFICATE-----"""
with open("cert-milvus.pem", "w") as file:
    file.write(cert)

milvus_host="158.175.191.130"
milvus_port="8080"
milvus_password="4XYg2XK6sMU4UuBEjHq4EhYE8mSFO3Qq"

## Connect with milvus

In [16]:
import numpy as np
from pymilvus import (
    connections,
    utility,
    FieldSchema, CollectionSchema, DataType,
    Collection,
)

from pymilvus import (
    connections,
    FieldSchema,
    CollectionSchema,
    DataType,
    Collection,
    MilvusClient
)

# host = "na4.services.cloud.techzone.ibm.com"
# port = 40403
# apiuser = "ibmlhadmin"
# apikey = "password"
# server_pem_path = "cert-milvus.pem"

# client = MilvusClient(
#     uri="https://ibmlhadmin:password@na4.services.cloud.techzone.ibm.com:40403",
#     server_pem_path=server_pem_path,
#     server_name="watsonxdata",
#     secure=True,
# )

print("=== start connecting to Milvus ===")
connections.connect("default", host=milvus_host,
                    port=milvus_port, secure=True, server_pem_path="cert-milvus.pem",
                    server_name="localhost",user="root",
                    password=milvus_password)

=== start connecting to Milvus ===


In [22]:
"""Module providing utility functions for Milvus"""

import logging
from pymilvus import CollectionSchema, FieldSchema, DataType, Collection
from sentence_transformers import SentenceTransformer

def create_collection(
    milvus_connection_alias: str = "default",
    collection_name: str = "ggf_collection"
    ) -> None:
    # Defining the default collection schema
    idx = FieldSchema(
        name = "id",
        description = "Embedding ID",
        dtype = DataType.INT64,
        is_primary = True,
        auto_id = True,
    )
    embedding_vector = FieldSchema(
        name = "embedding_vector",
        description = "Embedding vector",
        dtype = DataType.FLOAT_VECTOR,
        dim = 384,
    )
    embedding_raw = FieldSchema(
        name = "embedding_raw",
        description = "Embedding raw value",
        dtype = DataType.VARCHAR,
        max_length = 65535,
    )
    document_id = FieldSchema(
        name = "document_id",
        description = "Document ID",
        dtype = DataType.VARCHAR,
        max_length = 256
    )
    metadata = FieldSchema(
        name = "metadata_json",
        description = "Metadata in JSON format",
        dtype = DataType.VARCHAR,
        max_length = 65535
    )
    default_schema = CollectionSchema(
        fields = [
            idx,
            embedding_vector,
            embedding_raw,
            document_id,
            metadata
        ],
        description = "Default collection schema",
        enable_dynamic_field = True
    )
    logging.debug("Collection schema defined.")

    # Creating the default collection
    Collection(
        name = collection_name,
        schema = default_schema,
        using = milvus_connection_alias,
        shards_num = 2,
    )
    
def embed_pdf_text(
    document_id: str,
    file_path: str,
    milvus_connection_alias: str = "default",
    collection_name: str = "collection_vietnam",
    hf_model_id: str = 'sentence-transformers/all-MiniLM-L6-v2'
    ):
    # Loading text from pdf document
    loader = PyPDFLoader(file_path)
    text_splitter = CharacterTextSplitter(
        chunk_size = 1000,
        chunk_overlap = 100
    )
    docs = loader.load_and_split(
        text_splitter = text_splitter
    )

    # Embedding text in 'data' variable
    model = SentenceTransformer(hf_model_id)
    data = [
        [model.encode(doc.page_content) for doc in docs],
        [doc.page_content for doc in docs],
        [document_id for doc in docs],
        ["{}" for doc in docs]
    ]
    logging.debug("Text was successfully embedded from PDF {}.".format(file_path))

    # store the data in Milvus collection
    collection = Collection(
        name = collection_name,
        using = milvus_connection_alias
    )
    collection.insert(data)

def build_vector_index(
    milvus_connection_alias: str = "default",
    collection_name: str = "collection_vietnam"
):
    # Parameters of the index being created
    index_params = {
        "metric_type":"L2",
        "index_type":"IVF_FLAT",
        "params":{"nlist":1024}
        }
    
    collection = Collection(
        name = collection_name,
        using = milvus_connection_alias
    )

    # Create an index from the embeddings vectors
    collection.create_index(
        field_name = "embedding_vector",
        index_params = index_params
    )

def build_vector_index(
    milvus_connection_alias: str = "techzone_connection",
    collection_name: str = "default_collection"
):
    # Parameters of the index being created
    index_params = {
        "metric_type":"L2",
        "index_type":"IVF_FLAT",
        "params":{"nlist":1024}
        }
    
    collection = Collection(
        name = collection_name,
        using = milvus_connection_alias
    )

    # Create an index from the embeddings vectors
    collection.create_index(
        field_name = "embedding_vector",
        index_params = index_params
    )

def similarity_search(
    user_question: str,
    limit=3,
    milvus_connection_alias: str = "default",
    collection_name: str = "workhsop_collection",
    hf_model_id: str = 'sentence-transformers/all-MiniLM-L6-v2'
    ) -> list:

    # Search parameters
    search_params = {
        "metric_type": "L2", 
        "offset": 0, 
        "ignore_growing": False, 
        "params": {"nprobe": 10}
    }

    collection = Collection(
        name = collection_name,
        using = milvus_connection_alias
    )
    collection.load()
    logging.debug("Collection loaded.")

    # Embedding model
    model = SentenceTransformer(hf_model_id)
    logging.debug("Embedding model loaded.")

    # Search the index for the 3 closest vectors
    results = collection.search(
        data = [model.encode(user_question)],
        anns_field = "embedding_vector",
        param = search_params,
        limit = limit,
        expr = None,
        output_field = ['title'],
        consistency_level = "Strong"
    )

    # Retrieving the text associated with the results ids
    results_text = collection.query(
        expr = "id in {}".format(results[0].ids), 
        output_fields = ["id", "embedding_raw", "document_id", "metadata_json"],
        consistency_level="Strong"
    )
    collection.release()
    logging.debug("Text chunks succesfully retrieved.")

    return results_text


E0610 21:04:11.776736000 6309933056 chttp2_connector.cc:266]           Failed to create channel args during subchannel creation: INTERNAL: Failed to create secure subchannel for secure name 'localhost'; Got args: {grpc.client_channel_factory=0x10a8d9a50, grpc.default_authority=localhost, grpc.enable_retries=1, grpc.internal.channel_credentials=0x33f8ed780, grpc.internal.event_engine=0x108621110, grpc.internal.subchannel_pool=0x33f484a80, grpc.internal.transport=(nil), grpc.keepalive_time_ms=55000, grpc.max_receive_message_length=-1, grpc.max_send_message_length=-1, grpc.primary_user_agent=grpc-python/1.60.0, grpc.resource_quota=0x33f481680, grpc.server_uri=dns:///158.175.191.130:8080, grpc.ssl_target_name_override=localhost}


In [23]:
collection_name = "collection_vietnam"
has=utility.has_collection(f"{collection_name}")

if has == False:
    print(f"The collection is not available")
    create_collection(milvus_connection_alias="default", collection_name=collection_name)
    
else:
    print(f"The collection is available")
    print(utility.list_collections())

    collection = Collection(name=collection_name)
    print(collection)

The collection is available
['collection', 'collection_vietnam']
<Collection>:
-------------
<name>: collection_vietnam
<description>: Default collection schema
<schema>: {'auto_id': True, 'description': 'Default collection schema', 'fields': [{'name': 'id', 'description': 'Embedding ID', 'type': <DataType.INT64: 5>, 'is_primary': True, 'auto_id': True}, {'name': 'embedding_vector', 'description': 'Embedding vector', 'type': <DataType.FLOAT_VECTOR: 101>, 'params': {'dim': 384}}, {'name': 'embedding_raw', 'description': 'Embedding raw value', 'type': <DataType.VARCHAR: 21>, 'params': {'max_length': 65535}}, {'name': 'document_id', 'description': 'Document ID', 'type': <DataType.VARCHAR: 21>, 'params': {'max_length': 256}}, {'name': 'metadata_json', 'description': 'Metadata in JSON format', 'type': <DataType.VARCHAR: 21>, 'params': {'max_length': 65535}}], 'enable_dynamic_field': True}



In [24]:
embed_pdf_text(document_id="policy", file_path="data/paper_flowers.pdf", milvus_connection_alias="default", collection_name=collection_name)

In [25]:
build_vector_index(milvus_connection_alias="default", collection_name=collection_name)

In [26]:
similarity_search("where is the experiment take place?", milvus_connection_alias="default", collection_name=collection_name, limit=1)

E0610 21:04:41.778774000 6206222336 chttp2_connector.cc:266]           Failed to create channel args during subchannel creation: INTERNAL: Failed to create secure subchannel for secure name 'localhost'; Got args: {grpc.client_channel_factory=0x10a8d9a50, grpc.default_authority=localhost, grpc.enable_retries=1, grpc.internal.channel_credentials=0x33f8ed780, grpc.internal.event_engine=0x3aa5546b0, grpc.internal.subchannel_pool=0x33f484a80, grpc.internal.transport=(nil), grpc.keepalive_time_ms=55000, grpc.max_receive_message_length=-1, grpc.max_send_message_length=-1, grpc.primary_user_agent=grpc-python/1.60.0, grpc.resource_quota=0x33f481680, grpc.server_uri=dns:///158.175.191.130:8080, grpc.ssl_target_name_override=localhost}


[{'embedding_raw': 'Rev. Ceres, Viçosa, v. 70, n. 2, p. 1-12, mar/apr, 2023\n6 Joelma Gonçalves et al.\nIn relation to irrigation, the rate of colonization (RC) \nincreased as the water availability increased in the pots. \nWith a quadratic tendency, the lowest colonization rate \noccurred with the replacement of 42.44% of the evaporated \nwater (Figure 2).\nThe absence of mycorrhizal colonization in treatments \nthat did not receive inoculant indicated that the method of exposition of the substrate to the solar energy was effective \nin the elimination of propagules of native fungi. However, \nthe rate of colonization found in the plants (32%) receiving \ninoculant is different from those reported in the literature. \nZubek et al.  (2015) found a colonization rate above 80% in \nViola tricolor , using a mix of Rhizophagus irregulares and \nFunneliformis mosseae .\nFigure 1:  Root fragments free of mycorrhizal colonization (A); unit of mycorrhizal infection with presence of hyphopodia 

E0610 21:05:11.785382000 6207369216 chttp2_connector.cc:266]           Failed to create channel args during subchannel creation: INTERNAL: Failed to create secure subchannel for secure name 'localhost'; Got args: {grpc.client_channel_factory=0x10a8d9a50, grpc.default_authority=localhost, grpc.enable_retries=1, grpc.internal.channel_credentials=0x33f8ed780, grpc.internal.event_engine=0x10a67e110, grpc.internal.subchannel_pool=0x33f484a80, grpc.internal.transport=(nil), grpc.keepalive_time_ms=55000, grpc.max_receive_message_length=-1, grpc.max_send_message_length=-1, grpc.primary_user_agent=grpc-python/1.60.0, grpc.resource_quota=0x33f481680, grpc.server_uri=dns:///158.175.191.130:8080, grpc.ssl_target_name_override=localhost}
