# Milvus 

In [None]:
import os

from dotenv import load_dotenv
from milvus import MilvusServer, debug_server, default_server
from pymilvus import (
    Collection,
    CollectionSchema,
    DataType,
    FieldSchema,
    connections,
    db,
    utility,
)

load_dotenv()
DATABASE_PATH = os.getenv("DATABASE_PATH")
DATA_PATH = os.getenv("DATA_PATH")

## Setup DB setup und connect

In [None]:
def run_milvius():
    debug_server.run()


def set_settings():
    with default_server:
        default_server.set_base_dir(os.path.join(DATA_PATH, "milvius"))
        default_server.config.set("system_Log_level", "info")
        default_server.config.set("proxy_port", 19531)
        default_server.config.set("dataCoord.segment.maxSize", 1024)

def test_server():
    default_server.start()
    connections.connect(host="127.0.0.1", port=default_server.listen_port)
    print(utility.get_server_version())
    default_server.stop()


def drop_collection(collection):
    utility.drop_collection(collection)

def create_db():
    conn = connections.connect(host="127.0.0.1", port=19530, db_name="default")

    database = db.create_database("embeddings")

def connect_db():
    conn = connections.connect(host="127.0.0.1", port=19530, db_name="default")
    return conn

## Eine Collection erstellen

In [None]:
def create_collection():
    filename = FieldSchema(
        name="filename",
        dtype=DataType.VARCHAR,
        max_length=200,
        default_value="",
        is_primary=True,
    )
    sentence_id = FieldSchema(
        name="sentence_id",
        dtype=DataType.INT64,
    )
    sentence_text = FieldSchema(
        name="sentence_text",
        dtype=DataType.VARCHAR,
        max_length=6000,
        default_value="",
    )
    sentence_mini_lm_embed = FieldSchema(
        name="sentence_MINI_LM_embed",
        dtype=DataType.FLOAT_VECTOR, 
        dim=384,
        default_value=0,
    )
    schema = CollectionSchema(
        fields=[filename, sentence_id, sentence_text, sentence_mini_lm_embed],
        description="Setences with MINI_LM Embeddings",
        enable_dynamic_field=True,
    )
    collection_name = "sentence_embeddings_MINI_LM"
    collection = Collection(name=collection_name, schema=schema, using="default", shards_num=2)
    return collection

In [None]:
conn = connect_db()

In [None]:
drop_collection("sentence_embeddings_MINI_LM")

In [None]:
create_collection()

## Save MINI_LM in Collection

In [None]:
import sys
sys.path.append('..')
from db_connect import db_get_df, db_save_df, save_pkl, load_pkl, save_npz, load_npz
from milvus_connect import connect_db
from pymilvus import (
    Collection,
    CollectionSchema,
    DataType,
    FieldSchema,
    connections,
    db,
    utility,
)
import numpy as np
import pandas as pd
from tqdm import tqdm

In [None]:
embeddings = load_pkl("MINI_L6_embeddings.pkl")

In [None]:
connect_db()

In [None]:
df = db_get_df("transcript_sentences")

In [None]:
print(len(embeddings))
print(len(df))
print(len(embeddings[0]))
print(df.dtypes)

### Inserting Data

In [None]:
collection = Collection("sentence_embeddings_MINI_LM")

In [None]:
def split_into_batches(data, batch_size):
    for i in range(0, len(data), batch_size):
        yield data[i:i + batch_size]

In [None]:
batch_size = 100  
start_point = 0

filename_batches = list(split_into_batches(df["filename"].tolist(), batch_size))[start_point:]
sentence_id_batches = list(split_into_batches(df["sentence_id"].tolist(), batch_size))[start_point:]
sentence_text = list(split_into_batches(df["sentence_compound_split"].tolist(), batch_size))[start_point:]
embeddings_batches = list(split_into_batches(embeddings.tolist(), batch_size))[start_point:]

# Insert each batch into the collection
for i in tqdm(range(len(filename_batches))):
    batch_data = [
        filename_batches[i],
        sentence_id_batches[i],
        sentence_text[i],
        embeddings_batches[i],
    ]
    insert_result = collection.insert(batch_data)


### Building an Index

In [None]:
index_params = {
  "metric_type":"L2",
  "index_type":"IVF_FLAT",
  "params":{"nlist":1024}
}
collection.create_index(
  field_name="sentence_MINI_LM_embed", 
  index_params=index_params
)

In [None]:
print(collection.schema)                # Return the schema.CollectionSchema of the collection.
print(collection.description      )     # Return the description of the collection.
print(collection.name            )      # Return the name of the collection.
print(collection.is_empty       )       # Return the boolean value that indicates if the collection is empty.
print(collection.num_entities  )        # Return the number of entities in the collection.
print(collection.primary_field)         # Return the schema.FieldSchema of the primary key field.
print(collection.partitions  )          # Return the list[Partition] object.
print(collection.indexes    )           # Return the list[Index] object.
# print(collection.properties)

In [None]:
collection.flush()

In [None]:
utility.list_collections()

## Searchin Data

In [None]:
import sys
sys.path.append("..")
from embedding_creation.embedding_creator_MINI_L6 import document_embedding_MINI_LM
from db_connect import db_get_df, db_save_df, save_pkl, load_pkl, save_npz, load_npz
from milvus_connect import connect_db
from pymilvus import Collection

In [None]:
connect_db()
collection = Collection("sentence_embeddings_MINI_LM")
search_params = {
    "metric_type": "L2", 
    "offset": 0, 
    "ignore_growing": False, 
    "params": {"nprobe": 10}
}

In [None]:
question = "Kangal"
question_embedding = document_embedding_MINI_LM(question)

In [None]:
results = collection.search(
    data=question_embedding, 
    anns_field="sentence_MINI_LM_embed", 
    param=search_params,
    limit=10,
    expr=None,
    output_fields=['sentence_text'],
    consistency_level="Strong"
)

## Save tf_idf in collection

In [None]:
import sys
sys.path.append('..')
from db_connect import db_get_df, db_save_df, save_pkl, load_pkl, save_npz, load_npz

In [None]:
vertrizer = load_pkl("tfidf_vectorizer_compound_split_87k.pkl")

In [None]:
len(vertrizer.get_vocab())

In [None]:
df = db_get_df("transcript_sentences")

In [None]:
df.dtypes

In [None]:
data = [
    df["sentence_id"].tolist(),
    df["filename"].tolist(),
    df["sentence_compound_split"].tolist(),
    df["sentence_tf_idf_embed"].tolist(),
]

# Insert the data into the collection
insert_result = collection.insert(data)

In [None]:
import pandas as pd

vectors = df['vectors'].to_list()

mr = collection.insert([vectors])

