In [1]:
import pandas as pd
from pymilvus import connections, FieldSchema, CollectionSchema, DataType, Collection
import json

try:
    # Step 1: Connect to Milvus
    print("Connecting to Milvus...")
    connections.connect(host='127.0.0.1', port='19530')  # Update host and port as necessary

    # Step 2: Define Collection Schema
    collection_name = "gaits_lecture_data_collection"

    fields = [
        FieldSchema(name="id", dtype=DataType.INT64, is_primary=True, auto_id=True),
        FieldSchema(name="lecture_name", dtype=DataType.VARCHAR, max_length=500),
        FieldSchema(name="video_name", dtype=DataType.VARCHAR, max_length=500),
        FieldSchema(name="start_time", dtype=DataType.INT64),
        FieldSchema(name="end_time", dtype=DataType.INT64),
        FieldSchema(name="lecture_summary", dtype=DataType.VARCHAR, max_length=6000),
        FieldSchema(name="whole_video_summary", dtype=DataType.VARCHAR, max_length=6000),
        FieldSchema(name="lecture_embedding", dtype=DataType.FLOAT_VECTOR, dim=1536),  # Adjust `dim` to match your embedding size
    ]

    schema = CollectionSchema(fields, description="Lecture Data Collection")

    # Step 3: Create Collection
    print("Creating collection...")
    collection = Collection(name=collection_name, schema=schema)

    # Step 4: Prepare Data for Insertion
    print("Preparing data for insertion...")
    file_path = "milvus_extracted_data.json"  # Replace with the actual file path to the JSON file

    with open(file_path, "r") as f:
        data = json.load(f)

    # Ensure embeddings are converted back to float32
    for record in data:
        record["lecture_embedding"] = [float(x) for x in record["lecture_embedding"]]

    # Organize data into fields for insertion
    data_to_insert = [
        [record.get("lecture_name", "") for record in data],
        [record.get("video_name", "") for record in data],
        [record.get("start_time", 0) for record in data],
        [record.get("end_time", 0) for record in data],
        [record.get("lecture_summary", "") for record in data],
        [record.get("whole_video_summary", "") for record in data],
        [record.get("lecture_embedding", [0.0] * 1536) for record in data],
    ]

    # Step 5: Insert Data
    print("Inserting data into Milvus...")
    collection.insert(data_to_insert)

    # Step 6: Build the Index for Fast Search
    print("Creating index...")
    index_params = {"index_type": "IVF_FLAT", "metric_type": "IP", "params": {"nlist": 128}}
    collection.create_index(field_name="lecture_embedding", index_params=index_params)

    # Step 7: Load the Collection
    collection.load()
    print("Data successfully inserted and indexed in Milvus.")

except Exception as e:
    print(f"An error occurred: {e}")


Connecting to Milvus...
Creating collection...
Preparing data for insertion...
Inserting data into Milvus...
Creating index...
Data successfully inserted and indexed in Milvus.
