In [None]:
import pandas as pd
from pymilvus import connections, FieldSchema, CollectionSchema, DataType, Collection
from openai import OpenAI
import numpy as np
from tqdm import tqdm
from secret import OPENAI_API_KEY

try:
    # Step 1: Load the Excel File
    file_path = 'merged_lecture_summaries.xlsx'  # Replace with the actual file path
    df = pd.read_excel(file_path)

    # Step 2: Generate Embeddings for Lecture Summaries with Progress Bar
    def generate_embeddings(text):
        client = OpenAI(
            api_key=OPENAI_API_KEY,
        )
        response = client.embeddings.create(
            input=text,
            model="text-embedding-3-small"
        )
        embedding = response.data[0].embedding
        norm = np.linalg.norm(embedding)
        normalized_embedding = (embedding / norm).tolist() if norm != 0 else embedding.tolist()
        return normalized_embedding

    print("Generating embeddings...")
    tqdm.pandas(desc="Embedding Progress")
    df['lecture_embedding'] = df['lecture_summary'].progress_apply(generate_embeddings)

    # Step 3: Connect to Milvus
    print("Connecting to Milvus...")
    connections.connect(host='127.0.0.1', port='19530')  # Update host and port as necessary

    # Step 4: Create Milvus Collection
    collection_name = "gaits_lecture_data_collection"

    # Replace empty or NaN values with default placeholders
    df['lecture_summary'] = df['lecture_summary'].fillna("")
    df['whole_video_summary'] = df['whole_video_summary'].fillna("")
    df['lecture_embedding'] = df['lecture_embedding'].apply(lambda x: x if x else [0.0] * 1536)  # Replace empty embeddings with a zero vector
    df["lecture_name"] = "GAIT: Generative AI Tools"

    fields = [
        FieldSchema(name="id", dtype=DataType.INT64, is_primary=True, auto_id=True),
        FieldSchema(name="lecture_name", dtype=DataType.VARCHAR, max_length=500),
        FieldSchema(name="video_name", dtype=DataType.VARCHAR, max_length=500),
        FieldSchema(name="start_time", dtype=DataType.INT64, max_length=100),
        FieldSchema(name="end_time", dtype=DataType.INT64, max_length=100),
        FieldSchema(name="lecture_summary", dtype=DataType.VARCHAR, max_length=6000),
        FieldSchema(name="whole_video_summary", dtype=DataType.VARCHAR, max_length=6000),
        FieldSchema(name="lecture_embedding", dtype=DataType.FLOAT_VECTOR, dim=1536),  # Adjust `dim` to match your embedding size
    ]

    schema = CollectionSchema(fields, description="Lecture Data Collection")
    collection = Collection(name=collection_name, schema=schema)

    # Step 5: Insert Data into the Collection with Progress Bar
    print("Inserting data into Milvus...")
    data_to_insert = [
        df["lecture_name"].tolist(),
        df['video_name'].tolist(),
        df['start_time'].tolist(),
        df['end_time'].tolist(),
        df['lecture_summary'].tolist(),
        df['whole_video_summary'].tolist(),
        df['lecture_embedding'].tolist(),
    ]
    
    collection.insert(data_to_insert)

    # Step 6: Build the Index for Fast Search
    print("Creating index...")
    index_params = {"index_type": "IVF_FLAT", "metric_type": "IP", "params": {"nlist": 128}}
    collection.create_index(field_name="lecture_embedding", index_params=index_params)

    # Step 7: Load and Verify Collection
    collection.load()
    print("Data and embeddings have been successfully uploaded to Milvus.")

except Exception as e:
    print(f"An error occurred: {e}")


Generating embeddings...


Embedding Progress: 100%|██████████| 531/531 [06:07<00:00,  1.44it/s]


Connecting to Milvus...
Inserting data into Milvus...
Creating index...
Data and embeddings have been successfully uploaded to Milvus.
