In [29]:
import json
from pymilvus import connections, FieldSchema, CollectionSchema, DataType, Collection, utility

from pymilvus import connections
connections.connect(
  alias="default",
  host='localhost',
  port='19530'
)

# 2. Define fields
fields = [
    FieldSchema(name="id", dtype=DataType.INT64, is_primary=True, auto_id=True),
    FieldSchema(name="filename", dtype=DataType.VARCHAR, max_length=128), 
    FieldSchema(name="sentence", dtype=DataType.VARCHAR, max_length=4096), 
    FieldSchema(name="embedding", dtype=DataType.FLOAT_VECTOR, dim=384),
]

# 3. Build the schema
schema = CollectionSchema(
    fields,
    description="Schema of docs v2",
        enable_dynamic_field=False
)

# 4. Create collection
collection = Collection(
    name="python_docV2", 
    description="Python documentation collection V2",
    schema=schema
)

# 5. Create index

index_params = {
    "index_type": "AUTOINDEX",
    "metric_type": "L2",
    "params": {}
}

# To name the index, do as follows:
collection.create_index(
  field_name="embedding", 
  index_params=index_params,
  index_name='embedding_index'
)

Status(code=0, message=)

In [None]:
def insert_docs_to_db(batch_data):
    res = collection.insert(
        data=batch_data
    )
    return res

In [None]:
def load_json(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        return json.load(f)

In [None]:
json_file_path = "processed_data.json"
data = load_json(json_file_path)

# Insert data to the database
insert_result = insert_docs_to_db(data)
print(f"Insert result: {insert_result}")

In [28]:
from pymilvus import connections, Collection, utility
utility.drop_collection("python_docV2")

In [23]:
import json

json_file_path = "processed_data.json"
data = load_json(json_file_path)

def load_json(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        return json.load(f)

In [25]:
# Find the longest sentence
longest_sentence = ""
for record in data:
    if len(record['sentence']) > len(longest_sentence):
        longest_sentence = record['sentence']

print(f"Longest sentence: {longest_sentence}")



In [26]:
filtered_data = [record for record in data if len(record['sentence']) <= 2048]

print(f"Original number of entries: {len(data)}")
print(f"Number of entries after filtering: {len(filtered_data)}")
print(f"Entries removed: {len(data) - len(filtered_data)}")

Original number of entries: 90167
Number of entries after filtering: 90151
Entries removed: 16


In [27]:
data = filtered_data

In [30]:
def insert_docs_to_db(batch_data):
    res = collection.insert(
        data=batch_data
    )
    return res

# Define chunk size
chunk_size = 10000
total_chunks = len(data) // chunk_size + (1 if len(data) % chunk_size else 0)

for idx in range(total_chunks):
    start_idx = idx * chunk_size
    end_idx = (idx + 1) * chunk_size
    chunked_data = data[start_idx:end_idx]
    
    # Insert each chunk to the database
    insert_result = insert_docs_to_db(chunked_data)
    print(f"Insert result for chunk {idx+1}/{total_chunks}: {insert_result}")

Insert result for chunk 1/10: (insert count: 10000, delete count: 0, upsert count: 0, timestamp: 443857009038327810, success count: 10000, err count: 0)
Insert result for chunk 2/10: (insert count: 10000, delete count: 0, upsert count: 0, timestamp: 443857009339793412, success count: 10000, err count: 0)
Insert result for chunk 3/10: (insert count: 10000, delete count: 0, upsert count: 0, timestamp: 443857009627889666, success count: 10000, err count: 0)
Insert result for chunk 4/10: (insert count: 10000, delete count: 0, upsert count: 0, timestamp: 443857009916248068, success count: 10000, err count: 0)
Insert result for chunk 5/10: (insert count: 10000, delete count: 0, upsert count: 0, timestamp: 443857010204606465, success count: 10000, err count: 0)
Insert result for chunk 6/10: (insert count: 10000, delete count: 0, upsert count: 0, timestamp: 443857010492964868, success count: 10000, err count: 0)
Insert result for chunk 7/10: (insert count: 10000, delete count: 0, upsert count: