In [16]:
import json
from pymilvus import connections, FieldSchema, CollectionSchema, DataType, Collection, utility

from pymilvus import connections
connections.connect(
  alias="default",
  host='localhost',
  port='19530'
)

# 2. Define fields
fields = [
    FieldSchema(name="id", dtype=DataType.INT64, is_primary=True),
    FieldSchema(name="vector", dtype=DataType.FLOAT_VECTOR, dim=384),
]

# 3. Build the schema
schema = CollectionSchema(
    fields,
    description="Schema of wiki_v1",
        enable_dynamic_field=False
)

# 4. Create collection
collection = Collection(
    name="wiki_v1", 
    description="Wiki collection V1",
    schema=schema
)

# 5. Create index

index_params = {
    "index_type": "HNSW",
    "metric_type": "L2",
    "params": {"M": 30, "efConstruction": 200}
}

# To name the index, do as follows:
collection.create_index(
  field_name="vector", 
  index_params=index_params,
  index_name='embedding_index'
)

Status(code=0, message=)

In [2]:
def insert_docs_to_db(batch_data):
    res = collection.insert(
        data=batch_data
    )
    return res

In [3]:
def load_json(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        return json.load(f)

In [14]:
from pymilvus import connections, Collection, utility
utility.drop_collection("wiki_v1")

In [23]:
import json

def load_json(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        return json.load(f)

def insert_docs_to_db(batch_data):
    res = collection.insert(
        data=batch_data
    )
    return res

In [17]:
import os
import json

def load_json_in_chunks(file_path, chunk_size=10000):
    """Yield successive chunk_size-sized chunks from the JSON file."""
    with open(file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
        for i in range(0, len(data), chunk_size):
            yield data[i:i + chunk_size]

def insert_docs_to_db(batch_data):
    # Only insert the 'id' and 'vector' fields
    filtered_data = [{'id': item['id'], 'vector': item['vector']} for item in batch_data]
    res = collection.insert(data=filtered_data)
    return res

# Directory containing the JSON files
dir_path = 'I:\Embeddings'  # Replace with your directory path
json_files = [os.path.join(dir_path, f) for f in os.listdir(dir_path) if f.endswith('.json')]

for json_file_path in json_files:
    # Using the generator to load the file in chunks
    for chunked_data in load_json_in_chunks(json_file_path):
        # Insert the current chunk to the database
        insert_result = insert_docs_to_db(chunked_data)
        print(f"Insert result for chunk from file {json_file_path}: {insert_result}")


Insert result for chunk from file I:\Embeddings\encoded_vectors_1.json: (insert count: 10000, delete count: 0, upsert count: 0, timestamp: 444599461454807042, success count: 10000, err count: 0)
Insert result for chunk from file I:\Embeddings\encoded_vectors_1.json: (insert count: 5000, delete count: 0, upsert count: 0, timestamp: 444599461559664644, success count: 5000, err count: 0)
Insert result for chunk from file I:\Embeddings\encoded_vectors_10.json: (insert count: 10000, delete count: 0, upsert count: 0, timestamp: 444599462935920641, success count: 10000, err count: 0)
Insert result for chunk from file I:\Embeddings\encoded_vectors_10.json: (insert count: 5000, delete count: 0, upsert count: 0, timestamp: 444599463040778241, success count: 5000, err count: 0)
Insert result for chunk from file I:\Embeddings\encoded_vectors_100.json: (insert count: 10000, delete count: 0, upsert count: 0, timestamp: 444599464325021697, success count: 10000, err count: 0)
Insert result for chunk f

In [27]:
data = filtered_data

In [30]:
def insert_docs_to_db(batch_data):
    res = collection.insert(
        data=batch_data
    )
    return res

# Define chunk size
chunk_size = 10000
total_chunks = len(data) // chunk_size + (1 if len(data) % chunk_size else 0)

for idx in range(total_chunks):
    start_idx = idx * chunk_size
    end_idx = (idx + 1) * chunk_size
    chunked_data = data[start_idx:end_idx]
    
    # Insert each chunk to the database
    insert_result = insert_docs_to_db(chunked_data)
    print(f"Insert result for chunk {idx+1}/{total_chunks}: {insert_result}")

Insert result for chunk 1/10: (insert count: 10000, delete count: 0, upsert count: 0, timestamp: 443857009038327810, success count: 10000, err count: 0)
Insert result for chunk 2/10: (insert count: 10000, delete count: 0, upsert count: 0, timestamp: 443857009339793412, success count: 10000, err count: 0)
Insert result for chunk 3/10: (insert count: 10000, delete count: 0, upsert count: 0, timestamp: 443857009627889666, success count: 10000, err count: 0)
Insert result for chunk 4/10: (insert count: 10000, delete count: 0, upsert count: 0, timestamp: 443857009916248068, success count: 10000, err count: 0)
Insert result for chunk 5/10: (insert count: 10000, delete count: 0, upsert count: 0, timestamp: 443857010204606465, success count: 10000, err count: 0)
Insert result for chunk 6/10: (insert count: 10000, delete count: 0, upsert count: 0, timestamp: 443857010492964868, success count: 10000, err count: 0)
Insert result for chunk 7/10: (insert count: 10000, delete count: 0, upsert count: