In [6]:
from langchain_community.document_loaders import JSONLoader
import json
from pathlib import Path

file_path = "/Users/apple/Documents/project/buddy/RAG/files/activity.json"
data = json.loads(Path(file_path).read_text())


# def metadata_func(record: dict, metadata: dict) -> dict:
#     metadata["title"] = record.get("title")
#     return metadata


def process_content(content):
    return json.loads(content)


loader = JSONLoader(
    file_path=file_path,
    jq_schema=".[] | {title: .title, content: .content}",
    # content_key="content",
    text_content=False,
    # metadata_func=metadata_func,
)
docs = loader.load()

for doc in docs:
    doc.page_content = process_content(doc.page_content)
    print(doc)

# https://python.langchain.com/v0.2/docs/integrations/document_loaders/json/

page_content='{'title': '周一活动', 'content': {'9:00-10:00': {'活动': '晨练太极拳', '地点': '活动室'}, '10:30-11:30': {'活动': '园艺活动（菠菜种植）', '地点': '花园'}, '14:00-15:00': {'活动': '手工制作（睡眠香囊）', '地点': '手工室'}, '15:30-16:30': {'活动': '观看老电影', '地点': '影音室'}}}' metadata={'source': '/Users/apple/Documents/project/buddy/RAG/files/activity.json', 'seq_num': 1}
page_content='{'title': '周二活动', 'content': {'9:00-10:00': {'活动': '八段锦（简化版）', '地点': '大厅'}, '10:30-11:30': {'活动': '书法练习', '地点': '书画室'}, '14:00-15:00': {'活动': '棋牌娱乐（象棋、围棋等）', '地点': '棋牌室'}, '15:30-16:30': {'活动': '养生讲座（春天养生1）', '地点': '会议室'}}}' metadata={'source': '/Users/apple/Documents/project/buddy/RAG/files/activity.json', 'seq_num': 2}
page_content='{'title': '周三活动', 'content': {'9:00-10:00': {'活动': '手指操', '地点': '大厅'}, '10:30-11:30': {'活动': '唱歌活动（合唱、独唱）', '地点': '音乐室'}, '14:00-15:00': {'活动': '烘焙活动（西式糕点：蛋挞）', '地点': '备餐间'}, '15:30-16:30': {'活动': '观看戏曲表演', '地点': '影音室/大厅'}}}' metadata={'source': '/Users/apple/Documents/project/buddy/RAG/files/activity.json', 'seq_nu

In [9]:
from pymilvus import MilvusClient
import time
import torch
from sentence_transformers import SentenceTransformer
import numpy as np
import torch.nn.functional as F


# Initialize torch settings for device-agnostic code.
N_GPU = torch.cuda.device_count()
DEVICE = torch.device("cuda:N_GPU" if torch.cuda.is_available() else "cpu")


# Download the model from huggingface model hub.
model_name = "BAAI/bge-large-en-v1.5"
encoder = SentenceTransformer(model_name, device=DEVICE)


# Get the model parameters and save for later.
EMBEDDING_DIM = encoder.get_sentence_embedding_dimension()
MAX_SEQ_LENGTH_IN_TOKENS = encoder.get_max_seq_length()


# Inspect model parameters.
print(f"model_name: {model_name}")
print(f"EMBEDDING_DIM: {EMBEDDING_DIM}")
print(f"MAX_SEQ_LENGTH: {MAX_SEQ_LENGTH_IN_TOKENS}")

model_name: BAAI/bge-large-en-v1.5
EMBEDDING_DIM: 1024
MAX_SEQ_LENGTH: 512


In [13]:
# from langchain.text_splitter import RecursiveCharacterTextSplitter


# CHUNK_SIZE = 512
# chunk_overlap = np.round(CHUNK_SIZE * 0.10, 0)
# print(f"chunk_size: {CHUNK_SIZE}, chunk_overlap: {chunk_overlap}")


# # Define the splitter.
# child_splitter = RecursiveCharacterTextSplitter(
#     chunk_size=CHUNK_SIZE, chunk_overlap=chunk_overlap
# )


# # Chunk the docs.
# chunks = child_splitter.split_documents(docs)
# print(f"{len(docs)} docs split into {len(chunks)} child documents.")


# Encoder input is doc.page_content as strings.
list_of_strings = [doc.page_content for doc in docs if hasattr(doc, "page_content")]


# Embedding inference using HuggingFace encoder.
embeddings = torch.tensor(encoder.encode(list_of_strings))


# Normalize the embeddings.
embeddings = np.array(embeddings / np.linalg.norm(embeddings, axis=1, keepdims=True))


# Milvus expects a list of `numpy.ndarray` of `numpy.float32` numbers.
converted_values = list(map(np.float32, embeddings))


# Create dict_list for Milvus insertion.
dict_list = []
for doc, vector in zip(docs, converted_values):
    # Assemble embedding vector, original text content, metadata.
    chunk_dict = {
        "content": doc.page_content,
        "source": doc.metadata.get("source", ""),
        # "title": doc.metadata.get("title", ""),
        "vector": vector,
    }
    dict_list.append(chunk_dict)


mc = MilvusClient("buddy.db")

# Create a collection with flexible schema and AUTOINDEX.
COLLECTION_NAME = "MilvusDocs"
mc.create_collection(
    COLLECTION_NAME,
    EMBEDDING_DIM,
    consistency_level="Eventually",
    auto_id=True,
    overwrite=True,
)


# Insert data into the Milvus collection.
print("Start inserting entities")
start_time = time.time()
mc.insert(COLLECTION_NAME, data=dict_list, progress_bar=True)


end_time = time.time()
print(f"Milvus insert time for {len(dict_list)} vectors: ", end="")
print(f"{round(end_time - start_time, 2)} seconds")

Start inserting entities
Milvus insert time for 7 vectors: 0.01 seconds


In [14]:
SAMPLE_QUESTION = "周三有什么安排？"

# Embed the question using the same encoder.
query_embeddings = torch.tensor(encoder.encode([SAMPLE_QUESTION]))
# Normalize embeddings to unit length.
query_embeddings = F.normalize(query_embeddings, p=2, dim=1)
# Convert the embeddings to list of list of np.float32.
query_embeddings = list(map(np.float32, query_embeddings))


# Define metadata fields you can filter on.
OUTPUT_FIELDS = list(dict_list[0].keys())
OUTPUT_FIELDS.remove("vector")

# Define how many top-k results you want to retrieve.
TOP_K = 2


# Run semantic vector search using your query and the vector database.
results = mc.search(
    COLLECTION_NAME,
    data=query_embeddings,
    output_fields=OUTPUT_FIELDS,
    limit=TOP_K,
    consistency_level="Eventually",
)

for result in results:
    print(result)
    print('-'*50)
    

[{'id': 452110790429769730, 'distance': 0.7258294820785522, 'entity': {'content': {'title': '周三活动', 'content': {'9:00-10:00': {'活动': '手指操', '地点': '大厅'}, '10:30-11:30': {'活动': '唱歌活动（合唱、独唱）', '地点': '音乐室'}, '14:00-15:00': {'活动': '烘焙活动（西式糕点：蛋挞）', '地点': '备餐间'}, '15:30-16:30': {'活动': '观看戏曲表演', '地点': '影音室/大厅'}}}, 'source': '/Users/apple/Documents/project/buddy/RAG/files/activity.json'}}, {'id': 452110809121161232, 'distance': 0.7258287668228149, 'entity': {'content': {'title': '周三活动', 'content': {'9:00-10:00': {'活动': '手指操', '地点': '大厅'}, '10:30-11:30': {'活动': '唱歌活动（合唱、独唱）', '地点': '音乐室'}, '14:00-15:00': {'活动': '烘焙活动（西式糕点：蛋挞）', '地点': '备餐间'}, '15:30-16:30': {'活动': '观看戏曲表演', '地点': '影音室/大厅'}}}, 'source': '/Users/apple/Documents/project/buddy/RAG/files/activity.json'}}]
--------------------------------------------------
