### 初始化一个chroma客户端进行连接。下面是本地操作。setting可以走默认不设置。

In [55]:
import chromadb

setting = chromadb.config.Settings(anonymized_telemetry=False)
chroma_client = chromadb.HttpClient(host='10.1.61.1', port=28000, settings=setting)


### 创建一个collection。chroma中collection类似于数据库中的table，用于存储相关文档的一组向量。

默认使用 all-MiniLM-L6-v2 库进行向量话。默认使用L2距离进行搜索。


In [63]:

collection_L2 = chroma_client.get_or_create_collection(name="curricula_help_L2")
print(f"Vector data with L2:{collection_L2.count()}")
collection_cosine = chroma_client.get_or_create_collection(name="curricula_help_cosine",metadata={"hnsw:space": "cosine"})
print(f"Vector data with cosine:{collection_cosine.count()}")


Vector data with L2:95
Vector data with cosine:95


### 如果删除某个collection可以用下面命令，根据name删除。

In [48]:
chroma_client.delete_collection(name="curricula_help_L2")
chroma_client.delete_collection(name="curricula_help_cosine")

### 添加文档到chroma中的collection “curricula_help_L2”下面
只使用默认的比对方法L2距离。

In [28]:
import json
import os
import json

# Read the documents from the JSON files in the helpdocument folder
documents = []
metadatas = []
ids = []
folder_path = './helpdocument'
for filename in os.listdir(folder_path):
    if filename.endswith('.json'):
        file_path = os.path.join(folder_path, filename)
        with open(file_path, 'r') as file:
            json_data = json.load(file)
            documents.append(json_data["text"])
            metadatas.append({"title": json_data["metadata"]["profileName"], "url": json_data["metadata"]["show_url"]})
            ids.append(json_data["_id"])

# print(documents[0])
# print(metadatas[0]["title"], metadatas[0]["url"])
# print(ids[0])
# Add the documents to the collection
collection_L2.add(
    documents=documents,
    metadatas=metadatas,
    ids=ids
)
collection_L2.count()

95

### 添加文档到chroma中的collection “curricula_help_cosine”下面
只使用比对方法cosine来测试距离。

In [30]:
import json
import os
import json

# Read the documents from the JSON files in the helpdocument folder
documents = []
metadatas = []
ids = []
folder_path = './helpdocument'
for filename in os.listdir(folder_path):
    if filename.endswith('.json'):
        file_path = os.path.join(folder_path, filename)
        with open(file_path, 'r') as file:
            json_data = json.load(file)
            documents.append(json_data["text"])
            metadatas.append({"title": json_data["metadata"]["profileName"], "url": json_data["metadata"]["show_url"]})
            ids.append(json_data["_id"])

# print(documents[0])
# print(metadatas[0]["title"], metadatas[0]["url"])
# print(ids[0])
# Add the documents to the collection
collection_cosine.add(
    documents=documents,
    metadatas=metadatas,
    ids=ids
)
collection_cosine.count()

95

### 使用L2方法向量化集合查询文档

In [100]:

L2_result=collection_L2.query(
    query_texts="how to add an assignment to my course?",
    n_results=10
)

top_documents=[]
top_metadatas=[]
indx=0
for item in L2_result["distances"][0]:
    if item < 5:
        # Perform desired action
        top_documents.append(L2_result['documents'][0][indx])
        top_metadatas.append(L2_result['metadatas'][0][indx])
        print(f"distance: {item}")
        print(f"metadata: {L2_result['metadatas'][0][indx]}")
        # print(f"document: {L2_result['documents'][0][indx]}")
        indx+=1


distance: 1.0182036040635443
metadata: {'title': 'Set up a blended course', 'url': 'https://cdn.maivenpoint.com/assets/webhelp/curricula-for-corporate-learning-user-guide/index.htm#!Documents/setupablendedcourse.htm'}
distance: 1.0547094833460962
metadata: {'title': 'How to enable the system to automatically publish the "Pass" course grade?', 'url': 'https://cdn.maivenpoint.com/assets/webhelp/curricula-for-corporate-learning-user-guide/index.htm#!Documents/howtoenablethesystemtoautomaticallypublishthepasscoursegrade.htm'}
distance: 1.065349913384584
metadata: {'title': 'How to add a series of assessment meetings as learning objects?', 'url': 'https://cdn.maivenpoint.com/assets/webhelp/curricula-for-corporate-learning-user-guide/index.htm#!Documents/howtoaddaseriesofassessmentmeetingsaslearningobjects.htm'}
distance: 1.0779555658748086
metadata: {'title': 'How to add course content and publish to learners?', 'url': 'https://cdn.maivenpoint.com/assets/webhelp/curricula-for-corporate-lear

### 使用cosine方法向量化集合查询文档

In [75]:

cosine_result=collection_cosine.query(
    query_texts="how to add an assignment to my course?",
    n_results=3
)
indx=0
for item in cosine_result["distances"][0]:
    if item < 0.6:
        # Perform desired action
        print(f"distance: {item}")
        print(f"metadata: {cosine_result['metadatas'][0][indx]}")
        # print(f"document: {result['documents'][0][indx]}")
        indx+=1


distance: 0.509101800284714
metadata: {'title': 'Set up a blended course', 'url': 'https://cdn.maivenpoint.com/assets/webhelp/curricula-for-corporate-learning-user-guide/index.htm#!Documents/setupablendedcourse.htm'}
distance: 0.5273547390660329
metadata: {'title': 'How to enable the system to automatically publish the "Pass" course grade?', 'url': 'https://cdn.maivenpoint.com/assets/webhelp/curricula-for-corporate-learning-user-guide/index.htm#!Documents/howtoenablethesystemtoautomaticallypublishthepasscoursegrade.htm'}
distance: 0.5326749540532154
metadata: {'title': 'How to add a series of assessment meetings as learning objects?', 'url': 'https://cdn.maivenpoint.com/assets/webhelp/curricula-for-corporate-learning-user-guide/index.htm#!Documents/howtoaddaseriesofassessmentmeetingsaslearningobjects.htm'}


# Azure OpenAI API进行向量化处理。

In [56]:
import chromadb.utils.embedding_functions as embedding_functions
import chromadb

setting = chromadb.config.Settings(anonymized_telemetry=False)
local_chroma = chromadb.HttpClient(host='10.1.61.1', port=28000, settings=setting)

openai_ef = embedding_functions.OpenAIEmbeddingFunction(
                api_key="6d75bde81cb1477882ec7482c8e51e05",
                api_base="https://openai-jp-new.openai.azure.com/",
                api_type="azure",
                api_version="2024-02-01",
                model_name="text-embedding-3-large"
            )

AzureOpenAI_collection = chroma_client.get_or_create_collection(name="azure_ai",metadata={"hnsw:space": "cosine"}, embedding_function=openai_ef)

AzureOpenAI_collection.count()

0

# 用OpenAI的API向量化

In [57]:
import json
import os
import json

# Read the documents from the JSON files in the helpdocument folder
documents = []
metadatas = []
ids = []
folder_path = './helpdocument'
for filename in os.listdir(folder_path):
    if filename.endswith('.json'):
        file_path = os.path.join(folder_path, filename)
        with open(file_path, 'r') as file:
            json_data = json.load(file)
            documents.append(json_data["text"])
            metadatas.append({"title": json_data["metadata"]["profileName"], "url": json_data["metadata"]["show_url"]})
            ids.append(json_data["_id"])

# print(documents[0])
# print(metadatas[0]["title"], metadatas[0]["url"])
# print(ids[0])
# Add the documents to the collection
AzureOpenAI_collection.add(
    documents=documents,
    metadatas=metadatas,
    ids=ids
)
AzureOpenAI_collection.count()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


95

# 对Azure OpenAI 向量化后进行查询向量

In [76]:

AzureOpenAI_collection_result=AzureOpenAI_collection.query(
    query_texts="How to add an assignment to my course?",
    n_results=3
)
indx=0
for item in AzureOpenAI_collection_result["distances"][0]:
    if item < 1:
        # Perform desired action
        print(f"distance: {item}")
        print(f"metadata: {cosine_result['metadatas'][0][indx]}")
        # print(f"document: {result['documents'][0][indx]}")
        indx+=1


distance: 0.45042781986720937
metadata: {'title': 'Set up a blended course', 'url': 'https://cdn.maivenpoint.com/assets/webhelp/curricula-for-corporate-learning-user-guide/index.htm#!Documents/setupablendedcourse.htm'}
distance: 0.45912710964862546
metadata: {'title': 'How to enable the system to automatically publish the "Pass" course grade?', 'url': 'https://cdn.maivenpoint.com/assets/webhelp/curricula-for-corporate-learning-user-guide/index.htm#!Documents/howtoenablethesystemtoautomaticallypublishthepasscoursegrade.htm'}
distance: 0.5072284106356768
metadata: {'title': 'How to add a series of assessment meetings as learning objects?', 'url': 'https://cdn.maivenpoint.com/assets/webhelp/curricula-for-corporate-learning-user-guide/index.htm#!Documents/howtoaddaseriesofassessmentmeetingsaslearningobjects.htm'}
