In [None]:
# 文本数据库/向量检索
# 1.elasticsearch
# 2.faiss

In [1]:
# 1. elasticsearch

# Elasticsearch是一个开源的分布式、RESTful 风格的搜索和数据分析引擎，它的底层是开源库Apache Lucene。
# 它使用 Java 编写，内部采用 Lucene 做索引与搜索，但是它的目标是使全文检索变得更简单，简单来说，就是对Lucene 做了一层封装，它提供了一套简单一致的 RESTful API 来帮助我们实现存储和检索。一个分布式的实时文档存储，每个字段可以被索引与搜索；一个分布式实时分析搜索引擎；能胜任上百个服务节点的扩展，并支持 PB 级别的结构化或者非结构化数据。Elasticsearch已成为全文搜索领域的主流软件之一。


# 1.1 elasticsearch连接
# 下载elasticsearch然后解压缩进入bin目录执行elasticserach.bat启动该服务，第一次会显示用户名elastic的密码
# 如果忘了可以新建一个超级用户：elasticsearch-users.bat useradd fubin -p 199012 -r superuser

from elasticsearch import Elasticsearch
# Connect to the elastic cluster
es = Elasticsearch("http://localhost:9200", basic_auth=("fubin", "199012"))
resp = es.info()
print(resp)

{'name': 'DESKTOP-J73OJ47', 'cluster_name': 'elasticsearch', 'cluster_uuid': '7lw95BC_QYCLeDBL1B2Q0w', 'version': {'number': '8.9.1', 'build_flavor': 'default', 'build_type': 'zip', 'build_hash': 'a813d015ef1826148d9d389bd1c0d781c6e349f0', 'build_date': '2023-08-10T05:02:32.517455352Z', 'build_snapshot': False, 'lucene_version': '9.7.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'}


In [1]:
# 1.1 elasticsearch建立、插入数据、查询、删除、修改索引

from elasticsearch import Elasticsearch
from elasticsearch.helpers import bulk
import csv
import json
import time
 
# Connect to the elastic cluster
es = Elasticsearch("http://localhost:9200", basic_auth=("fubin", "199012"))

# 建立索引前，定义索引库的约束文件mapping
# mapping常见属性: 1.默认 index = true 建立倒排索引，默认所有字段都建立索引; 2.analyzer使用哪种分词器；3.properties 指定子属性；4.type指定数据类型
# type数据类型：1.字符串类型：keyword 和 text; 2.数据类型：long integer short byte double float; 3.布尔：boolean; 4.日期：date; 5.对象：object （可用来表示复杂嵌套对象）
# 如下建立： [id: long]; [name: text (keyword, ngrams可分词)];  [brand: text (keyword)];  [price： float]
# 建立语句： PUT /heima {下面mapping内容...}
mappings = {
    "properties": {
        "id": {"type": "long"},
        "name": {
            "type": "text",
            "analyzer": "standard",
            "fields": {
                "keyword": {"type": "keyword"},
                "ngrams": {"type": "text", "analyzer": "ngram_analyzer"},
            }
        },
        "brand": {
            "type": "text",
            "fields": {
                "keyword": {"type": "keyword"},
            }
        },
        "price": {"type": "float"},
        "attributes": {
            "type": "nested",
            "properties": {
                "attribute_name": {"type": "text"},
                "attribute_value": {"type": "text"},
            }
        }
    }
}
settings = {
    "index": {"number_of_replicas": 2},
    "analysis": {
        "filter": {
            "ngram_filter": {
                "type": "edge_ngram",
                "min_gram": 2,
                "max_gram": 15,
            }
        },
        "analyzer": {
            "ngram_analyzer": {
                "type": "custom",
                "tokenizer": "standard",
                "filter": ["lowercase", "ngram_filter"],
            }
        }
    }
}

# 删除/建立索引
INDEX_NAME = "fubin-index"
# check the existence of the index. If yes, remove it
if(es.indices.exists(index=INDEX_NAME)):
    print("The index has already existed, going to remove it")
    print('删除:', es.search(index=INDEX_NAME, query={"match_all":{}})['hits']['hits'])
    es.options(ignore_status=404).indices.delete(index=INDEX_NAME) # 删除索引
# Create the index with the correct configurations
es.indices.create(index=INDEX_NAME, settings=settings,mappings=mappings)
print('删除/建立索引:', es.indices.get(index=INDEX_NAME))

# 建立/删除别名（索引库）
# res = es.indices.put_alias(index=INDEX_NAME, name="fubin-index-anothername") # 一个索引库取一个别名
es.indices.put_alias(index=[INDEX_NAME], name="fubin-index-anothername") # 多个索引库取一个别名
print('建立别名:', es.indices.get(index=INDEX_NAME))
es.indices.get_alias(index=INDEX_NAME)
print('获取指定索引库的别名:', es.indices.get(index=INDEX_NAME))
es.indices.get_alias(index="fubin-index-anothername", allow_no_indices=True, ignore_unavailable=True)
print('获取所有别名:', es.indices.get(index=INDEX_NAME))
# es.indices.delete_alias(index=INDEX_NAME, name="fubin-index-anothername") # 一个索引库删除其别名
es.indices.delete_alias(index=[INDEX_NAME], name="fubin-index-anothername") # 多个索引库删除其别名
print('删除别名:', es.indices.get(index=INDEX_NAME))


# 插入数据
doc1 = {
    "id": 1,
    "name": "HP EliteBook Model 1",
    "brand": "HP",
    "price": 38842.00,
    "attributes": [
        {"attribute_name": "cpu", "attribute_value": "Intel Core i7"},
        {"attribute_name": "memory", "attribute_value": "8GB"},
        {"attribute_name": "storage", "attribute_value": "256GB"}
    ]
}
doc2 = {
    "id": 2,
    "name": "MacBook Air 13",
    "brand": "Apple",
    "price": 9000.00,
    "attributes": [
        {"attribute_name": "cpu", "attribute_value": "M1"},
        {"attribute_name": "memory", "attribute_value": "16GB"},
        {"attribute_name": "storage", "attribute_value": "256GB"}
    ]
}
res = es.index(index=INDEX_NAME, id=1, document = doc1)
time.sleep(1)
# 查询所有
print('插入文档1后:', res, es.search(index=INDEX_NAME, query={"match_all":{}})['hits']['hits']) # 还在缓存，并没有推到检索库中，需要sleep几秒 (这里debug卡住一会了)
res = es.index(index=INDEX_NAME, id=2, document= doc2)
time.sleep(1)
# 查询所有
print('插入文档2后:', res, es.search(index=INDEX_NAME, query={"match_all":{}})['hits']['hits']) # 还在缓存，并没有推到检索库中，需要sleep几秒 (这里debug卡住一会了)
res = es.get(index=INDEX_NAME, id=1) # 获取某个文档
print('插入文档后查询id=1的文档：', res)

# 执行queryc查询语句
search_query = {
    "match": {
      "name.ngrams": "Air"
    }
}
res = es.search(index=INDEX_NAME, query=search_query)
print('执行query语句match[name:Air]:', res['hits']['hits'])
search_query = {
    "match": {
      "brand": "Apple"
    }
}
res = es.search(index=INDEX_NAME, query=search_query)
print('执行query语句match[brand:Apple]:', res['hits']['hits'])

# 删除文档
es.delete(index=INDEX_NAME, id=1)
time.sleep(1)
# 查询所有
print('删除文档1后:', es.search(index=INDEX_NAME, query={"match_all":{}})['hits']['hits']) # 还在缓存，并没有推到检索库中，需要sleep几秒 (这里debug卡住一会了)

# 批量删除文档（复合筛选条件）
query = {
    "match": {
      "brand": "Apple"
    }
}
es.delete_by_query(index=INDEX_NAME, query=query)
print('执行删除query文后:', es.search(index=INDEX_NAME, query={"match_all":{}})['hits']['hits']) # 还在缓存，并没有推到检索库中，需要sleep几秒 (这里debug卡住一会了)

# 文档导入数据：批量插入
colums = ["id", "name", "brand", "price", "cpu", "memory", "storage"]
with open("./data/es_examples.csv", "r") as fi:
    reader = csv.DictReader(fi, fieldnames=colums, delimiter=",", quotechar='"')
    # This skips the first row which is the header of the CSV file.
    next(reader)
    actions = []
    for row in reader:
        action = {"index": {"_index": INDEX_NAME, "_id": int(row["id"])}}
        doc = {
            "id": int(row["id"]),
            "name": row["name"],
            "price": float(row["price"]),
            "brand": row["brand"],
            "attributes": [
                {"attribute_name": "cpu", "attribute_value": row["cpu"]},
                {"attribute_name": "memory", "attribute_value": row["memory"]},
                {
                    "attribute_name": "storage",
                    "attribute_value": row["storage"],
                },
            ],
        }
        actions.append(json.dumps(action))
        actions.append(json.dumps(doc))
    bulk(es, actions, index=INDEX_NAME) # 批量发送请求
time.sleep(1)
print('文档导入数据:', es.search(index=INDEX_NAME, query={"match_all":{}})['hits']['hits']) # 还在缓存，并没有推到检索库中，需要sleep几秒 (这里debug卡住一会了)


The index has already existed, going to remove it
删除: [{'_index': 'fubin-index', '_id': 'hyXhP4oBmJoXJHE5grpR', '_score': 1.0, '_source': {'index': {'_index': 'fubin-index', '_id': 1}}}, {'_index': 'fubin-index', '_id': 'iCXhP4oBmJoXJHE5grpR', '_score': 1.0, '_source': {'id': 1, 'name': 'HP EliteBook Model 1', 'price': 38842.0, 'brand': 'HP', 'attributes': [{'attribute_name': 'cpu', 'attribute_value': 'Intel Core i7'}, {'attribute_name': 'memory', 'attribute_value': '8GB'}, {'attribute_name': 'storage', 'attribute_value': '256GB'}]}}, {'_index': 'fubin-index', '_id': 'iSXhP4oBmJoXJHE5grpR', '_score': 1.0, '_source': {'index': {'_index': 'fubin-index', '_id': 2}}}, {'_index': 'fubin-index', '_id': 'iiXhP4oBmJoXJHE5grpR', '_score': 1.0, '_source': {'id': 2, 'name': 'MacBook Air 13', 'price': 9010.0, 'brand': 'Apple', 'attributes': [{'attribute_name': 'cpu', 'attribute_value': 'M1'}, {'attribute_name': 'memory', 'attribute_value': '16GB'}, {'attribute_name': 'storage', 'attribute_value': 

In [107]:
# elasticsearch 简单查询
# elasticsearch自动将字符串类型转换为小写,检索时候先转化为小写在查询.

# 查全部: match_all
query = {
    'match_all': {}
}
print('match_all:', es.search(index=INDEX_NAME, query=query)['hits']['hits'])

# 精确查询: term
query = {
    'term': {
        'brand': "apple" # 必须小写,大写查不到
    }
}
print('term:', es.search(index=INDEX_NAME, query=query)['hits']['hits'])

# 精确查询匹配任意其中之一即可: terms
query = {
    'terms': {
        'brand': ["apple", 'Apple'] # 必须小写,大写查不到
    }
}
print('term:', es.search(index=INDEX_NAME, query=query)['hits']['hits'])

# 关键字在多个域匹配查询: multi_match
query = {
    'multi_match': {
        'query': "MacBook",
        'fields': ['name', 'brand']
    }
}
print('multi_match:', es.search(index=INDEX_NAME, query=query)['hits']['hits'])

# id匹配多值,将内在_id进行匹配
query = {
    'ids': {
        'values': ['iiXhP4oBmJoXJHE5grpR', "554644544"]
    }
}
print('ids:', es.search(index=INDEX_NAME, query=query)['hits']['hits'])

# range范围查找: gt >, gte >=, lt <, lte <=
query = {
    'range': {
        'price': {
            'gte': 10, # >
            'lte': 10000 # <
        }
    }
}
print('range:', es.search(index=INDEX_NAME, query=query)['hits']['hits'])

# prefix前缀查询
query = {
    'prefix': {
        'name': "macbook" # 一定要小写
    }
}
print('prefix:', es.search(index=INDEX_NAME, query=query)['hits']['hits'])

# wildcard通配符查询
query = {
    'wildcard': {
        'name': "hp*" # 一定要小写
    }
}
print('wildcard:', es.search(index=INDEX_NAME, query=query)['hits']['hits'])

# 模糊查询: 模糊相似,编辑距离
query = {
    'fuzzy': {
        'name': "hp" # 一定要小写
    }
}
print('fuzzy:', es.search(index=INDEX_NAME, query=query)['hits']['hits'])


match_all: [{'_index': 'fubin-index', '_id': 'hyXhP4oBmJoXJHE5grpR', '_score': 1.0, '_source': {'index': {'_index': 'fubin-index', '_id': 1}}}, {'_index': 'fubin-index', '_id': 'iCXhP4oBmJoXJHE5grpR', '_score': 1.0, '_source': {'id': 1, 'name': 'HP EliteBook Model 1', 'price': 38842.0, 'brand': 'HP', 'attributes': [{'attribute_name': 'cpu', 'attribute_value': 'Intel Core i7'}, {'attribute_name': 'memory', 'attribute_value': '8GB'}, {'attribute_name': 'storage', 'attribute_value': '256GB'}]}}, {'_index': 'fubin-index', '_id': 'iSXhP4oBmJoXJHE5grpR', '_score': 1.0, '_source': {'index': {'_index': 'fubin-index', '_id': 2}}}, {'_index': 'fubin-index', '_id': 'iiXhP4oBmJoXJHE5grpR', '_score': 1.0, '_source': {'id': 2, 'name': 'MacBook Air 13', 'price': 9010.0, 'brand': 'Apple', 'attributes': [{'attribute_name': 'cpu', 'attribute_value': 'M1'}, {'attribute_name': 'memory', 'attribute_value': '16GB'}, {'attribute_name': 'storage', 'attribute_value': '256GB'}]}}]
term: [{'_index': 'fubin-index

In [16]:

query = {
    'match_all': {},
    'sort': {
        'price': {"order": "desc"}
    }
}
print('文档导入数据:', es.search(index=INDEX_NAME, query=query)) # 还在缓存，并没有推到检索库中，需要sleep几秒 (这里debug卡住一会了)


BadRequestError: BadRequestError(400, 'parsing_exception', '[match_all] malformed query, expected [END_OBJECT] but found [FIELD_NAME]')

In [130]:
# elasticsearch 复合查询

# must 必须满足条件,布尔查询
query = {
    'bool': {
        'must': [
            {'prefix': {'name': "macbook" }},
            {'terms': {'brand': ["apple", 'Apple']}}
        ]
    }
}
print('must:', es.search(index=INDEX_NAME, query=query)['hits']['hits'])

# should 必须满足任一条件
query = {
    'bool': {
        'should': [
            {'prefix': {'name': "macbook" }},
            {'terms': {'brand': ["apple", 'Apple']}}
        ]
    }
}
print('should:', es.search(index=INDEX_NAME, query=query)['hits']['hits'])

# must_not 必须满足任一条件
query = {
    'bool': {
        'must_not': [
            {'prefix': {'name': "macbook1" }},
            {'terms': {'brand': ["apple1", 'Apple1']}}
        ]
    },
}
print('must_not:', es.search(index=INDEX_NAME, query=query)['hits']['hits'])

# sort返回排序(??)
query = {
    'query': {'match_all': {}},
    'sort': {
        'price': {"order": "desc"}
    },
    'from':0,
    'size':1
}
query = {
    'query': {
        'match_all': {}
    },
    'sort': {
        'price': {"order": "desc"}
    },
    'from':0,
    'size':1
}
print('sort:', es.search(index=INDEX_NAME, query=query)['hits']['hits'])


must: [{'_index': 'fubin-index', '_id': 'iiXhP4oBmJoXJHE5grpR', '_score': 2.0, '_source': {'id': 2, 'name': 'MacBook Air 13', 'price': 9010.0, 'brand': 'Apple', 'attributes': [{'attribute_name': 'cpu', 'attribute_value': 'M1'}, {'attribute_name': 'memory', 'attribute_value': '16GB'}, {'attribute_name': 'storage', 'attribute_value': '256GB'}]}}]
should: [{'_index': 'fubin-index', '_id': 'iiXhP4oBmJoXJHE5grpR', '_score': 2.0, '_source': {'id': 2, 'name': 'MacBook Air 13', 'price': 9010.0, 'brand': 'Apple', 'attributes': [{'attribute_name': 'cpu', 'attribute_value': 'M1'}, {'attribute_name': 'memory', 'attribute_value': '16GB'}, {'attribute_name': 'storage', 'attribute_value': '256GB'}]}}]
must_not: [{'_index': 'fubin-index', '_id': 'hyXhP4oBmJoXJHE5grpR', '_score': 0.0, '_source': {'index': {'_index': 'fubin-index', '_id': 1}}}, {'_index': 'fubin-index', '_id': 'iCXhP4oBmJoXJHE5grpR', '_score': 0.0, '_source': {'id': 1, 'name': 'HP EliteBook Model 1', 'price': 38842.0, 'brand': 'HP', 'at

BadRequestError: BadRequestError(400, 'parsing_exception', 'unknown query [query]')

In [111]:
# elasticsearch 更新数据：改了价格
doc2_ = {
    "id": 2,
    "name": "MacBook Air 13",
    "brand": "Apple",
    "price": 9010.00,
    "attributes": [
        {"attribute_name": "cpu", "attribute_value": "M1"},
        {"attribute_name": "memory", "attribute_value": "16GB"},
        {"attribute_name": "storage", "attribute_value": "256GB"}
    ]
}
es.update(index=INDEX_NAME, id='iiXhP4oBmJoXJHE5grpR', doc =doc2_) # 注意这里是生成的内在_id
time.sleep(1)
print('update文档2后:', es.search(index=INDEX_NAME, query={"match_all":{}})['hits']['hits']) # 还在缓存，并没有推到检索库中，需要sleep几秒 (这里debug卡住一会了)

# 批量更新(??)
query = {
    'query': {
        'match':{
            'brand': 'Apple',
        }
    },
    'script':{
        "source": "ctx._source.price = 9020.00",
    }
}
es.update_by_query(index=INDEX_NAME, query=query)
time.sleep(1)
print('update文档2后:', es.search(index=INDEX_NAME, query={"match_all":{}})['hits']['hits']) # 还在缓存，并没有推到检索库中，需要sleep几秒 (这里debug卡住一会了)


update文档2后: [{'_index': 'fubin-index', '_id': 'hyXhP4oBmJoXJHE5grpR', '_score': 1.0, '_source': {'index': {'_index': 'fubin-index', '_id': 1}}}, {'_index': 'fubin-index', '_id': 'iCXhP4oBmJoXJHE5grpR', '_score': 1.0, '_source': {'id': 1, 'name': 'HP EliteBook Model 1', 'price': 38842.0, 'brand': 'HP', 'attributes': [{'attribute_name': 'cpu', 'attribute_value': 'Intel Core i7'}, {'attribute_name': 'memory', 'attribute_value': '8GB'}, {'attribute_name': 'storage', 'attribute_value': '256GB'}]}}, {'_index': 'fubin-index', '_id': 'iSXhP4oBmJoXJHE5grpR', '_score': 1.0, '_source': {'index': {'_index': 'fubin-index', '_id': 2}}}, {'_index': 'fubin-index', '_id': 'iiXhP4oBmJoXJHE5grpR', '_score': 1.0, '_source': {'id': 2, 'name': 'MacBook Air 13', 'price': 9010.0, 'brand': 'Apple', 'attributes': [{'attribute_name': 'cpu', 'attribute_value': 'M1'}, {'attribute_name': 'memory', 'attribute_value': '16GB'}, {'attribute_name': 'storage', 'attribute_value': '256GB'}]}}]


BadRequestError: BadRequestError(400, 'parsing_exception', 'unknown query [query]')

In [1]:
%%time
# faiss 向量检索（矢量数据检索，开源）
# 把我们自己的候选向量集封装成一个index数据库，它可以加速我们检索相似向量TopK的过程，其中有些索引还支持GPU构建
# Facebook AI Similarity Search，是FaceBook的AI团队针对大规模相似度检索问题开发的一个工具，使用C++编写，有python接口，对10亿量级的索引可以做到毫秒级检索的性能。

# 工业上：IndexFlatIP（内积）和IndexFlatL2（欧式距离）是精确检索；快速检索采用IndexIVFFlat k-means检索
# 只适合本地单机版
# 矢量数据库适合检索，不适合增删改查sql
# 分布式采用clickhouse性能好，milvus国内封装了faiss
# 分布式还可以采用分布式版本 distributed-faiss、京东的vsearch等

import faiss, time, gc
import numpy as np
np.random.seed(100)

gc.collect()

# 公司垃圾电脑跑
# 一千万 * 100
num = 10000000
dim = 100
startime = time.time()
data = np.random.random((num, dim)).astype(np.float32)
print('np cost time:', (time.time()-startime))

# 八种度量方式: METRIC_INNER_PRODUCT 内积 / METRIC_L1 曼哈顿距离 / METRIC_L2 欧式距离 / METRIC_Linf 无穷范数 / METRIC_Lp p范数 / METRIC_BrayCurits BC相异度 / METRIC_JensenShannon JS散度 等 
index = faiss.IndexFlatL2(dim) # 
%time index.add(data) # Wall time: 1.88 s

exampes = np.random.random((10, dim)).astype(np.float32)
startime = time.time()
distances, indices = index.search(exampes, 1000)
print('index cost time:', (time.time()-startime))
print('distances =', distances[:3])
print("indices =", indices[:3])

np cost time: 12.293491125106812
Wall time: 1.88 s
index cost time: 1.0377700328826904
distances = [[ 8.053272   8.506979   8.519906  ... 10.725521  10.725808  10.725841 ]
 [ 8.3548155  8.488422   8.53853   ... 10.603056  10.603208  10.603817 ]
 [ 7.5566     7.9136305  7.9478464 ...  9.965536   9.96581    9.965822 ]]
indices = [[5284755 3032200 3836790 ... 5269557 4512153 6469582]
 [2702196 3494834 9429217 ... 7880960 5332515 7527386]
 [9612960 5854851 6211074 ... 2737059 4941920 7193825]]
Wall time: 15.8 s


In [1]:
# faiss 各种建立检索方式(参考照片,根据实际需求选择合适的建立检索方式)

import psutil, os, time
import numpy as np
import faiss
num = 1000000 # 一百万
dim = 100
data = np.random.random((num, dim)).astype(np.float32)

# 检索方式对比:
# 1.Flat 暴力检索(最准确 + 最慢 + 占内存大, 50万) （***）
method = 'Flat'
print(method)
memory_size = psutil.Process(os.getpid()).memory_info().rss / 1024 / 1024
time_stamp = time.time()
index = faiss.index_factory(dim, method, faiss.METRIC_L2)
%time index.add(data)
print("耗时:", time.time()-time_stamp, 's,', '内存占用:', psutil.Process(os.getpid()).memory_info().rss / 1024 / 1024 - memory_size, 'MB')
# 耗时: 0.20311236381530762 s, 内存占用: 381.55078125 MB

# 2.IVFx Flat 倒排暴力检索(x表示k-means聚类,百万) （***）
method = 'IVF100,Flat'
print(method)
memory_size = psutil.Process(os.getpid()).memory_info().rss / 1024 / 1024
time_stamp = time.time()
index = faiss.index_factory(dim, method, faiss.METRIC_L2)
%time index.train(data)
%time index.add(data)
print("耗时:", time.time()-time_stamp, 's,', '内存占用:', psutil.Process(os.getpid()).memory_info().rss / 1024 / 1024 - memory_size, 'MB')
# Wall time: 2.71 s
# Wall time: 9.87 s
# 耗时: 12.587925672531128 s, 内存占用: 394.2578125 MB

# # 3.PQx 乘积量化(速度快 + 内存较小 + 召回率较高,x为向量切分段数,切分越细越复杂越准确越慢)
method = 'PQ20'
print(method)
memory_size = psutil.Process(os.getpid()).memory_info().rss / 1024 / 1024
time_stamp = time.time()
index = faiss.index_factory(dim, method, faiss.METRIC_L2)
%time index.train(data)
%time index.add(data)
print("耗时:", time.time()-time_stamp, 's,', '内存占用:', psutil.Process(os.getpid()).memory_info().rss / 1024 / 1024 - memory_size, 'MB')
# Wall time: 1min 18s
# Wall time: 1.77 s
# 耗时: 80.07125926017761 s, 内存占用: 28.52734375 MB

# # 4.IVFxPQy 倒排乘积量化(集成了上述方法,折衷)
method = 'IVF100,PQ20'
print(method)
memory_size = psutil.Process(os.getpid()).memory_info().rss / 1024 / 1024
time_stamp = time.time()
index = faiss.index_factory(dim, method, faiss.METRIC_L2)
%time index.train(data)
%time index.add(data)
print("耗时:", time.time()-time_stamp, 's,', '内存占用:', psutil.Process(os.getpid()).memory_info().rss / 1024 / 1024 - memory_size, 'MB')
# Wall time: 1min 20s
# Wall time: 11.9 s
# 耗时: 92.77270889282227 s, 内存占用: 46.6953125 MB

# # 5.LSH 局部敏感哈希(两点距离远则哈希分桶相同概率小)(占内存少,召回率差,适合大规模资源稀缺)
method = 'LSH'
print(method)
memory_size = psutil.Process(os.getpid()).memory_info().rss / 1024 / 1024
time_stamp = time.time()
index = faiss.index_factory(dim, method, faiss.METRIC_L2)
%time index.add(data)
print("耗时:", time.time()-time_stamp, 's,', '内存占用:', psutil.Process(os.getpid()).memory_info().rss / 1024 / 1024 - memory_size, 'MB')
# Wall time: 46.8 ms
# 耗时: 0.04684090614318848 s, 内存占用: 13.33203125 MB

# # 6.HNSWx: 检索速度快.召回匹配Flat,检索时间为loglogn,无视向量规模,构建索引极慢占内存极大.
method = 'HNSW50'
print(method)
memory_size = psutil.Process(os.getpid()).memory_info().rss / 1024 / 1024
time_stamp = time.time()
index = faiss.index_factory(dim, method, faiss.METRIC_L2)
%time index.add(data)
print("耗时:", time.time()-time_stamp, 's,', '内存占用:', psutil.Process(os.getpid()).memory_info().rss / 1024 / 1024 - memory_size, 'MB')
# Wall time: 6min 53s
# 耗时: 413.1467287540436 s, 内存占用: 781.33984375 MB


HNSW50
Wall time: 6min 53s
耗时: 413.1467287540436 s, 内存占用: 781.33984375 MB
