In [1]:
import pandas as pd
from sentence_transformers import SentenceTransformer
import os
from timeit import default_timer as timer
from nano_vectordb import NanoVectorDB
import psutil
import numpy as np
import math

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
path = os.path.join(os.getcwd(), 'Data', 'input_search_DB.csv')
df = pd.read_csv(path, sep=',', encoding='utf-8')
df.head()

Unnamed: 0,order_id,order_customer_name,product_name,part_type_name,product_postpress_type_name
0,2200006,得意先_1,A　2023年3月号 定期演奏会,['本文1'],
1,2107551,得意先_7,アーティストリスト2022年,"['本文1', '本文1', '本文2', '本文2', '表紙1', '表紙1']",
2,2200898,得意先_8,ミュージアムリーフレット,['本文'],
3,2202767,得意先_148,A小学校　2024学校案内パンフレット,"['本文1', '本文2', '表紙1', '表紙2']",
4,2203087,得意先_14,A社統合報告書2022（英文）,"['本文1', '表紙1']",


In [3]:
df = df[['order_id', 'order_customer_name', 'product_name', 'part_type_name']]
df.head()

Unnamed: 0,order_id,order_customer_name,product_name,part_type_name
0,2200006,得意先_1,A　2023年3月号 定期演奏会,['本文1']
1,2107551,得意先_7,アーティストリスト2022年,"['本文1', '本文1', '本文2', '本文2', '表紙1', '表紙1']"
2,2200898,得意先_8,ミュージアムリーフレット,['本文']
3,2202767,得意先_148,A小学校　2024学校案内パンフレット,"['本文1', '本文2', '表紙1', '表紙2']"
4,2203087,得意先_14,A社統合報告書2022（英文）,"['本文1', '表紙1']"


In [4]:
df["text"] = df["product_name"] + " " + df["part_type_name"]

In [5]:
model = SentenceTransformer("intfloat/multilingual-e5-small")
texts = ["query: " + text for text in df["text"]]

start = timer()
embeddings = model.encode(texts, convert_to_numpy=True)
end = timer()

process = psutil.Process()
mem_mb = process.memory_info().rss / 1024**2
print(f"Encoding time: {end - start:.4f} seconds")
print(f"Memory usage: {mem_mb:.2f} MB")

INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: cpu
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: intfloat/multilingual-e5-small
Batches: 100%|██████████| 4/4 [00:00<00:00,  4.48it/s]

Encoding time: 0.8993 seconds
Memory usage: 791.04 MB





In [6]:
df['embedding'] = embeddings.tolist()

In [7]:
flag = True
if flag:
    df.to_csv(os.path.join(os.getcwd(), 'Data', 'output_search_DB_embedded.csv'), index=False, sep=',', encoding='utf-8')
else:
    df = pd.read_csv(os.path.join(os.getcwd(), 'Data', 'output_search_DB_embedded.csv'), sep=',', encoding='utf-8')
    df['embedding'] = df['embedding'].apply(eval).apply(lambda x: [float(i) for i in x])

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 98 entries, 0 to 97
Data columns (total 6 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   order_id             98 non-null     int64 
 1   order_customer_name  98 non-null     object
 2   product_name         98 non-null     object
 3   part_type_name       98 non-null     object
 4   text                 98 non-null     object
 5   embedding            98 non-null     object
dtypes: int64(1), object(5)
memory usage: 4.7+ KB


In [9]:
flag = True
if flag:
    target = 1000000
    n = math.ceil(target / len(df))
    df = pd.concat([df] * n, ignore_index=True)

In [10]:
df['order_id'] = range(1, len(df) + 1)

In [11]:
data = []
for i, row in df.iterrows():
    data.append({"__id__": str(row["order_id"]), "__vector__": np.array(row["embedding"]),"customer_name": row["order_customer_name"], "product_name": row["product_name"], "part_type_name": row["part_type_name"], "text": row["text"]})

In [12]:
embeddings[1]

array([ 0.08686243,  0.00368898,  0.00107138, -0.04675761,  0.07198764,
       -0.02520897,  0.02125641,  0.05702867,  0.03563521, -0.01163542,
        0.04603251, -0.00130556,  0.06788292,  0.00074451, -0.017533  ,
        0.03457747,  0.06847565, -0.01766942, -0.07427671, -0.05218086,
        0.04559273,  0.0328599 , -0.02091267,  0.01830516,  0.0755232 ,
        0.02460371, -0.03933692,  0.02722863,  0.07450794, -0.04297682,
       -0.06388988, -0.0423326 ,  0.06619136, -0.06336074,  0.0824189 ,
        0.00977511, -0.02668929, -0.03663327,  0.02967907, -0.00038882,
       -0.0142074 , -0.03371827,  0.06482528,  0.09704196,  0.05591549,
        0.10494497, -0.00350834,  0.03642689, -0.05414964, -0.0150751 ,
       -0.02350223,  0.05187542,  0.02454915,  0.05164273, -0.00728966,
       -0.05077475, -0.00479663, -0.05072277, -0.07437301, -0.02091098,
        0.10358013, -0.01892507,  0.03842646,  0.05637784,  0.08835746,
        0.0169538 ,  0.02478359,  0.02163713, -0.07780316, -0.02

In [13]:
print(f"Number of records: {len(data)}")

Number of records: 1000090


In [14]:
vdb = NanoVectorDB(embedding_dim=embeddings.shape[1], storage_file="orders.json")

INFO:nano-vectordb:Init {'embedding_dim': 384, 'metric': 'cosine', 'storage_file': 'orders.json'} 0 data


In [15]:
start = timer()
vdb.upsert(data) 
end = timer()

process = psutil.Process()
mem_mb = process.memory_info().rss / 1024**2
print(f"Upsert time: {end - start:.4f} seconds")
print(f"Memory usage: {mem_mb:.2f} MB")

Upsert time: 14.6476 seconds
Memory usage: 2798.24 MB


In [16]:
query = "ーセプル"
query_embedding = model.encode(["query: " + query], convert_to_numpy=True)[0]

start = timer()
results = vdb.query(query_embedding, top_k=5, better_than_threshold=0.1)
end = timer()
process = psutil.Process()
mem_mb = process.memory_info().rss / 1024**2

print(f"Query time: {end - start:.4f} seconds")
print(f"Memory usage: {mem_mb:.2f} MB")

Batches: 100%|██████████| 1/1 [00:00<00:00, 27.27it/s]

Query time: 0.1386 seconds
Memory usage: 2742.62 MB





In [17]:
for result in results:
    print(f"ID: {result['__id__']}, Customer Name: {result['customer_name']}, Product Name: {result['product_name']}, Part Type Name: {result['part_type_name']}, Score: {result['__metrics__']:.4f}")

ID: 631074, Customer Name: 得意先_200, Product Name: hoge一括（2023年3月）※デザイン型変更あり, Part Type Name: ['本文1'], Score: 0.8355
ID: 887736, Customer Name: 得意先_200, Product Name: hoge一括（2023年3月）※デザイン型変更あり, Part Type Name: ['本文1'], Score: 0.8355
ID: 398030, Customer Name: 得意先_200, Product Name: hoge一括（2023年3月）※デザイン型変更あり, Part Type Name: ['本文1'], Score: 0.8355
ID: 455556, Customer Name: 得意先_200, Product Name: hoge一括（2023年3月）※デザイン型変更あり, Part Type Name: ['本文1'], Score: 0.8355
ID: 359908, Customer Name: 得意先_200, Product Name: hoge一括（2023年3月）※デザイン型変更あり, Part Type Name: ['本文1'], Score: 0.8355
