# Read CSV File

In [1]:
import pandas as pd

df = pd.read_csv("data\input_search_DB.csv")

df.head()

Unnamed: 0,order_id,order_customer_name,product_name,part_type_name,product_postpress_type_name
0,2200006,得意先_1,A　2023年3月号 定期演奏会,['本文1'],
1,2107551,得意先_7,アーティストリスト2022年,"['本文1', '本文1', '本文2', '本文2', '表紙1', '表紙1']",
2,2200898,得意先_8,ミュージアムリーフレット,['本文'],
3,2202767,得意先_148,A小学校　2024学校案内パンフレット,"['本文1', '本文2', '表紙1', '表紙2']",
4,2203087,得意先_14,A社統合報告書2022（英文）,"['本文1', '表紙1']",


## Embedd Dataframe using Ollama nomic-embedd-text model

In [None]:
import requests
import json
from typing import List
from nano_vectordb import NanoVectorDB
import numpy as np

OLLAMA_HOST = 'http://localhost:11434'
MODEL_NAME = 'nomic-embed-text'
TEXT_COLUMN = 'text'


def get_embeddings(texts: List[str]) -> List[List[float]]:
    """Get embeddings for a list of texts using Ollama's API"""
    embeddings = []

    for i, text in enumerate(texts):
        print(f"Processing {i + 1}/{len(texts)}...")
        payload = {
            "model": MODEL_NAME,
            "prompt": text,
            "options": {"embedding_only": True}
        }

        response = requests.post(
            f"{OLLAMA_HOST}/api/embeddings",
            headers={"Content-Type": "application/json"},
            data=json.dumps(payload)
        )

        if response.status_code == 200:
            # Parse the response (Ollama streams responses)
            for line in response.iter_lines():
                if line:
                    decoded_line = json.loads(line.decode('utf-8'))
                    if 'embedding' in decoded_line:
                        embeddings.append(decoded_line['embedding'])
                        break  # We got the embedding, move to next text
        else:
            raise Exception(f"Error getting embeddings: {response.text}")

    return embeddings


def add_embeddings_to_dataframe(df: pd.DataFrame, text_column: str) -> pd.DataFrame:
    """Add embeddings to a dataframe for a specified text column"""
    texts = df[text_column].tolist()
    embeddings = get_embeddings(texts)

    # Add embeddings as a new column
    df['embeddings'] = embeddings

    return df

if __name__ == "__main__":
    text_columns = [col for col in df.columns if col.lower() != "order_id"]
    df["text"] = df[text_columns].apply(lambda row: " ".join(str(cell) for cell in row), axis=1)

    if TEXT_COLUMN not in df.columns:
        raise ValueError(f"Column '{TEXT_COLUMN}' not found in the CSV file")

    df_with_embeddings = add_embeddings_to_dataframe(df, TEXT_COLUMN)

    df_with_embeddings.to_csv('data/demoV3_embedding.csv', index=False)

    print("Embedding completed successfully!")

## Upsert And query nano VDB

In [8]:
import json
import ast
import requests
from nano_vectordb import NanoVectorDB
import numpy as np
from torch.utils.benchmark import timer
import psutil
import os

CSV_PATH = "data\demoV3_embedding.csv"
STORAGE_FILE = "nano_data.json"

def print_memory_usage(note=""):
    process = psutil.Process(os.getpid())
    mem_info = process.memory_info()
    print(f"[Memory Usage] {note} - RSS: {mem_info.rss / (1024 ** 2):.2f} MB, VMS: {mem_info.vms / (1024 ** 2):.2f} MB")

def embed_query(query: str):
    response = requests.post(
        "http://localhost:11434/api/embeddings",
        headers={"Content-Type": "application/json"},
        data=json.dumps({
            "model": "nomic-embed-text",
            "prompt": query,
            "options": {"embedding_only": True}
        })
    )

    for line in response.iter_lines():
        if line:
            return json.loads(line.decode("utf-8"))["embedding"]

df = pd.read_csv(CSV_PATH)

df['embeddings'] = df['embeddings'].apply(ast.literal_eval)

df['embeddings'] = df['embeddings'].apply(lambda x: np.array(x, dtype=np.float32))

EMBEDDING_DIM = len(df['embeddings'].iloc[0])

data = []
for i, row in df.iterrows():
    data.append({
        "__id__": str(row["order_id"]),
        "__vector__": row["embeddings"],
        "customer_name": row["order_customer_name"],
        "product_name": row["product_name"],
        "part_type_name": row["part_type_name"],
        "text": row["text"]
    })

vdb = NanoVectorDB(embedding_dim=EMBEDDING_DIM, storage_file=STORAGE_FILE)
start = timer()
result = vdb.upsert(data)
end = timer()
print(f"Upsert time: {end - start:.4f} seconds")
print_memory_usage("After upserting to VDB")

query = "ーセプル"
print("input: " + query)
embedding = embed_query(query)
start = timer()
results = vdb.query(embedding, top_k=5, better_than_threshold=0.1)
end = timer()
print_memory_usage("After querying")

for i, res in enumerate(results, 1):
    print(f"[{i}] ID: {res['__id__']}, Customer Name: {res['customer_name']}, Product Name: {res['product_name']}, Parts Type Name: {res['part_type_name']}")
print(f"Query time: {end - start:.4f} seconds")


INFO:nano-vectordb:Init {'embedding_dim': 768, 'metric': 'cosine', 'storage_file': 'nano_data.json'} 0 data


Upsert time: 0.0015 seconds
[Memory Usage] After upserting to VDB - RSS: 347.90 MB, VMS: 691.70 MB
input: ーセプル
[Memory Usage] After querying - RSS: 347.96 MB, VMS: 723.76 MB
[1] ID: 2203259, Customer Name: 得意先_17, Product Name: ブリスターパック台紙テストプリント, Parts Type Name: ['台紙', '台紙２']
[2] ID: 2203843, Customer Name: 得意先_27, Product Name: 2023 母の日用  ティーチケットセット, Parts Type Name: ['チケット', 'チケット・台紙', 'チケット・台紙', '台紙']
[3] ID: 2204374, Customer Name: 得意先_97, Product Name: 当日プログラム「ウィンタースポーツ」, Parts Type Name: ['本文', '本文', '表紙', '表紙']
[4] ID: 2204248, Customer Name: 得意先_200, Product Name: B　POP, Parts Type Name: ['Ne-tak', 'しらおい上質86.5kg', 'クリスパー0.25']
[5] ID: 2204787, Customer Name: 得意先_200, Product Name: D　10週（3/7付）　POP, Parts Type Name: ['Ne-tak', 'しらおい上質86.5kg', 'クリスパー0.25']
Query time: 0.0004 seconds
