In [None]:
import os
import faiss
from langchain_community.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings
from datasets import load_dataset

General Information of the Vector DB

In [None]:
dataset = load_dataset("ed-donner/pricer-data")
train = dataset["train"]

# List all feature columns
print("Available feature columns:", train.column_names)

# You can also inspect schema (data types)
print("\nSchema / Features:")
print(train.features)

In [None]:
# Path where your index is saved
save_path = "/home/lisa/Arupreza/ShopAI/product_vector_store"
embedding_model = "nomic-ai/nomic-embed-text-v1"

In [None]:
# Reload embeddings
embeddings = HuggingFaceEmbeddings(
    model_name=embedding_model,
    model_kwargs={"device": "cuda", "trust_remote_code": True},
    encode_kwargs={"normalize_embeddings": True}
)

In [None]:
# Reload vectorstore
vectorstore = FAISS.load_local(save_path, embeddings, allow_dangerous_deserialization=True)

In [None]:
# Get underlying FAISS index
faiss_index = vectorstore.index

# Number of vectors stored
ntotal = faiss_index.ntotal

# Dimensionality of vectors
dim = faiss_index.d

print(f"✅ Vector store loaded from {save_path}")
print(f"Number of vectors stored: {ntotal}")
print(f"Dimensionality of each vector: {dim}")

In [None]:
vec = faiss_index.reconstruct(0)

print(f"Vector ID: {0}")
print("Vector shape:", len(vec))
print("First 10 values:", vec)

print("\n")

doc_id = vectorstore.index_to_docstore_id[0]
doc = vectorstore.docstore.search(doc_id)

print("Document text snippet:", doc.page_content[:200])
print("Metadata:", doc.metadata)

Vector Visualization

In [None]:
import os
import faiss
import numpy as np
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS

# -------------------
# 1. Reload FAISS vector store
# -------------------
save_path = "/home/lisa/Arupreza/ShopAI/product_vector_store"
embedding_model = "nomic-ai/nomic-embed-text-v1"

embeddings = HuggingFaceEmbeddings(
    model_name=embedding_model,
    model_kwargs={"device": "cuda", "trust_remote_code": True},
    encode_kwargs={"normalize_embeddings": True}
)

vectorstore = FAISS.load_local(save_path, embeddings, allow_dangerous_deserialization=True)
faiss_index = vectorstore.index

print("✅ Vector store loaded with", faiss_index.ntotal, "vectors")

# -------------------
# 2. Extract vectors (sample for visualization)
# -------------------
n_samples = 50000
vectors = faiss_index.reconstruct_n(0, min(n_samples, faiss_index.ntotal))

# -------------------
# 3. KMeans clustering
# -------------------
n_clusters = 10  # tune this (try 10, 20, 50)
kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
labels = kmeans.fit_predict(vectors)

# -------------------
# 4. Dimensionality reduction for visualization
# -------------------
tsne = TSNE(n_components=2, random_state=42, perplexity=30)
reduced = tsne.fit_transform(vectors)

# -------------------
# 5. Plot clusters
# -------------------
plt.figure(figsize=(10, 7))
scatter = plt.scatter(reduced[:, 0], reduced[:, 1], c=labels, cmap="tab10", s=10, alpha=0.7)
plt.colorbar(scatter, label="Cluster ID")
plt.title("Product Embeddings Clustered (t-SNE + KMeans)")
plt.xlabel("Dimension 1")
plt.ylabel("Dimension 2")
plt.show()

Find Similer Products

In [None]:
import os, re, json
import json5
from typing import List
from pydantic import BaseModel
from langchain_community.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_huggingface.llms import HuggingFacePipeline
from langchain.prompts import PromptTemplate
from langchain.output_parsers import PydanticOutputParser
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

# -------------------
# 1. Schema
# -------------------
class ProductInfo(BaseModel):
    product: str
    price: float

class ProductList(BaseModel):
    items: List[ProductInfo]

parser = PydanticOutputParser(pydantic_object=ProductList)

# -------------------
# 2. Prompt
# -------------------
prompt = PromptTemplate(
    template="""Extract all product names and their prices from the context below. 

⚠️ Rules:
- Only output a JSON object.
- JSON must have one key "items".
- "items" must be an array of {{ "product": string, "price": number }}.
- Do not include schema, explanation, or extra text.

Context:
{context}

Question: {question}

{format_instructions}
""",
    input_variables=["context", "question"],
    partial_variables={"format_instructions": parser.get_format_instructions()},
)

# -------------------
# 3. Load vectorstore
# -------------------
save_path = "/home/lisa/Arupreza/ShopAI/product_vector_store"

embeddings = HuggingFaceEmbeddings(
    model_name="nomic-ai/nomic-embed-text-v1",
    model_kwargs={"device": "cuda", "trust_remote_code": True},
    encode_kwargs={"normalize_embeddings": True}
)

vectorstore = FAISS.load_local(save_path, embeddings, allow_dangerous_deserialization=True)

# -------------------
# 4. Load fine-tuned model
# -------------------
model_path = "/home/lisa/Arupreza/ShopAI/price_prediction_peft/price_llama_lora/checkpoint-20000"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(model_path, device_map="auto")

pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=512,
    return_full_text=False,   # don't echo the full prompt
)
llm = HuggingFacePipeline(pipeline=pipe)

# -------------------
# 5. JSON cleaning helpers
# -------------------
def clean_output(text: str) -> str:
    """Extract JSON-ish content and fix common issues."""
    matches = re.findall(r"\{[\s\S]*\}", text)
    if not matches:
        raise ValueError("❌ No JSON found in output")
    candidate = matches[-1].strip()

    # Replace single with double quotes
    candidate = candidate.replace("'", '"')

    # Ensure keys are quoted
    candidate = re.sub(r'(\b\w+\b):', r'"\1":', candidate)

    return candidate

def safe_json_parse(text: str):
    try:
        return json.loads(text)  # strict
    except Exception as e1:
        print("⚠️ Standard JSON failed:", e1)
        try:
            repaired = clean_output(text)
            return json.loads(repaired)
        except Exception as e2:
            print("⚠️ Repaired JSON still failed:", e2)
            # final fallback: json5 (tolerates missing commas, comments, trailing commas)
            return json5.loads(repaired)

# -------------------
# 6. Ask function
# -------------------
def ask(query: str, top_k: int = 3):
    docs = vectorstore.similarity_search(query, k=top_k)
    all_context = "\n\n".join([d.page_content for d in docs])

    final_prompt = prompt.format(context=all_context, question=query)
    output = llm(final_prompt)

    # HuggingFacePipeline returns list of dicts
    if isinstance(output, list) and "generated_text" in output[0]:
        output_text = output[0]["generated_text"]
    else:
        output_text = str(output)

    # First try structured parser
    try:
        return parser.parse(output_text)
    except Exception:
        print("⚠️ Raw output (before cleaning):\n", output_text)
        parsed = safe_json_parse(output_text)
        return ProductList.model_validate(parsed)

In [None]:
# -------------------
# 7. Run Example
# -------------------
if __name__ == "__main__":
    query = "Find for me a gaming computer and give the price list?"
    results = ask(query, top_k=10)

    print("\n✅ Structured Results:")
    for item in results.items:
        print(item.model_dump())