In [1]:
pip install pandas sentence-transformers scikit-learn

Collecting sentence-transformers
  Downloading sentence_transformers-5.2.0-py3-none-any.whl.metadata (16 kB)
Collecting transformers<6.0.0,>=4.41.0 (from sentence-transformers)
  Downloading transformers-4.57.3-py3-none-any.whl.metadata (43 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.0/44.0 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
Collecting torch>=1.11.0 (from sentence-transformers)
  Downloading torch-2.2.2-cp311-none-macosx_10_9_x86_64.whl.metadata (25 kB)
Collecting huggingface-hub>=0.20.0 (from sentence-transformers)
  Downloading huggingface_hub-1.2.3-py3-none-any.whl.metadata (13 kB)
Collecting hf-xet<2.0.0,>=1.2.0 (from huggingface-hub>=0.20.0->sentence-transformers)
  Downloading hf_xet-1.2.0-cp37-abi3-macosx_10_12_x86_64.whl.metadata (4.9 kB)
Collecting httpx<1,>=0.23.0 (from huggingface-hub>=0.20.0->sentence-transformers)
  Downloading httpx-0.28.1-py3-none-any.whl.metadata (7.1 kB)
Collecting shellingham (from huggingface-hub>=0.

In [5]:
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

# Load CSV correctly
df = pd.read_csv("convertcsv.csv", sep=";")

# Rename columns
df.columns = ["ProductName", "Category"]

# Master category list (EDIT THIS)
master_categories = [
    "Electronics",
    "Home & Kitchen",
    "Books",
    "Clothing",
    "Sports & Outdoors",
    "Toys & Games",
    "Beauty & Personal Care"
]

# Load model (first run may take time)
model = SentenceTransformer("all-MiniLM-L6-v2")

# Generate embeddings
product_embeddings = model.encode(
    df["ProductName"].astype(str).tolist(),
    batch_size=64,
    show_progress_bar=True
)

category_embeddings = model.encode(master_categories)

# Similarity
similarity = cosine_similarity(product_embeddings, category_embeddings)

# Best match
best_idx = similarity.argmax(axis=1)
confidence = similarity.max(axis=1)

df["PredictedCategory"] = [master_categories[i] for i in best_idx]
df["Confidence"] = confidence

# Save output
df.to_csv("mapped_products.csv", index=False)

print("✅ Mapping completed successfully")


Batches:   0%|          | 0/1563 [00:00<?, ?it/s]

✅ Mapping completed successfully
