## **Section 1: E5 Model**

In [None]:
!pip install -qU \
    torch==2.5.1 \
    transformers \
    pinecone

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.0/44.0 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m31.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m25.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m22.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m11.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

### **Section 1.1: Initial**

In [None]:
# Libraries
import torch
import pandas as pd
from transformers import AutoModel, AutoTokenizer
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from pinecone import Pinecone, ServerlessSpec

# Define device
device = "cuda" if torch.cuda.is_available() else "cpu"

# CHANGES WRT SECTION 1.3 START   ----------------------------

# Set up Pinecone environment
pc = Pinecone(api_key="pcsk_4GT8aD_FPuF7yJHHbz2h8Tpn9GRrAXjTULo69KzemEafbBwyUawYMFz3hYXpSFkTtqkrdL", environment="us-west1-gcp")

index_name = "product-embeddings"

# If the index doesn't exist, create it
if index_name not in pc.list_indexes().names():
    pc.create_index(
    name=index_name,
    dimension=768,                                      # Replace with your model dimensions, DEFAULT 2
    metric="cosine",
    spec=ServerlessSpec(
        cloud="aws",
        region="us-west1-gcp"
    )
)

# Connect to pc index
index = pc.Index(index_name)

# CHANGES WRT SECTION 1.3 END   ----------------------------

# Initialize tokenizer and model
model_id = "intfloat/e5-base-v2"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModel.from_pretrained(model_id).to(device)
model.eval()

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

In [None]:
# Load Data
df = pd.read_csv('master_data.csv', encoding='windows-1252')
df.head()

Unnamed: 0,itemcode,itemdesc,catcode,category,company,brand,packaging,flavor,color,qty,uomdesc,pack_size,launchdate,audittype
0,30001,RED LION GIFT ASSORTMENT 350 GM (ADJ DISTRIBUT...,6,BISCUITS,ADJ DISTRIBUTORS,RED LION GIFT ASSORTMENT,CDBOX,,,350.0,GM,350.00 GM,201301.0,1
1,30002,REDLION GIFT ASSORTM 400 GM (ADJ DISTRIBUTORS),6,BISCUITS,ADJ DISTRIBUTORS,RED LION GIFT ASSORTMENT,CDBOX,,,400.0,GM,400.00 GM,200901.0,1
2,30003,SIGMA GIFT SELECTION 360 GM,6,BISCUITS,ASIA AGRO FOODS (PVT) LTD,SIGMA GIFT SELECTION,CDBOX,,,360.0,GM,360.00 GM,201808.0,1
3,30004,BISCO FIRST CHOICE CLASSIC 300 GM,6,BISCUITS,BISCO FOODS (PVT) LTD,BISCO FIRST CHOICE CLASSIC,CDBOX,,,300.0,GM,300.00 GM,201705.0,1
4,30006,KIST GIFT ASSORTMENT 400 GM,6,BISCUITS,CARGILLS QUALITY DAIRIES (PVT) LTD,KIST GIFT ASSORTMENT,CDBOX,,,400.0,GM,400.00 GM,201401.0,1


In [None]:
# Function to get embeddings from text using the model
def get_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512).to(device)
    with torch.no_grad():
        outputs = model(**inputs)
        embeddings = outputs.last_hidden_state.mean(dim=1).squeeze()
    return embeddings.cpu().numpy()

# Generate embeddings for all items in your dataset
embeddings = []
for index, row in df.iterrows():
    # Combine itemdesc, category, and brand
    text = f"{row['itemdesc']} {row['category']} {row['brand']}"
    embedding = get_embedding(text)
    embeddings.append(embedding)

# Convert list to numpy array for easier manipulation
embeddings = np.array(embeddings)

In [None]:
print(embeddings)

[[-0.36126047 -0.22316462 -0.70406204 ... -0.02674151  0.75151104
  -0.04409679]
 [-0.32190576 -0.34300658 -0.7000368  ... -0.20726684  0.642019
   0.15255094]
 [-0.5050028  -0.11133112 -0.8511407  ...  0.06591822  0.34988013
   0.07019906]
 ...
 [-0.34379375 -0.8302665  -0.4450665  ... -0.44972357  0.0866977
   0.05328827]
 [-0.47959176 -0.48679858 -0.24120246 ... -0.23492353  0.42319003
   0.3595234 ]
 [-0.01915236 -1.0020916  -0.45837045 ... -0.0561334   0.4351942
   0.00729315]]


In [None]:
# Demo query to test retrieval
query = "Rohit\sibha"
query_embedding = get_embedding(query).reshape(1, -1)

# Calculate cosine similarity between query and item embeddings
similarities = cosine_similarity(query_embedding, embeddings)

# Find the top 5 most similar items
top_k = 5
top_k_indices = similarities.argsort()[0][-top_k:][::-1]

# Display the top 5 most similar items
for idx in top_k_indices:
    print(f"Item: {df.iloc[idx]['itemdesc']}, Similarity: {similarities[0][idx]:.4f}")

Item: SIHINA CHICKEN CURRY SOYA 35 GM, Similarity: 0.8089
Item: RANI SANDALWOOD WITH ROSE WATER & SAFFRON 90 GM, Similarity: 0.7999
Item: RANI SANDALWOOD WTH ROSE WATER & SAFFRON 65 GM CDBOX, Similarity: 0.7983
Item: RANI SANDALWOOD WTH ROSE WATER & SAFFRON 140 GM CDBOX, Similarity: 0.7977
Item: DIVYA SANDALWOOD SUDU HANDUN RATH HANDUN KOKUM 70 GM CDBOX, Similarity: 0.7971


### **Section 1.2: E5 with testing on 2 set of Dataframes**

In [None]:
# import torch
# import pandas as pd
# from transformers import AutoModel, AutoTokenizer
# import numpy as np
# from sklearn.metrics.pairwise import cosine_similarity

# # Load Data
# df = pd.read_csv('master_data.csv')

# # Define device
# device = "cuda" if torch.cuda.is_available() else "cpu"

# # Initialize tokenizer and model
# model_id = "intfloat/e5-base-v2"
# tokenizer = AutoTokenizer.from_pretrained(model_id)
# model = AutoModel.from_pretrained(model_id).to(device)
# model.eval()

# # Function to get embeddings from text using the model
# def get_embedding(text):
#     inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512).to(device)
#     with torch.no_grad():
#         outputs = model(**inputs)
#         embeddings = outputs.last_hidden_state.mean(dim=1).squeeze()
#     return embeddings.cpu().numpy()

# Function to get embeddings from text using the model
### Continuing from def get_embedding

# Generate embeddings for both strategies: itemdesc only, and itemdesc + other fields
embeddings_desc = []
embeddings_full = []

for index, row in df.iterrows():
    # Match with itemdesc only
    text_desc = row['itemdesc']
    embedding_desc = get_embedding(text_desc)
    embeddings_desc.append(embedding_desc)

    # Match with itemdesc, itemcode, category, company, and brand
    text_full = f"{row['itemdesc']} {row['itemcode']} {row['category']} {row['company']} {row['brand']}"
    embedding_full = get_embedding(text_full)
    embeddings_full.append(embedding_full)

# Convert lists to numpy arrays
embeddings_desc = np.array(embeddings_desc)
embeddings_full = np.array(embeddings_full)

# Function to perform retrieval and output results
def retrieve(query, top_k=5):
    # Query matching with itemdesc only
    query_embedding_desc = get_embedding(query).reshape(1, -1)
    similarities_desc = cosine_similarity(query_embedding_desc, embeddings_desc)
    top_k_indices_desc = similarities_desc.argsort()[0][-top_k:][::-1]

    # Query matching with itemdesc + other fields
    query_embedding_full = get_embedding(query).reshape(1, -1)
    similarities_full = cosine_similarity(query_embedding_full, embeddings_full)
    top_k_indices_full = similarities_full.argsort()[0][-top_k:][::-1]

    # Extract the results for both strategies
    result_desc = df.iloc[top_k_indices_desc][['itemdesc']]
    result_full = df.iloc[top_k_indices_full][['itemdesc', 'itemcode', 'category', 'company', 'brand']]

    # Display results in a tabular format (like df.head())
    print("\nTop 5 Matches for itemdesc only:")
    display(result_desc)

    print("\nTop 5 Matches for itemdesc + itemcode + category + company + brand:")
    display(result_full)

    return result_desc, result_full

# Example query
query = "RED LION GIFT ASSORTMENT"
retrieve(query)


Top 5 Matches for itemdesc only:


Unnamed: 0,itemdesc
0,RED LION GIFT ASSORTMENT 350 GM (ADJ DISTRIBUT...
10741,RED LION GIFT CAKE 800 GM CDBOX (RED LION BAKERS)
1,REDLION GIFT ASSORTM 400 GM (ADJ DISTRIBUTORS)
1222,RED LION SEASONS GREETINGS CDBOX 1000 GM (ADJ ...
1215,RED LION CAKE 600 GM (ADJ DISTRI)



Top 5 Matches for itemdesc + itemcode + category + company + brand:


Unnamed: 0,itemdesc,itemcode,category,company,brand
0,RED LION GIFT ASSORTMENT 350 GM (ADJ DISTRIBUT...,30001,BISCUITS,ADJ DISTRIBUTORS,RED LION GIFT ASSORTMENT
1,REDLION GIFT ASSORTM 400 GM (ADJ DISTRIBUTORS),30002,BISCUITS,ADJ DISTRIBUTORS,RED LION GIFT ASSORTMENT
10741,RED LION GIFT CAKE 800 GM CDBOX (RED LION BAKERS),48912,CAKE,RED LION BAKERS LANKA LTD,RED LION
1222,RED LION SEASONS GREETINGS CDBOX 1000 GM (ADJ ...,31846,CAKE,ADJ DISTRIBUTORS,RED LION
1205,RED LION CHOCOLATE FLAVOURED ICING CAKE CDBOX ...,31822,CAKE,ADJ DISTRIBUTORS,RED LION


(                                                itemdesc
 0      RED LION GIFT ASSORTMENT 350 GM (ADJ DISTRIBUT...
 10741  RED LION GIFT CAKE 800 GM CDBOX (RED LION BAKERS)
 1         REDLION GIFT ASSORTM 400 GM (ADJ DISTRIBUTORS)
 1222   RED LION SEASONS GREETINGS CDBOX 1000 GM (ADJ ...
 1215                   RED LION CAKE 600 GM (ADJ DISTRI),
                                                 itemdesc  itemcode  category  \
 0      RED LION GIFT ASSORTMENT 350 GM (ADJ DISTRIBUT...     30001  BISCUITS   
 1         REDLION GIFT ASSORTM 400 GM (ADJ DISTRIBUTORS)     30002  BISCUITS   
 10741  RED LION GIFT CAKE 800 GM CDBOX (RED LION BAKERS)     48912      CAKE   
 1222   RED LION SEASONS GREETINGS CDBOX 1000 GM (ADJ ...     31846      CAKE   
 1205   RED LION CHOCOLATE FLAVOURED ICING CAKE CDBOX ...     31822      CAKE   
 
                          company                     brand  
 0               ADJ DISTRIBUTORS  RED LION GIFT ASSORTMENT  
 1               ADJ DISTRIBUTORS  RED 

### **Section 1.3: E5 integration with Pinecone** (for storing the embeddings), <br> Strategies are still used for both { (only itemdesc) , (itemdesc + itemcode + category + company + brand)}

In [None]:
# # Libraries
# import torch
# import pandas as pd
# from transformers import AutoModel, AutoTokenizer
# import numpy as np
# from sklearn.metrics.pairwise import cosine_similarity
# from pinecone import Pinecone, ServerlessSpec

# # Define device
# device = "cuda" if torch.cuda.is_available() else "cpu"

# # Set up Pinecone environment
# pc = Pinecone(api_key="pcsk_4GT8aD_FPuF7yJHHbz2h8Tpn9GRrAXjTULo69KzemEafbBwyUawYMFz3hYXpSFkTtqkrdL", environment="us-west1-gcp")

# index_name = "product-embeddings"

# # If the index doesn't exist, create it
# if index_name not in pc.list_indexes().names():
#     pc.create_index(
#     name=index_name,
#     dimension=768,                                      # Replace with your model dimensions, DEFAULT 2
#     metric="cosine",
#     spec=ServerlessSpec(
#         cloud="aws",
#         region="us-west1-gcp"
#     )
# )

# # Connect to pc index
# index = pc.Index(index_name)

# # Initialize tokenizer and model
# model_id = "intfloat/e5-base-v2"
# tokenizer = AutoTokenizer.from_pretrained(model_id)
# model = AutoModel.from_pretrained(model_id).to(device)
# model.eval()

# # Load Data
# df = pd.read_csv('master_data.csv', encoding='windows-1252')
# df.head()

# # Function to get embeddings from text using the model
# def get_embedding(text):
#     inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512).to(device)
#     with torch.no_grad():
#         outputs = model(**inputs)
#         embeddings = outputs.last_hidden_state.mean(dim=1).squeeze()
#     return embeddings.cpu().numpy()

# # Generate embeddings for all items in your dataset
# embeddings = []
# for index, row in df.iterrows():
#     # Combine itemdesc, category, and brand
#     text = f"{row['itemdesc']} {row['category']} {row['brand']}"
#     embedding = get_embedding(text)
#     embeddings.append(embedding)

# # Convert list to numpy array for easier manipulation
# embeddings = np.array(embeddings)

# Function to get embeddings from text using the model
### Continuing from def get_embedding

# Store embeddings in Pinecone for both strategies: itemdesc only, and itemdesc + other fields
def store_embeddings_in_pinecone():
    vectors_desc = []
    vectors_full = []
    ids_desc = []
    ids_full = []

    for index, row in df.iterrows():
        # Generate embedding for itemdesc only
        text_desc = row['itemdesc']
        embedding_desc = get_embedding(text_desc)
        vectors_desc.append(embedding_desc)
        ids_desc.append(f"desc_{row['itemcode']}")

        # Generate embedding for itemdesc, itemcode, category, company, brand
        text_full = f"{row['itemdesc']} {row['itemcode']} {row['category']} {row['company']} {row['brand']}"
        embedding_full = get_embedding(text_full)
        vectors_full.append(embedding_full)
        ids_full.append(f"full_{row['itemcode']}")

    # Insert vectors into Pinecone (for both strategies)
    index.upsert(vectors=zip(ids_desc, vectors_desc))
    index.upsert(vectors=zip(ids_full, vectors_full))

# Store embeddings in Pinecone
store_embeddings_in_pinecone()

AttributeError: 'int' object has no attribute 'upsert'

In [None]:
# Function to perform retrieval using Pinecone
def retrieve(query, top_k=5):
    # Generate embedding for the query
    query_embedding = get_embedding(query).reshape(1, -1)

    # Query Pinecone for itemdesc-only matching
    query_id = "query_desc"
    query_results_desc = index.query(queries=query_embedding.tolist(), top_k=top_k, include_metadata=True, namespace="desc")

    # Query Pinecone for itemdesc + other fields matching
    query_results_full = index.query(queries=query_embedding.tolist(), top_k=top_k, include_metadata=True, namespace="full")

    # Process results and convert to DataFrame format
    result_desc = pd.DataFrame(query_results_desc['matches'], columns=['id', 'score', 'metadata'])
    result_full = pd.DataFrame(query_results_full['matches'], columns=['id', 'score', 'metadata'])

    # Extract relevant metadata and display the results
    result_desc['itemdesc'] = result_desc['metadata'].apply(lambda x: x['itemdesc'])
    result_full['itemdesc'] = result_full['metadata'].apply(lambda x: x['itemdesc'])
    result_full['itemcode'] = result_full['metadata'].apply(lambda x: x['itemcode'])
    result_full['category'] = result_full['metadata'].apply(lambda x: x['category'])
    result_full['company'] = result_full['metadata'].apply(lambda x: x['company'])
    result_full['brand'] = result_full['metadata'].apply(lambda x: x['brand'])

    # Display results in tabular format (like df.head())
    print("\nTop 5 Matches for itemdesc only:")
    display(result_desc[['itemdesc', 'score']].head())

    print("\nTop 5 Matches for itemdesc + itemcode + category + company + brand:")
    display(result_full[['itemdesc', 'itemcode', 'category', 'company', 'brand', 'score']].head())

    return result_desc, result_full

# Example query
query = "RED LION GIFT ASSORTMENT"
retrieve(query)

#### Section 1.3.1: Debugging PineCone Insertion

In [None]:
# Libraries
import torch
import pandas as pd
from transformers import AutoModel, AutoTokenizer
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import os
from pinecone import Pinecone, ServerlessSpec

# Define device
device = "cuda" if torch.cuda.is_available() else "cpu"

# Set up Pinecone environment
pc = Pinecone(api_key="pcsk_4GT8aD_FPuF7yJHHbz2h8Tpn9GRrAXjTULo69KzemEafbBwyUawYMFz3hYXpSFkTtqkrdL")

index_name = "product-embeddings"

# Check if the index exists, if not create it
if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name,
        dimension=768,  # Make sure the dimension matches your model output
        metric="cosine",
        spec=ServerlessSpec(
            cloud="aws",
            region="us-west1-gcp"
        )
    )

# Connect to Pinecone index
index = pc.Index(index_name)

# # Initialize tokenizer and model
# model_id = "intfloat/e5-base-v2"
# tokenizer = AutoTokenizer.from_pretrained(model_id)
# model = AutoModel.from_pretrained(model_id).to(device)
# model.eval()

# # Load Data
# df = pd.read_csv('master_data.csv', encoding='windows-1252')
# df.head()

# # Function to get embeddings from text using the model
# def get_embedding(text):
#     inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512).to(device)
#     with torch.no_grad():
#         outputs = model(**inputs)
#         embeddings = outputs.last_hidden_state.mean(dim=1).squeeze()
#     return embeddings.cpu().numpy()

# # Generate embeddings for all items in your dataset
# embeddings = []
# for index, row in df.iterrows():
#     # Combine itemdesc, category, and brand
#     text = f"{row['itemdesc']} {row['category']} {row['brand']}"
#     embedding = get_embedding(text)
#     embeddings.append(embedding)

# # Convert list to numpy array for easier manipulation
# embeddings = np.array(embeddings)

# Function to get embeddings from text using the model
### Continuing from def get_embedding

# Store embeddings in Pinecone for both strategies: itemdesc only, and itemdesc + other fields
def store_embeddings_in_pinecone():
    vectors_desc = []
    vectors_full = []
    ids_desc = []
    ids_full = []

    # Loop through the DataFrame to generate embeddings and prepare vectors
    for index, row in df.iterrows():
        # Generate embedding for itemdesc only
        text_desc = row['itemdesc']
        embedding_desc = get_embedding(text_desc)
        vectors_desc.append(embedding_desc)
        ids_desc.append(f"desc_{row['itemcode']}")

        # Generate embedding for itemdesc, itemcode, category, company, brand
        text_full = f"{row['itemdesc']} {row['itemcode']} {row['category']} {row['company']} {row['brand']}"
        embedding_full = get_embedding(text_full)
        vectors_full.append(embedding_full)
        ids_full.append(f"full_{row['itemcode']}")

    # Convert vectors to the format required by Pinecone
    vectors_desc_data = [{"id": id, "values": vec.tolist()} for id, vec in zip(ids_desc, vectors_desc)]
    vectors_full_data = [{"id": id, "values": vec.tolist()} for id, vec in zip(ids_full, vectors_full)]

    # Insert vectors into Pinecone (for both strategies)
    index.upsert(vectors=vectors_desc_data)
    index.upsert(vectors=vectors_full_data)

# Store embeddings in Pinecone
store_embeddings_in_pinecone()

# Function to perform retrieval using Pinecone
def retrieve(query, top_k=5):
    # Generate embedding for the query
    query_embedding = get_embedding(query).reshape(1, -1)

    # Query Pinecone for itemdesc-only matching
    query_id = "query_desc"
    query_results_desc = index.query(queries=query_embedding.tolist(), top_k=top_k, include_metadata=True, namespace="desc")

    # Query Pinecone for itemdesc + other fields matching
    query_results_full = index.query(queries=query_embedding.tolist(), top_k=top_k, include_metadata=True, namespace="full")

    # Process results and convert to DataFrame format
    result_desc = pd.DataFrame(query_results_desc['matches'], columns=['id', 'score', 'metadata'])
    result_full = pd.DataFrame(query_results_full['matches'], columns=['id', 'score', 'metadata'])

    # Extract relevant metadata and display the results
    result_desc['itemdesc'] = result_desc['metadata'].apply(lambda x: x['itemdesc'])
    result_full['itemdesc'] = result_full['metadata'].apply(lambda x: x['itemdesc'])
    result_full['itemcode'] = result_full['metadata'].apply(lambda x: x['itemcode'])
    result_full['category'] = result_full['metadata'].apply(lambda x: x['category'])
    result_full['company'] = result_full['metadata'].apply(lambda x: x['company'])
    result_full['brand'] = result_full['metadata'].apply(lambda x: x['brand'])

    # Display results in tabular format (like df.head())
    print("\nTop 5 Matches for itemdesc only:")
    display(result_desc[['itemdesc', 'score']].head())

    print("\nTop 5 Matches for itemdesc + itemcode + category + company + brand:")
    display(result_full[['itemdesc', 'itemcode', 'category', 'company', 'brand', 'score']].head())

    return result_desc, result_full

# Example query
query = "RED LION GIFT ASSORTMENT"
retrieve(query)

AttributeError: 'int' object has no attribute 'upsert'

#### Section 1.3.2: Debugging with Mini Data (20 rows)

In [None]:
import pandas as pd

# Load Data
dfm = pd.read_csv('master_data.csv', encoding='windows-1252')
dfm.head()

# Select the first 20 rows
reduced_dfm = dfm.head(20)

# Save the reduced data to a new CSV file
reduced_dfm.to_csv('reduced_file.csv', index=False)  # Replace 'reduced_file.csv' with the desired file name

In [None]:
# Libraries
import torch
import pandas as pd
from transformers import AutoModel, AutoTokenizer
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import os
from pinecone import Pinecone, ServerlessSpec

# Define device
device = "cuda" if torch.cuda.is_available() else "cpu"

# Set up Pinecone environment
pc = Pinecone(api_key="pcsk_4GT8aD_FPuF7yJHHbz2h8Tpn9GRrAXjTULo69KzemEafbBwyUawYMFz3hYXpSFkTtqkrdL")

index_name = "product-embeddings"

# Check if the index exists, if not create it
if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name,
        dimension=768,  # Make sure the dimension matches your model output
        metric="cosine",
        spec=ServerlessSpec(
            cloud="aws",
            region="us-west1-gcp"
        )
    )

# Connect to Pinecone index
vector_index = pc.Index(index_name)

# Initialize tokenizer and model
model_id = "intfloat/e5-base-v2"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModel.from_pretrained(model_id).to(device)
model.eval()

# Load Data
df = pd.read_csv('reduced_file.csv', encoding='windows-1252')
df.head()

# Function to get embeddings from text using the model
def get_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512).to(device)
    with torch.no_grad():
        outputs = model(**inputs)
        embeddings = outputs.last_hidden_state.mean(dim=1).squeeze()
    return embeddings.cpu().numpy()

# Generate embeddings for all items in your dataset
embeddings = []
for index, row in df.iterrows():
    # Combine itemdesc, category, and brand
    text = f"{row['itemdesc']} {row['category']} {row['brand']}"
    embedding = get_embedding(text)
    embeddings.append(embedding)

# Convert list to numpy array for easier manipulation
embeddings = np.array(embeddings)

# Function to get embeddings from text using the model
### Continuing from def get_embedding

# Store embeddings in Pinecone for both strategies: itemdesc only, and itemdesc + other fields
def store_embeddings_in_pinecone():
    vectors_desc = []
    vectors_full = []
    ids_desc = []
    ids_full = []

    # Loop through the DataFrame to generate embeddings and prepare vectors
    for index_num, row in df.iterrows():  # Use index_num here to avoid conflict
    # for index, row in df.iterrows():
        # Generate embedding for itemdesc only
        text_desc = row['itemdesc']
        embedding_desc = get_embedding(text_desc)
        vectors_desc.append(embedding_desc)
        ids_desc.append(f"desc_{row['itemcode']}")

        # Generate embedding for itemdesc, itemcode, category, company, brand
        text_full = f"{row['itemdesc']} {row['itemcode']} {row['category']} {row['company']} {row['brand']}"
        embedding_full = get_embedding(text_full)
        vectors_full.append(embedding_full)
        ids_full.append(f"full_{row['itemcode']}")

    # Convert vectors to the format required by Pinecone
    vectors_desc_data = [{"id": id, "values": vec.tolist()} for id, vec in zip(ids_desc, vectors_desc)]
    vectors_full_data = [{"id": id, "values": vec.tolist()} for id, vec in zip(ids_full, vectors_full)]

    # Insert vectors into Pinecone (for both strategies)
    vector_index.upsert(vectors=vectors_desc_data, namespace="desc") # Specify namespace
    vector_index.upsert(vectors=vectors_full_data, namespace="full") # Specify namespace

# Store embeddings in Pinecone
print(f"Before store_embeddings: {type(index)}")
store_embeddings_in_pinecone()
print(f"After store_embeddings: {type(index)}")

# Function to perform retrieval using Pinecone
def retrieve(query, top_k=5):
    query_embedding = get_embedding(query).reshape(1, -1)

    query_results_desc = vector_index.query(vector=query_embedding.tolist()[0], top_k=top_k, include_metadata=True, namespace="desc")
    query_results_full = vector_index.query(vector=query_embedding.tolist()[0], top_k=top_k, include_metadata=True, namespace="full")

    # Process results and convert to DataFrame format (Corrected)
    def process_results(results):
        matches = results['matches']
        df = pd.DataFrame(matches)

        if not df.empty: # Check if there are any matches
            # Extract metadata safely, handling potential missing keys
            for col in ['itemdesc', 'itemcode', 'category', 'company', 'brand']:
                df[col] = df['metadata'].apply(lambda x: x.get(col, None)) # Use .get()

            # Remove the 'values' and 'metadata' columns, which are no longer needed
            df = df.drop(columns=['values', 'metadata'], errors='ignore')

        return df

    result_desc = process_results(query_results_desc)
    result_full = process_results(query_results_full)


    print("\nTop 5 Matches for itemdesc only:")
    display(result_desc[['itemdesc', 'score']].head())

    print("\nTop 5 Matches for itemdesc + itemcode + category + company + brand:")
    display(result_full[['itemdesc', 'itemcode', 'category', 'company', 'brand', 'score']].head())

    return result_desc, result_full

# Example query (same as before)
query = "RED LION GIFT ASSORTMENT"
retrieve(query)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Before store_embeddings: <class 'int'>
After store_embeddings: <class 'int'>


ValueError: invalid __array_struct__

In [None]:
get_embedding(query).reshape(1, -1).tolist()[0]

[-0.32156145572662354,
 -0.284949392080307,
 -0.49618977308273315,
 -0.2820149064064026,
 0.42545223236083984,
 -0.11723895370960236,
 0.028629641979932785,
 0.08783744275569916,
 -0.5438091158866882,
 0.16556555032730103,
 -0.5503515005111694,
 0.5945003032684326,
 -0.5844578146934509,
 0.27570846676826477,
 -0.6395530104637146,
 0.7527806758880615,
 0.3460014760494232,
 0.5406359434127808,
 0.3445587754249573,
 -0.10109362006187439,
 -0.5472133755683899,
 -0.4778062701225281,
 0.0024170256219804287,
 -0.15011171996593475,
 0.4442983865737915,
 -0.21649840474128723,
 0.3443528115749359,
 0.15276499092578888,
 -1.0362446308135986,
 0.12086106836795807,
 0.3868924081325531,
 0.7908179759979248,
 0.6440797448158264,
 -0.618558406829834,
 -0.623938262462616,
 0.08369415998458862,
 -0.6444043517112732,
 -0.6124579906463623,
 -0.7592822909355164,
 -0.8373408317565918,
 -0.09994234889745712,
 -0.6037479043006897,
 -0.6265375018119812,
 0.4693703353404999,
 -0.5039514899253845,
 -0.0822237655

#### Section 1.3.3: Still Debugging

In [None]:
import torch
import pandas as pd
from transformers import AutoModel, AutoTokenizer
import numpy as np
from pinecone import Pinecone, ServerlessSpec
from IPython.display import display

# Define device
device = "cuda" if torch.cuda.is_available() else "cpu"

# Set up Pinecone environment
pc = Pinecone(api_key="pcsk_4GT8aD_FPuF7yJHHbz2h8Tpn9GRrAXjTULo69KzemEafbBwyUawYMFz3hYXpSFkTtqkrdL")  # Replace with your API key

index_name = "product-embeddings"

# Check if the index exists, if not create it
if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name,
        dimension=768,  # Make sure the dimension matches your model output
        metric="cosine",
        spec=ServerlessSpec(
            cloud="aws",
            region="us-west1-gcp"  # Or your preferred region
        )
    )

# Connect to Pinecone index
vector_index = pc.Index(index_name)

# Initialize tokenizer and model
model_id = "intfloat/e5-base-v2"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModel.from_pretrained(model_id).to(device)
model.eval()

# Load Data
df = pd.read_csv('reduced_file.csv', encoding='windows-1252')  # Or your CSV file
df = df.fillna('') # Handle missing values. Very Important.
df.head()

# Function to get embeddings from text using the model
def get_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512).to(device)
    with torch.no_grad():
        outputs = model(**inputs)
        embeddings = outputs.last_hidden_state.mean(dim=1).squeeze()
    return embeddings.cpu().numpy()

def store_embeddings_in_pinecone():
    vectors = []
    ids = []
    metadata_list = []  # List to store metadata dictionaries

    for index, row in df.iterrows():  # Correct way to iterate and access row
        text = row['itemdesc']
        embedding = get_embedding(text)
        vectors.append(embedding)
        ids.append(f"desc_{row['itemcode']}")
        metadata_list.append(row.to_dict())  # Store metadata for each row

    vectors_data = [{"id": id, "values": vec.tolist(), "metadata": metadata} for id, vec, metadata in zip(ids, vectors, metadata_list)]
    vector_index.upsert(vectors=vectors_data, namespace="desc")

# Store embeddings in Pinecone
store_embeddings_in_pinecone()

def retrieve(query, top_k=5):
    query_embedding = get_embedding(query).reshape(1, -1)

    query_results = vector_index.query(vector=query_embedding.tolist()[0], top_k=top_k, include_metadata=True, namespace="desc")

    def process_results(results):
        matches = results['matches']
        data = []  # List to store dictionaries for DataFrame creation

        if matches:  # Check if there are any matches
            for match in matches:
                metadata = match.get('metadata', {})  # Safely get metadata, handle missing
                data.append({
                    'id': match['id'],
                    'score': match['score'],
                    **metadata  # Unpack metadata into the dictionary
                })

            df_results = pd.DataFrame(data)  # Create DataFrame DIRECTLY from the list of dictionaries
            return df_results
        else:
            return pd.DataFrame()  # Return empty DataFrame if no matches

    result = process_results(query_results)

    print("\nTop Matches for itemdesc:")
    if not result.empty:  # Check if the DataFrame is empty before displaying
        display(result[['itemdesc', 'itemcode', 'category', 'company', 'brand', 'score']].head())
    else:
        print("No matches found.")

    return result

# Example query
query = "RED LION GIFT ASSORTMENT"
retrieve(query)

# Example query
query = "RED LION GIFT ASSORTMENT"
retrieve(query)


Top Matches for itemdesc:


Unnamed: 0,itemdesc,itemcode,category,company,brand,score
0,RED LION GIFT ASSORTMENT 350 GM (ADJ DISTRIBUT...,30001.0,BISCUITS,ADJ DISTRIBUTORS,RED LION GIFT ASSORTMENT,0.933212
1,REDLION GIFT ASSORTM 400 GM (ADJ DISTRIBUTORS),30002.0,BISCUITS,ADJ DISTRIBUTORS,RED LION GIFT ASSORTMENT,0.867457
2,CHERISH GIFT ASSORTMENT 400 GM,30013.0,BISCUITS,CHERISH BISCUIT (PVT) LTD,CHERISH GIFT ASSORTMENT,0.849081
3,SIGMA GIFT SELECTION 360 GM,30003.0,BISCUITS,ASIA AGRO FOODS (PVT) LTD,SIGMA GIFT SELECTION,0.843979
4,LUCKYLAND GIFT SELECTION ASSORTMENT 350 GM,30033.0,BISCUITS,LUCKYLAND BISCUITS MFS,LUCKYLAND GIFT SELECTION ASSORTMENT,0.840915



Top Matches for itemdesc:


Unnamed: 0,itemdesc,itemcode,category,company,brand,score
0,RED LION GIFT ASSORTMENT 350 GM (ADJ DISTRIBUT...,30001.0,BISCUITS,ADJ DISTRIBUTORS,RED LION GIFT ASSORTMENT,0.933212
1,REDLION GIFT ASSORTM 400 GM (ADJ DISTRIBUTORS),30002.0,BISCUITS,ADJ DISTRIBUTORS,RED LION GIFT ASSORTMENT,0.867457
2,CHERISH GIFT ASSORTMENT 400 GM,30013.0,BISCUITS,CHERISH BISCUIT (PVT) LTD,CHERISH GIFT ASSORTMENT,0.849081
3,SIGMA GIFT SELECTION 360 GM,30003.0,BISCUITS,ASIA AGRO FOODS (PVT) LTD,SIGMA GIFT SELECTION,0.843979
4,LUCKYLAND GIFT SELECTION ASSORTMENT 350 GM,30033.0,BISCUITS,LUCKYLAND BISCUITS MFS,LUCKYLAND GIFT SELECTION ASSORTMENT,0.840915


Unnamed: 0,id,score,audittype,brand,catcode,category,color,company,flavor,itemcode,itemdesc,launchdate,pack_size,packaging,qty,uomdesc
0,desc_30001,0.933212,1.0,RED LION GIFT ASSORTMENT,6.0,BISCUITS,,ADJ DISTRIBUTORS,,30001.0,RED LION GIFT ASSORTMENT 350 GM (ADJ DISTRIBUT...,201301.0,350.00 GM,CDBOX,350.0,GM
1,desc_30002,0.867457,1.0,RED LION GIFT ASSORTMENT,6.0,BISCUITS,,ADJ DISTRIBUTORS,,30002.0,REDLION GIFT ASSORTM 400 GM (ADJ DISTRIBUTORS),200901.0,400.00 GM,CDBOX,400.0,GM
2,desc_30013,0.849081,1.0,CHERISH GIFT ASSORTMENT,6.0,BISCUITS,,CHERISH BISCUIT (PVT) LTD,,30013.0,CHERISH GIFT ASSORTMENT 400 GM,200810.0,400.00 GM,PLPCH,400.0,GM
3,desc_30003,0.843979,1.0,SIGMA GIFT SELECTION,6.0,BISCUITS,,ASIA AGRO FOODS (PVT) LTD,,30003.0,SIGMA GIFT SELECTION 360 GM,201808.0,360.00 GM,CDBOX,360.0,GM
4,desc_30033,0.840915,1.0,LUCKYLAND GIFT SELECTION ASSORTMENT,6.0,BISCUITS,,LUCKYLAND BISCUITS MFS,,30033.0,LUCKYLAND GIFT SELECTION ASSORTMENT 350 GM,201704.0,350.00 GM,CDBOX,350.0,GM


#### Section 1.3.4: Testing on Master Data

In [None]:
import torch
import pandas as pd
from transformers import AutoModel, AutoTokenizer
import numpy as np
from pinecone import Pinecone, ServerlessSpec
from IPython.display import display

# Define device
device = "cuda" if torch.cuda.is_available() else "cpu"

# Set up Pinecone environment
pc = Pinecone(api_key="pcsk_4GT8aD_FPuF7yJHHbz2h8Tpn9GRrAXjTULo69KzemEafbBwyUawYMFz3hYXpSFkTtqkrdL")  # Replace with your API key

index_name = "product-embeddings-master"

# Check if the index exists, if not create it
if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name,
        dimension=768,  # Make sure the dimension matches your model output
        metric="cosine",
        spec=ServerlessSpec(
            cloud="aws",
            region="us-east-1"  # Or your preferred region
        )
    )

# Connect to Pinecone index
vector_index = pc.Index(index_name)

# Initialize tokenizer and model
model_id = "intfloat/e5-base-v2"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModel.from_pretrained(model_id).to(device)
model.eval()

# Load Data
df = pd.read_csv('master_data.csv', encoding='windows-1252')  # Or your CSV file
df = df.fillna('') # Handle missing values. Very Important.
df.head()

# Function to get embeddings from text using the model
def get_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512).to(device)
    with torch.no_grad():
        outputs = model(**inputs)
        embeddings = outputs.last_hidden_state.mean(dim=1).squeeze()
    return embeddings.cpu().numpy()

def store_embeddings_in_pinecone():
    vectors = []
    ids = []
    metadata_list = []  # List to store metadata dictionaries

    for index, row in df.iterrows():  # Correct way to iterate and access row
        text = row['itemdesc']
        embedding = get_embedding(text)
        vectors.append(embedding)
        ids.append(f"desc_{row['itemcode']}")
        metadata_list.append(row.to_dict())  # Store metadata for each row

    vectors_data = [{"id": id, "values": vec.tolist(), "metadata": metadata} for id, vec, metadata in zip(ids, vectors, metadata_list)]
    vector_index.upsert(vectors=vectors_data, namespace="desc")

# Store embeddings in Pinecone
store_embeddings_in_pinecone()

def retrieve(query, top_k=5):
    query_embedding = get_embedding(query).reshape(1, -1)

    query_results = vector_index.query(vector=query_embedding.tolist()[0], top_k=top_k, include_metadata=True, namespace="desc")

    def process_results(results):
        matches = results['matches']
        data = []  # List to store dictionaries for DataFrame creation

        if matches:  # Check if there are any matches
            for match in matches:
                metadata = match.get('metadata', {})  # Safely get metadata, handle missing
                data.append({
                    'id': match['id'],
                    'score': match['score'],
                    **metadata  # Unpack metadata into the dictionary
                })

            df_results = pd.DataFrame(data)  # Create DataFrame DIRECTLY from the list of dictionaries
            return df_results
        else:
            return pd.DataFrame()  # Return empty DataFrame if no matches

    result = process_results(query_results)

    print("\nTop Matches for itemdesc:")
    if not result.empty:  # Check if the DataFrame is empty before displaying
        display(result[['itemdesc', 'itemcode', 'category', 'company', 'brand', 'score']].head())
    else:
        print("No matches found.")

    return result

# Example query
query = "RED LION GIFT ASSORTMENT"
retrieve(query)

PineconeApiException: (400)
Reason: Bad Request
HTTP response headers: HTTPHeaderDict({'Date': 'Mon, 24 Feb 2025 08:10:25 GMT', 'Content-Type': 'application/json', 'Content-Length': '119', 'Connection': 'keep-alive', 'x-pinecone-request-latency-ms': '28529', 'x-pinecone-request-id': '6634281989724520347', 'x-envoy-upstream-service-time': '0', 'server': 'envoy'})
HTTP response body: {"code":11,"message":"Error, message length too large: found 94835394 bytes, the limit is: 4194304 bytes","details":[]}


In [None]:
import os
os.kill(os.getpid(), 9)  # Kill the current process

#### Section 1.3.5: Handling 4MB Limitation of Pinecone

In [None]:
import pandas as pd

data = pd.read_csv('master_data.csv', encoding='windows-1252')
data.head()

Unnamed: 0,itemcode,itemdesc,catcode,category,company,brand,packaging,flavor,color,qty,uomdesc,pack_size,launchdate,audittype
0,30001,RED LION GIFT ASSORTMENT 350 GM (ADJ DISTRIBUT...,6,BISCUITS,ADJ DISTRIBUTORS,RED LION GIFT ASSORTMENT,CDBOX,,,350.0,GM,350.00 GM,201301.0,1
1,30002,REDLION GIFT ASSORTM 400 GM (ADJ DISTRIBUTORS),6,BISCUITS,ADJ DISTRIBUTORS,RED LION GIFT ASSORTMENT,CDBOX,,,400.0,GM,400.00 GM,200901.0,1
2,30003,SIGMA GIFT SELECTION 360 GM,6,BISCUITS,ASIA AGRO FOODS (PVT) LTD,SIGMA GIFT SELECTION,CDBOX,,,360.0,GM,360.00 GM,201808.0,1
3,30004,BISCO FIRST CHOICE CLASSIC 300 GM,6,BISCUITS,BISCO FOODS (PVT) LTD,BISCO FIRST CHOICE CLASSIC,CDBOX,,,300.0,GM,300.00 GM,201705.0,1
4,30006,KIST GIFT ASSORTMENT 400 GM,6,BISCUITS,CARGILLS QUALITY DAIRIES (PVT) LTD,KIST GIFT ASSORTMENT,CDBOX,,,400.0,GM,400.00 GM,201401.0,1


In [None]:
import pandas as pd

# Load the DataFrame (assuming 'df' is your DataFrame)
df = pd.read_csv('master_data.csv', encoding='windows-1252')

# # Remove specified columns
# columns_to_remove = ['itemcode', 'catcode', 'packaging', 'flavour', 'colour', 'quantity', 'uomdesc', 'pack_size', 'launchdate', 'audittype', 'brand']
# df = df.drop(columns=columns_to_remove, errors='ignore')

# Remove the last 10000 rows
df = df[:-22000]

# Save the DataFrame to a CSV file
df.to_csv('data.csv', index=False)

In [None]:
import torch
import pandas as pd
from transformers import AutoModel, AutoTokenizer
import numpy as np
from pinecone import Pinecone, ServerlessSpec
from IPython.display import display

# Define device
device = "cuda" if torch.cuda.is_available() else "cpu"

# Set up Pinecone environment
pc = Pinecone(api_key="pcsk_4GT8aD_FPuF7yJHHbz2h8Tpn9GRrAXjTULo69KzemEafbBwyUawYMFz3hYXpSFkTtqkrdL")  # Replace with your API key

index_name = "product-embeddings-master"

# Check if the index exists, if not create it
if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name,
        dimension=768,  # Make sure the dimension matches your model output
        metric="cosine",
        spec=ServerlessSpec(
            cloud="aws",
            region="us-east-1"  # Or your preferred region
        )
    )

# Connect to Pinecone index
vector_index = pc.Index(index_name)

# Initialize tokenizer and model
model_id = "intfloat/e5-base-v2"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModel.from_pretrained(model_id).to(device)
model.eval()

# Load Data
df = pd.read_csv('data.csv', encoding='windows-1252')  # Or your CSV file
df = df.fillna('') # Handle missing values. Very Important.
df.head()

# Function to get embeddings from text using the model
def get_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512).to(device)
    with torch.no_grad():
        outputs = model(**inputs)
        embeddings = outputs.last_hidden_state.mean(dim=1).squeeze()
    return embeddings.cpu().numpy()

def store_embeddings_in_pinecone():
    vectors = []
    ids = []
    metadata_list = []  # List to store metadata dictionaries

    for index, row in df.iterrows():  # Correct way to iterate and access row
        text = row['itemdesc']
        embedding = get_embedding(text)
        vectors.append(embedding)
        ids.append(f"desc_{row['itemcode']}")
        metadata_list.append(row.to_dict())  # Store metadata for each row

    vectors_data = [{"id": id, "values": vec.tolist(), "metadata": metadata} for id, vec, metadata in zip(ids, vectors, metadata_list)]
    vector_index.upsert(vectors=vectors_data, namespace="desc")

# Store embeddings in Pinecone
store_embeddings_in_pinecone()

def retrieve(query, top_k=5):
    query_embedding = get_embedding(query).reshape(1, -1)

    query_results = vector_index.query(vector=query_embedding.tolist()[0], top_k=top_k, include_metadata=True, namespace="desc")

    def process_results(results):
        matches = results['matches']
        data = []  # List to store dictionaries for DataFrame creation

        if matches:  # Check if there are any matches
            for match in matches:
                metadata = match.get('metadata', {})  # Safely get metadata, handle missing
                data.append({
                    'id': match['id'],
                    'score': match['score'],
                    **metadata  # Unpack metadata into the dictionary
                })

            df_results = pd.DataFrame(data)  # Create DataFrame DIRECTLY from the list of dictionaries
            return df_results
        else:
            return pd.DataFrame()  # Return empty DataFrame if no matches

    result = process_results(query_results)

    print("\nTop Matches for itemdesc:")
    if not result.empty:  # Check if the DataFrame is empty before displaying
        display(result[['itemdesc', 'itemcode', 'category', 'company', 'brand', 'score']].head())
    else:
        print("No matches found.")

    return result

# Example query
query = "RED LION GIFT ASSORTMENT"
retrieve(query)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/314 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/650 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

PineconeApiException: (400)
Reason: Bad Request
HTTP response headers: HTTPHeaderDict({'Date': 'Tue, 25 Feb 2025 04:40:13 GMT', 'Content-Type': 'application/json', 'Content-Length': '119', 'Connection': 'keep-alive', 'x-pinecone-request-latency-ms': '2050', 'x-pinecone-request-id': '9041961092332270391', 'x-envoy-upstream-service-time': '13', 'server': 'envoy'})
HTTP response body: {"code":11,"message":"Error, message length too large: found 27293025 bytes, the limit is: 4194304 bytes","details":[]}


#### **Section 1.3.7: Embeddings are successfully saved in PineCone via Chunking**
This part of the code was executed in Philadelphia's Server, hence the output is not present here

In [None]:
import torch
import pandas as pd
from transformers import AutoModel, AutoTokenizer
import numpy as np
from pinecone import Pinecone, ServerlessSpec

# Define device
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Device: {device}")

# Set up Pinecone environment
pinecone_api = "pcsk_4GT8aD_FPuF7yJHHbz2h8Tpn9GRrAXjTULo69KzemEafbBwyUawYMFz3hYXpSFkTtqkrdL"
pc = Pinecone(api_key=pinecone_api)  # Replace with your API key

index_name = "product-embeddings-test"

# Check if the index exists, if not create it
if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name,
        dimension=768,  # Ensure the dimension matches your model output (E5 model output size)
        metric="cosine",
        spec=ServerlessSpec(
            cloud="aws",
            region="us-east-1"  # Or your preferred region
        )
    )

# Connect to Pinecone index
vector_index = pc.Index(index_name)

# Initialize tokenizer and model
model_id = "intfloat/e5-base-v2"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModel.from_pretrained(model_id).to(device)
model.eval()

# Load Data
file_path = "/content/master_data"  # Your file path
df = pd.read_csv(file_path, encoding='windows-1252')  # Or your CSV file
df = df.fillna('')  # Handle missing values. Very Important.
print("Data loaded.")

# Function to get embeddings from text using the model
def get_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512).to(device)
    with torch.no_grad():
        outputs = model(**inputs)
        embeddings = outputs.last_hidden_state.mean(dim=1).squeeze()
    return embeddings.cpu().numpy()

def store_embeddings_in_pinecone(chunk_size=100):
    vectors = []
    ids = []
    metadata_list = []  # List to store metadata dictionaries

    # Iterate through the rows and collect embeddings
    for index, row in df.iterrows():
        text = row['itemdesc']
        embedding = get_embedding(text)
        vectors.append(embedding)
        ids.append(f"desc_{row['itemcode']}")
        metadata_list.append(row.to_dict())  # Store metadata for each row

        # When we reach the chunk size, upload to Pinecone
        if len(vectors) >= chunk_size:
            vectors_data = [{"id": id, "values": vec.tolist(), "metadata": metadata}
                            for id, vec, metadata in zip(ids, vectors, metadata_list)]
            vector_index.upsert(vectors=vectors_data, namespace="desc")
            print(f"Uploaded {len(vectors_data)} embeddings to Pinecone.")

            # Reset the lists to start a new chunk
            vectors = []
            ids = []
            metadata_list = []

    # Upload any remaining embeddings that didn't fill the last chunk
    if vectors:
        vectors_data = [{"id": id, "values": vec.tolist(), "metadata": metadata}
                        for id, vec, metadata in zip(ids, vectors, metadata_list)]
        vector_index.upsert(vectors=vectors_data, namespace="desc")
        print(f"Uploaded {len(vectors_data)} embeddings to Pinecone.")

# Store embeddings in Pinecone in chunks
store_embeddings_in_pinecone(chunk_size=100)

In [None]:
import torch
import pandas as pd
from transformers import AutoModel, AutoTokenizer
import numpy as np
from pinecone import Pinecone

# Define device
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Device: {device}")

# Set up Pinecone environment
pinecone_api = "pcsk_4GT8aD_FPuF7yJHHbz2h8Tpn9GRrAXjTULo69KzemEafbBwyUawYMFz3hYXpSFkTtqkrdL"
pc = Pinecone(api_key=pinecone_api)  # Replace with your API key

index_name = "product-embeddings-test"

# Connect to Pinecone index
vector_index = pc.Index(index_name)

# Initialize tokenizer and model
model_id = "intfloat/e5-base-v2"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModel.from_pretrained(model_id).to(device)
model.eval()

# Function to get embeddings from text using the model
def get_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512).to(device)
    with torch.no_grad():
        outputs = model(**inputs)
        embeddings = outputs.last_hidden_state.mean(dim=1).squeeze()
    return embeddings.cpu().numpy()

# Function to retrieve data from Pinecone based on query
def retrieve_from_pinecone(query, top_k=5):
    query_embedding = get_embedding(query).reshape(1, -1)

    query_results = vector_index.query(vector=query_embedding.tolist()[0], top_k=top_k, include_metadata=True, namespace="desc")

    def process_results(results):
        matches = results['matches']
        data = []  # List to store dictionaries for DataFrame creation

        if matches:  # Check if there are any matches
            for match in matches:
                metadata = match.get('metadata', {})  # Safely get metadata, handle missing
                data.append({
                    'id': match['id'],
                    'score': match['score'],
                    **metadata  # Unpack metadata into the dictionary
                })

            df_results = pd.DataFrame(data)  # Create DataFrame directly from the list of dictionaries
            return df_results
        else:
            return pd.DataFrame()  # Return empty DataFrame if no matches

    result = process_results(query_results)

    print("\nTop Matches for itemdesc:")
    if not result.empty:  # Check if the DataFrame is empty before displaying
        print(result[['itemdesc', 'itemcode', 'category', 'company', 'brand', 'score']].head())
    else:
        print("No matches found.")

    return result

# Example query
# query = "WRISHAV"
query = input("Enter Your Query: ")
# print("\n")
# print(f"Query: {query}")
retrieve_from_pinecone(query)
print("\n")