In [None]:
import pandas as pd
from pinecone.grpc import PineconeGRPC as Pinecone
from pinecone import ServerlessSpec
import time
from tqdm import tqdm  # For progress tracking
import itertools    
import json


In [None]:
with open("creds.json", "r") as f:
    creds = json.load(f)["pinecone"]

pc = Pinecone(
    api_key=api_key, 
)

# Check if the index already exists
if 'product-codes-index' not in pc.list_indexes().names():
    # Create an index
    pc.create_index(
        name='product-codes-index',
        dimension=1024,  
        metric='cosine',  
        spec=ServerlessSpec(
            cloud='aws', 
            region='us-east-1'
        ) 
    )

In [None]:
# Load and process the dataset
df = pd.read_csv("data/product_codes_HS17_V202501.csv")
df = df.rename(columns={"code": "id", "description": "text"})

# Ensure ID is a string
df["id"] = pd.to_numeric(df["id"], errors="coerce").astype("Int64").astype(str)

# Convert text to string and drop missing values
df["text"] = df["text"].astype(str)
df = df.dropna(subset=["id", "text"])

# Convert to list of dicts for processing
data = df.to_dict(orient="records")


In [73]:
BATCH_SIZE = 96

# Function to process in batches
def batch_generator(iterable, batch_size):
    """Yield successive batch_size chunks from iterable."""
    for i in range(0, len(iterable), batch_size):
        yield iterable[i : i + batch_size]

# Store all embeddings
all_embeddings = []

# Process embeddings in batches
for batch in tqdm(batch_generator(data, BATCH_SIZE), desc="Processing Batches"):
    batch_texts = [d["text"] for d in batch]

    # Generate embeddings for the batch
    batch_embeddings = pc.inference.embed(
        model="multilingual-e5-large",
        inputs=batch_texts,
        parameters={"input_type": "passage", "truncate": "END"}
    )

    # Append results
    all_embeddings.extend(batch_embeddings)


Processing Batches: 57it [01:19,  1.40s/it]


In [74]:
# Connect to the index
index = pc.Index("product-codes-index")

In [88]:
len(batch_data)

96

In [100]:
BATCH_SIZE = 96

for start_idx in range(0, len(data), BATCH_SIZE):
    end_idx = min(start_idx + BATCH_SIZE, len(data))
    
    batch_data = data[start_idx:end_idx]
    batch_embeddings = [e.values for e in all_embeddings[start_idx:end_idx]]  # Extract float lists

    # Prepare the upsert format (ID, vector, metadata)
    upsert_data = [
        (d["id"], emb, {"description": d["text"]})  # (id, vector, metadata)
        for d, emb in zip(batch_data, batch_embeddings)
    ]

    # Perform upsert
    index.upsert(vectors=upsert_data)

print(f"Upsert completed for {len(all_embeddings)} records.")

Upsert completed for 5384 records.


In [103]:
def get_autocomplete_suggestions(query, top_k=5):
    """
    Query Pinecone to get the top-k autocomplete suggestions based on input text.
    """
    # Convert query text into an embedding
    query_embedding = pc.inference.embed(
        model="multilingual-e5-large",
        inputs=[query],
        parameters={"input_type": "query", "truncate": "END"}
    )[0].values  # Extract list of floats

    # Perform a similarity search in Pinecone
    results = index.query(
        vector=query_embedding,
        top_k=top_k,
        include_metadata=True
    )

    # Extract product descriptions from results
    suggestions = [match.metadata["description"] for match in results.matches]
    
    return suggestions

query_text = "organic"
suggestions = get_autocomplete_suggestions(query_text)

print(f"Autocomplete suggestions for '{query_text}':")
for s in suggestions:
    print(f"- {s}")

Autocomplete suggestions for 'organic':
- Organic surface-active agents: non-ionic (other than soap), whether or not put up for retail sale
- Organic surface-active agents: cationic (other than soap), whether or not put up for retail sale
- Organic surface-active agents: anionic (other than soap), whether or not put up for retail sale
- Organic surface-active products and preparations for washing the skin, in the form of liquid or cream and put up for retail sale, whether or not containing soap
- Dyes: synthetic organic products of a kind used as fluorescent brightening agents


In [77]:
records = []
for d, e in zip(data, all_embeddings):
    records.append({
        "id": d['id'],
        "values": e['values'],
        "metadata": {'text': d['text']}
    })

# Upsert the records into the index
index.upsert(
    vectors=records,
    namespace="example-namespace"
)


PineconeException: UNKNOWN:Error received from peer  {grpc_message:"Error, message length too large: found 22757507 bytes, the limit is: 4194304 bytes", grpc_status:11, created_time:"2025-02-07T02:32:17.995844+00:00"}

In [None]:


data = df.to_dict(orient="records")

# get embeddings
embeddings = pc.inference.embed(
    model="multilingual-e5-large",
    inputs=[d['text'] for d in data],
    parameters={"input_type": "passage", "truncate": "END"}
)



In [3]:
from pinecone import Pinecone, ServerlessSpec

pc = Pinecone(
    api_key="pcsk_ytxDk_UB2VfK6P7Nk5qAzj1kYyV9MYSWiZDKC7HMqvrWgCsQBaCUxZVdB9tUM2BXsWsYX",  # Replace with your API key
)

# Check if the index already exists
if 'product-codes-index' not in pc.list_indexes().names():
    # Create an index
    pc.create_index(
        name='product-codes-index',
        dimension=1536,  # Adjust based on your embedding model
        metric='cosine',  # 'cosine', 'euclidean', or 'dotproduct'
        spec=ServerlessSpec(
            cloud='gcp',
            region='europe-west1'  # UK region
        )
    )

NotFoundException: (404)
Reason: Not Found
HTTP response headers: HTTPHeaderDict({'content-type': 'text/plain; charset=utf-8', 'access-control-allow-origin': '*', 'vary': 'origin,access-control-request-method,access-control-request-headers', 'access-control-expose-headers': '*', 'x-pinecone-api-version': '2024-07', 'X-Cloud-Trace-Context': '2597340bf492bdd8d5a91e507a12939a', 'Date': 'Fri, 07 Feb 2025 01:59:43 GMT', 'Server': 'Google Frontend', 'Content-Length': '106', 'Via': '1.1 google', 'Alt-Svc': 'h3=":443"; ma=2592000,h3-29=":443"; ma=2592000'})
HTTP response body: {"error":{"code":"NOT_FOUND","message":"Resource cloud: gcp region: europe-west1 not found"},"status":404}


# Country codes

In [104]:

# Initialize Pinecone with API key and serverless spec for UK region
pc = Pinecone(
    api_key=api_key,  # Replace with your API key
)

# Check if the index already exists
if 'country-codes-index' not in pc.list_indexes().names():
    # Create an index
    pc.create_index(
        name='country-codes-index',
        dimension=1024,  # Adjust based on your embedding model
        metric='cosine',  # 'cosine', 'euclidean', or 'dotproduct'
        spec=ServerlessSpec(
            cloud='aws', 
            region='us-east-1'
        ) 
    )

In [107]:
df.columns

Index(['country_code', 'country_name', 'country_iso2', 'country_iso3'], dtype='object')

In [113]:
# Load and process the dataset
df = pd.read_csv("data/country_codes_V202501.csv") # Index(['country_code', 'country_name', 'country_iso2', 'country_iso3'], dtype='object')
df = df.rename(columns={"country_code": "id", "country_name": "text"})

# Ensure ID is a string (Pinecone requires string IDs)
df["id"] = pd.to_numeric(df["id"], errors="coerce").astype("Int64").astype(str)

# Convert text to string and drop missing values
df["text"] = df["text"].astype(str)
df = df.dropna(subset=["id", "text"])

# Convert to list of dicts for processing
country_data = df.to_dict(orient="records")

BATCH_SIZE = 96

# Function to process in batches
def batch_generator(iterable, batch_size):
    """Yield successive batch_size chunks from iterable."""
    for i in range(0, len(iterable), batch_size):
        yield iterable[i : i + batch_size]

# Store all embeddings
all_country_embeddings = []

# Process embeddings in batches
for batch in tqdm(batch_generator(country_data, BATCH_SIZE), desc="Processing Batches"):
    batch_texts = [d["text"] for d in batch]

    # Generate embeddings for the batch
    batch_embeddings = pc.inference.embed(
        model="multilingual-e5-large",
        inputs=batch_texts,
        parameters={"input_type": "passage", "truncate": "END"}
    )

    # Append results
    all_country_embeddings.extend(batch_embeddings)



Processing Batches: 3it [00:04,  1.40s/it]


In [114]:
country_index = pc.Index("country-codes-index")

In [124]:
BATCH_SIZE = 96

for start_idx in range(0, len(country_data), BATCH_SIZE):
    end_idx = min(start_idx + BATCH_SIZE, len(country_data))
    
    batch_data = country_data[start_idx:end_idx]
    batch_embeddings = [e.values for e in all_country_embeddings[start_idx:end_idx]]  # Extract float lists

    # Prepare the upsert format (ID, vector, metadata)
    upsert_data = [
        (d["id"], emb, {"description": d["text"]})  # (id, vector, metadata)
        for d, emb in zip(batch_data, batch_embeddings)
    ]

    # Perform upsert
    country_index.upsert(vectors=upsert_data)

print(f"Upsert completed for {len(country_data)} records.")

Upsert completed for 238 records.


In [121]:
upsert_data = [
    (d["id"], emb, {"description": d["text"]})  # (id, vector, metadata)
    for d, emb in zip(batch_data, batch_embeddings)
]


In [122]:
upsert_data

[('4',
  dict_values(['4', 'Afghanistan', 'AF', 'AFG']),
  {'description': 'Afghanistan'}),
 ('8', dict_values(['8', 'Albania', 'AL', 'ALB']), {'description': 'Albania'}),
 ('12',
  dict_values(['12', 'Algeria', 'DZ', 'DZA']),
  {'description': 'Algeria'}),
 ('16',
  dict_values(['16', 'American Samoa', 'AS', 'ASM']),
  {'description': 'American Samoa'}),
 ('20',
  dict_values(['20', 'Andorra', 'AD', 'AND']),
  {'description': 'Andorra'}),
 ('24', dict_values(['24', 'Angola', 'AO', 'AGO']), {'description': 'Angola'}),
 ('28',
  dict_values(['28', 'Antigua and Barbuda', 'AG', 'ATG']),
  {'description': 'Antigua and Barbuda'}),
 ('31',
  dict_values(['31', 'Azerbaijan', 'AZ', 'AZE']),
  {'description': 'Azerbaijan'}),
 ('32',
  dict_values(['32', 'Argentina', 'AR', 'ARG']),
  {'description': 'Argentina'}),
 ('36',
  dict_values(['36', 'Australia', 'AU', 'AUS']),
  {'description': 'Australia'}),
 ('40',
  dict_values(['40', 'Austria', 'AT', 'AUT']),
  {'description': 'Austria'}),
 ('44',


In [127]:
import eco_style
import json

In [128]:
json.dumps(eco_style.dark())

'{"config": {"background": "#122B39", "font": "Circular Std", "title": {"color": "#b4c8d8", "fontSize": 14, "fontWeight": 400}, "mark": {"line": {"interpolate": "monotone"}}, "view": {"stroke": "transparent", "width": 400, "height": 300}, "range": {"category": ["#36B7B4", "#E6224B", "#F4C245", "#0063AF", "#00A767", "#179FDB", "#EB5C2E"], "diverging": ["#E6224B", "#E54753", "#C9C9C9", "#179FDB", "#122B39"], "heatmap": ["#C9C9C9", "#179FDB", "#0063AF", "#122B39"], "ordinal": ["#00A767", "#36B7B4", "#179FDB", "#0063AF", "#243B5A"]}, "axisX": {"domainColor": "#b4c8d8", "domainOpacity": 0.5, "grid": false, "labelAngle": 0, "labelColor": "#b4c8d8", "labelOpacity": 0.7, "orient": "bottom", "tickColor": "#b4c8d8", "tickCount": 10, "tickOpacity": 0.5, "title": "", "titleAlign": "center", "titleAnchor": "middle", "titleColor": "#b4c8d8", "titleFontSize": 12, "titleOpacity": 0.8, "titleX": 207, "titleY": -15}, "axisY": {"domainColor": "#b4c8d8", "domainOpacity": 0.5, "format": ".0f", "gridColor":