In [None]:
from pinecone import Pinecone , ServerlessSpec

In [None]:
pc= Pinecone(api_key = os.getenv("PINECONE_API_KEY"))

In [None]:
index_name = "learning-pinecone"
if not pc.has_index(index_name):
    pc.create_index_for_model(
        name= index_name,
        cloud= "aws",
        region = "us-east-1",
        embed = {
            "model":"llama-text-embed-v2",
            "field_map":{"text": "chunk_text"}
        }
    )

In [None]:
index = pc.Index(index_name)
desrcibe=index.describe_index_stats()
print(desrcibe)

In [None]:
%pip install openpyxl

In [None]:
import pandas as pd
from sentence_transformers import SentenceTransformer
import uuid # For generating guaranteed unique IDs
import numpy as np
import openpyxl

In [None]:
records = []

# --- 1. Load the Excel File ---
try:
    # Read the first sheet of the Excel file
    df = pd.read_excel("Book4.xlsx")
    print(f"Loaded {len(df)} records from ")
except FileNotFoundError:
    print(f"Error: not found. Please check the file path.")
    exit()

# Ensure all data is treated as strings for concatenation and metadata
# We fill NaNs with an empty string ('')
df = df.astype(str).fillna('')

# Get all original column names
columns = df.columns.tolist()

# --- 2. Create the Combined Text for Embedding ---

def combine_row_data(row, column_names):
    """Combines all column values in a row into a single string for embedding."""
    parts = []
    for col in column_names:
        # Include the column name to give context to the embedding model
        parts.append(f"{col}: {row[col]}")
    return " | ".join(parts)

print("\nCombining all column data into the required 'chunk_text' field...")
# Apply the function to create the text that Pinecone will embed
df['chunk_text'] = df.apply(
    lambda row: combine_row_data(row, columns), 
    axis=1
)

# --- 3. Format Data into Pinecone Records ---

# The format for a Pinecone upsert with managed embedding is a list of tuples:
# (id, text_to_embed, metadata)
# Where text_to_embed is *just* the string of text that Pinecone will process.
# Since we are using an `embed` configuration, Pinecone expects the text
# to be inside the metadata dictionary under the key defined in your index setup,
# which is 'chunk_text' (from `field_map={"text": "chunk_text"}`).

for i, row in df.iterrows():
    
    unique_id = str(uuid.uuid4())
    
    # 3b. Construct the Metadata Dictionary
    # We combine the ID and the metadata into a single dictionary.
    pinecone_record_dict = {
        # 1. Add the unique ID field
        'id': unique_id,
        # 2. Add the text field that Pinecone will embed (CRITICAL)
        'chunk_text': row['chunk_text']
    }
    
    # 3. Add ALL original columns and their values as additional metadata
    for col in columns:
        # Avoid overwriting 'chunk_text' if a column was already named that, 
        # though our code created 'chunk_text' from a combination of all.
        if col not in ['chunk_text', 'id']: 
            pinecone_record_dict[col] = str(row[col])
            
    # Append the dictionary to the main list
    records.append(pinecone_record_dict)

print(f"\nSuccessfully created {len(records)} Pinecone records in the 'records' variable.")

# --- Example of the final 'records' structure (Dictionary) ---
if records:
    print("\nExample of the first record (Dictionary format):")
    
    first_record = records[0] # This gets the first dictionary from the list
    
    # CORRECT: Access keys by their name (string)
    print(f"  ID (Key): {first_record['id']}") 
    
    # CORRECT: The vector is *not* present because Pinecone is generating it, 
    # so we just check the 'chunk_text' field which serves as the source.
    print(f"  Content for Embedding: {first_record['chunk_text'][:50]}...") 
    
    # Print the full list of keys to show what's included in metadata
    print(f"  All Metadata Keys: {list(first_record.keys())}")

In [None]:
index.upsert_records("namespace1", records)

In [None]:
desrcibe=index.describe_index_stats()
print(desrcibe)

In [None]:
search = index.search(
    namespace="namespace1",
    query={
        "top_k":10,
        "inputs":{
            'text':"syringe price"
        }
    }

)
print(search['result']['hits'][0])
