# Vector Databses : Pinecone 

In [12]:
import pinecone 
from pinecone import Pinecone, ServerlessSpec
import os 
from datasets import load_dataset
from sentence_transformers import SentenceTransformer


from dotenv import load_dotenv, find_dotenv

load_dotenv(find_dotenv(), override=True) 



True

In [5]:
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"), environment=os.getenv("PINECONE_ENV")) 

In [6]:
pc.list_indexes() 

[
    {
        "name": "udemy-ml-engineer-course",
        "metric": "cosine",
        "host": "udemy-ml-engineer-course-4z04feo.svc.aped-4627-b74a.pinecone.io",
        "spec": {
            "serverless": {
                "region": "us-east-1",
                "cloud": "aws",
                "read_capacity": {
                    "mode": "OnDemand",
                    "status": {
                        "state": "Ready",
                        "current_shards": null,
                        "current_replicas": null
                    }
                }
            }
        },
        "status": {
            "ready": true,
            "state": "Ready"
        },
        "vector_type": "dense",
        "dimension": 3,
        "deletion_protection": "disabled",
        "tags": null
    },
    {
        "name": "udemy-ml-engineer-course-dim-1536",
        "metric": "dotproduct",
        "host": "udemy-ml-engineer-course-dim-1536-4z04feo.svc.aped-4627-b74a.pinecone.io",
        "spe

In [10]:
pc.create_index(
    name="udemy-ml-engineer-course-dim-1536", 
    dimension=1536, 
    spec=ServerlessSpec(
        cloud="aws", 
        region="us-east-1"),
    metric="dotproduct"
    ) 

{
    "name": "udemy-ml-engineer-course-dim-1536",
    "metric": "dotproduct",
    "host": "udemy-ml-engineer-course-dim-1536-4z04feo.svc.aped-4627-b74a.pinecone.io",
    "spec": {
        "serverless": {
            "region": "us-east-1",
            "cloud": "aws",
            "read_capacity": {
                "mode": "OnDemand",
                "status": {
                    "state": "Ready",
                    "current_shards": null,
                    "current_replicas": null
                }
            }
        }
    },
    "status": {
        "ready": true,
        "state": "Ready"
    },
    "vector_type": "dense",
    "dimension": 1536,
    "deletion_protection": "disabled",
    "tags": null,
    "_response_info": {
        "raw_headers": {
            "content-type": "application/json",
            "vary": "origin, access-control-request-method, access-control-request-headers",
            "access-control-allow-origin": "*",
            "access-control-expose-headers":

In [11]:
pc.list_indexes() 

[
    {
        "name": "udemy-ml-engineer-course",
        "metric": "cosine",
        "host": "udemy-ml-engineer-course-4z04feo.svc.aped-4627-b74a.pinecone.io",
        "spec": {
            "serverless": {
                "region": "us-east-1",
                "cloud": "aws",
                "read_capacity": {
                    "mode": "OnDemand",
                    "status": {
                        "state": "Ready",
                        "current_shards": null,
                        "current_replicas": null
                    }
                }
            }
        },
        "status": {
            "ready": true,
            "state": "Ready"
        },
        "vector_type": "dense",
        "dimension": 3,
        "deletion_protection": "disabled",
        "tags": null
    },
    {
        "name": "udemy-ml-engineer-course-dim-1536",
        "metric": "dotproduct",
        "host": "udemy-ml-engineer-course-dim-1536-4z04feo.svc.aped-4627-b74a.pinecone.io",
        "spe

In [8]:
index = pc.Index(name="udemy-ml-engineer-course")

In [10]:
index.upsert([
    ('Dog', [4.,0.,1.]),
    ('Cat', [4.,0.,1.]),
    ('Chicken', [2.,2.,1.]),
    ('Matis', [6.,2.,3.]),
    ('Elephant', [4.,0.,1.])
])

UpsertResponse(upserted_count=5, _response_info={'raw_headers': {'date': 'Sun, 15 Feb 2026 10:02:08 GMT', 'content-type': 'application/json', 'content-length': '19', 'connection': 'keep-alive', 'x-pinecone-request-lsn': '1', 'x-pinecone-request-logical-size': '86', 'x-pinecone-request-latency-ms': '259', 'x-envoy-upstream-service-time': '260', 'x-pinecone-response-duration-ms': '261', 'grpc-status': '0', 'server': 'envoy'}})

### Getting to know the FineWeb dataset

In [14]:
fw = load_dataset(
    'HuggingFaceFW/FineWeb', 
    name='sample-10BT', 
    split='train', 
    streaming=True)



In [15]:
fw

IterableDataset({
    features: ['text', 'id', 'dump', 'url', 'date', 'file_path', 'language', 'language_score', 'token_count'],
    num_shards: 15
})

In [16]:
fw.features

{'text': Value('string'),
 'id': Value('string'),
 'dump': Value('string'),
 'url': Value('string'),
 'date': Value('string'),
 'file_path': Value('string'),
 'language': Value('string'),
 'language_score': Value('float64'),
 'token_count': Value('int64')}

### Upserting text data using an embedding algorithm 

In [17]:
model = SentenceTransformer('all-MiniLM-L6-v2')

Loading weights: 100%|██████████| 103/103 [00:00<00:00, 1840.87it/s, Materializing param=pooler.dense.weight]                             
[1mBertModel LOAD REPORT[0m from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


In [18]:
pc.create_index(
    name="text", 
    dimension=model.get_sentence_embedding_dimension(), 
    spec=ServerlessSpec(
        cloud="aws", 
        region="us-east-1"),
    metric="cosine"
    )


{
    "name": "text",
    "metric": "cosine",
    "host": "text-4z04feo.svc.aped-4627-b74a.pinecone.io",
    "spec": {
        "serverless": {
            "region": "us-east-1",
            "cloud": "aws",
            "read_capacity": {
                "mode": "OnDemand",
                "status": {
                    "state": "Ready",
                    "current_shards": null,
                    "current_replicas": null
                }
            }
        }
    },
    "status": {
        "ready": true,
        "state": "Ready"
    },
    "vector_type": "dense",
    "dimension": 384,
    "deletion_protection": "disabled",
    "tags": null,
    "_response_info": {
        "raw_headers": {
            "content-type": "application/json",
            "vary": "origin, access-control-request-method, access-control-request-headers",
            "access-control-allow-origin": "*",
            "access-control-expose-headers": "*",
            "x-pinecone-api-version": "2025-10",
        

In [19]:
index_2 = pc.Index(name="text")

In [24]:
# Define a sub-set of the dataset to upsert, which will reduce the time it takes 
subset_size = 10000

# Iterate over the dataset and prepare the data for upserting 
vectors_to_upsert = []

for i, item in enumerate(fw):
    if i >= subset_size:
        break
    text = item['text']
    unique_id = str(item['id'])
    language = item['language']
    date = item['date']
    url = item['url']

    # Creating an embedding for the text 
    embedding = model.encode(text, show_progress_bar=False).tolist()

    # Prepare metadata for the vector 
    metadata = {
        "language": language,
        "date": date,
        "url": url}

    # Append a tuple to the list 
    vectors_to_upsert.append((unique_id, embedding, metadata))

# Upsert the vectors to the database in batches
batch_size = 100

for i in range(0, len(vectors_to_upsert), batch_size):
    batch = vectors_to_upsert[i:i+batch_size]
    index_2.upsert(vectors=batch)

print(f"Upserted {len(vectors_to_upsert)} vectors to the index")



Upserted 10000 vectors to the index
