In [1]:
from datasets import load_dataset
from pinecone import Pinecone, ServerlessSpec
import os
from dotenv import load_dotenv, find_dotenv
from sentence_transformers import SentenceTransformer

In [2]:
fw = load_dataset('HuggingFaceFW/fineweb', name='sample-10BT', split='train', streaming=True)

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [5]:
load_dotenv(find_dotenv(), override=True)

True

In [6]:
pc = Pinecone(api_key=os.environ.get('PINECONE_API_KEY'))

In [7]:
model = SentenceTransformer('all-MiniLM-L6-v2')

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [9]:
pc.list_indexes()

[
    {
        "name": "sample-index",
        "metric": "euclidean",
        "host": "sample-index-02jckfa.svc.aped-4627-b74a.pinecone.io",
        "spec": {
            "serverless": {
                "region": "us-east-1",
                "cloud": "aws",
                "read_capacity": {
                    "mode": "OnDemand",
                    "status": {
                        "state": "Ready",
                        "current_shards": null,
                        "current_replicas": null
                    }
                }
            }
        },
        "status": {
            "ready": true,
            "state": "Ready"
        },
        "vector_type": "dense",
        "dimension": 1536,
        "deletion_protection": "disabled",
        "tags": null
    },
    {
        "name": "my-index",
        "metric": "cosine",
        "host": "my-index-02jckfa.svc.aped-4627-b74a.pinecone.io",
        "spec": {
            "serverless": {
                "region": "us-east-1",

In [11]:
pc.create_index(
    name='text',
    dimension=model.get_sentence_embedding_dimension(),
    metric='cosine',
    spec=ServerlessSpec(
        cloud='aws',
        region='us-east-1'
    )
)

{
    "name": "text",
    "metric": "cosine",
    "host": "text-02jckfa.svc.aped-4627-b74a.pinecone.io",
    "spec": {
        "serverless": {
            "region": "us-east-1",
            "cloud": "aws",
            "read_capacity": {
                "mode": "OnDemand",
                "status": {
                    "state": "Ready",
                    "current_shards": null,
                    "current_replicas": null
                }
            }
        }
    },
    "status": {
        "ready": true,
        "state": "Ready"
    },
    "vector_type": "dense",
    "dimension": 384,
    "deletion_protection": "disabled",
    "tags": null,
    "_response_info": {
        "raw_headers": {
            "content-type": "application/json",
            "access-control-allow-origin": "*",
            "vary": "access-control-request-headers",
            "access-control-expose-headers": "*",
            "x-pinecone-api-version": "2025-10",
            "x-cloud-trace-context": "c8b0f9df6

In [14]:
# Defining the sample size
sample_size = 1000

# Iterating over the dataset
vector_to_upsert = []
for i, item in enumerate(fw):
    if i >= sample_size:
        break

    text = item['text']
    unique_id = str(item['id'])
    language = item['language']

    # create embeddings
    embedding = model.encode(text, show_progress_bar=False).tolist()

    # prepare metadata
    metadata = {'language': language}

    vector_to_upsert.append((unique_id, embedding, metadata))

In [16]:
index = pc.Index('text')

In [17]:
batch_size = 100
for i in range(0, len(vector_to_upsert), batch_size):
    batch = vector_to_upsert[i:i + batch_size]
    index.upsert(vectors=batch)

print("Subset of the dataset upserted in Pinecone index")

Subset of the dataset upserted in Pinecone index
