## FineWeb vectorDB

In [1]:
from datasets import load_dataset
from dotenv import load_dotenv, find_dotenv
import pinecone
from pinecone import Pinecone, ServerlessSpec
import os
from sentence_transformers import SentenceTransformer


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
fw = load_dataset("HuggingFaceFW/fineweb", name='sample-10BT', split='train', streaming=True)

In [3]:
fw

IterableDataset({
    features: ['text', 'id', 'dump', 'url', 'date', 'file_path', 'language', 'language_score', 'token_count'],
    num_shards: 15
})

In [4]:
fw.features

{'text': Value(dtype='string', id=None),
 'id': Value(dtype='string', id=None),
 'dump': Value(dtype='string', id=None),
 'url': Value(dtype='string', id=None),
 'date': Value(dtype='string', id=None),
 'file_path': Value(dtype='string', id=None),
 'language': Value(dtype='string', id=None),
 'language_score': Value(dtype='float64', id=None),
 'token_count': Value(dtype='int64', id=None)}

In [5]:
model = SentenceTransformer('all-MiniLM-L6-v2')

In [6]:
model

SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
)

In [7]:
load_dotenv(find_dotenv(), override=True)

True

In [8]:
pc = Pinecone(api_key=os.environ.get("PINECONE_API_KEY"), environment = os.environ.get("PINECONE_ENV"))

In [9]:
pc.list_indexes()

[
    {
        "name": "ben-start-index",
        "metric": "cosine",
        "host": "ben-start-index-i2oc4nb.svc.aped-4627-b74a.pinecone.io",
        "spec": {
            "serverless": {
                "cloud": "aws",
                "region": "us-east-1"
            }
        },
        "status": {
            "ready": true,
            "state": "Ready"
        },
        "vector_type": "dense",
        "dimension": 3,
        "deletion_protection": "disabled",
        "tags": null
    }
]

In [10]:
pc.create_index(
    name="ben-fineweb-vector",
    dimension=model.get_sentence_embedding_dimension(),
    metric='cosine',
    spec=ServerlessSpec(
        cloud='aws',
        region='us-east-1'
    )
)

{
    "name": "ben-fineweb-vector",
    "metric": "cosine",
    "host": "ben-fineweb-vector-i2oc4nb.svc.aped-4627-b74a.pinecone.io",
    "spec": {
        "serverless": {
            "cloud": "aws",
            "region": "us-east-1"
        }
    },
    "status": {
        "ready": true,
        "state": "Ready"
    },
    "vector_type": "dense",
    "dimension": 384,
    "deletion_protection": "disabled",
    "tags": null
}

In [11]:
index = pc.Index(name='ben-fineweb-vector')

In [12]:
index

<pinecone.data.index.Index at 0x322575060>

In [13]:
# Define the number of items you want to process (subset size)

subset_size = 100 # Only take 10, 000 items

# Iterate over the dataset and prepare data for upserting

vectors_to_upsert = []

for i, item in enumerate(fw):
    if i >= subset_size:
        break

    text = item['text']
    unique_id = str(item['id'])
    language=item['language']

    # Create an embedding for the text
    embedding = model.encode(text, show_progress_bar=False).tolist()

    # Prepare metadata
    metadata = {'language': language}

    # Append the tuple (id, embedding, metadata)
    vectors_to_upsert.append((unique_id, embedding, metadata))


# Upsert data to Pinecone in batches

batch_size = 10 # Adjust based on your environment and dataset size

for i in range(0, len(vectors_to_upsert), batch_size):
    print("Current batch ", i+batch_size)
    batch = vectors_to_upsert[i:i + batch_size]
    print("Batch value ", batch)
    index.upsert(vectors=batch)


print("Subset of data upserted to Pinecone Index.")

Current batch  10
Batch value  [('<urn:uuid:39147604-bfbe-4ed5-b19c-54105f8ae8a7>', [-0.07137056440114975, -0.012872940860688686, 0.0957353487610817, 0.014749758876860142, -0.005443932022899389, 0.06845620274543762, 0.012210480868816376, 0.01889190264046192, 0.06273901462554932, -0.001546548563055694, 0.03348218649625778, -0.024711819365620613, -0.07785329967737198, -0.020623095333576202, -0.05137189105153084, -0.05435721203684807, -0.053589414805173874, -0.01099426206201315, -0.09039899706840515, 0.06976143270730972, -0.052547305822372437, -0.11004109680652618, 0.018547991290688515, 0.07281851023435593, 0.03385881707072258, 0.006531232502311468, 0.050778500735759735, 0.023879820480942726, -0.005901073105633259, -0.001958848675712943, -0.039048630744218826, 0.02282853238284588, 0.04294576495885849, 0.019258510321378708, 0.002372132148593664, 0.027770398184657097, 0.03706519305706024, -0.028697552159428596, 0.05579996109008789, 0.024242309853434563, 0.0003278878575656563, -0.08616065979