In [None]:
# Install necessary packages (if not already installed)
!pip install sentence_transformers datasets

In [None]:
# Import required libraries
import pandas as pd
from tqdm import tqdm
import numpy as np
from datasets import load_dataset
from sentence_transformers import SentenceTransformer


In [None]:

# Load the SentenceTransformer model
model = SentenceTransformer('multi-qa-mpnet-base-dot-v1')

# Load the dataset
dataset_name = "Forbu14/LoiLibre"
ds = load_dataset(dataset_name, split="train")


In [None]:

# Set batch size for processing
batch_size = 16
num_batches = len(ds) // batch_size

# Initialize lists to store embeddings and their indices
embedding_list = []
embedding_index = []

# Process the dataset in batches
for batch_idx in tqdm(range(num_batches)):
    # Get a batch of texts
    start_idx = batch_idx * batch_size
    end_idx = (batch_idx + 1) * batch_size
    batch = ds[start_idx:end_idx]
    texts = batch["text"]
    
    # Generate embeddings for the batch
    embeddings = model.encode(texts)
    
    # Store embeddings and their indices
    embedding_list.append(embeddings)
    embedding_index.extend(range(start_idx, end_idx))

# Concatenate all embeddings into a single array
embedding_array = np.concatenate(embedding_list)

In [None]:

# Create a DataFrame with embeddings and their indices
df_embedding = pd.DataFrame({
    "embedding": list(embedding_array),
    "index": embedding_index
})

# Save the embeddings to a Parquet file
df_embedding.to_parquet("embedding.parquet")

# Print the total number of embeddings generated
print(f"Total embeddings generated: {len(df_embedding)}")

In [None]:
len(embedding_df)