In [20]:
print("Q")

Q


In [21]:
# Synchronous Example
from atoma_sdk import AtomaSDK
import os
import uuid


def get_embedding(text):
    with AtomaSDK(
        bearer_auth=os.getenv("ATOMA_API_KEY", ""),
    ) as atoma_sdk:

        user_id = f"user-{uuid.uuid4()}"

        res = atoma_sdk.embeddings.create(
            input_=text,
            model="intfloat/multilingual-e5-large-instruct",
            encoding_format="float",
            user=user_id,
        )

        return res


# Example usage
# text = "The quick brown fox jumped over the lazy dog"
# embedding = get_embedding(text)
# print(embedding)

In [22]:
from transformers import AutoTokenizer, AutoModel

tokenizer = AutoTokenizer.from_pretrained("intfloat/multilingual-e5-large-instruct")


def truncate_text(text, max_tokens=512):
    # Tokenize the text
    tokens = tokenizer(text, truncation=False)["input_ids"]

    # If text is already shorter than max_tokens, return as is
    if len(tokens) <= max_tokens:
        return text

    # Truncate to max_tokens and decode back to text
    truncated_tokens = tokens[:max_tokens]
    truncated_text = tokenizer.decode(truncated_tokens)

    return truncated_text


# Example usage
# text = "The quick brown fox jumped over the lazy dog"
# truncated = truncate_text(text)
# print(f"Original length: {len(tokenizer(text)['input_ids'])}")
# print(f"Truncated length: {len(tokenizer(truncated)['input_ids'])}")

In [None]:
def process_to_paragraphs(text):
    # Split by newlines
    lines = text.split("\n")

    # Initialize variables
    current_title = None
    current_content = []
    paragraphs = []

    for i, line in enumerate(lines):
        # Skip empty lines
        if not line.strip():
            continue

        # Count words
        word_count = len(line.split())

        # If line has less than 10 words, treat as title
        if word_count < 10:
            # If we have content from previous title, save it
            if current_title and current_content:
                paragraphs.append(
                    {"title": current_title, "content": " ".join(current_content)}
                )

            # Start new section
            current_title = line
            current_content = []

        else:
            # Add to current content
            current_content.append(line)

    # Handle last section
    if current_title and current_content:
        # Check if last line should be merged
        if len(current_content) > 1:
            last_line = current_content[-1]
            if len(last_line.split()) < 10:
                # Merge last line with previous
                current_content[-2] = current_content[-2] + " " + last_line
                current_content.pop()

        paragraphs.append(
            {"title": current_title, "content": " ".join(current_content)}
        )

    return paragraphs


# # Process the text into paragraphs
# text = scraped_data[0]["content"]
# structured_paragraphs = process_paragraphs(text)


# # Print content length for each paragraph
# for i, p in enumerate(structured_paragraphs):
#     print(f"\nParagraph {i+1}:")
#     print(f"Title length: {len(p['title'])}")
#     print(f"Content length: {len(p['content'])}")
#     print(f"Content tokens: {len(tokenizer(p['content'])['input_ids'])}")

In [27]:
import glob
import json
from datetime import datetime
import os

# Get today's date in YYYYMMDD format
date_str = datetime.now().strftime("%Y%m%d")

# Find all scraped data files for today
embed_1_2_files = glob.glob(f"{date_str}_*_embed.1.2.json")

for file_path in embed_1_2_files:
    print(f"Processing {file_path}")

    # Read the scraped data
    with open(file_path, "r", encoding="utf-8") as f:
        scraped_data = json.load(f)

    # Process each document
    embedded_data = []
    for doc in scraped_data:
        if doc.get("content"):
            # Get embedding for the content
            # Process text into paragraphs
            paragraphs = process_to_paragraphs(doc["content"])

            # Process each paragraph
            doc_paragraph_embeddings = []
            for para in paragraphs:
                # Format text as "# {title}\n{content}"
                formatted_text = f"# {para['title']}\n{para['content']}"

                # Get embedding for the formatted text
                truncated_text = truncate_text(formatted_text, max_tokens=500)
                embedding = get_embedding(truncated_text)

                doc_paragraph_embeddings.append(embedding.data[0].embedding)

            # Store original doc data along with paragraph embeddings
            embedded_doc = {**doc, "paragraph_embeddings": doc_paragraph_embeddings}
            embedded_data.append(embedded_doc)

    # Generate output filename
    # Extract site name from input filename
    # Input format: YYYYMMDD_site_scraped_data.json
    site = file_path.split("_")[1]
    output_file = f"{date_str}_{site}_embed.1.3.json"

    # Save embedded data
    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(embedded_data, f, ensure_ascii=False, indent=4)

    print(f"Saved embeddings to {output_file}")

Processing 20250207_learn.bluefin.io_scraped_data.json
Saved embeddings to 20250207_learn.bluefin.io_embed.1.3.json
Processing 20250207_docs.sudo.finance_scraped_data.json
Saved embeddings to 20250207_docs.sudo.finance_embed.1.3.json
Processing 20250207_docs.suilend.fi_scraped_data.json
Saved embeddings to 20250207_docs.suilend.fi_embed.1.3.json
Processing 20250207_docs.aftermath.finance_scraped_data.json
Saved embeddings to 20250207_docs.aftermath.finance_embed.1.3.json
Processing 20250207_omnibtclabs.gitbook.io_scraped_data.json
Saved embeddings to 20250207_omnibtclabs.gitbook.io_embed.1.3.json
Processing 20250207_cetus-1.gitbook.io_scraped_data.json
Saved embeddings to 20250207_cetus-1.gitbook.io_embed.1.3.json
Processing 20250207_doc-en.mole.fi_scraped_data.json
Saved embeddings to 20250207_doc-en.mole.fi_embed.1.3.json
Processing 20250207_naviprotocol.gitbook.io_scraped_data.json
Saved embeddings to 20250207_naviprotocol.gitbook.io_embed.1.3.json
Processing 20250207_docs.scallop.i

Token indices sequence length is longer than the specified maximum sequence length for this model (1392 > 512). Running this sequence through the model will result in indexing errors


Saved embeddings to 20250207_docs.scallop.io_embed.1.3.json
Processing 20250207_docs.typus.finance_scraped_data.json
Saved embeddings to 20250207_docs.typus.finance_embed.1.3.json
Processing 20250207_docs.flowx.finance_scraped_data.json
Saved embeddings to 20250207_docs.flowx.finance_embed.1.3.json
Processing 20250207_docs.kriya.finance_scraped_data.json
Saved embeddings to 20250207_docs.kriya.finance_embed.1.3.json
Processing 20250207_docs.mstable.io_scraped_data.json
Saved embeddings to 20250207_docs.mstable.io_embed.1.3.json
Processing 20250207_docs.strater.xyz_scraped_data.json
Saved embeddings to 20250207_docs.strater.xyz_embed.1.3.json
Processing 20250207_docs.walrus.site_scraped_data.json
Saved embeddings to 20250207_docs.walrus.site_embed.1.3.json
Processing 20250207_haedal-protocol.gitbook.io_scraped_data.json
Saved embeddings to 20250207_haedal-protocol.gitbook.io_embed.1.3.json
Processing 20250207_unihouse.gitbook.io_scraped_data.json
Saved embeddings to 20250207_unihouse.gi

In [26]:
embedded_data

[{'url': 'https://docs.sui.io',
  'type': 'html',
  'pdf': None,
  'text_raw': "Sui Documentation Skip to main content Sui Documentation Guides Concepts Standards References Search Get started Sui Documentation Discover the power of Sui through examples, guides, and concepts Developers Getting started Sui Developer Basics Move Validators and Node operators Validator configuration Run a Sui Full node Sui Bridge Node configuration About Sui Tokenomics Cryptography Standards References Sui dApp Kit Sui API Sui framework (GitHub) Rust SDK (GitHub) Resources Sui ecosystem directory Sui blog Sui dev cheat sheet Build your dApp on Sui Why Sui? Sui is the first internet-scale programmable blockchain platform Unmatched scalability, instant settlement A safe smart contract language accessible to mainstream developers Ability to define rich and composable on-chain assets Better user experience for web3 apps Scalability Sui scales horizontally to meet the demands of applications. Network capacity 