In [None]:
from langchain_experimental.text_splitter import SemanticChunker
from langchain_huggingface import HuggingFaceEmbeddings
from pathlib import Path

# Testo di esempio
fac_simile = "Il tuo testo lungo da chunkare va qui. Può essere un testo scientifico, un articolo, un documento legale ecc."

# Modello di embeddings
embedding_model = HuggingFaceEmbeddings(
    model_name="intfloat/multilingual-e5-large",
    model_kwargs={"use_auth_token": os.getenv("HF_TOKEN")}
)

# Configurazioni da testare SENZA specificare chunk size
chunker_configs = [
    {"breakpoint_threshold_type": "gradient"},

    {"breakpoint_threshold_type": "percentile", "breakpoint_threshold_amount": 95},
    {"breakpoint_threshold_type": "percentile", "breakpoint_threshold_amount": 90},
    {"breakpoint_threshold_type": "percentile", "breakpoint_threshold_amount": 85},
    {"breakpoint_threshold_type": "percentile", "breakpoint_threshold_amount": 80},

    {"breakpoint_threshold_type": "standard_deviation", "breakpoint_threshold_amount": 1.0},
    {"breakpoint_threshold_type": "standard_deviation", "breakpoint_threshold_amount": 0.75},
    {"breakpoint_threshold_type": "standard_deviation", "breakpoint_threshold_amount": 0.5},

    {"breakpoint_threshold_type": "interquartile", "breakpoint_threshold_amount": 1.5},
    {"breakpoint_threshold_type": "interquartile", "breakpoint_threshold_amount": 1.0},
    {"breakpoint_threshold_type": "interquartile", "breakpoint_threshold_amount": 0.75},
]

# Loop su tutte le configurazioni
for idx, config in enumerate(chunker_configs):
    print(f"\n--- Configurazione {idx + 1} ---")
    print(f"Parametri: {config}")

    semantic_chunker = SemanticChunker(
        embeddings=embedding_model,
        **config  # Inserisce dinamicamente i parametri
    )

    semantic_chunks = semantic_chunker.create_documents([fac_simile])

    filename = f"semantic_chunks_config_{idx + 1}.md"
    with open(filename, "w", encoding="utf-8") as f:
        for i, doc in enumerate(semantic_chunks):
            f.write(f"### Chunk {i + 1}\n\n")
            f.write(doc.page_content.strip() + "\n\n")

    print(f"Salvato in: {filename}")
    print(f"Numero di chunk creati: {len(semantic_chunks)}")


In [None]:
semantic_chunker = SemanticChunker(
    embeddings=embedding_model,
    breakpoint_threshold_type="gradient",  # o "percentile" se vuoi sperimentare con soglie fisse
    buffer_size=80,  # aumenta il contesto per rendere i chunk più coerenti
    embedding_batch_size=16  # sfrutta la potenza del tuo computer
)
