In [17]:
import os
from chunk_experiment.util.helpers import (
    semantic_chunks_to_text,
    process_batch_chunk_output,

)
from chunk_experiment.util.chunk_highlight import TextHighlighter
import time
# 语义分块
from chunk_experiment.util.embedding_api import EmbeddingClient
from chunk_experiment.src.semantic_chunk import SemanticChunker, EmbeddingModel

# 递归分块方法1(chunk_size)
from chunk_experiment.src.recursive_chunk import RecursiveCharacterTextSplitter

# 递归分块方法2(sentence&chunk_size)
from chunk_experiment.util.sentence_split import GeneralTextSplitter


In [None]:
input_dir="en_files"
#input_dir="zh_files" 
file_paths = [os.path.join(input_dir, f) for f in os.listdir(input_dir) if f.endswith(".txt") and os.path.isfile(os.path.join(input_dir, f))]

# 测试文本
# 100单位长度
sample_text_en =("This is a sample text designed to demonstrate how to highlight text chunks in Jupyter Notebook. By r")
sample_text =("这是一个用于展示如何在Jupyter Notebook中高亮显示文本分块的示例文本。通过随机选取文本片段，并使用分块API进行分块处理，最终以不同颜色高亮显示每个分块，确保相邻分块颜色不同,从而提高文")

# Semantic Chunker

In [18]:
# semantic_chunker
TEST_URL = "https://ai-platform-cloud-proxy.polymas.com/ai/common/kb-get-embedding"
embedding_client = EmbeddingClient(embedding_url=TEST_URL)
embedding_model = EmbeddingModel(embedding_client)

## Semantic_cumulative_chunker

In [None]:
semanticcumulativechunker = SemanticChunker(
    embedding_model=embedding_model,
    min_characters_per_sentence=5,
    similarity_threshold=None,  # 使用动态计算阈值
    similarity_percentile=90,
    similarity_window=1,
    mode="cumulative",
    initial_sentences=1,
    min_sentences=1,
    chunk_size=50,
    min_chunk_size=20,
    threshold_step=0.05,
    sep="🐮🍺",
).chunk

In [None]:
total_time = 0
for file_path in file_paths:
    with open(file_path, 'r', encoding='utf-8') as f:
        sample_text = f.read()
    
    start_time = time.time()
    semanticcumulativechunker(sample_text)  # 调用测试函数
    end_time = time.time()

    total_time += (end_time - start_time)

average_time = total_time / len(file_paths)
print(f"Semantic_cumulative_chunker 100个文档平均运行时间:{average_time:.4f}秒")

## Semantic_window_chunker

In [21]:
semanticwindowchunker = SemanticChunker(
    embedding_model=embedding_model,
    min_characters_per_sentence=5,
    similarity_threshold=None,  # 使用动态计算阈值
    similarity_percentile=90,
    similarity_window=1,
    mode="window",
    initial_sentences=1,
    min_sentences=1,
    chunk_size=50,
    min_chunk_size=20,
    threshold_step=0.05,
    sep="🐮🍺",
).chunk

In [None]:
total_time = 0
for file_path in file_paths:
    with open(file_path, 'r', encoding='utf-8') as f:
        sample_text = f.read()
    
    start_time = time.time()
    semanticwindowchunker(sample_text)  # 调用测试函数
    end_time = time.time()

    total_time += (end_time - start_time)

average_time = total_time / len(file_paths)
print(f"Semantic_window_chunker 100个文档平均运行时间:{average_time:.4f}秒")

# RecursiveChunker

## RecursiveChunker(按照chunk_size递归)

In [22]:
recursivesplitter = RecursiveCharacterTextSplitter(
    chunk_size=50,
    chunk_overlap=10,
    separators=["\n\n", "\n", "。", "？", "！", "，", " ", ""],
)
recursivechunker = recursivesplitter.split_text

In [None]:
total_time = 0
for file_path in file_paths:
    with open(file_path, 'r', encoding='utf-8') as f:
        sample_text = f.read()
    
    start_time = time.time()
    recursivechunker(sample_text)  # 调用测试函数
    end_time = time.time()

    total_time += (end_time - start_time)
    
average_time = total_time / len(file_paths)
print(f"RecursiveChunker 100个文档平均运行时间:{average_time:.4f}秒")

## RecursiveChunker(按照sentence&chunk_size递归)

In [None]:
recursive_sentence_splitter = GeneralTextSplitter(max_sentence_length=500)
recursive_sentence_chunker = recursive_sentence_splitter.batch_chunk

In [None]:
total_time = 0
for file_path in file_paths:
    with open(file_path, 'r', encoding='utf-8') as f:
        sample_text = f.read()
    
    start_time = time.time()
    recursive_sentence_chunker([sample_text])  # 调用测试函数
    end_time = time.time()

    total_time += (end_time - start_time)
    
average_time = total_time / len(file_paths)
print(f"RecursiveChunker 100个文档平均运行时间:{average_time:.4f}秒")

### 高亮展示

In [24]:
max_len = 500
highlighter = TextHighlighter(
    long_text=sample_text_en, chunking_api=semanticcumulativechunker, max_length=max_len
)
highlighter.display_highlighted_text(wrapper_func=semantic_chunks_to_text)


随机选取的文本片段起始索引: 1568, 长度: 500
选取的文本片段:
arameter configurations, to adapt to different types of text and application requirements. For instance, the chunking process can dynamically adjust chunk sizes based on text length, paragraph structure, or semantic relevance. At the same time, the choice of colors can be customized based on user preferences, thereby meeting different aesthetic and usability needs. This approach not only improves the technical level of text processing but also brings new possibilities for practical applications.

文本已分成 4 块。
分块结果:
Chunk 1: arameter configurations, to adapt to different types of text and application requirements.For instance,
Chunk 2: the chunking process can dynamically adjust chunk sizes based on text length,paragraph structure,or semantic relevance.
Chunk 3: At the same time,the choice of colors can be customized based on user preferences,thereby meeting different aesthetic and usability needs.
Chunk 4: This approach not only improves the technica

In [25]:
highlighter = TextHighlighter(
    long_text=sample_text_en, chunking_api=semanticwindowchunker, max_length=max_len
)
highlighter.display_highlighted_text(wrapper_func=semantic_chunks_to_text)


随机选取的文本片段起始索引: 969, 长度: 500
选取的文本片段:
ents, the analysis results become intuitive and easy to understand, facilitating team collaboration and decision-making. In the education field, teachers can use this highlighting and chunking technique to provide students with visual learning materials, such as emphasizing grammar points, key phrases, and paragraph structures in texts. The flexibility of this technique makes it suitable for a wide range of scenarios, offering great convenience for both teaching and research purposes. To achieve

文本已分成 2 块。
分块结果:
Chunk 1: ents, the analysis results become intuitive and easy to understand, facilitating team collaboration and decision-making.In the education field,teachers can use this highlighting and chunking technique to provide students with visual learning materials,such as emphasizing grammar points,
Chunk 2: key phrases,and paragraph structures in texts.The flexibility of this technique makes it suitable for a wide range of scenarios,offering g

In [26]:
highlighter = TextHighlighter(
    long_text=sample_text_en, chunking_api=recursivechunker, max_length=max_len
)
highlighter.display_highlighted_text()


随机选取的文本片段起始索引: 673, 长度: 500
选取的文本片段:
. Additionally, this approach can be applied in natural language processing (NLP) tasks for text visualization, such as displaying sentiment analysis results, keyword extraction, or topic modeling outputs. By using different colors or formats to distinguish positive, negative, and neutral sentiments, the analysis results become intuitive and easy to understand, facilitating team collaboration and decision-making. In the education field, teachers can use this highlighting and chunking technique t

文本已分成 13 块。
分块结果:
Chunk 1: .

Additionally,

this

approach

can

be

applied
Chunk 2: e

appliedin

natural

language

processing

(NLP)

tasks
Chunk 3: LP)

tasksfor

text

visualization,

such

as

displaying
Chunk 4: displayingsentiment

analysis

results,

keyword
Chunk 5: ,

keywordextraction,

or

topic

modeling

outputs.

By
Chunk 6: tputs.

Byusing

different

colors

or

formats

to
Chunk 7: ormats

todistinguish

positive,

negative,

and

neutr

In [27]:
highlighter = TextHighlighter(
    long_text=[sample_text_en], chunking_api=recursive_sentence_chunker, max_length=max_len
)
highlighter.display_highlighted_text(wrapper_func=process_batch_chunk_output)

长文本长度小于或等于max_length (500)，返回整个文本。
选取的文本片段:
['This is a sample text designed to demonstrate how to highlight text chunks in Jupyter Notebook. By randomly selecting text fragments and using a chunking API to process the text into chunks, each chunk can be highlighted with different colors. This ensures that adjacent chunks are displayed in distinct colors, enhancing the readability and visual appeal of the text. This method is highly useful in industrial scenarios such as data analysis, text processing, and report generation. For example, in long reports, highlighting key content from different sections can help readers quickly locate important information, thereby improving reading efficiency and comprehension. Additionally, this approach can be applied in natural language processing (NLP) tasks for text visualization, such as displaying sentiment analysis results, keyword extraction, or topic modeling outputs. By using different colors or formats to distinguish positive, negative, and