In [38]:
import os
import time
from langchain_text_splitters import HTMLSectionSplitter
from langchain.text_splitter import (
    HTMLHeaderTextSplitter,
    PythonCodeTextSplitter,
    MarkdownTextSplitter,
    LatexTextSplitter,
)
from chunk_experiment.util.chunk_highlight import TextHighlighter

In [60]:
html_files_path="chunk_experiment/data/html/arxiv"
latex_files_path="chunk_experiment/data/latex/arxiv"
md_files_path="chunk_experiment/data/markdown/markdown-documentation-transformers"
python_files_path="chunk_experiment/data/python"

html_splitter = HTMLSectionSplitter(headers_to_split_on=[("h1", "Header 1"),("h2", "Header 2"),("h3", "Header 3"),("h4", "Header 4"),("h5", "Header 5"),("h6", "Header 6")])
python_splitter = PythonCodeTextSplitter(chunk_size=100, chunk_overlap=20)
md_splitter = MarkdownTextSplitter(chunk_size=50, chunk_overlap=10)
tex_splitter = LatexTextSplitter(chunk_size=50, chunk_overlap=10)

In [61]:
def choose_type(type):
    if type==".html":
        return html_files_path,html_splitter
    elif type==".tex":
        return latex_files_path,tex_splitter
    elif type==".md":
        return md_files_path,md_splitter
    elif type==".py":
        return python_files_path,python_splitter

# HTML

In [72]:
type=".html"
paths,splitter = choose_type(type)

file_paths = [os.path.join(paths, f) for f in os.listdir(paths) if f.endswith(type)]
file_paths = file_paths[:100]
total_time = 0
for file_path in file_paths:
    with open(file_path, 'r', encoding='utf-8') as f:
        sample_text = f.read()

    # html需要移除xml声明
    sample_text = sample_text.replace('<?xml version="1.0" encoding="UTF-8"?>', '')
    print(f"\n测试文件: {file_path}")
    start_time = time.time()
    splitter.split_text(sample_text)
    end_time = time.time()
    total_time += end_time - start_time
    
average_time = total_time / len(file_paths)
print(f"100个文档平均运行时间:{average_time:.4f}秒")


测试文件: chunk_experiment/data/html/arxiv/340.html

测试文件: chunk_experiment/data/html/arxiv/205.html

测试文件: chunk_experiment/data/html/arxiv/74.html

测试文件: chunk_experiment/data/html/arxiv/439.html

测试文件: chunk_experiment/data/html/arxiv/252.html

测试文件: chunk_experiment/data/html/arxiv/481.html

测试文件: chunk_experiment/data/html/arxiv/23.html

测试文件: chunk_experiment/data/html/arxiv/194.html

测试文件: chunk_experiment/data/html/arxiv/301.html

测试文件: chunk_experiment/data/html/arxiv/244.html

测试文件: chunk_experiment/data/html/arxiv/35.html

测试文件: chunk_experiment/data/html/arxiv/497.html

测试文件: chunk_experiment/data/html/arxiv/182.html

测试文件: chunk_experiment/data/html/arxiv/478.html

测试文件: chunk_experiment/data/html/arxiv/356.html

测试文件: chunk_experiment/data/html/arxiv/213.html

测试文件: chunk_experiment/data/html/arxiv/62.html

测试文件: chunk_experiment/data/html/arxiv/9.html

测试文件: chunk_experiment/data/html/arxiv/268.html

测试文件: chunk_experiment/data/html/arxiv/287.html

测试文件: chunk_experiment/da

## 高亮展示

In [74]:
max_len = 200000
highlighter = TextHighlighter(
    long_text=sample_text,
    chunking_api=splitter.split_text,
    max_length=max_len
)

# 显示高亮文本
highlighter.display_highlighted_text()

随机选取的文本片段起始索引: 1057671, 长度: 200000
选取的文本片段:
</apply><apply id="S4.E1.m1.1.1.1.1.1.1.1.1.1.cmml" xref="S4.E1.m1.1.1.1.1.1.1.1.1.1"><times id="S4.E1.m1.1.1.1.1.1.1.1.1.1.2.cmml" xref="S4.E1.m1.1.1.1.1.1.1.1.1.1.2"></times><apply id="S4.E1.m1.1.1.1.1.1.1.1.1.1.3.cmml" xref="S4.E1.m1.1.1.1.1.1.1.1.1.1.3"><csymbol cd="ambiguous" id="S4.E1.m1.1.1.1.1.1.1.1.1.1.3.1.cmml" xref="S4.E1.m1.1.1.1.1.1.1.1.1.1.3">subscript</csymbol><ci id="S4.E1.m1.1.1.1.1.1.1.1.1.1.3.2.cmml" xref="S4.E1.m1.1.1.1.1.1.1.1.1.1.3.2">𝑤</ci><ci id="S4.E1.m1.1.1.1.1.1.1.1.1.1.3.3.cmml" xref="S4.E1.m1.1.1.1.1.1.1.1.1.1.3.3">𝑖</ci></apply><apply id="S4.E1.m1.1.1.1.1.1.1.1.1.1.4.cmml" xref="S4.E1.m1.1.1.1.1.1.1.1.1.1.4"><csymbol cd="ambiguous" id="S4.E1.m1.1.1.1.1.1.1.1.1.1.4.1.cmml" xref="S4.E1.m1.1.1.1.1.1.1.1.1.1.4">subscript</csymbol><ci id="S4.E1.m1.1.1.1.1.1.1.1.1.1.4.2.cmml" xref="S4.E1.m1.1.1.1.1.1.1.1.1.1.4.2">Φ</ci><ci id="S4.E1.m1.1.1.1.1.1.1.1.1.1.4.3.cmml" xref="S4.E1.m1.1.1.1.1.1.1.1.1.1.4.3">𝑖</ci></apply><app

# LaTex

In [69]:
type=".tex"
paths,splitter = choose_type(type)
file_paths = [os.path.join(paths, f) for f in os.listdir(paths) if f.endswith(type)]
file_paths = file_paths[:100]

total_time = 0
for file_path in file_paths:
    with open(file_path, "r", encoding="latin1") as f:
        sample_text = f.read()

    print(f"\n测试文件: {file_path}")
    start_time = time.time()
    chunks = splitter.split_text(sample_text)
    end_time = time.time()
    total_time += end_time - start_time
    
average_time = total_time / len(file_paths)
print(f"100个文档平均运行时间:{average_time:.4f}秒")


测试文件: chunk_experiment/data/latex/arxiv/113.tex

测试文件: chunk_experiment/data/latex/arxiv/107.tex

测试文件: chunk_experiment/data/latex/arxiv/12.tex

测试文件: chunk_experiment/data/latex/arxiv/339.tex

测试文件: chunk_experiment/data/latex/arxiv/477.tex

测试文件: chunk_experiment/data/latex/arxiv/311.tex

测试文件: chunk_experiment/data/latex/arxiv/305.tex

测试文件: chunk_experiment/data/latex/arxiv/463.tex

测试文件: chunk_experiment/data/latex/arxiv/488.tex

测试文件: chunk_experiment/data/latex/arxiv/259.tex

测试文件: chunk_experiment/data/latex/arxiv/503.tex

测试文件: chunk_experiment/data/latex/arxiv/265.tex

测试文件: chunk_experiment/data/latex/arxiv/271.tex

测试文件: chunk_experiment/data/latex/arxiv/270.tex

测试文件: chunk_experiment/data/latex/arxiv/502.tex

测试文件: chunk_experiment/data/latex/arxiv/264.tex

测试文件: chunk_experiment/data/latex/arxiv/258.tex

测试文件: chunk_experiment/data/latex/arxiv/489.tex

测试文件: chunk_experiment/data/latex/arxiv/304.tex

测试文件: chunk_experiment/data/latex/arxiv/462.tex

测试文件: chunk_experime

## 高亮展示

In [53]:
max_len = 20000
highlighter = TextHighlighter(
    long_text=sample_text,
    chunking_api=splitter.split_text,
    max_length=max_len
)

# 显示高亮文本
highlighter.display_highlighted_text()

随机选取的文本片段起始索引: 76840, 长度: 20000
选取的文本片段:
(3)}(t) \le d_i (t), \forall t=1,2,\ldots,e, i=1,2,\ldots,n \\\nonumber
%\prod_{t=1}^{e}[d_{i}(t)-(\lambda_p^i+D^{(p)}(t))]=0,\forall p=1,2,3, i=1,2,\ldots,n.\\\nonumber
%\end{eqnarray}
%After this reformulation, we're very possibly able to use sequential quadratic programming(SQL) to solve this optimization problem.
%\end{corollary}



%%%%%  We will discuss them in DIscussion
% Then we can ask the following questions:
% \begin{problem}
% Suppose we have a random sample $S \subset \Tn$ of saze $n$.  Then how
% often can we get a unique solution?
% \end{problem}
% \begin{problem}
% Comparing with tropical Fermat-Weber points.  Does an (the?) optimal
% solution for Problem \ref{optimization} contains a tropical
% Fermat-Weber point of the sample $S$?  How does it relate to?
% Tropical Fermat-Weber points can be found at \url{https://arxiv.org/abs/1604.04674}.
% \end{problem}

% \begin{proposition}
% With given a sample $\{d_1,\cdots,d_n\}$, suppo

# Markdown

In [70]:
type=".md"
paths,splitter = choose_type(type)

file_paths = [os.path.join(paths, f) for f in os.listdir(paths) if f.endswith(type)]
file_paths = file_paths[:100]
total_time = 0
for file_path in file_paths:
    with open(file_path, 'r', encoding='utf-8') as f:
        sample_text = f.read()

    print(f"\n测试文件: {file_path}")
    start_time = time.time()
    splitter.split_text(sample_text)
    end_time = time.time()
    total_time += end_time - start_time
    
average_time = total_time / len(file_paths)
print(f"100个文档平均运行时间:{average_time:.4f}秒")


测试文件: chunk_experiment/data/markdown/markdown-documentation-transformers/28.md

测试文件: chunk_experiment/data/markdown/markdown-documentation-transformers/79.md

测试文件: chunk_experiment/data/markdown/markdown-documentation-transformers/124.md

测试文件: chunk_experiment/data/markdown/markdown-documentation-transformers/175.md

测试文件: chunk_experiment/data/markdown/markdown-documentation-transformers/186.md

测试文件: chunk_experiment/data/markdown/markdown-documentation-transformers/217.md

测试文件: chunk_experiment/data/markdown/markdown-documentation-transformers/357.md

测试文件: chunk_experiment/data/markdown/markdown-documentation-transformers/246.md

测试文件: chunk_experiment/data/markdown/markdown-documentation-transformers/306.md

测试文件: chunk_experiment/data/markdown/markdown-documentation-transformers/141.md

测试文件: chunk_experiment/data/markdown/markdown-documentation-transformers/110.md

测试文件: chunk_experiment/data/markdown/markdown-documentation-transformers/281.md

测试文件: chunk_experiment/data/m

## 高亮展示

In [55]:
max_len = 20000
highlighter = TextHighlighter(
    long_text=sample_text,
    chunking_api=splitter.split_text,
    max_length=max_len
)

# 显示高亮文本
highlighter.display_highlighted_text()

随机选取的文本片段起始索引: 6097, 长度: 20000
选取的文本片段:
, a high number of gradient accumulation steps can result in a more pronounced training slowdown. Consider the following example. Let’s say, the `per_device_train_batch_size=4` without gradient accumulation hits the GPU’s limit. If you would like to train with batches of size 64, do not set the `per_device_train_batch_size` to 1 and `gradient_accumulation_steps` to 64. Instead, keep `per_device_train_batch_size=4` and set `gradient_accumulation_steps=16`. This results in the same effective batch size while making better use of the available GPU resources.

For additional information, please refer to batch size and gradient accumulation benchmarks for [RTX-3090](https://github.com/huggingface/transformers/issues/14608#issuecomment-1004392537) and [A100](https://github.com/huggingface/transformers/issues/15026#issuecomment-1005033957).

## Gradient Checkpointing

Some large models may still face memory issues even when the batch size is set to 1 an

# Python

In [None]:
type=".py"
paths,splitter = choose_type(type)

file_paths = [os.path.join(paths, f) for f in os.listdir(paths) if f.endswith(type)]
file_paths = file_paths[:100]
total_time = 0
for file_path in file_paths:
    with open(file_path, 'r', encoding='utf-8') as f:
        sample_text = f.read()

    print(f"\n测试文件: {file_path}")
    start_time = time.time()
    splitter.split_text(sample_text)
    end_time = time.time()
    
average_time = total_time / len(file_paths)
print(f"100个文档平均运行时间:{average_time:.4f}秒")

## 高亮展示

In [78]:
max_len = 20000
highlighter = TextHighlighter(
    long_text=sample_text,
    chunking_api=splitter.split_text,
    max_length=max_len
)

# 显示高亮文本
highlighter.display_highlighted_text()

随机选取的文本片段起始索引: 1083, 长度: 20000
选取的文本片段:
 -> List[np.ndarray]:
        """Embed a batch of texts and return a list of embedding vectors.

        Args:
            texts (List[str]): List of texts to embed.

        Returns:
            List[np.ndarray]: List of embedding vectors.
        """
        # 中文注释: 返回模拟的随机嵌入向量，实际情况需调用真实模型方法。
        # return [np.random.rand(self.embedding_dim).astype(np.float32) for _ in texts]
        # embeddings = embedding_api(texts)
        # 每十个文本一组，避免一次请求过多文本
        batch_size = 10
        embeddings = []
        for i in range(0, len(texts), batch_size):
            batch_texts = texts[i : i + batch_size]
            response = self.embedding_client.get_embeddings(batch_texts)
            batch_embeddings = response.get("data", {}).get("resultList", [])
            embeddings.extend(batch_embeddings)
        return [np.array(embedding).astype(np.float32) for embedding in embeddings]

    def similarity(self, embedding1: np.ndarray, embedding2: np.ndar