## Chunker

In [1]:
# import sys
# sys.path.append("../")
import numpy as np
import pandas as pd

from frony_document_processor.parser import ParserTXT
from frony_document_processor.parser import ParserPDF
from frony_document_processor.parser import ParserPPTX
from frony_document_processor.parser import ParserPDFImage
from frony_document_processor.parser import ParserImage

from frony_document_processor.chunker import RuleBasedTextChunker
from frony_document_processor.chunker import LLMBasedTextChunker
from frony_document_processor.chunker import LLMBasedImageChunker

from frony_document_processor.embedder import OpenAIEmbedder
from frony_document_processor.embedder import SentenceTransformerEmbedder

  from .autonotebook import tqdm as notebook_tqdm


## RuleBasedTextChunker

In [2]:
parser = ParserPDF()
df = parser.parse("test_files/test_pdf.pdf")
df

Unnamed: 0,page_number,page_content
0,1,"Provided proper attribution is provided, Googl..."
1,2,"1 Introduction\nRecurrent neural networks, lon..."
2,3,Figure 1: The Transformer - model architecture...
3,4,Scaled Dot-Product Attention Multi-Head Attent...
4,5,output values. These are concatenated and once...
5,6,"Table 1: Maximum path lengths, per-layer compl..."
6,7,n\nlength is smaller than the representation d...
7,8,Table 2: The Transformer achieves better BLEU ...
8,9,Table 3: Variations on the Transformer archite...
9,10,Table 4: The Transformer generalizes well to E...


In [3]:
chunker = RuleBasedTextChunker()
chunks = chunker.chunk(df)
total_chunks = next(chunks)
print(total_chunks)
df_chunk = []
for chunk in chunks:
    df_chunk.append(chunk)
df_chunk = pd.DataFrame(df_chunk)
df_chunk

631


create documents... (rule_short): 100%|██████████| 507/507 [00:01<00:00, 376.55it/s]
create documents... (rule_long): 100%|██████████| 124/124 [00:00<00:00, 401.36it/s]


Unnamed: 0,page_number,chunk_type,chunk_id,chunk_content
0,1,rule_short,0,"Provided proper attribution is provided, Googl..."
1,1,rule_short,1,reproduce the tables and figures in this paper...
2,1,rule_short,2,Attention Is All You Need\n3202 guA 2 ]LC.sc[...
3,1,rule_short,3,Google Brain Google Brain Google Research Goog...
4,1,rule_short,4,†\nLlion Jones∗ Aidan N. Gomez∗ Łukasz Kaiser∗...
...,...,...,...,...
626,2,rule_long,119,| | p | u | t- | In | p ...
627,1,rule_long,120,| 0 | | | | | ...
628,15,rule_long,121,| 2 | ehT | waL | lliw | reven | eb ...
629,15,rule_long,122,| 4 | | | | | ...


In [4]:
embedder = OpenAIEmbedder(model_id="text-embedding-3-small", embed_dim=1536)
embed = embedder.embed(df_chunk["chunk_content"].to_list())
print(len(embed))
np.array(embed).shape

631


(631, 1536)

In [5]:
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer
from openai import OpenAI
from typing import List
from dotenv import load_dotenv
load_dotenv()

class SentenceTransformerEmbedder():
    def __init__(self, model_id: str, embed_dim: int, device: str = "cpu", precision: str = "fp16"):
        self.model = SentenceTransformer(model_id, device=device)
        self.embed_dim = embed_dim
        self.precision = precision

    def embed(self, data: str | List[str], batch_size: int = 4):
        data = [data] if isinstance(data, str) else data
        sorted = np.array([len(i) for i in data]).argsort()
        embed = self.model.encode([data[i] for i in sorted], batch_size=batch_size, normalize_embeddings=True, convert_to_tensor=True)
        embed = embed.half() if self.precision == "fp16" else embed
        embed[sorted] = embed.clone()
        embed = embed.tolist()
        return embed


In [6]:
embedder = SentenceTransformerEmbedder(model_id="all-MiniLM-L6-v2", embed_dim=384, device="cpu")

In [7]:
%%time

embed = embedder.embed(df_chunk["chunk_content"].to_list())

CPU times: total: 24.2 s
Wall time: 6.47 s


In [8]:
print(len(embed))
np.array(embed)[0][:10]

631


array([-0.1151123 ,  0.01026154,  0.00456619, -0.05773926,  0.05285645,
       -0.01487732,  0.03738403, -0.07794189, -0.00886536,  0.00239754])

In [9]:
%%time

embed = embedder.model.encode(df_chunk["chunk_content"].to_list(), batch_size=4, normalize_embeddings=True, convert_to_tensor=True).half().tolist()

CPU times: total: 26.5 s
Wall time: 7.83 s


In [10]:
print(len(embed))
np.array(embed)[0][:10]

631


array([-0.1151123 ,  0.01026154,  0.00456619, -0.05773926,  0.05285645,
       -0.01487732,  0.03738403, -0.07794189, -0.00886536,  0.00239754])

In [11]:
df_chunk["chunk_content"].sample(frac=1.0, random_state=42)

509    convolutional neural networks that include an ...
271    |    |      | 4                               ...
218    1 2\nrate over the course of training, accordi...
145    The dimensionality of input and output is d = ...
495    ni   | ym   | noinipo   | .   | >SOE<   | >dap...
                             ...                        
71     1 n\nsequence (y , ..., y ) of symbols one ele...
106    √1\nof . Additive attention computes the compa...
270    |  3 | (C)  | 2                               ...
435    |    | tir   | ta   | yt   | n   | st   | e   ...
102    into a matrix Q. The keys and values are also ...
Name: chunk_content, Length: 631, dtype: object