## Chunker

In [1]:
import numpy as np
import pandas as pd

from frony_document_manager.parser_dev import ParserTXT
from frony_document_manager.parser_dev import ParserPDF
from frony_document_manager.parser_dev import ParserPPTX
from frony_document_manager.parser_dev import ParserPDFImage
from frony_document_manager.parser_dev import ParserImage

from frony_document_manager.chunker_dev import RuleBasedTextChunker
from frony_document_manager.chunker_dev import LLMBasedTextChunker
from frony_document_manager.chunker_dev import LLMBasedImageChunker

  from .autonotebook import tqdm as notebook_tqdm


## RuleBasedTextChunker

In [9]:
parser = ParserPDF()
df = parser.parse("test_files/test_pdf.pdf")
df

Unnamed: 0,page_number,page_content
0,1,"Providedproperattributionisprovided,Googlehere..."
1,2,"1 Introduction\nRecurrentneuralnetworks,longsh..."
2,3,Figure1: TheTransformer-modelarchitecture.\nTh...
3,4,ScaledDot-ProductAttention Multi-HeadAttention...
4,5,output values. These are concatenated and once...
5,6,"Table1: Maximumpathlengths,per-layercomplexity..."
6,7,"n d,\nlength is smaller than the representatio..."
7,8,Table2: TheTransformerachievesbetterBLEUscores...
8,9,Table3: VariationsontheTransformerarchitecture...
9,10,Table4: TheTransformergeneralizeswelltoEnglish...


In [10]:
chunker = RuleBasedTextChunker()
chunks = chunker.chunk(df)
total_chunks = next(chunks)
print(total_chunks)
df_chunk = []
for chunk in chunks:
    df_chunk.append(chunk)
df_chunk = pd.DataFrame(df_chunk)
df_chunk

218


create documents... (rule_short): 100%|██████████| 145/145 [00:00<00:00, 590.68it/s]
create documents... (rule_long): 100%|██████████| 73/73 [00:00<00:00, 434.20it/s]


Unnamed: 0,page_number,chunk_type,chunk_id,chunk_content
0,1,rule_short,0,"Providedproperattributionisprovided,Googlehere..."
1,1,rule_short,1,gleBrain GoogleResearch GoogleResearch\navaswa...
2,1,rule_short,2,ukhin@gmail.com\nAbstract\nThedominantsequence...
3,1,rule_short,3,"re, the Transformer,\nbasedsolelyonattentionme..."
4,1,rule_short,4,LEU on the WMT 2014 English-\nto-German transl...
...,...,...,...,...
213,15,rule_long,68,| | 6. | Not | eth | att | hea ...
214,1,rule_long,69,| p | u | t | La | y | e ...
215,15,rule_long,70,| | | | | | ...
216,15,rule_long,71,| >dap< |\n| 3 | | | | ...


## LLMBasedTextChunker

In [2]:
parser = ParserPDF()
df = parser.parse("test_files/test_pdf.pdf")
df

Unnamed: 0,page_number,page_content
0,1,"Providedproperattributionisprovided,Googlehere..."
1,2,"1 Introduction\nRecurrentneuralnetworks,longsh..."
2,3,Figure1: TheTransformer-modelarchitecture.\nTh...
3,4,ScaledDot-ProductAttention Multi-HeadAttention...
4,5,output values. These are concatenated and once...
5,6,"Table1: Maximumpathlengths,per-layercomplexity..."
6,7,"n d,\nlength is smaller than the representatio..."
7,8,Table2: TheTransformerachievesbetterBLEUscores...
8,9,Table3: VariationsontheTransformerarchitecture...
9,10,Table4: TheTransformergeneralizeswelltoEnglish...


In [3]:
chunker = LLMBasedTextChunker()
chunks = chunker.chunk(df)
total_chunks = next(chunks)
print(total_chunks)
df_chunk = []
for chunk in chunks:
    df_chunk.append(chunk)
df_chunk = pd.DataFrame(df_chunk)
df_chunk

18


create documents... (llm_text):   6%|▌         | 1/18 [00:05<01:37,  5.74s/it]

5     1935
4     1933
2     1931
6     1928
7     1928
1     1924
9     1923
12    1923
11    1923
10    1920
8     1920
3     1398
15     558
14     557
13     554
dtype: int64


create documents... (llm_text):  11%|█         | 2/18 [00:12<01:41,  6.32s/it]

9     1841
8     1838
5     1833
12    1833
6     1832
11    1831
4     1831
10    1828
7     1828
1     1824
2     1824
3     1318
15     708
13     707
14     701
dtype: int64


create documents... (llm_text):  17%|█▋        | 3/18 [00:18<01:32,  6.14s/it]

1     1948
2     1947
4     1947
7     1947
5     1947
9     1947
6     1946
10    1946
8     1946
11    1946
12    1945
3     1411
15     527
13     506
14     506
dtype: int64


create documents... (llm_text):  22%|██▏       | 4/18 [00:24<01:28,  6.32s/it]

5     1919
7     1917
1     1917
12    1917
4     1916
8     1916
2     1916
9     1915
6     1914
11    1914
10    1913
3     1382
15     607
13     601
14     592
dtype: int64


create documents... (llm_text):  28%|██▊       | 5/18 [00:31<01:24,  6.53s/it]

9     1803
11    1801
8     1798
12    1798
10    1796
1     1791
6     1786
2     1780
7     1779
5     1777
4     1768
3     1275
15     723
14     707
13     705
dtype: int64


create documents... (llm_text):  33%|███▎      | 6/18 [00:38<01:17,  6.49s/it]

11    1910
5     1909
9     1908
2     1907
12    1907
7     1906
1     1906
4     1905
6     1905
10    1903
8     1903
3     1369
13     631
15     631
14     629
dtype: int64


create documents... (llm_text):  39%|███▉      | 7/18 [00:44<01:11,  6.46s/it]

1     1998
2     1998
4     1998
6     1998
5     1998
7     1998
8     1998
12    1998
9     1998
10    1998
11    1998
3     1460
13     540
14     540
15     540
dtype: int64


create documents... (llm_text):  44%|████▍     | 8/18 [00:51<01:05,  6.59s/it]

5     1934
4     1930
2     1930
1     1927
7     1925
6     1924
8     1919
12    1914
11    1913
10    1910
9     1909
3     1400
15     542
13     536
14     529
dtype: int64


create documents... (llm_text):  50%|█████     | 9/18 [00:58<00:59,  6.65s/it]

5     1938
4     1933
6     1931
7     1930
2     1929
1     1926
9     1924
12    1919
10    1917
11    1915
8     1902
3     1402
13     628
15     626
14     625
dtype: int64


create documents... (llm_text):  56%|█████▌    | 10/18 [01:04<00:51,  6.47s/it]

1     1939
7     1939
5     1939
4     1937
2     1936
6     1936
12    1933
11    1929
10    1927
8     1927
9     1924
3     1402
15     510
13     465
14     415
dtype: int64


create documents... (llm_text):  61%|██████    | 11/18 [01:11<00:45,  6.51s/it]

1     1875
6     1871
8     1871
11    1871
12    1871
5     1870
10    1869
9     1868
2     1868
4     1866
7     1866
3     1345
15     612
13     609
14     601
dtype: int64


create documents... (llm_text):  67%|██████▋   | 12/18 [01:19<00:43,  7.18s/it]

5     1855
7     1853
1     1853
4     1852
2     1851
11    1846
8     1844
12    1844
6     1843
9     1842
10    1830
3     1328
15     486
13     458
14     424
dtype: int64


create documents... (llm_text):  67%|██████▋   | 12/18 [01:25<00:42,  7.13s/it]


KeyboardInterrupt: 

## LLMBasedImageChunker

In [3]:
parser = ParserPDFImage()
df = parser.parse("test_files/test_pdf.pdf")
df

Unnamed: 0,page_number,page_content
0,1,iVBORw0KGgoAAAANSUhEUgAACfYAAAzlCAIAAABT38lbAA...
1,2,iVBORw0KGgoAAAANSUhEUgAACfYAAAzlCAIAAABT38lbAA...
2,3,iVBORw0KGgoAAAANSUhEUgAACfYAAAzlCAIAAABT38lbAA...
3,4,iVBORw0KGgoAAAANSUhEUgAACfYAAAzlCAIAAABT38lbAA...
4,5,iVBORw0KGgoAAAANSUhEUgAACfYAAAzlCAIAAABT38lbAA...
5,6,iVBORw0KGgoAAAANSUhEUgAACfYAAAzlCAIAAABT38lbAA...
6,7,iVBORw0KGgoAAAANSUhEUgAACfYAAAzlCAIAAABT38lbAA...
7,8,iVBORw0KGgoAAAANSUhEUgAACfYAAAzlCAIAAABT38lbAA...
8,9,iVBORw0KGgoAAAANSUhEUgAACfYAAAzlCAIAAABT38lbAA...
9,10,iVBORw0KGgoAAAANSUhEUgAACfYAAAzlCAIAAABT38lbAA...


In [4]:
chunker = LLMBasedImageChunker()
chunks = chunker.chunk(df)
total_chunks = next(chunks)
print(total_chunks)
df_chunk = []
for chunk in chunks:
    df_chunk.append(chunk)
df_chunk = pd.DataFrame(df_chunk)
df_chunk

15


create documents... (llm_base):   0%|          | 0/15 [00:00<?, ?it/s]

create documents... (llm_base): 100%|██████████| 15/15 [01:42<00:00,  6.86s/it]


Unnamed: 0,page_number,chunk_type,chunk_id,chunk_content
0,1,llm_base,0,"논문 ""Attention Is All You Need""의 요약은 다음과 같습니다.\..."
1,2,llm_base,0,### 1. 서론\n- **재귀 신경망**: 언어 모델링과 기계 번역에서 주류 방법...
2,3,llm_base,0,다음은 주제별로 나눈 요약입니다.\n\n### 1. Transformer 모델 구조...
3,4,llm_base,0,### 주제별 요약\n\n#### 1. Scaled Dot-Product Atten...
4,5,llm_base,0,다음은 주제별로 요약한 내용입니다.\n\n### 1. 멀티헤드 어텐션\n- 멀티헤드...
5,6,llm_base,0,### 주제별 요약\n\n#### 1. 레이어 유형 및 복잡도\n- **Self-A...
6,7,llm_base,0,다음은 주제별로 나눈 요약입니다.\n\n### 1. 모델 구조\n- 문장 표현의 차...
7,8,llm_base,0,### 주제별 요약\n\n#### 1. 모델 성능\n- **BLEU 점수**: Tr...
8,9,llm_base,0,다음 이미지는 Transformer 아키텍처에 대한 다양한 실험 결과와 영어 구문 ...
9,10,llm_base,0,이 문서는 Transformer 모델에 대한 내용을 다루고 있습니다. 주제별로 요약...


In [None]:
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import pickle

# create dataset
X, y = make_classification(n_samples=500000, n_features=100, random_state=0)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
 
# define model
rf = RandomForestClassifier(n_estimators=100, random_state=0, n_jobs=-1)

# train
rf.fit(X_train, y_train)

# save model with pickle
pickle.dump(rf, open("rf.pkl", "wb"))