In [22]:
import re
import unicodedata
from pdfminer.high_level import extract_text
from langchain.text_splitter import RecursiveCharacterTextSplitter
from pprint import pprint

In [23]:
path = "../../data/input_data/paper_008.pdf"

In [24]:
pdf_miner_doc = extract_text(path)

In [28]:
print(pdf_miner_doc)

Cite This: J. Phys. Chem. C 2017, 121, 26163-26171

Article

pubs.acs.org/JPCC

Between Scylla and Charybdis: Balancing Among Structural Stability
and Energy Density of Layered NCM Cathode Materials for Advanced
Lithium-Ion Batteries
Lea de Biasi,*,†
Pascal Hartmann,*,†,⊥

Aleksandr O. Kondrakov,
†,∥
and Jürgen Janek

Torsten Brezesinski,*,†

Holger Geßwein,

†,⊥

‡,§

†
Battery and Electrochemistry Laboratory, Institute of Nanotechnology and
Technology, Hermann-von-Helmholtz-Platz 1, 76344 Eggenstein-Leopoldshafen, Germany
§
Helmholtz Institute Ulm for Electrochemical Energy Storage, Helmholtzstraße 11, 89081 Ulm, Germany
⊥
BASF SE, 67056 Ludwigshafen, Germany
Institute of Physical Chemistry, Justus-Liebig-University Giessen, Heinrich-Buﬀ-Ring 17, 35392 Giessen, Germany
*S Supporting Information

∥

‡
Institute for Applied Materials, Karlsruhe Institute of

ABSTRACT: Two major strategies are currently pursued to
improve the energy density of
lithium-ion batteries using
LiNixCoyMnzO2 

In [29]:
def remove_last_section_from_pdf(file_path: str) -> str:
    """
    PDF 파일에서 조건에 따라 특정 섹션 이후를 제외하고 본문 텍스트만 반환합니다.

    Args:
        file_path (str): PDF 파일 경로.

    Returns:
        str: 특정 섹션 제외된 본문 텍스트.
    """
    full_text = extract_text(file_path)
    #  = ""
    
    # # PDF의 모든 페이지에서 텍스트 추출
    # for page in doc:
    #     full_text += page.get_text() + "\n"
    
    # for page in doc:
    #     blocks = page.get_text("blocks") # 블록 단위 추출
    #     blocks.sort(key=lambda b: (b[1], b[0])) # 좌표 기준 정렬 (y, x)

    # for block in blocks:
    #     text += block[4] + "\n\n" # 문단 구분을 위해 두 줄 바꿈 추가    
    
    # Unicode 정규화
    full_text = unicodedata.normalize("NFKD", full_text)

    # 특정 단어가 있는지 확인
    contains_advancedsciencenews = "www.advancedsciencenews.com" in full_text
    contains_chemelectrochem = "www.chemelectrochem.org" in full_text
    contains_materialsviews = "www.MaterialsViews.com" in full_text

    # print("조건 확인:")
    # print(f"Contains 'www.advancedsciencenews.com': {contains_advancedsciencenews}")
    # print(f"Contains 'ChemElectroChem': {contains_chemelectrochem}")
    # print(f"Contains 'www.MaterialsViews.com': {contains_materialsviews}")

    # 조건에 따라 키워드 설정
    if contains_materialsviews:
        keyword = "Acknowledgements"
    elif contains_advancedsciencenews or contains_chemelectrochem:
        keyword = "Conflict of Interest"
    else:
        keyword = "References"

    # 키워드로 시작하는 부분 중 가장 마지막 부분 찾기
    if keyword == "Conflict of Interest":
        keyword_pattern = r"(?i)c[ o]*n[ f]*l[ i]*c[ t]*[\uFB00]*[ o]*f[ i]*n[ t]*e[ r]*e[ s]*t"
    else:
        keyword_pattern = "(?i)" + keyword.replace(" ", r"\s*")

    matches = list(re.finditer(keyword_pattern, full_text))

    if matches:
        # 마지막 매치의 시작 위치를 기준으로 텍스트를 잘라냄
        last_match = matches[-1]
        full_text = full_text[:last_match.start()]

    return full_text

In [31]:
preprocessed_text = remove_last_section_from_pdf(path)

In [36]:
print(preprocessed_text)

Cite This: J. Phys. Chem. C 2017, 121, 26163-26171

Article

pubs.acs.org/JPCC

Between Scylla and Charybdis: Balancing Among Structural Stability
and Energy Density of Layered NCM Cathode Materials for Advanced
Lithium-Ion Batteries
Lea de Biasi,*,†
Pascal Hartmann,*,†,⊥

Aleksandr O. Kondrakov,
†,∥
and Jürgen Janek

Torsten Brezesinski,*,†

Holger Geßwein,

†,⊥

‡,§

†
Battery and Electrochemistry Laboratory, Institute of Nanotechnology and
Technology, Hermann-von-Helmholtz-Platz 1, 76344 Eggenstein-Leopoldshafen, Germany
§
Helmholtz Institute Ulm for Electrochemical Energy Storage, Helmholtzstraße 11, 89081 Ulm, Germany
⊥
BASF SE, 67056 Ludwigshafen, Germany
Institute of Physical Chemistry, Justus-Liebig-University Giessen, Heinrich-Buff-Ring 17, 35392 Giessen, Germany
*S Supporting Information

∥

‡
Institute for Applied Materials, Karlsruhe Institute of

ABSTRACT: Two major strategies are currently pursued to
improve the energy density of
lithium-ion batteries using
LiNixCoyMnzO2

In [37]:
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    separators=["\n\n"],
    chunk_size=500,
    chunk_overlap=100
)
texts = text_splitter.split_text(preprocessed_text)

In [38]:
len(texts)

26

In [39]:
texts

['Cite This: J. Phys. Chem. C 2017, 121, 26163-26171\n\nArticle\n\npubs.acs.org/JPCC\n\nBetween Scylla and Charybdis: Balancing Among Structural Stability\nand Energy Density of Layered NCM Cathode Materials for Advanced\nLithium-Ion Batteries\nLea de Biasi,*,†\nPascal Hartmann,*,†,⊥\n\nAleksandr O. Kondrakov,\n†,∥\nand Jürgen Janek\n\nTorsten Brezesinski,*,†\n\nHolger Geßwein,\n\n†,⊥\n\n‡,§\n\n†\nBattery and Electrochemistry Laboratory, Institute of Nanotechnology and\nTechnology, Hermann-von-Helmholtz-Platz 1, 76344 Eggenstein-Leopoldshafen, Germany\n§\nHelmholtz Institute Ulm for Electrochemical Energy Storage, Helmholtzstraße 11, 89081 Ulm, Germany\n⊥\nBASF SE, 67056 Ludwigshafen, Germany\nInstitute of Physical Chemistry, Justus-Liebig-University Giessen, Heinrich-Buff-Ring 17, 35392 Giessen, Germany\n*S Supporting Information\n\n∥\n\n‡\nInstitute for Applied Materials, Karlsruhe Institute of',
 '∥\n\n‡\nInstitute for Applied Materials, Karlsruhe Institute of\n\nABSTRACT: Two majo

In [18]:
pdf_miner_doc.split(sep="\n\n")

['Cite This: J. Phys. Chem. C 2017, 121, 26163-26171',
 'Article',
 'pubs.acs.org/JPCC',
 'Between Scylla and Charybdis: Balancing Among Structural Stability\nand Energy Density of Layered NCM Cathode Materials for Advanced\nLithium-Ion Batteries\nLea de Biasi,*,†\nPascal Hartmann,*,†,⊥',
 'Aleksandr O. Kondrakov,\n†,∥\nand Jürgen Janek',
 'Torsten Brezesinski,*,†',
 'Holger Geßwein,',
 '†,⊥',
 '‡,§',
 '†\nBattery and Electrochemistry Laboratory, Institute of Nanotechnology and\nTechnology, Hermann-von-Helmholtz-Platz 1, 76344 Eggenstein-Leopoldshafen, Germany\n§\nHelmholtz Institute Ulm for Electrochemical Energy Storage, Helmholtzstraße 11, 89081 Ulm, Germany\n⊥\nBASF SE, 67056 Ludwigshafen, Germany\nInstitute of Physical Chemistry, Justus-Liebig-University Giessen, Heinrich-Buﬀ-Ring 17, 35392 Giessen, Germany\n*S Supporting Information',
 '∥',
 '‡\nInstitute for Applied Materials, Karlsruhe Institute of',
 'ABSTRACT: Two major strategies are currently pursued to\nimprove the energy

In [15]:
from langchain_experimental.text_splitter import SemanticChunker
from langchain_openai.embeddings import OpenAIEmbeddings

# OpenAI 임베딩을 사용하여 의미론적 청크 분할기를 초기화합니다.
text_splitter = SemanticChunker(OpenAIEmbeddings())
chunks = text_splitter.split_text(pdf_miner_doc)

In [16]:
chunks

['Cite This: J. Phys. Chem. C 2017, 121, 26163-26171\n\nArticle\n\npubs.acs.org/JPCC\n\nBetween Scylla and Charybdis: Balancing Among Structural Stability\nand Energy Density of Layered NCM Cathode Materials for Advanced\nLithium-Ion Batteries\nLea de Biasi,*,†\nPascal Hartmann,*,†,⊥\n\nAleksandr O. Kondrakov,\n†,∥\nand Jürgen Janek\n\nTorsten Brezesinski,*,†\n\nHolger Geßwein,\n\n†,⊥\n\n‡,§\n\n†\nBattery and Electrochemistry Laboratory, Institute of Nanotechnology and\nTechnology, Hermann-von-Helmholtz-Platz 1, 76344 Eggenstein-Leopoldshafen, Germany\n§\nHelmholtz Institute Ulm for Electrochemical Energy Storage, Helmholtzstraße 11, 89081 Ulm, Germany\n⊥\nBASF SE, 67056 Ludwigshafen, Germany\nInstitute of Physical Chemistry, Justus-Liebig-University Giessen, Heinrich-Buﬀ-Ring 17, 35392 Giessen, Germany\n*S Supporting Information\n\n∥\n\n‡\nInstitute for Applied Materials, Karlsruhe Institute of\n\nABSTRACT: Two major strategies are currently pursued to\nimprove the energy density of\