In [1]:
"""
Документация по либам:
- langchain_community:
    - TextLoader, Docx2txtLoader, PyPDFLoader -> работа с обработкой файлов
    - RecursiveCharacterTextSplitter -> разбиение текста на более мелкие части с учетом определённых ограничений

"""
from langchain_community.document_loaders import TextLoader, Docx2txtLoader, PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [2]:
def file_to_chunk(file_name: str,
                  sep: str,
                  chunk_size: int,
                  chunk_overlap: int):

    all_chunks = []

    file_format = file_name.split('.')[-1]
    file_path = file_name

    if file_format == "txt":
        loader = TextLoader(file_path)

    elif file_format == "docx":
        loader = Docx2txtLoader(file_path)

    elif file_format == "pdf":
        loader = PyPDFLoader(file_path)

    file = loader.load()

    """
    RecursiveCharacterTextSplitter - инструмент для разбиения текста на более мелкие части с учетом определённых ограничений
    """
    text_splitter = RecursiveCharacterTextSplitter(
        separators = sep,
        chunk_size = chunk_size,
        chunk_overlap = chunk_overlap,
        length_function = len,
        is_separator_regex = False,
        add_start_index = False
    )

    for docs in file:
        content = docs.page_content
        chunks = text_splitter.split_text(content)

        print(content)

In [3]:

def create_documents():
    """
    create_documents - функция, которая предназначена для создания объектов documents
    - documents: 
    - separator:
    - chunk_size:
    - chunk_overlap:
    """
    documents = []
    sep = '\n'
    chunk_size = 2048
    chunk_overlap = 128

    path = "D:\\ProgPrj\\vsProjects\\echo_hack\\dl_pipeline\\pipe_1\\datasets\\small-bench\\SUSI Partners - 230510_SUSI-Partners-Sustainability-Report-2022.pdf"
    chunks = file_to_chunk(path, sep, chunk_size, chunk_overlap)

    return chunks



"""
ПРИМЕР ТОГО ЧТО МЫ ПОЛУЧАЕМ:
Document(
    metadata=
        {
        'source': 'D:\\ProgPrj\\vsProjects\\echo_hack\\if_llm\\pipe_1\\datasets\\small-bench\\SUSI Partners - 230510_SUSI-Partners-Sustainability-Report-2022.pdf', 
        'page': 1
        }, 
    page_content='CONTENT\nINTRODUCTION\n →Note from the Co-CEOs 3\n →SUSI at a glance 4\n →Our stakeholders 5\nSUSTAINABILITY STRATEGY\n →Our sustainability journey 6\n →Sustainability strategy and oversight 7\n →Increasing transparency and comparabil
""";

In [4]:
с = create_documents()

SUSTAINABILITY REPORT
Annual report 2022
IMPACTFUL RETURNS
CONTENT
INTRODUCTION
 →Note from the Co-CEOs 3
 →SUSI at a glance 4
 →Our stakeholders 5
SUSTAINABILITY STRATEGY
 →Our sustainability journey 6
 →Sustainability strategy and oversight 7
 →Increasing transparency and comparability in the market 9
SUSTAINABLE INVESTING
 →Driving forward the energy transition 11
 →ESG integration 13
 →Our investment platforms 17
 →Climate risks & TCFD 21
CORPORATE SUSTAINABILITY
 →Our people - the centre of our success 23
 →ICT as a key enabler of corporate operations 26
OUTLOOK 27
ABOUT THIS REPORT
This report addresses all stakeholders of SUSI Partners AG and its affiliates and provides an overview of the 
firm’s efforts in ensuring the sustainability of its investments and the company as a whole, with a particular 
focus on the respective impacts on society and the environment more broadly. All readers are encouraged to 
read through the important legal information regarding this report at the 