In [1]:
%reload_ext autoreload
%autoreload 2

import sys

sys.path.append(".")
sys.path.append("..")
sys.path.append("../..")

In [2]:
from dotenv import load_dotenv
import os
import pandas as pd
from uniflow.transform.client import Client
from uniflow.transform.config import TransformOpenAIConfig
from uniflow.model.config import OpenAIModelConfig
from langchain.document_loaders import PyPDFLoader
from uniflow.schema import Context, GuidedPrompt

load_dotenv()

  from .autonotebook import tqdm as notebook_tqdm


True

In [3]:
markdown_file = "README.md"

In [6]:
dir_cur = os.getcwd()
# input_file = "../../" + markdown_file
input_file = os.path.join(f"{dir_cur}", markdown_file)
print(input_file)

/home/ubuntu/uniflow/example/transform/README.md


In [7]:
markdown_str = ""
with open(input_file, 'r') as file:
   markdown_str = file.read()

In [8]:
markdown_str

'# Examples\n## Base Config\nThe base `Config` is the base configuration that all other configurations inherit from. Here are the default parameters:\n\n| Parameter | Type | Default | Description |\n| --- | --- | --- | --- |\n| `flow_name` | `str` | [ModelFlow] | The name of the flow to run. |\n| `guided_prompt_template` | `GuidedPrompt` | [Default](../../README.md#2-prompting) | The template to use for the guided prompt. |\n| `num_threads` | `int` | 1 | The number of threads to use. |\n| `model_config` | `ModelConfig` | `ModelConfig` | The model configuration to use. |\n\nHere are the default parameters for the `ModelConfig`:\n\n| Parameter | Type | Default | Description |\n| --- | --- | --- | --- |\n| `model_name` | `str` | `gpt-3.5-turbo-1106` | The name of the model to use. |\n\nThe [model.ipynb notebook](./model.ipynb) shows a basic example of how to use the base `Config`, where it also passes the `OpenAIModelConfig` as a `model_config` argument.\n\n## OpenAIConfig\nThe `OpenAICon

In [9]:
from typing import (Dict, List, TypedDict)

In [10]:
class HeaderType(TypedDict):
    """Header type as typed dict."""

    level: int
    name: str
    data: str

In [11]:
class LineType(TypedDict):
    """Line type as typed dict."""

    metadata: Dict[str, str]
    content: str

In [12]:
headers_to_split_on = [
    ("#", "Header 1"),
    ("##", "Header 2"),
    ("###", "Header 3"),
    ("####", "Header 4")
]

In [13]:
def markdown_splitter(markdown_str, headers_to_split_on=headers_to_split_on):
    # Final output
    lines_with_metadata: List[LineType] = []
    # Content and metadata of the chunk currently being processed
    current_content: List[str] = []
    current_metadata: Dict[str, str] = {}
    # Keep track of the nested header structure
    header_stack: List[HeaderType] = []
    initial_metadata: Dict[str, str] = {}

    markdown_str += "\n# end"
    lines = markdown_str.split("\n")

    for line in lines:
        stripped_line = line.strip()

        for sep, name in headers_to_split_on:
            # Check if line starts with a header that we intend to split on
            if stripped_line.startswith(sep) and (
                # Header with no text OR header is followed by space
                # Both are valid conditions that sep is being used a header
                len(stripped_line) == len(sep) or stripped_line[len(sep)] == " "
            ):
                # Ensure we are tracking the header as metadata
                if name is not None:
                    # Get the current header level
                    current_header_level = sep.count("#")

                    # Pop out headers of lower or same level from the stack
                    while (
                        header_stack
                        and header_stack[-1]["level"] >= current_header_level
                    ):
                        # We have encountered a new header
                        # at the same or higher level
                        popped_header = header_stack.pop()
                        # Clear the metadata for the
                        # popped header in initial_metadata
                        if popped_header["name"] in initial_metadata:
                            initial_metadata.pop(popped_header["name"])

                    # Push the current header to the stack
                    header: HeaderType = {
                        "level": current_header_level,
                        "name": name,
                        "data": stripped_line[len(sep) :].strip(),
                    }
                    header_stack.append(header)
                    # Update initial_metadata with the current header
                    initial_metadata[name] = header["data"]

                # Add the previous line to the lines_with_metadata
                # only if current_content is not empty
                if current_content:
                    lines_with_metadata.append(
                        {
                            "content": "\n".join(current_content),
                            "metadata": current_metadata.copy(),
                        }
                    )
                    current_content.clear()

                break
            else:
                if stripped_line:
                    current_content.append(stripped_line)
                    current_metadata = initial_metadata.copy()
    
    return lines_with_metadata

In [14]:
markdown_splitter(markdown_str)

[{'content': '## Base Config', 'metadata': {'Header 1': 'Examples'}},
 {'content': 'The base `Config` is the base configuration that all other configurations inherit from. Here are the default parameters:\nThe base `Config` is the base configuration that all other configurations inherit from. Here are the default parameters:\nThe base `Config` is the base configuration that all other configurations inherit from. Here are the default parameters:\nThe base `Config` is the base configuration that all other configurations inherit from. Here are the default parameters:\n| Parameter | Type | Default | Description |\n| Parameter | Type | Default | Description |\n| Parameter | Type | Default | Description |\n| Parameter | Type | Default | Description |\n| --- | --- | --- | --- |\n| --- | --- | --- | --- |\n| --- | --- | --- | --- |\n| --- | --- | --- | --- |\n| `flow_name` | `str` | [ModelFlow] | The name of the flow to run. |\n| `flow_name` | `str` | [ModelFlow] | The name of the flow to run.

In [15]:
from langchain.text_splitter import MarkdownHeaderTextSplitter

In [16]:
markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)
md_header_splits = markdown_splitter.split_text(markdown_str)
md_header_splits

[Document(page_content='The base `Config` is the base configuration that all other configurations inherit from. Here are the default parameters:  \n| Parameter | Type | Default | Description |\n| --- | --- | --- | --- |\n| `flow_name` | `str` | [ModelFlow] | The name of the flow to run. |\n| `guided_prompt_template` | `GuidedPrompt` | [Default](../../README.md#2-prompting) | The template to use for the guided prompt. |\n| `num_threads` | `int` | 1 | The number of threads to use. |\n| `model_config` | `ModelConfig` | `ModelConfig` | The model configuration to use. |  \nHere are the default parameters for the `ModelConfig`:  \n| Parameter | Type | Default | Description |\n| --- | --- | --- | --- |\n| `model_name` | `str` | `gpt-3.5-turbo-1106` | The name of the model to use. |  \nThe [model.ipynb notebook](./model.ipynb) shows a basic example of how to use the base `Config`, where it also passes the `OpenAIModelConfig` as a `model_config` argument.', metadata={'Header 1': 'Examples', 'He