# Use LLM to Write a Table of Content (TOC) of All Example Notebooks

In [39]:
!pip3 install -q pandas tabulate uniflow==0.0.29


In [31]:

import pandas as pd
# Adjusting display settings to avoid truncation
pd.set_option('display.max_rows', None)  # Adjust to display all rows
pd.set_option('display.max_columns', None)  # Adjust to display all columns
pd.set_option('display.width', 20)  # Adjust to ensure each row uses optimal width
# pd.set_option('display.max_colwidth', None)  # Adjust to display full content of each cell

from uniflow.flow.client import TransformClient
from uniflow.flow.flow_factory import FlowFactory
from uniflow.flow.config import TransformConfig
from uniflow.op.model.model_config import OpenAIModelConfig
from uniflow.viz import Viz
from uniflow.op.prompt import PromptTemplate, Context

In [4]:
import os

def list_ipynb_files(directory):
    ipynb_files = []
    for root, dirs, files in os.walk(directory):
        for file in files:
            if file.endswith('.ipynb'):
                ipynb_files.append(os.path.join(root, file))
    return ipynb_files

# Replace with the actual path to the cloned 'example' directory
all_ipynb = list_ipynb_files('./')
len(all_ipynb)


25

In [5]:
all_ipynb[:3]

['./toc.ipynb',
 './rater/openai_evaluate_answer_completeness_accuracy_for_given_questions.ipynb',
 './rater/openai_compare_generated_answers_to_grounding_answer.ipynb']

In [8]:
home_url = "https://github.com/CambioML/uniflow-llm-based-pdf-extraction-text-cleaning-data-clustering/tree/main/example"

In [17]:
## define the github url for each jupyter notebook
all_ipynb_urls = [home_url+ipynb[1:] for ipynb in all_ipynb]
all_ipynb_urls[:3]

['https://github.com/CambioML/uniflow-llm-based-pdf-extraction-text-cleaning-data-clustering/tree/main/example/toc.ipynb',
 'https://github.com/CambioML/uniflow-llm-based-pdf-extraction-text-cleaning-data-clustering/tree/main/example/rater/openai_evaluate_answer_completeness_accuracy_for_given_questions.ipynb',
 'https://github.com/CambioML/uniflow-llm-based-pdf-extraction-text-cleaning-data-clustering/tree/main/example/rater/openai_compare_generated_answers_to_grounding_answer.ipynb']

In [11]:
all_ipynb_urls_context = [Context(context={"filename": c}) for c in all_ipynb_urls]
all_ipynb_urls_context[:3]

[Context(context={'filename': 'https://github.com/CambioML/uniflow-llm-based-pdf-extraction-text-cleaning-data-clustering/tree/main/example/toc.ipynb'}),
 Context(context={'filename': 'https://github.com/CambioML/uniflow-llm-based-pdf-extraction-text-cleaning-data-clustering/tree/main/example/rater/openai_evaluate_answer_completeness_accuracy_for_given_questions.ipynb'}),
 Context(context={'filename': 'https://github.com/CambioML/uniflow-llm-based-pdf-extraction-text-cleaning-data-clustering/tree/main/example/rater/openai_compare_generated_answers_to_grounding_answer.ipynb'})]

In [13]:
instruction = """
Assume you are an experienced ML technical writer known well about uniflow (https://www.cambioml.com/docs/uniflow/uniflow.flow.html#module-uniflow.flow.client). Here is a list of jupyter notebooks using uniflow. For each notebook, do following tasks only once:
1. retrieve the exactly title of each notebook, i.e. the notebook header. Only one title per each notebook. 
2. provide a 3-sentence, concise, unique and informative summary of each notebook. 
3. identify the input data type each notebook is using (including PDF, HTML, TXT, Jupyter Notebook, etc). 
4. identify the uniflow model type each notebook is using (such as 'TransformAzureOpenAIFlow', 'TransformGoogleFlow', 'TransformGoogleMultiModalModelFlow','TransformHuggingFaceFlow', 'TransformLMQGFlow', 'TransformOpenAIFlow', etc.)
Follow the format of the examples below to include each notebook's title, summary, input_data_type, uniflow_type and url in response. 
"""

transform_config = TransformConfig(
    flow_name="TransformOpenAIFlow",
    model_config=OpenAIModelConfig(
        response_format={"type": "json_object"},
        temperature=0),
    prompt_template=PromptTemplate(
            instruction=instruction,
            few_shot_prompt=[
                Context(
                    context="""...""",
                    title="""...""",
                    summary="""...""",
                    input_data_type="""...""",
                    uniflow_type="""...""",
                    url="""...""",
                ),
            ],
        )
    )


In [20]:
data = all_ipynb_urls_context[1:] ## ignore the first jupyter, which is this file

client = TransformClient(transform_config)
client_output = client.run(data) 
# client_output ## uncomment it out if you want to print the outputs

## Process the outputs

In [48]:
import pandas as pd

# Flatten the data
flattened_data = []
for item in client_output:
    for output in item['output']:
        for response in output['response']:
            for notebook in response['notebooks']:
                flattened_data.append({
                    'title': notebook['title'],
                    'summary': notebook['summary'],
                    'input_data_type': notebook['input_data_type'],
                    'uniflow_type': notebook['uniflow_type'],
                    'url': notebook['url']
                })

# Create DataFrame
df = pd.DataFrame(flattened_data)
df = df.set_index('title')
print(df.shape)


(29, 4)


### Output cleaning
It seems that LLMs make up some data. let's remove those fake rows in the dataframe. Here are the criteria: if the url is not start with `home_url`, that means LLMs make up this row so we can remove it.

In [24]:
# print the home_url
home_url

'https://github.com/CambioML/uniflow-llm-based-pdf-extraction-text-cleaning-data-clustering/tree/main/example'

In [37]:
# Remove rows where the 'url' column does not start with the specified prefix
df_filtered = df[df['url'].str.startswith(home_url)]
print(df_filtered.shape)
df_filtered

(23, 4)


Unnamed: 0_level_0,summary,input_data_type,uniflow_type,url
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
OpenAI Evaluate Answer Completeness Accuracy for Given Questions,This notebook uses uniflow to evaluate the completeness and accuracy of answers for given questions using OpenAI model. It provides insights into the performance of the model in generating accurate and complete answers.,Jupyter Notebook,TransformOpenAIFlow,https://github.com/CambioML/uniflow-llm-based-pdf-extraction-text-cleaning-data-clustering/tree/main/example/rater/openai_evaluate_answer_completeness_accuracy_for_given_questions.ipynb
OpenAI Compare Generated Answers to Grounding Answer,This notebook compares the answers generated by OpenAI language model to a grounding answer for evaluation. It provides a method to assess the quality of the generated answers.,Jupyter Notebook,TransformOpenAIFlow,https://github.com/CambioML/uniflow-llm-based-pdf-extraction-text-cleaning-data-clustering/tree/main/example/rater/openai_compare_generated_answers_to_grounding_answer.ipynb
Bedrock Evaluate Answer Completeness Accuracy for Given Questions,This notebook uses uniflow to evaluate the completeness and accuracy of answers for given questions using the Bedrock model. It provides insights into the quality of answers and helps in identifying areas for improvement.,Jupyter Notebook,BedrockFlow,https://github.com/CambioML/uniflow-llm-based-pdf-extraction-text-cleaning-data-clustering/tree/main/example/rater/bedrock_evaluate_answer_completeness_accuracy_for_given_questions.ipynb
Huggingface Evaluate Answer Completeness Accuracy for Given Questions,This notebook uses Huggingface model to evaluate the completeness and accuracy of answers for given questions. It provides a comprehensive analysis of the model's performance in understanding and answering questions.,Jupyter Notebook,TransformHuggingFaceFlow,https://github.com/CambioML/uniflow-llm-based-pdf-extraction-text-cleaning-data-clustering/tree/main/example/rater/huggingface_evaluate_answer_completeness_accuracy_for_given_questions.ipynb
PDF Extraction and Text Cleaning with Uniflow,"This notebook demonstrates how to use uniflow for PDF extraction and text cleaning, including data clustering for further analysis. It provides a step-by-step guide for processing PDF data and preparing it for clustering.",PDF,TransformLMQGFlow,https://github.com/CambioML/uniflow-llm-based-pdf-extraction-text-cleaning-data-clustering/tree/main/example/pipeline/pipeline_s3_txt.ipynb
PDF Extraction and Text Cleaning with Data Clustering,"This notebook demonstrates the process of extracting text from PDF documents, cleaning the text data, and clustering the cleaned text data for further analysis. It provides a comprehensive pipeline for preprocessing PDF data and preparing it for downstream tasks.",PDF,TransformLMQGFlow,https://github.com/CambioML/uniflow-llm-based-pdf-extraction-text-cleaning-data-clustering/tree/main/example/pipeline/pipeline_pdf_extract_transform.ipynb
Pipeline Web Summary,"This notebook demonstrates the use of uniflow for PDF extraction, text cleaning, and data clustering to generate a summary of web content. It showcases the end-to-end pipeline for web content analysis using uniflow.","PDF, HTML",TransformLMQGFlow,https://github.com/CambioML/uniflow-llm-based-pdf-extraction-text-cleaning-data-clustering/tree/main/example/pipeline/pipeline_web_summary.ipynb
PDF Extraction and Text Cleaning with Data Clustering,"This notebook demonstrates the process of extracting text from PDF documents, cleaning the text data, and clustering the cleaned text data for further analysis.",PDF,TransformLMQGFlow,https://github.com/CambioML/uniflow-llm-based-pdf-extraction-text-cleaning-data-clustering/tree/main/example/pipeline/pipeline_pdf.ipynb
"LLM Based PDF Extraction, Text Cleaning, Data Clustering","This notebook demonstrates the use of LLM for PDF extraction, text cleaning, and data clustering. It showcases the end-to-end workflow of processing PDF documents, cleaning the text data, and clustering similar documents based on their content.",PDF,TransformLLMFlow,https://github.com/CambioML/uniflow-llm-based-pdf-extraction-text-cleaning-data-clustering/tree/main/example/vector_database/setup_resources.ipynb
Extract PDF with Recursive Splitter,"This notebook demonstrates how to use uniflow to extract text from PDF files using a recursive splitter, and then clean the extracted text data. It also showcases data clustering techniques to organize the extracted text data.",PDF,TransformLMQGFlow,https://github.com/CambioML/uniflow-llm-based-pdf-extraction-text-cleaning-data-clustering/tree/main/example/extract/extract_pdf_with_recursive_splitter.ipynb


## Save to markdown

Now we can save the dataframe to a markdown file and use it in the README file!

In [47]:
# Saving the markdown text to a .md file
df_markdown = df_filtered.to_markdown(index=True)
file_path = 'toc_examples.md'
with open(file_path, 'w') as file:
    file.write(df_markdown)


## End of the notebook

Check more Uniflow use cases in the [example folder](https://github.com/CambioML/uniflow/tree/main/example/model#examples)!

<a href="https://www.cambioml.com/" title="Title">
    <img src="./image/cambioml_logo_large.png" style="height: 100px; display: block; margin-left: auto; margin-right: auto;"/>
</a>