# Script to write a table of content (TOC) for all example notebooks

In [38]:
# !pip3 install -q uniflow==0.0.24

import pandas as pd
# Adjusting display settings to avoid truncation
pd.set_option('display.max_rows', None)  # Adjust to display all rows
pd.set_option('display.max_columns', None)  # Adjust to display all columns
pd.set_option('display.width', None)  # Adjust to ensure each row uses optimal width
pd.set_option('display.max_colwidth', None)  # Adjust to display full content of each cell

from uniflow.flow.client import TransformClient
from uniflow.flow.flow_factory import FlowFactory
from uniflow.flow.config import TransformConfig
from uniflow.op.model.model_config import OpenAIModelConfig
from uniflow.viz import Viz
from uniflow.op.prompt import PromptTemplate, Context

In [4]:
import os

def list_ipynb_files(directory):
    ipynb_files = []
    for root, dirs, files in os.walk(directory):
        for file in files:
            if file.endswith('.ipynb'):
                ipynb_files.append(os.path.join(root, file))
    return ipynb_files

# Replace with the actual path to the cloned 'example' directory
all_ipynb = list_ipynb_files('./')
len(all_ipynb)


39

In [5]:
all_ipynb[:3]

['./example_toc.ipynb',
 './rater/huggingface_classification.ipynb',
 './rater/bedrock_classification.ipynb']

In [9]:
home_html = "https://github.com/CambioML/uniflow-llm-based-pdf-extraction-text-cleaning-data-clustering/tree/main/example"

In [45]:
all_ipynb_htmls = [home_html+ipynb[1:] for ipynb in all_ipynb] ## remove the first file, which is this notebook
all_ipynb_htmls[:3]

['https://github.com/CambioML/uniflow-llm-based-pdf-extraction-text-cleaning-data-clustering/tree/main/example/example_toc.ipynb',
 'https://github.com/CambioML/uniflow-llm-based-pdf-extraction-text-cleaning-data-clustering/tree/main/example/rater/huggingface_classification.ipynb',
 'https://github.com/CambioML/uniflow-llm-based-pdf-extraction-text-cleaning-data-clustering/tree/main/example/rater/bedrock_classification.ipynb']

In [46]:
all_ipynb_htmls_context = [Context(context={"filename": c}) for c in all_ipynb_htmls]
all_ipynb_htmls_context[:3]

[Context(context={'filename': 'https://github.com/CambioML/uniflow-llm-based-pdf-extraction-text-cleaning-data-clustering/tree/main/example/example_toc.ipynb'}),
 Context(context={'filename': 'https://github.com/CambioML/uniflow-llm-based-pdf-extraction-text-cleaning-data-clustering/tree/main/example/rater/huggingface_classification.ipynb'}),
 Context(context={'filename': 'https://github.com/CambioML/uniflow-llm-based-pdf-extraction-text-cleaning-data-clustering/tree/main/example/rater/bedrock_classification.ipynb'})]

In [50]:
transform_config = TransformConfig(
    flow_name="TransformOpenAIFlow",
    model_config=OpenAIModelConfig(
        response_format={"type": "json_object"},
        temperature=0),
    prompt_template=PromptTemplate(
            instruction="""
            Assume you are an experienced ML technical writer. Here is a list of jupyter notebooks. First, identify the formal title of each notebook, i.e. the notebook header. Only one title per each notebook. Then, provide a 3-sentence, concise and informative summary of each notebook. Next, identify the input data type each notebook is using (including PDF, HTML, TXT, Jupyter Notebook, etc). Last, identify the uniflow model type each notebook is using (such as 'TransformAzureOpenAIFlow', 'TransformGoogleFlow', 'TransformGoogleMultiModalModelFlow','TransformHuggingFaceFlow', 'TransformLMQGFlow', 'TransformOpenAIFlow', etc.)
            Follow the format of the examples below to include each notebook's title, summary, input_data_type, uniflow_type and url in response. 
            """,
            few_shot_prompt=[
                Context(
                    context="""...""",
                    title="""...""",
                    summary="""...""",
                    input_data_type="""...""",
                    uniflow_type="""...""",
                    url="""...""",
                ),
            ],
        )
    )


In [51]:
client = TransformClient(transform_config)
client_output = client.run(all_ipynb_htmls_context[1:5])
client_output

  0%|          | 0/4 [00:00<?, ?it/s]

100%|██████████| 4/4 [00:10<00:00,  2.62s/it]


[{'output': [{'response': [{'notebooks': [{'title': 'Huggingface Classification',
        'summary': 'This notebook demonstrates how to use Huggingface models for text classification tasks. It includes fine-tuning a pre-trained model on a custom dataset and evaluating its performance.',
        'input_data_type': 'Jupyter Notebook',
        'uniflow_type': 'TransformHuggingFaceFlow',
        'url': 'https://github.com/CambioML/uniflow-llm-based-pdf-extraction-text-cleaning-data-clustering/tree/main/example/rater/huggingface_classification.ipynb'}]}],
    'error': 'No errors.'}],
  'root': <uniflow.node.Node at 0x124850d90>},
 {'output': [{'response': [{'few_shot_prompt': [{'context': 'https://github.com/CambioML/uniflow-llm-based-pdf-extraction-text-cleaning-data-clustering/tree/main/example/rater/bedrock_classification.ipynb',
        'title': 'Bedrock Classification',
        'summary': 'This notebook demonstrates the process of classifying different types of bedrock using machine le

In [44]:
import pandas as pd

# Flatten the data
flattened_data = []
for item in client_output:
    for output in item['output']:
        for response in output['response']:
            for notebook in response['notebooks']:
                flattened_data.append({
                    'title': notebook['title'],
                    'summary': notebook['summary'],
                    'input_data_type': notebook['input_data_type'],
                    'uniflow_type': notebook['uniflow_type']
                })

# Create DataFrame
df = pd.DataFrame(flattened_data)
df


Unnamed: 0,title,summary,input_data_type,uniflow_type
0,huggingface_classification.ipynb,"This notebook demonstrates the use of Hugging Face models for text classification tasks. It includes loading a pre-trained model, fine-tuning on a custom dataset, and evaluating the model's performance.",Jupyter Notebook,TransformHuggingFaceFlow
1,bedrock_classification.ipynb,"This notebook demonstrates the process of classifying bedrock types using machine learning. It includes data preprocessing, model training, and evaluation.",Jupyter Notebook,TransformHuggingFaceFlow
2,LLM-based PDF Extraction and Text Cleaning,"This notebook demonstrates the process of using Large Language Models (LLM) for extracting text from PDF documents and cleaning the extracted text. It covers the steps of preprocessing the PDF, extracting text using LLM, and performing text cleaning to improve the quality of the extracted text.",PDF,TransformHuggingFaceFlow
3,Data Clustering with LLM-based Embeddings,"This notebook showcases the application of Large Language Models (LLM) for generating embeddings of text data and using these embeddings for data clustering. It includes the steps of preprocessing the text data, generating LLM-based embeddings, and performing clustering using the embeddings.",Jupyter Notebook,TransformHuggingFaceFlow
4,PDF Extraction and Text Cleaning,"This notebook demonstrates the process of extracting text from PDF documents and cleaning the text data for further analysis. It includes techniques for handling special characters, removing stop words, and performing lemmatization.",PDF,TransformHuggingFaceFlow
5,Data Clustering with LLM-based Model,"This notebook showcases the use of a large language model (LLM) for data clustering tasks. It covers the process of fine-tuning the LLM for clustering, applying it to the input data, and visualizing the clustered results.",Jupyter Notebook,TransformOpenAIFlow


GPT4's output is unstable and the below parsing doesn't work.

In [28]:
title, summary, input_data_type, uniflow_type, url = [], [], [], [], []

for o, u in zip(output, all_ipynb_htmls_context[1:5]):
    title.append(o[0]['output'][0]['response'][0]['title'])
    summary.append(o[0]['output'][0]['response'][0]['summary'])
    input_data_type.append(o[0]['output'][0]['response'][0]['input_data_type'])
    uniflow_type.append(o[0]['output'][0]['response'][0]['uniflow_type'])
    url.append(u)


# Sample data
data = {
    'title': title,
    'summary': summary,
    'input_data_type': input_data_type,
    'uniflow_type': uniflow_type,
    'URL': url
}



# Creating a DataFrame
df = pd.DataFrame(data)
df['URL'] = df['URL'].apply(lambda x: f'<a href="{x}" target="_blank">{x}</a>')


# Displaying the DataFrame
display(HTML(df.to_html(escape=False)))


KeyError: 0