# Example of extracting HTML file

### Load packages

In [1]:
%reload_ext autoreload
%autoreload 2

import sys
import pprint

sys.path.append(".")
sys.path.append("..")
sys.path.append("../..")

In [2]:
!{sys.executable} -m pip install bs4

Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com


In [3]:
from uniflow.flow.client import ExtractClient
from uniflow.flow.config import ExtractHTMLConfig
from uniflow.viz import Viz
from uniflow.flow.flow_factory import FlowFactory

FlowFactory.list()

  from .autonotebook import tqdm as notebook_tqdm


{'extract': ['ExtractHTMLFlow',
  'ExtractImageFlow',
  'ExtractIpynbFlow',
  'ExtractMarkdownFlow',
  'ExtractPDFFlow',
  'ExtractTxtFlow',
  'ExtractS3TxtFlow'],
 'transform': ['TransformAzureOpenAIFlow',
  'TransformCopyFlow',
  'TransformHuggingFaceFlow',
  'TransformLMQGFlow',
  'TransformOpenAIFlow'],
 'rater': ['RaterFlow']}

### Prepare the input data

We can not only load local html files by `filename`, but also load online html files by providing the `url`.

In [4]:
# data = [{"url": f'https://github.com/CambioML/uniflow'}]

In [4]:
data = [{"filename": f'../transform/data/raw_input/22.11_information-theory.html'}]

### Load the html file via ExtractClient

In [5]:
client = ExtractClient(ExtractHTMLConfig())

In [6]:
output = client.run(data)

  0%|          | 0/1 [00:00<?, ?it/s]

100%|██████████| 1/1 [00:00<00:00,  1.72it/s]


### Output

Let's take a look of the generation output.

In [7]:
text = output[0]['output'][0]['text']
for i, _s in enumerate(text[0:15]):
    _s = len(_s) > 100 and ((_s[:60]) + "...") or _s
    pprint.pprint(f"chunk_{i}: {_s}")

'chunk_0: Quick search'
'chunk_1: Show Source'
'chunk_2: Table Of Contents'
'chunk_3: 1. Introduction\n2. Preliminaries\n2.1. Data Manipulation\n2.2....'
'chunk_4: Table Of Contents'
'chunk_5: 1. Introduction\n2. Preliminaries\n2.1. Data Manipulation\n2.2....'
'chunk_6: Open the notebook in Colab'
'chunk_7: Open the notebook in Colab'
'chunk_8: Open the notebook in Colab'
'chunk_9: Open the notebook in Colab'
'chunk_10: Open the notebook in SageMaker Studio Lab'
'chunk_11: The universe is overflowing with information. Information pr...'
'chunk_12: Section 4.1'
'chunk_13: Section 4.1'
'chunk_14: Consider the following thought experiment. We have a friend ...'


### Comparison with `unstructured`

- Text context: Both `unstructured` and our `ExtractHTMLFlow` perform well.

- Table content: Both `unstructured` and our `ExtractHTMLFlow` perform well.

- List content: Both `unstructured` and our `ExtractHTMLFlow` perform well.

- Code block: Our `ExtractHTMLFlow` performs better.

- Code in text: Both we and unstructured need to improve.

In [8]:
from unstructured.partition.html import partition_html

p = partition_html(filename=data[0]["filename"])

for i, _s in enumerate(p[60:80]):
    pprint.pprint(f"chunk_{i}: {_s}")

'chunk_0: pytorch'
'chunk_1: mxnet'
'chunk_2: tensorflow'
'chunk_3: import'
'chunk_4: torch'
'chunk_5: from'
'chunk_6: torch.nn'
'chunk_7: import'
'chunk_8: NLLLoss'
'chunk_9: def'
'chunk_10: nansum'
'chunk_11: ):'
'chunk_12: # Define nansum, as pytorch does not offer it inbuilt.'
'chunk_13: return'
'chunk_14: torch'
'chunk_15: isnan'
'chunk_16: )]'
'chunk_17: sum'
'chunk_18: ()'
'chunk_19: def'


## End of the notebook

Check more Uniflow use cases in the [example folder](https://github.com/CambioML/uniflow/tree/main/example/model#examples)!

<a href="https://www.cambioml.com/" title="Title">
    <img src="../image/cambioml_logo_large.png" style="height: 100px; display: block; margin-left: auto; margin-right: auto;"/>
</a>