# L2: Normalizing the Content

In [1]:
# Warning control
import warnings
warnings.filterwarnings('ignore')
import os

In [2]:
from dotenv import load_dotenv

In [3]:
from IPython.display import JSON

import json

from unstructured_client import UnstructuredClient
from unstructured_client.models import shared
from unstructured_client.models.errors import SDKError

from unstructured.partition.html import partition_html
from unstructured.staging.base import dict_to_elements, elements_to_json

In [4]:




s = UnstructuredClient(
    api_key_auth=os.getenv("UNSTRUCTURED_API_KEY"),
    server_url = os.getenv("UNSTRUCTURED_API_URL")
    
)

<p style="background-color:#fff6ff; padding:15px; border-width:3px; border-color:#efe6ef; border-style:solid; border-radius:6px"> 💻 &nbsp; <b>Access Utils File and Helper Functions:</b> To access helper functions and other related files for this notebook, 1) click on the <em>"View"</em> option on the top menu of the notebook and then 2) click on <em>"File Browser"</em>. For more help, please see the <em>"Appendix - Tips and Help"</em> Lesson.</p>


## Example Document: Medium Blog HTML Page

In [None]:
from IPython.display import Image
Image(filename="images/HTML_demo.png", height=600, width=600)

In [5]:
filename = "images/medium_blog.html"
elements = partition_html(filename=filename)

In [7]:
element_dict

[{'type': 'Title',
  'element_id': '58ce54f95c4ba051d7ba46498beed83d',
  'text': 'Open in app',
  'metadata': {'category_depth': 0,
   'link_texts': ['Open in app'],
   'link_urls': ['https://rsci.app.link/?%24canonical_url=https%3A%2F%2Fmedium.com%2Fp%2F6c2659eda4af&%7Efeature=LoOpenInAppButton&%7Echannel=ShowPostUnderCollection&source=---two_column_layout_nav----------------------------------'],
   'last_modified': '2024-08-23T22:59:59',
   'languages': ['eng'],
   'file_directory': 'images',
   'filename': 'medium_blog.html',
   'filetype': 'text/html'}},
 {'type': 'Title',
  'element_id': 'ee5ff1290aadcf1065fd0cd002bad569',
  'text': 'Sign in',
  'metadata': {'category_depth': 0,
   'link_texts': ['Sign in'],
   'link_urls': ['/m/signin?operation=login&redirect=https%3A%2F%2Fmedium.com%2Funstructured-io%2Feffortless-document-extraction-a-guide-to-using-unstructured-api-and-data-connectors-6c2659eda4af&source=post_page---two_column_layout_nav-----------------------global_nav--------

In [8]:
example_output

'[\n  {\n    "type": "UncategorizedText",\n    "element_id": "ad980fa3117623716a040b733c8e84f2",\n    "text": "--",\n    "metadata": {\n      "last_modified": "2024-08-23T22:59:59",\n      "languages": [\n        "eng"\n      ],\n      "parent_id": "304e44851f57298c74adc6c6bd72dfd1",\n      "file_directory": "images",\n      "filename": "medium_blog.html",\n      "filetype": "text/html"\n    }\n  },\n  {\n    "type": "NarrativeText",\n    "element_id": "23a7f3e28178ea0fa2b3e98b0275d2e3",\n    "text": "In the vast digital universe, data is the lifeblood that drives decision-making and innovation. But not all data is created equal. Unstructured data in images and documents often hold a wealth of information that can be challenging to extract and analyze.",\n    "metadata": {\n      "last_modified": "2024-08-23T22:59:59",\n      "languages": [\n        "eng"\n      ],\n      "parent_id": "304e44851f57298c74adc6c6bd72dfd1",\n      "file_directory": "images",\n      "filename": "medium_blog

In [6]:
element_dict = [el.to_dict() for el in elements]
example_output = json.dumps(element_dict[11:15], indent=2)
print(example_output)

[
  {
    "type": "UncategorizedText",
    "element_id": "ad980fa3117623716a040b733c8e84f2",
    "text": "--",
    "metadata": {
      "last_modified": "2024-08-23T22:59:59",
      "languages": [
        "eng"
      ],
      "parent_id": "304e44851f57298c74adc6c6bd72dfd1",
      "file_directory": "images",
      "filename": "medium_blog.html",
      "filetype": "text/html"
    }
  },
  {
    "type": "NarrativeText",
    "element_id": "23a7f3e28178ea0fa2b3e98b0275d2e3",
    "text": "In the vast digital universe, data is the lifeblood that drives decision-making and innovation. But not all data is created equal. Unstructured data in images and documents often hold a wealth of information that can be challenging to extract and analyze.",
    "metadata": {
      "last_modified": "2024-08-23T22:59:59",
      "languages": [
        "eng"
      ],
      "parent_id": "304e44851f57298c74adc6c6bd72dfd1",
      "file_directory": "images",
      "filename": "medium_blog.html",
      "filetype": "t

In [None]:
from IPython.core.display import JSON
JSON(example_output)

In [None]:
JSON(json.dumps(element_dict[:], indent=2))

## Example Doc: MSFT PowerPoint on OpenAI

In [None]:
Image(filename="images/pptx_slide.png", height=600, width=600) 

## Example Document: PDF on Chain-of-Thought

In [5]:
Image(filename="images/cot_paper.png", height=600, width=600) 

NameError: name 'Image' is not defined

In [6]:
filename = "images/CoT.pdf"
with open(filename, "rb") as f:
    files=shared.Files(
        content=f.read(), 
        file_name=filename,
    )

req = shared.PartitionParameters(
    files=files,
    strategy='hi_res',
    pdf_infer_table_structure=True,
    languages=["eng"],
)
try:
    resp = s.general.partition(req)
    print(json.dumps(resp.elements[:3], indent=2))
except SDKError as e:
    print(e)

INFO: Preparing to split document for partition.
INFO: Starting page number set to 1
INFO: Allow failed set to 0
INFO: Concurrency level set to 5
INFO: Splitting pages 1 to 1 (1 total)
INFO: Determined optimal split size of 2 pages.
INFO: Document has too few pages (1) to be split efficiently. Partitioning without split.
INFO: Successfully partitioned the document.


[
  {
    "type": "Title",
    "element_id": "826446fa7830f0352c88808f40b0cc9b",
    "text": "B All Experimental Results",
    "metadata": {
      "filetype": "application/pdf",
      "languages": [
        "eng"
      ],
      "page_number": 1,
      "filename": "CoT.pdf"
    }
  },
  {
    "type": "NarrativeText",
    "element_id": "055f2fa97fbdee35766495a3452ebd9d",
    "text": "This section contains tables for experimental results for varying models and model sizes, on all benchmarks, for standard prompting vs. chain-of-thought prompting.",
    "metadata": {
      "filetype": "application/pdf",
      "languages": [
        "eng"
      ],
      "page_number": 1,
      "parent_id": "826446fa7830f0352c88808f40b0cc9b",
      "filename": "CoT.pdf"
    }
  },
  {
    "type": "NarrativeText",
    "element_id": "9bf5af5255b80aace01b2da84ea86531",
    "text": "For the arithmetic reasoning benchmarks, some chains of thought (along with the equations produced) were correct, except the model p

In [13]:
[x for x in resp.elements if x['type'] == 'NarrativeText']

[{'type': 'NarrativeText',
  'element_id': '055f2fa97fbdee35766495a3452ebd9d',
  'text': 'This section contains tables for experimental results for varying models and model sizes, on all benchmarks, for standard prompting vs. chain-of-thought prompting.',
  'metadata': {'filetype': 'application/pdf',
   'languages': ['eng'],
   'page_number': 1,
   'parent_id': '826446fa7830f0352c88808f40b0cc9b',
   'filename': 'CoT.pdf'}},
 {'type': 'NarrativeText',
  'element_id': '9bf5af5255b80aace01b2da84ea86531',
  'text': 'For the arithmetic reasoning benchmarks, some chains of thought (along with the equations produced) were correct, except the model performed an arithmetic operation incorrectly. A similar observation was made in Cobbe et al. (2021). Hence, we can further add a Python program as an external calculator (using the Python eval function) to all the equations in the generated chain of thought. When there are multiple equations in a chain of thought, we propagate the external calculat

In [15]:
import chromadb

client = chromadb.PersistentClient(path="chroma_tmp", settings=chromadb.Settings(allow_reset=True))
client.reset()

INFO: Anonymized telemetry enabled. See                     https://docs.trychroma.com/telemetry for more information.


True

In [16]:
#Set the parameters for chroma
collection = client.create_collection(
    name="winter_sports",
    metadata={"hnsw:space": "cosine"}
)

In [18]:
for element in resp.elements:
    parent_id = element["metadata"].get("parent_id")
    collection.add(
        documents=[element["text"]],
        ids=[element["element_id"]],
    
    )

INFO: HTTP Request: GET https://chroma-onnx-models.s3.amazonaws.com/all-MiniLM-L6-v2/onnx.tar.gz "HTTP/1.1 200 OK"
/Users/marcushausch/.cache/chroma/onnx_models/all-MiniLM-L6-v2/onnx.tar.gz: 100%|██████████| 79.3M/79.3M [00:10<00:00, 7.88MiB/s]


In [19]:
collection.count()

6

In [20]:
results = collection.peek()
print(results["documents"])

['20', 'This section contains tables for experimental results for varying models and model sizes, on all benchmarks, for standard prompting vs. chain-of-thought prompting.', 'Prior best Prompting N/A (ﬁnetuning) 55a GSM8K SVAMP ASDiv 57.4b 75.3c AQuA 37.9d MAWPS 88.4e UL2 20B Standard Chain of thought 4.4 (+0.3) + ext. calc 4.1 6.9 10.1 12.5 (+2.4) 16.9 (+0.9) 23.6 (+3.1) 28.3 16.0 20.5 34.3 23.6 16.6 19.1 (+2.5) 42.7 LaMDA 137B Standard Chain of thought 14.3 (+7.8) + ext. calc 6.5 17.8 29.5 37.5 (+8.0) 46.6 (+6.5) 20.6 (-4.9) 42.1 40.1 25.5 53.4 20.6 43.2 57.9 (+14.7) 69.3 GPT-3 175B (text-davinci-002) Chain of thought 46.9 (+31.3) 68.9 (+3.2) 71.3 (+1.0) 35.8 (+11.0) 87.1 (+14.4) Standard 15.6 65.7 70.3 24.8 72.7 + ext. calc 49.6 70.3 71.1 35.8 87.5 Codex (code-davinci-002) Chain of thought 63.1 (+43.4) 76.4 (+6.5) 80.4 (+6.4) 45.3 (+15.8) 92.6 (+13.9) Standard 19.7 69.9 74.0 29.5 78.7 + ext. calc 65.4 77.0 80.0 45.3 93.3 PaLM 540B Standard Chain of thought 56.9 (+39.0) 79.0 (+9.6) 7