# L2: Normalizing the Content

In [1]:
# Warning control
import warnings
warnings.filterwarnings('ignore')
import os

In [2]:
from dotenv import load_dotenv

In [3]:
from IPython.display import JSON

import json

from unstructured_client import UnstructuredClient
from unstructured_client.models import shared
from unstructured_client.models.errors import SDKError

from unstructured.partition.html import partition_html
from unstructured.staging.base import dict_to_elements, elements_to_json

In [4]:




s = UnstructuredClient(
    api_key_auth=os.getenv("UNSTRUCTURED_API_KEY"),
    server_url = os.getenv("UNSTRUCTURED_API_URL")
    
)

<p style="background-color:#fff6ff; padding:15px; border-width:3px; border-color:#efe6ef; border-style:solid; border-radius:6px"> 💻 &nbsp; <b>Access Utils File and Helper Functions:</b> To access helper functions and other related files for this notebook, 1) click on the <em>"View"</em> option on the top menu of the notebook and then 2) click on <em>"File Browser"</em>. For more help, please see the <em>"Appendix - Tips and Help"</em> Lesson.</p>


## Example Document: Medium Blog HTML Page

In [None]:
from IPython.display import Image
Image(filename="images/HTML_demo.png", height=600, width=600)

In [None]:
filename = "images/medium_blog.html"
elements = partition_html(filename=filename)

In [None]:
element_dict = [el.to_dict() for el in elements]
example_output = json.dumps(element_dict[11:15], indent=2)
print(example_output)

In [None]:
from IPython.core.display import JSON
JSON(example_output)

In [None]:
JSON(json.dumps(element_dict[:], indent=2))

## Example Doc: MSFT PowerPoint on OpenAI

In [None]:
Image(filename="images/pptx_slide.png", height=600, width=600) 

## Example Document: PDF on Chain-of-Thought

In [5]:
Image(filename="images/cot_paper.png", height=600, width=600) 

NameError: name 'Image' is not defined

In [6]:
filename = "images/CoT.pdf"
with open(filename, "rb") as f:
    files=shared.Files(
        content=f.read(), 
        file_name=filename,
    )

req = shared.PartitionParameters(
    files=files,
    strategy='hi_res',
    pdf_infer_table_structure=True,
    languages=["eng"],
)
try:
    resp = s.general.partition(req)
    print(json.dumps(resp.elements[:3], indent=2))
except SDKError as e:
    print(e)

INFO: Preparing to split document for partition.
INFO: Starting page number set to 1
INFO: Allow failed set to 0
INFO: Concurrency level set to 5
INFO: Splitting pages 1 to 1 (1 total)
INFO: Determined optimal split size of 2 pages.
INFO: Document has too few pages (1) to be split efficiently. Partitioning without split.
INFO: Successfully partitioned the document.


[
  {
    "type": "Title",
    "element_id": "826446fa7830f0352c88808f40b0cc9b",
    "text": "B All Experimental Results",
    "metadata": {
      "filetype": "application/pdf",
      "languages": [
        "eng"
      ],
      "page_number": 1,
      "filename": "CoT.pdf"
    }
  },
  {
    "type": "NarrativeText",
    "element_id": "055f2fa97fbdee35766495a3452ebd9d",
    "text": "This section contains tables for experimental results for varying models and model sizes, on all benchmarks, for standard prompting vs. chain-of-thought prompting.",
    "metadata": {
      "filetype": "application/pdf",
      "languages": [
        "eng"
      ],
      "page_number": 1,
      "parent_id": "826446fa7830f0352c88808f40b0cc9b",
      "filename": "CoT.pdf"
    }
  },
  {
    "type": "NarrativeText",
    "element_id": "9bf5af5255b80aace01b2da84ea86531",
    "text": "For the arithmetic reasoning benchmarks, some chains of thought (along with the equations produced) were correct, except the model p

In [13]:
[x for x in resp.elements if x['type'] == 'NarrativeText']

[{'type': 'NarrativeText',
  'element_id': '055f2fa97fbdee35766495a3452ebd9d',
  'text': 'This section contains tables for experimental results for varying models and model sizes, on all benchmarks, for standard prompting vs. chain-of-thought prompting.',
  'metadata': {'filetype': 'application/pdf',
   'languages': ['eng'],
   'page_number': 1,
   'parent_id': '826446fa7830f0352c88808f40b0cc9b',
   'filename': 'CoT.pdf'}},
 {'type': 'NarrativeText',
  'element_id': '9bf5af5255b80aace01b2da84ea86531',
  'text': 'For the arithmetic reasoning benchmarks, some chains of thought (along with the equations produced) were correct, except the model performed an arithmetic operation incorrectly. A similar observation was made in Cobbe et al. (2021). Hence, we can further add a Python program as an external calculator (using the Python eval function) to all the equations in the generated chain of thought. When there are multiple equations in a chain of thought, we propagate the external calculat

In [15]:
import chromadb

client = chromadb.PersistentClient(path="chroma_tmp", settings=chromadb.Settings(allow_reset=True))
client.reset()

INFO: Anonymized telemetry enabled. See                     https://docs.trychroma.com/telemetry for more information.


True

In [16]:
#Set the parameters for chroma
collection = client.create_collection(
    name="winter_sports",
    metadata={"hnsw:space": "cosine"}
)

In [18]:
for element in resp.elements:
    parent_id = element["metadata"].get("parent_id")
    collection.add(
        documents=[element["text"]],
        ids=[element["element_id"]],
    
    )

INFO: HTTP Request: GET https://chroma-onnx-models.s3.amazonaws.com/all-MiniLM-L6-v2/onnx.tar.gz "HTTP/1.1 200 OK"
/Users/marcushausch/.cache/chroma/onnx_models/all-MiniLM-L6-v2/onnx.tar.gz: 100%|██████████| 79.3M/79.3M [00:10<00:00, 7.88MiB/s]


In [19]:
collection.count()

6