# Extracting Text from Webpages and Images

In [1]:
#Upgrade dependencies
!pip install --upgrade pip
!pip install --upgrade sagemaker
!pip install --upgrade beautifulsoup4
!pip install --upgrade html5lib
!pip install --upgrade requests
!pip install --upgrade textract-trp

Collecting sagemaker
  Downloading sagemaker-2.237.1-py3-none-any.whl.metadata (16 kB)
Downloading sagemaker-2.237.1-py3-none-any.whl (1.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m41.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sagemaker
  Attempting uninstall: sagemaker
    Found existing installation: sagemaker 2.237.0
    Uninstalling sagemaker-2.237.0:
      Successfully uninstalled sagemaker-2.237.0
Successfully installed sagemaker-2.237.1
Collecting html5lib
  Downloading html5lib-1.1-py2.py3-none-any.whl.metadata (16 kB)
Downloading html5lib-1.1-py2.py3-none-any.whl (112 kB)
Installing collected packages: html5lib
Successfully installed html5lib-1.1
Collecting textract-trp
  Downloading textract_trp-0.1.3-py3-none-any.whl.metadata (3.0 kB)
Downloading textract_trp-0.1.3-py3-none-any.whl (5.8 kB)
Installing collected packages: textract-trp
Successfully installed textract-trp-0.1.3


## 1. Extracting information from a webpage

In [2]:
from bs4 import BeautifulSoup
import requests

In [3]:
page = requests.get('https://aws.amazon.com/blogs/machine-learning/')
page.status_code

200

In [4]:
soup = BeautifulSoup(page.content, 'html.parser')

In [5]:
print(soup.prettify())

<!DOCTYPE html>
<html class="no-js aws-lng-en_US" data-aws-assets="https://a0.awsstatic.com" data-css-version="1.0.538" data-js-version="1.0.681" data-static-assets="https://a0.awsstatic.com" lang="en-US" xmlns="http://www.w3.org/1999/xhtml">
 <head>
  <meta charset="utf-8"/>
  <meta content="width=device-width, initial-scale=1" name="viewport"/>
  <title>
   AWS Machine Learning Blog
  </title>
  <meta content="IE=edge,chrome=1" http-equiv="X-UA-Compatible"/>
  <meta content="default-src 'self' data: https://a0.awsstatic.com https://prod.us-east-1.ui.gcr-chat.marketing.aws.dev; base-uri 'none'; connect-src 'self' *.akamaized.net *.googlevideo.com/videoplayback https://*.analytics.console.aws.a2z.com https://*.harmony.a2z.com https://*.marketing.aws.dev https://*.panorama.console.api.aws https://*.prod.chc-features.uxplatform.aws.dev https://112-tzm-766.mktoresp.com https://112-tzm-766.mktoutil.com https://a0.awsstatic.com https://a0.p.awsstatic.com https://a1.awsstatic.com https://ama

In [6]:
print(soup.title.text)

AWS Machine Learning Blog


In [7]:
print(soup.article.prettify())

<article class="blog-post" typeof="TechArticle" vocab="https://schema.org/">
 <meta content="en-US" property="inLanguage"/>
 <meta content="https://d2908q01vomqb2.cloudfront.net/f1f836cb4ea6efb2a0b1b99f41ad8b103eff4b59/2024/12/17/featured-images-ML-17989-1120x630.jpg" property="image"/>
 <div class="lb-row lb-snap">
  <div class="lb-col lb-mid-6 lb-tiny-24">
   <a href="https://aws.amazon.com/blogs/machine-learning/simplify-multimodal-generative-ai-with-amazon-bedrock-data-automation/" property="url" rel="bookmark">
    <img alt="" class="attachment-large size-large wp-post-image" height="576" src="https://d2908q01vomqb2.cloudfront.net/f1f836cb4ea6efb2a0b1b99f41ad8b103eff4b59/2024/12/17/featured-images-ML-17989-1024x576.jpg" width="1024"/>
   </a>
  </div>
  <div class="lb-col lb-mid-18 lb-tiny-24">
   <h2 class="lb-bold blog-post-title">
    <a href="https://aws.amazon.com/blogs/machine-learning/simplify-multimodal-generative-ai-with-amazon-bedrock-data-automation/" property="url" rel

In [8]:
print(soup.article.h2.span.prettify())

<span property="name headline">
 Simplify multimodal generative AI with Amazon Bedrock Data Automation
</span>



In [9]:
print(soup.article.h2.span.text)

Simplify multimodal generative AI with Amazon Bedrock Data Automation


In [10]:
print(soup.article.time.text)

17 DEC 2024


In [11]:
print(soup.article.section.p.text)

Amazon Bedrock Data Automation in public preview, offers a unified experience for developers of all skillsets to easily automate the extraction, transformation, and generation of relevant insights from documents, images, audio, and videos to build generative AI–powered applications. In this post, we demonstrate how to use Amazon Bedrock Data Automation in the AWS Management Console and the AWS SDK for Python (Boto3) for media analysis and intelligent document processing (IDP) workflows.


In [12]:
print(soup.article.footer.span.prettify())

<span>
 by
 <span property="author" typeof="Person">
  <span property="name">
   Ian Lodge
  </span>
 </span>
 ,
 <span property="author" typeof="Person">
  <span property="name">
   Alex Pieri
  </span>
 </span>
 , and
 <span property="author" typeof="Person">
  <span property="name">
   Raj Pathak
  </span>
 </span>
</span>



In [13]:
print(soup.article.div.a['href'])

https://aws.amazon.com/blogs/machine-learning/simplify-multimodal-generative-ai-with-amazon-bedrock-data-automation/


In [14]:
for article in soup.find_all('article'):
    print('==========================================')
    print(article.h2.span.text)
    authors = article.footer.find_all('span', {"property":"author"})
    print('by', end=' ')
    for author in authors:
        if author.span != None:
            print(author.span.text, end=', ')
    print(f'on {article.time.text}')
    print(article.section.p.text)
    print(article.div.a['href'])
    

Simplify multimodal generative AI with Amazon Bedrock Data Automation
by Ian Lodge, Alex Pieri, Raj Pathak, on 17 DEC 2024
Amazon Bedrock Data Automation in public preview, offers a unified experience for developers of all skillsets to easily automate the extraction, transformation, and generation of relevant insights from documents, images, audio, and videos to build generative AI–powered applications. In this post, we demonstrate how to use Amazon Bedrock Data Automation in the AWS Management Console and the AWS SDK for Python (Boto3) for media analysis and intelligent document processing (IDP) workflows.
https://aws.amazon.com/blogs/machine-learning/simplify-multimodal-generative-ai-with-amazon-bedrock-data-automation/
How TUI uses Amazon Bedrock to scale content creation and enhance hotel descriptions in under 10 seconds
by Hin Yee Liu, Nikolaos Zavitsanos, on 17 DEC 2024
TUI Group is one of the world’s leading global tourism services, providing 21 million customers with an unmatch

In [15]:
blog_posts = []
for article in soup.find_all('article'):
    authors = article.footer.find_all('span', {"property":"author"})
    author_text = []
    for author in authors:
        if author.span != None:
            author_text.append(author.span.text)
    blog_posts.append([article.h2.span.text, ', '.join(author_text), article.time.text, article.section.p.text, article.div.a['href'] ])
    

In [16]:
import pandas as pd
import time

In [17]:
df = pd.DataFrame(blog_posts, columns=['title','authors','published','summary','link'])

In [18]:
df['published'] = pd.to_datetime(df['published'], format='%d %b %Y')

In [19]:
pd.options.display.max_rows
pd.set_option('display.max_colwidth', None)
df.head()

Unnamed: 0,title,authors,published,summary,link
0,Simplify multimodal generative AI with Amazon Bedrock Data Automation,"Ian Lodge, Alex Pieri, Raj Pathak",2024-12-17,"Amazon Bedrock Data Automation in public preview, offers a unified experience for developers of all skillsets to easily automate the extraction, transformation, and generation of relevant insights from documents, images, audio, and videos to build generative AI–powered applications. In this post, we demonstrate how to use Amazon Bedrock Data Automation in the AWS Management Console and the AWS SDK for Python (Boto3) for media analysis and intelligent document processing (IDP) workflows.",https://aws.amazon.com/blogs/machine-learning/simplify-multimodal-generative-ai-with-amazon-bedrock-data-automation/
1,How TUI uses Amazon Bedrock to scale content creation and enhance hotel descriptions in under 10 seconds,"Hin Yee Liu, Nikolaos Zavitsanos",2024-12-17,"TUI Group is one of the world’s leading global tourism services, providing 21 million customers with an unmatched holiday experience in 180 regions. The TUI content teams are tasked with producing high-quality content for its websites, including product details, hotel information, and travel guides, often using descriptions written by hotel and third-party partners. In this post, we discuss how we used Amazon SageMaker and Amazon Bedrock to build a content generator that rewrites marketing content following specific brand and style guidelines.",https://aws.amazon.com/blogs/machine-learning/how-tui-uses-amazon-bedrock-to-scale-content-creation-and-enhance-hotel-descriptions-in-under-10-seconds/
2,Llama 3.3 70B now available in Amazon SageMaker JumpStart,"Marc Karp, Adriana Simmons, Lokeshwaran Ravi, Melanie Li, Saurabh Trikande, Yotam Moss",2024-12-16,"Today, we are excited to announce that the Llama 3.3 70B from Meta is available in Amazon SageMaker JumpStart. Llama 3.3 70B marks an exciting advancement in large language model (LLM) development, offering comparable performance to larger Llama versions with fewer computational resources. In this post, we explore how to deploy this model efficiently on Amazon SageMaker AI, using advanced SageMaker AI features for optimal performance and cost management.",https://aws.amazon.com/blogs/machine-learning/llama-3-3-70b-now-available-in-amazon-sagemaker-jumpstart/
3,AWS re:Invent 2024 Highlights: Top takeaways from Swami Sivasubramanian to help customers manage generative AI at scale,Swami Sivasubramanian,2024-12-16,"We spoke with Dr. Swami Sivasubramanian, Vice President of Data and AI, shortly after AWS re:Invent 2024 to hear his impressions—and to get insights on how the latest AWS innovations help meet the real-world needs of customers as they build and scale transformative generative AI applications.",https://aws.amazon.com/blogs/machine-learning/aws-reinvent-2024-highlights-top-takeaways-from-swami-sivasubramanian-to-help-customers-manage-generative-ai-at-scale/
4,Multi-tenant RAG with Amazon Bedrock Knowledge Bases,"Emanuele Levi, Dani Mitchell, Mehran Nikoo",2024-12-16,"Organizations are continuously seeking ways to use their proprietary knowledge and domain expertise to gain a competitive edge. With the advent of foundation models (FMs) and their remarkable natural language processing capabilities, a new opportunity has emerged to unlock the value of their data assets. As organizations strive to deliver personalized experiences to customers using […]",https://aws.amazon.com/blogs/machine-learning/multi-tenant-rag-with-amazon-bedrock-knowledge-bases/


## 2. Extracting text from images

In [20]:
import boto3

In [21]:
# Document
s3BucketName = "c144486a3735933l8791011t1w547619329106-labbucket-ce3vqwekw7af"
documentName = "lab31/simple-document-image.jpg"

In [22]:
# Amazon Textract client
textract = boto3.client('textract')

# Call Amazon Textract
response = textract.detect_document_text(
    Document={
        'S3Object': {
            'Bucket': s3BucketName,
            'Name': documentName
        }
    })

print(response)

{'DocumentMetadata': {'Pages': 1}, 'Blocks': [{'BlockType': 'PAGE', 'Geometry': {'BoundingBox': {'Width': 1.0, 'Height': 1.0, 'Left': 0.0, 'Top': 0.0}, 'Polygon': [{'X': 0.0, 'Y': 0.0}, {'X': 1.0, 'Y': 0.0}, {'X': 1.0, 'Y': 1.0}, {'X': 0.0, 'Y': 1.0}]}, 'Id': 'a4379c6b-9457-4390-b2f0-12f7a96d1425', 'Relationships': [{'Type': 'CHILD', 'Ids': ['65392e62-580a-4892-af87-75610b112316', 'b2f25431-72e9-4d94-b187-924c6bfba50f', '94110839-2008-4ae6-82c0-83845503f7f3', '564b686a-60e2-41bb-b929-53adc4397439']}]}, {'BlockType': 'LINE', 'Confidence': 99.52398681640625, 'Text': 'Amazon.com, Inc. is located in Seattle, WA', 'Geometry': {'BoundingBox': {'Width': 0.512660026550293, 'Height': 0.06824082136154175, 'Left': 0.06333211064338684, 'Top': 0.1989629715681076}, 'Polygon': [{'X': 0.06337157636880875, 'Y': 0.20793944597244263}, {'X': 0.5759921669960022, 'Y': 0.1989629715681076}, {'X': 0.5759671330451965, 'Y': 0.2590251564979553}, {'X': 0.06333211064338684, 'Y': 0.26720380783081055}]}, 'Id': '65392

In [23]:
# Print text
print("\nText\n========")
text = ""
for item in response["Blocks"]:
    if item["BlockType"] == "LINE":
        print ('\033[94m' +  item["Text"] + '\033[0m')
        text = text + " " + item["Text"]


Text
[94mAmazon.com, Inc. is located in Seattle, WA[0m
[94mIt was founded July 5th, 1994 by Jeff Bezos[0m
[94mAmazon.com allows customers to buy everything from books to blenders[0m
[94mSeattle is north of Portland and south of Vancouver, BC.[0m


In [24]:
# Document
documentName = "lab31/employmentapp.png"

In [25]:
# Amazon Textract client

response = textract.analyze_document(
    Document={
        'S3Object': {
            'Bucket': s3BucketName,
            'Name': documentName
        }
    },
    FeatureTypes=["TABLES"])


In [26]:
from trp import Document
doc = Document(response)

for page in doc.pages:
    for table in page.tables:
        for r, row in enumerate(table.rows):
            for c, cell in enumerate(row.cells):
                print("Table[{}][{}] = {}".format(r, c, cell.text))

Table[0][0] = Applicant 
Table[0][1] = Information 
Table[1][0] = Full Name: Jane 
Table[1][1] = Doe 
Table[2][0] = Phone Number: 
Table[2][1] = 555-0100 
Table[3][0] = Home Address: 
Table[3][1] = 123 Any Street, Any Town, USA 
Table[4][0] = Mailing Address: 
Table[4][1] = same as home address 
Table[0][0] = 
Table[0][1] = 
Table[0][2] = Previous Employment 
Table[0][3] = History 
Table[0][4] = 
Table[1][0] = Start Date 
Table[1][1] = End Date 
Table[1][2] = Employer Name 
Table[1][3] = Position Held 
Table[1][4] = Reason for leaving 
Table[2][0] = 1/15/2009 
Table[2][1] = 6/30/2011 
Table[2][2] = AnyCompany 
Table[2][3] = Assistant Baker 
Table[2][4] = Family relocated 
Table[3][0] = 7/1/2011 
Table[3][1] = 8/10/2013 
Table[3][2] = AnyCompany Bread 
Table[3][3] = Baker 
Table[3][4] = Better opportunity 
Table[4][0] = 8/15/2013 
Table[4][1] = present 
Table[4][2] = Example Corp. 
Table[4][3] = Head Baker 
Table[4][4] = N/A, current employer 
