## Required Libraries

In [None]:
!pip install beautifulsoup4   # HTML parsing library
!pip install lxml             # Fast parser for BeautifulSoup
!pip install html5lib         # Lenient HTML parser (alternative)
!pip install pandas           # For data storage/CSV export
!pip install requests         # For downloading web pages


In [2]:
from bs4 import BeautifulSoup # this module helps in web scrapping.
import requests  # this module helps us to download a web page

### Beautiful Soap Objects

In [3]:
%%html
<!DOCTYPE html>
<html>
<head>
<title>Page Title</title>
</head>
<body>
<h3><b id='boldest'>Lebron James</b></h3>
<p> Salary: $ 92,000,000 </p>
<h3> Stephen Curry</h3>
<p> Salary: $85,000, 000 </p>
<h3> Kevin Durant </h3>
<p> Salary: $73,200, 000</p>
</body>
</html>

In [4]:
#store in a string as variable 
html = """<!DOCTYPE html>
<html>
<head>
<title>Page Title</title>
</head>
<body>
<h3><b id='boldest'>Lebron James</b></h3>
<p> Salary: $ 92,000,000 </p>
<h3> Stephen Curry</h3>
<p> Salary: $85,000, 000 </p>
<h3> Kevin Durant </h3>
<p> Salary: $73,200, 000</p>
</body>
</html>"""

In [5]:
soup = BeautifulSoup(html, "html.parser")

In [6]:
print(soup.prettify())

<!DOCTYPE html>
<html>
 <head>
  <title>
   Page Title
  </title>
 </head>
 <body>
  <h3>
   <b id="boldest">
    Lebron James
   </b>
  </h3>
  <p>
   Salary: $ 92,000,000
  </p>
  <h3>
   Stephen Curry
  </h3>
  <p>
   Salary: $85,000, 000
  </p>
  <h3>
   Kevin Durant
  </h3>
  <p>
   Salary: $73,200, 000
  </p>
 </body>
</html>



## Tags

In [7]:
obj = soup.title
print("Tag Objects", obj)

Tag Objects <title>Page Title</title>


In [8]:
print("Type", type(obj))

Type <class 'bs4.element.Tag'>


In [9]:
obj= soup.h3
obj

<h3><b id="boldest">Lebron James</b></h3>

## Children , Parents, Siblings

In [10]:
child = soup.b
child

<b id="boldest">Lebron James</b>

In [11]:
parent_obj = child.parent
parent_obj

<h3><b id="boldest">Lebron James</b></h3>

In [14]:
sibling_1 = parent_obj.next_sibling  # may return '\n'
sibling_1


'\n'

In [15]:
sibling_2 = parent_obj.next_sibling.next_sibling  # actual <p> tag
sibling_2

<p> Salary: $ 92,000,000 </p>

## Html Atrributes

In [17]:
child['id']

'boldest'

In [18]:
child.attrs

{'id': 'boldest'}

In [19]:
child.get('id')

'boldest'

## Filter

In [47]:
%%html
<table>
  <tr>
    <td id='flight' >Flight No</td>
    <td>Launch site</td> 
    <td>Payload mass</td>
   </tr>
  <tr> 
    <td>1</td>
    <td><a href='https://en.wikipedia.org/wiki/Florida'>Florida</a></td>
    <td>300 kg</td>
  </tr>
  <tr>
    <td>2</td>
    <td><a href='https://en.wikipedia.org/wiki/Texas'>Texas</a></td>
    <td>94 kg</td>
  </tr>
  <tr>
    <td>3</td>
    <td><a href='https://en.wikipedia.org/wiki/Florida'>Florida</a> </td>
    <td>80 kg</td>
  </tr>
</table>

0,1,2
Flight No,Launch site,Payload mass
1,Florida,300 kg
2,Texas,94 kg
3,Florida,80 kg


In [21]:
table = """
<table>
  <tr>
    <td id='flight' >Flight No</td>
    <td>Launch site</td> 
    <td>Payload mass</td>
   </tr>
  <tr> 
    <td>1</td>
    <td><a href='https://en.wikipedia.org/wiki/Florida'>Florida</a></td>
    <td>300 kg</td>
  </tr>
  <tr>
    <td>2</td>
    <td><a href='https://en.wikipedia.org/wiki/Texas'>Texas</a></td>
    <td>94 kg</td>
  </tr>
  <tr>
    <td>3</td>
    <td><a href='https://en.wikipedia.org/wiki/Florida'>Florida</a> </td>
    <td>80 kg</td>
  </tr>
</table> """

In [24]:
table_bs = BeautifulSoup(table , "html.parser")

In [25]:
table_rows=table_bs.find_all('tr')
table_rows

[<tr>
 <td id="flight">Flight No</td>
 <td>Launch site</td>
 <td>Payload mass</td>
 </tr>,
 <tr>
 <td>1</td>
 <td><a href="https://en.wikipedia.org/wiki/Florida">Florida</a></td>
 <td>300 kg</td>
 </tr>,
 <tr>
 <td>2</td>
 <td><a href="https://en.wikipedia.org/wiki/Texas">Texas</a></td>
 <td>94 kg</td>
 </tr>,
 <tr>
 <td>3</td>
 <td><a href="https://en.wikipedia.org/wiki/Florida">Florida</a> </td>
 <td>80 kg</td>
 </tr>]

In [26]:
first_row =table_rows[0]
first_row

<tr>
<td id="flight">Flight No</td>
<td>Launch site</td>
<td>Payload mass</td>
</tr>

In [27]:
print(type(first_row))

<class 'bs4.element.Tag'>


In [28]:
first_row.td

<td id="flight">Flight No</td>

In [29]:
for i,row in enumerate(table_rows):
    print("row",i,"is",row)
    

row 0 is <tr>
<td id="flight">Flight No</td>
<td>Launch site</td>
<td>Payload mass</td>
</tr>
row 1 is <tr>
<td>1</td>
<td><a href="https://en.wikipedia.org/wiki/Florida">Florida</a></td>
<td>300 kg</td>
</tr>
row 2 is <tr>
<td>2</td>
<td><a href="https://en.wikipedia.org/wiki/Texas">Texas</a></td>
<td>94 kg</td>
</tr>
row 3 is <tr>
<td>3</td>
<td><a href="https://en.wikipedia.org/wiki/Florida">Florida</a> </td>
<td>80 kg</td>
</tr>


In [30]:
for i,row in enumerate(table_rows):
    print("row",i)
    cells=row.find_all('td')
    for j,cell in enumerate(cells):
        print('colunm',j,"cell",cell)

row 0
colunm 0 cell <td id="flight">Flight No</td>
colunm 1 cell <td>Launch site</td>
colunm 2 cell <td>Payload mass</td>
row 1
colunm 0 cell <td>1</td>
colunm 1 cell <td><a href="https://en.wikipedia.org/wiki/Florida">Florida</a></td>
colunm 2 cell <td>300 kg</td>
row 2
colunm 0 cell <td>2</td>
colunm 1 cell <td><a href="https://en.wikipedia.org/wiki/Texas">Texas</a></td>
colunm 2 cell <td>94 kg</td>
row 3
colunm 0 cell <td>3</td>
colunm 1 cell <td><a href="https://en.wikipedia.org/wiki/Florida">Florida</a> </td>
colunm 2 cell <td>80 kg</td>


## Attributes

In [39]:
table_bs.find_all(id = "flight")

[<td id="flight">Flight No</td>]

In [35]:
list_input=table_bs.find_all(href="https://en.wikipedia.org/wiki/Florida")
list_input

[<a href="https://en.wikipedia.org/wiki/Florida">Florida</a>,
 <a href="https://en.wikipedia.org/wiki/Florida">Florida</a>]

<h2 id="DSCW">Downloading And Scraping The Contents Of A Web Page</h2> 


In [42]:
url = "https://web.archive.org/web/20230224123642/https://www.ibm.com/us-en/"

In [43]:
data = requests.get(url).text

In [49]:
soup = BeautifulSoup(data, "html.parser")

### scrape links

In [46]:
for link in soup.find_all('a',href=True):  # in html anchor/link is represented by the tag <a>

    print(link.get('href'))


https://web.archive.org/web/20230224123642/https://www.ibm.com/reports/threat-intelligence/
https://web.archive.org/web/20230224123642/https://www.ibm.com/about
https://web.archive.org/web/20230224123642/https://www.ibm.com/consulting/?lnk=flathl
https://web.archive.org/web/20230224123642/https://www.ibm.com/consulting/strategy/?lnk=flathl
https://web.archive.org/web/20230224123642/https://www.ibm.com/consulting/ibmix?lnk=flathl
https://web.archive.org/web/20230224123642/https://www.ibm.com/consulting/technology/
https://web.archive.org/web/20230224123642/https://www.ibm.com/consulting/operations/?lnk=flathl
https://web.archive.org/web/20230224123642/https://www.ibm.com/strategic-partnerships
https://web.archive.org/web/20230224123642/https://www.ibm.com/employment/?lnk=flatitem
https://web.archive.org/web/20230224123642/https://www.ibm.com/impact
https://web.archive.org/web/20230224123642/https://research.ibm.com/
https://web.archive.org/web/20230224123642/https://www.ibm.com/


### scrape images

In [48]:
for link in soup.find_all('img'):# in html image is represented by the tag <img>
    print(link)
    print(link.get('src'))

<img alt="Person standing with arms crossed" aria-describedby="bx--image-1" class="bx--image__img" src="https://web.archive.org/web/20230224123642im_/https://1.dam.s81c.com/p/0a23e414312bcb6f/08196d0e04260ae5_cropped.jpg.global.sr_16x9.jpg"/>
https://web.archive.org/web/20230224123642im_/https://1.dam.s81c.com/p/0a23e414312bcb6f/08196d0e04260ae5_cropped.jpg.global.sr_16x9.jpg
<img alt="Team members at work in a conference room" aria-describedby="bx--image-2" class="bx--image__img" src="https://web.archive.org/web/20230224123642im_/https://1.dam.s81c.com/p/06655c075aa3aa29/CaitOppermann_2019_12_06_IBMGarage_DSC3304.jpg.global.m_16x9.jpg"/>
https://web.archive.org/web/20230224123642im_/https://1.dam.s81c.com/p/06655c075aa3aa29/CaitOppermann_2019_12_06_IBMGarage_DSC3304.jpg.global.m_16x9.jpg
<img alt="Coworkers looking at laptops" aria-describedby="bx--image-3" class="bx--image__img" src="https://web.archive.org/web/20230224123642im_/https://1.dam.s81c.com/p/08f951353c2707b8/052022_CaitOp

### Scrape data from html Tables

In [50]:
#The below url contains an html table with data about colors and color codes.
url = "https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBM-DA0321EN-SkillsNetwork/labs/datasets/HTMLColorCodes.html"

In [51]:
data = requests.get(url).text

In [52]:
soup = BeautifulSoup(data, "html.parser")

In [55]:
table = soup.find("table")

In [56]:
#Get all rows from the table
for row in table.find_all('tr'): # in html table row is represented by the tag <tr>
    # Get all columns in each row.
    cols = row.find_all('td') # in html a column is represented by the tag <td>
    color_name = cols[2].string # store the value in column 3 as color_name
    color_code = cols[3].string # store the value in column 4 as color_code
    print("{}--->{}".format(color_name,color_code))

Color Name--->None
lightsalmon--->#FFA07A
salmon--->#FA8072
darksalmon--->#E9967A
lightcoral--->#F08080
coral--->#FF7F50
tomato--->#FF6347
orangered--->#FF4500
gold--->#FFD700
orange--->#FFA500
darkorange--->#FF8C00
lightyellow--->#FFFFE0
lemonchiffon--->#FFFACD
papayawhip--->#FFEFD5
moccasin--->#FFE4B5
peachpuff--->#FFDAB9
palegoldenrod--->#EEE8AA
khaki--->#F0E68C
darkkhaki--->#BDB76B
yellow--->#FFFF00
lawngreen--->#7CFC00
chartreuse--->#7FFF00
limegreen--->#32CD32
lime--->#00FF00
forestgreen--->#228B22
green--->#008000
powderblue--->#B0E0E6
lightblue--->#ADD8E6
lightskyblue--->#87CEFA
skyblue--->#87CEEB
deepskyblue--->#00BFFF
lightsteelblue--->#B0C4DE
dodgerblue--->#1E90FF
