# Scraping the Web

In [18]:
from bs4 import BeautifulSoup

html = """
<html>
    <head>
        <title>Web title</title>
    </head>
    <body>
        <p id="author">Dong In Lee</p>
        <p id="Subject">Data Mining</p>
        <p class="price">10000 dolors</p>
    </body>
</html>"""
#DOM structer 객체를 soup 이 가리킴
soup = BeautifulSoup(html, 'html5lib')

## DOM structer 객체

In [25]:
BeautifulSoup(html, 'html5lib')

<html><head>
        <title>Web title</title>
    </head>
    <body>
        <p id="author">Dong In Lee</p>
        <p id="Subject">Data Mining</p>
        <p class="price">10000 dolors</p>
    
</body></html>

Find **title**

In [26]:
soup.title

<title>Web title</title>

In [27]:
soup.title.text

'Web title'

Find **p** of **body**

In [32]:
soup.body('p')

[<p id="author">Dong In Lee</p>,
 <p id="Subject">Data Mining</p>,
 <p class="price">10000 dolors</p>]

In [47]:
soup.p.text #첫번째 p만 가져온다.

'Dong In Lee'

Find second **p**'s text of **body**

In [37]:
soup.body('p')[1].text

'Data Mining'

In [39]:
print(soup.body('p')[1].text)

Data Mining


Loop over all **p** of **body**

In [42]:
for i, p in enumerate(soup.body('p')):
    print('paragragh {}: {}'.format(i, p.text))

paragragh 0: Dong In Lee
paragragh 1: Data Mining
paragragh 2: 10000 dolors


Find first **p**'s **id** attribute's value

In [49]:
soup.p['id']

'author'

Find all **p** whose attribute **id** is 'author'

In [55]:
soup('p', {'id':'author'}) #딕셔너리 형태로 질문한 것

[<p id="author">Dong In Lee</p>]

Find all **p** whose attribute **class** is 'price'

In [56]:
soup('p', {'class':'price'})

[<p class="price">10000 dolors</p>]

Find all **texts**

In [59]:
soup.text

'\n        Web title\n    \n    \n        Dong In Lee\n        Data Mining\n        10000 dolors\n    \n'

## JSON

In [65]:
import json
serialized = """{"title": "Data Mining Book",
                 "author": "Dong In Lee",
                 "publicationYear": "2022",
                 "topics": ["data", "science", "data science"]}"""

#parse the JSON to create a Python dictionary
deserialized = json.loads(serialized)
print(deserialized)

{'title': 'Data Mining Book', 'author': 'Dong In Lee', 'publicationYear': '2022', 'topics': ['data', 'science', 'data science']}


In [64]:
deserialized['title']

'Data Mining Book'

In [67]:
deserialized['author']

'Dong In Lee'

In [69]:
deserialized['publicationYear']

'2022'

In [70]:
deserialized['topics']

['data', 'science', 'data science']

In [73]:
deserialized['topics'][2]

'data science'

## XML

In [93]:
from bs4 import BeautifulSoup

xml_text = """
<Book>
    <Title>Data Mining Book</Title>
    <Author>Dong In Lee</Author>
    <PublicationYear>2022</PublicationYear>
    <Topics>
        <Topic>data</Topic>
        <Topic>science</Topic>
        <Topic>data science</Topic>
    </Topcis>
</Book>
"""
soup = BeautifulSoup(xml_text, 'lxml')
print(soup)

<html><body><book>
<title>Data Mining Book</title>
<author>Dong In Lee</author>
<publicationyear>2022</publicationyear>
<topics>
<topic>data</topic>
<topic>science</topic>
<topic>data science</topic>
</topics></book>
</body></html>


Find **title** of **book**

In [94]:
soup.book.title

<title>Data Mining Book</title>

In [95]:
soup.book.title.text

'Data Mining Book'

Find **author** of **book**

In [96]:
soup.book.author

<author>Dong In Lee</author>

In [97]:
soup.book.author.text

'Dong In Lee'

Find all **topic** under **topics**

In [99]:
soup.topics('topic')

[<topic>data</topic>, <topic>science</topic>, <topic>data science</topic>]

Find last **topic** of **book**

In [101]:
soup.book.topics('topic')[-1].text

'data science'

Loop over all **topic** of **book**

In [105]:
for i, topic in enumerate(soup.topics('topic')):
    print("topic= {} : {}".format(i, topic.text))

topic= 0 : data
topic= 1 : science
topic= 2 : data science
