# 3. beautifulSoup 기초
* HTML과 XML문서를 파싱하기 위한 파이썬 패키지
* https://www.crummy.com/software/BeautifulSoup/bs4/doc/

In [1]:
import requests
from bs4 import BeautifulSoup

In [111]:
html_doc = """
<html>
<head>
<title>My story</title>
</head>
<body>
<p class="title">My Story</p>
<p class="story">내가 좋아하는 음식
<a href="https://www.pizzahut.co.kr" class="food" id="link1">피자</a>
<a href="https://www.kyochon.com" class="food" id="link2">치킨</a>
<a href="https://www.momstouch.co.kr" class="food" id="link3">햄버거</a>
</p>
</body>
</html>
"""

In [112]:
soup = BeautifulSoup(html_doc, 'html.parser') # lxml, html.parser

In [113]:
type(soup)

bs4.BeautifulSoup

In [114]:
print(soup.prettify())

<html>
 <head>
  <title>
   My story
  </title>
 </head>
 <body>
  <p class="title">
   My Story
  </p>
  <p class="story">
   내가 좋아하는 음식
   <a class="food" href="https://www.pizzahut.co.kr" id="link1">
    피자
   </a>
   <a class="food" href="https://www.kyochon.com" id="link2">
    치킨
   </a>
   <a class="food" href="https://www.momstouch.co.kr" id="link3">
    햄버거
   </a>
  </p>
 </body>
</html>



## find 함수
* 조건에 만족하는 첫번째 tag만 검색

In [115]:
soup.find('p')

<p class="title">My Story</p>

In [116]:
soup.find('a')

<a class="food" href="https://www.pizzahut.co.kr" id="link1">피자</a>

In [117]:
soup.find('a', id='link2')

<a class="food" href="https://www.kyochon.com" id="link2">치킨</a>

In [118]:
soup.find('a', class_='food', id='link3')

<a class="food" href="https://www.momstouch.co.kr" id="link3">햄버거</a>

In [119]:
attrs = {
  'class': 'food',
  'id': 'link3'
}
soup.find('a', attrs=attrs)

<a class="food" href="https://www.momstouch.co.kr" id="link3">햄버거</a>

## find_all 함수 
* 조건에 맞는 모든 tag를 리스트로 반환

In [120]:
soup.find_all('p')

[<p class="title">My Story</p>,
 <p class="story">내가 좋아하는 음식
 <a class="food" href="https://www.pizzahut.co.kr" id="link1">피자</a>
 <a class="food" href="https://www.kyochon.com" id="link2">치킨</a>
 <a class="food" href="https://www.momstouch.co.kr" id="link3">햄버거</a>
 </p>]

In [121]:
for tag in soup.find_all('a'):
  print(tag)

<a class="food" href="https://www.pizzahut.co.kr" id="link1">피자</a>
<a class="food" href="https://www.kyochon.com" id="link2">치킨</a>
<a class="food" href="https://www.momstouch.co.kr" id="link3">햄버거</a>


## get_text 함수
* tag안의 value를 추출
* 부모 tag의 경우 모든 자식 tag의 value추출

In [122]:
soup.find(class_='title').get_text()

'My Story'

In [123]:
for tag in soup.find_all('a'):
  print(tag.get_text())

피자
치킨
햄버거


## attribute값 추출하기
* 검색한 tag에서 attribute값을 추출
* tag['attr명']

In [124]:
soup.find('p').attrs

{'class': ['title']}

In [125]:
soup.find('p')['class']

['title']

In [126]:
soup.find('a').attrs

{'href': 'https://www.pizzahut.co.kr', 'class': ['food'], 'id': 'link1'}

In [127]:
for tag in soup.find_all('a'):
  print(tag['href'], tag['id'])

https://www.pizzahut.co.kr link1
https://www.kyochon.com link2
https://www.momstouch.co.kr link3


## select 함수
* select는 CSS Selector로 tag 찾기
* 자손 태그 찾기 - tag1 tag2
* 직계 자식 태그 찾기 - tag1 > tag2
* id 선택자 - #id
* class 선택자 - .class
* 속성값 찾기 - \[name="value"]
  * 속성값 prefix 찾기 - \[name^="value"]
  * 속성값 suffix 찾기 - \[name$="value"]
  * 속성값 포함문자열 찾기 - \[name*="value"]

In [128]:
soup.select('p')

[<p class="title">My Story</p>,
 <p class="story">내가 좋아하는 음식
 <a class="food" href="https://www.pizzahut.co.kr" id="link1">피자</a>
 <a class="food" href="https://www.kyochon.com" id="link2">치킨</a>
 <a class="food" href="https://www.momstouch.co.kr" id="link3">햄버거</a>
 </p>]

In [129]:
soup.select_one('p')

<p class="title">My Story</p>

In [130]:
# 자손 태그
soup.select('html title')

[<title>My story</title>]

In [131]:
# 직계 자식 태그
soup.select('html > title')

[]

In [132]:
# id 선택자
soup.select('a#link1')

[<a class="food" href="https://www.pizzahut.co.kr" id="link1">피자</a>]

In [133]:
# class 선택자
soup.select('.food')

[<a class="food" href="https://www.pizzahut.co.kr" id="link1">피자</a>,
 <a class="food" href="https://www.kyochon.com" id="link2">치킨</a>,
 <a class="food" href="https://www.momstouch.co.kr" id="link3">햄버거</a>]

In [134]:
# 속성값 찾기
soup.select('[href="https://www.pizzahut.co.kr"]')

[<a class="food" href="https://www.pizzahut.co.kr" id="link1">피자</a>]

In [136]:
# 시작 문자열
soup.select('[href^="http"]')

[<a class="food" href="https://www.pizzahut.co.kr" id="link1">피자</a>,
 <a class="food" href="https://www.kyochon.com" id="link2">치킨</a>,
 <a class="food" href="https://www.momstouch.co.kr" id="link3">햄버거</a>]

In [137]:
# 종료 문자열
soup.select('[href$="kr"]')

[<a class="food" href="https://www.pizzahut.co.kr" id="link1">피자</a>,
 <a class="food" href="https://www.momstouch.co.kr" id="link3">햄버거</a>]

In [140]:
# 포함 문자열
soup.select('[href*="ch"]')

[<a class="food" href="https://www.kyochon.com" id="link2">치킨</a>,
 <a class="food" href="https://www.momstouch.co.kr" id="link3">햄버거</a>]