# download 함수 정의

In [1]:
import requests
import time
import json


def download(method, url, params=None, data=None, headers=None, maxretries=4):
    try:
        resp = requests.request(method,
                                url,
                                params=params,
                                data=data,
                                headers=headers)
        resp.raise_for_status()
    except requests.exceptions.HTTPError as e:
        if 500 <= e.response.status_code < 600 and maxretries > 0:
            print(maxretries)
            resp = download(method, url, params, data, maxretries - 1)
        else:
            print(e.response.status_code)
            print(e.response.reason)
    return resp

# Naver 통합 검색 결과 중 뉴스 링크 및 제목 가져오기

In [2]:
import re
from html import unescape

In [3]:
headers = {
    'User-Agent':
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'
}
url = 'https://search.naver.com/search.naver'
params = {'query': '아이린', 'where': 'nexearch'}
resp = download('get', url, params=params, headers=headers)
html = unescape(resp.text)

In [4]:
pattern = r'''<dt><a href="([^"]*news[^"]*)"(?:[^>]*)?title="([^"]*)"'''

# 첫 번째 그룹이 link, 두 번째 그룹이 title

In [5]:
re.findall(pattern, html)

[('http://www.chuksannews.co.kr/news/article.html?no=188624',
  "'올드스쿨' 모델 아이린, 韓 사랑이 다했다? "),
 ('http://www.economytalk.kr/news/articleView.html?idxno=191197',
  '레드벨벳 아이린, 걸그룹의 끝판왕 미모는 이런 것이다'),
 ('http://www.incheonilbo.com/news/articleView.html?idxno=961566',
  "'김창열의 올드스쿨' 하늘은 스무살 때 아이돌 연습생을 ...")]

---

# BeautifulSoup

In [6]:
import bs4
from bs4 import BeautifulSoup

## 예제

In [7]:
html = """
<html>
 <head></head>
 <body>
  <div id="result">
   <p class="row">
    <a class="red">go to page1</a>
    <a class="blue">go to page2</a>
    <div class="test">
     <a href="test.com">test</a>
    </div>
   </p>
  </div>
 </body>
</html>
"""

In [8]:
dom = BeautifulSoup(html, 'html.parser')

In [9]:
dom.select('p > a')

[<a class="red">go to page1</a>, <a class="blue">go to page2</a>]

In [10]:
div = dom.select('div')

In [11]:
div[0]['id']

'result'

In [12]:
[
    tag['class'][0] for tag in dom.p.children
    if type(tag) == bs4.element.Tag and tag.has_attr('class')
]

['red', 'blue', 'test']

In [13]:
dom.find_all('a')

[<a class="red">go to page1</a>,
 <a class="blue">go to page2</a>,
 <a href="test.com">test</a>]

In [14]:
dom.p

<p class="row">
<a class="red">go to page1</a>
<a class="blue">go to page2</a>
<div class="test">
<a href="test.com">test</a>
</div>
</p>

In [15]:
dom.p.find_all('a')

[<a class="red">go to page1</a>,
 <a class="blue">go to page2</a>,
 <a href="test.com">test</a>]

In [16]:
dom.p.find_all('a', recursive=False)

[<a class="red">go to page1</a>, <a class="blue">go to page2</a>]

In [17]:
dom.div.find_all('a')

[<a class="red">go to page1</a>,
 <a class="blue">go to page2</a>,
 <a href="test.com">test</a>]

In [18]:
dom.p.div.a

<a href="test.com">test</a>

In [19]:
p = dom.p

In [20]:
p.a.parent()

[<a class="red">go to page1</a>,
 <a class="blue">go to page2</a>,
 <div class="test">
 <a href="test.com">test</a>
 </div>,
 <a href="test.com">test</a>]

In [21]:
p.a.find_parents()

[<p class="row">
 <a class="red">go to page1</a>
 <a class="blue">go to page2</a>
 <div class="test">
 <a href="test.com">test</a>
 </div>
 </p>, <div id="result">
 <p class="row">
 <a class="red">go to page1</a>
 <a class="blue">go to page2</a>
 <div class="test">
 <a href="test.com">test</a>
 </div>
 </p>
 </div>, <body>
 <div id="result">
 <p class="row">
 <a class="red">go to page1</a>
 <a class="blue">go to page2</a>
 <div class="test">
 <a href="test.com">test</a>
 </div>
 </p>
 </div>
 </body>, <html>
 <head></head>
 <body>
 <div id="result">
 <p class="row">
 <a class="red">go to page1</a>
 <a class="blue">go to page2</a>
 <div class="test">
 <a href="test.com">test</a>
 </div>
 </p>
 </div>
 </body>
 </html>, 
 <html>
 <head></head>
 <body>
 <div id="result">
 <p class="row">
 <a class="red">go to page1</a>
 <a class="blue">go to page2</a>
 <div class="test">
 <a href="test.com">test</a>
 </div>
 </p>
 </div>
 </body>
 </html>]

In [22]:
parent = dom.p.div.find_parent()

In [23]:
parent

<p class="row">
<a class="red">go to page1</a>
<a class="blue">go to page2</a>
<div class="test">
<a href="test.com">test</a>
</div>
</p>

## 실습용 사이트
http://pythonscraping.com/pages/page3.html

In [24]:
resp = download('get',
                'http://pythonscraping.com/pages/page3.html',
                headers=headers)
dom = BeautifulSoup(resp.text, 'lxml')

In [25]:
result = dom.find('div', {'id': 'footer'})

In [26]:
[tag.name for tag in result.find_parents()]

['div', 'body', 'html', '[document]']

In [27]:
dom.div.find_all('div')[1]

<div id="footer">
© Totally Normal Gifts, Inc. <br/>
+234 (617) 863-0736
</div>

In [28]:
dom.select('div > div#footer')

[<div id="footer">
 © Totally Normal Gifts, Inc. <br/>
 +234 (617) 863-0736
 </div>]

In [29]:
[
    tag.name for tag in result.find_parent().children
    if type(tag) == bs4.element.Tag
]

['img', 'h1', 'div', 'table', 'div']

In [30]:
[tag.name for tag in dom.div.find_all(recursive=False)]

['img', 'h1', 'div', 'table', 'div']

In [31]:
[tag.name for tag in result.find_previous_siblings()]

['table', 'div', 'h1', 'img']

In [32]:
dom.table.find_all('img')[1]

<img src="../img/gifts/img2.jpg"/>

In [33]:
dom.find_all('img')[2]

<img src="../img/gifts/img2.jpg"/>

In [34]:
dom.select('img')[2]

<img src="../img/gifts/img2.jpg"/>

In [35]:
img = [tag for tag in result.find_previous_siblings() if tag.name == 'img'][0]

In [36]:
from requests.compat import urljoin

In [37]:
imgs = [
    urljoin(resp.url, tag['src'])
    for tag in result.find_previous_siblings()[0].find_all('img')
]

In [38]:
imgs

['http://pythonscraping.com/img/gifts/img1.jpg',
 'http://pythonscraping.com/img/gifts/img2.jpg',
 'http://pythonscraping.com/img/gifts/img3.jpg',
 'http://pythonscraping.com/img/gifts/img4.jpg',
 'http://pythonscraping.com/img/gifts/img6.jpg']

In [39]:
import os

if not os.path.exists('이미지/'):
    os.mkdir('이미지')

In [40]:
for i, img in enumerate(imgs):
    resp = download('get', img)
    ext = '.' + resp.headers['Content-Type'].split('/')[-1]
    with open('이미지/' + str(i) + ext, 'wb') as file:
        file.write(resp.content)

In [41]:
resp = download('get',
                'http://pythonscraping.com/pages/page3.html',
                headers=headers)

In [42]:
pattern = r'''<img src="([^"]*)"'''

In [43]:
re.findall(pattern, resp.text)

['../img/gifts/logo.jpg',
 '../img/gifts/img1.jpg',
 '../img/gifts/img2.jpg',
 '../img/gifts/img3.jpg',
 '../img/gifts/img4.jpg',
 '../img/gifts/img6.jpg']

In [44]:
[urljoin(resp.url, link) for link in re.findall(pattern, resp.text)]

['http://pythonscraping.com/img/gifts/logo.jpg',
 'http://pythonscraping.com/img/gifts/img1.jpg',
 'http://pythonscraping.com/img/gifts/img2.jpg',
 'http://pythonscraping.com/img/gifts/img3.jpg',
 'http://pythonscraping.com/img/gifts/img4.jpg',
 'http://pythonscraping.com/img/gifts/img6.jpg']

---

In [45]:
resp = download(
    'get',
    'https://search.naver.com/search.naver?sm=top_hty&fbm=1&ie=utf8&query=%EC%95%84%EC%9D%B4%EB%A6%B0'
)

In [46]:
dom = BeautifulSoup(resp.text, 'lxml')

In [47]:
result = [(tag['href'], tag['title'])
          for tag in dom.select('li[id^=sp_nws_all] > dl > dt > a')]

In [48]:
result

[('http://www.chuksannews.co.kr/news/article.html?no=188624',
  '\'올드스쿨\' 모델 아이린, 韓 사랑이 다했다? "차세대 리더로 꼽힌 이유는.."'),
 ('http://www.economytalk.kr/news/articleView.html?idxno=191197',
  '레드벨벳 아이린, 걸그룹의 끝판왕 미모는 이런 것이다'),
 ('http://www.gamemeca.com/mv.php?inflow=naver_s&gid=1565982',
  '[공커 위클리] 킹 오브 파이터 올스타 7월 2주차, 쫄쫄이 용병단의 깜짝 습격'),
 ('http://www.wkorea.com/?p=133584', '아이돌이 사랑한 블레이즈 헤어!'),
 ('http://www.incheonilbo.com/news/articleView.html?idxno=961566',
  "'김창열의 올드스쿨' 하늘은 스무살 때 아이돌 연습생을 ...")]

In [49]:
resp = download('get',
                'https://search.naver.com/search.naver',
                params={
                    'where': 'image',
                    'sm': 'tab_jum',
                    'query': '아이린'
                },
                headers=headers)
dom = BeautifulSoup(resp.text, 'lxml')

In [50]:
urls = [
    tag['data-source'] for tag in dom.select('div.photo_grid._box img')
    if tag.has_attr('data-source')
]

In [51]:
for i, url in enumerate(urls):
    resp = download('get', url)
    ext = '.' + resp.headers['Content-Type'].split('/')[-1]
    with open('검색결과/' + str(i) + ext, 'wb') as file:
        file.write(resp.content)

## 정규식으로 네이버 이미지 검색결과 가져오기

In [52]:
pattern = r'''<img src="(?:[^"]*)" class="(?:[^"]*)" alt="(?:[^"]*)" onerror="(?:[^"]*)" data-source="([^"]*)"'''

---

## 네이버 통합 검색 결과 중 뉴스 링크 및 제목만 가져오기

In [54]:
url = 'https://search.naver.com/search.naver'
resp = download('get', url, params=params, headers=headers)
html = unescape(resp.text)

In [55]:
dom = BeautifulSoup(html, 'lxml')

In [56]:
[(tag['href'], tag['title'])
 for tag in dom.select('li[id^=sp_nws_all] dt > a')]

[('http://www.chuksannews.co.kr/news/article.html?no=188624',
  "'올드스쿨' 모델 아이린, 韓 사랑이 다했다? "),
 ('http://www.economytalk.kr/news/articleView.html?idxno=191197',
  '레드벨벳 아이린, 걸그룹의 끝판왕 미모는 이런 것이다'),
 ('http://www.gamemeca.com/mv.php?inflow=naver_s&gid=1565982',
  '[공커 위클리] 킹 오브 파이터 올스타 7월 2주차, 쫄쫄이 용병단의 깜짝 습격'),
 ('http://www.wkorea.com/?p=133584', '아이돌이 사랑한 블레이즈 헤어!'),
 ('http://www.incheonilbo.com/news/articleView.html?idxno=961566',
  "'김창열의 올드스쿨' 하늘은 스무살 때 아이돌 연습생을 ...")]

## 구글 검색결과 가져오기

In [57]:
resp = download('get',
                'https://www.google.com/search',
                params={'q': '아이린'},
                headers=headers)

In [58]:
[(tag['href'], tag.text)
 for tag in BeautifulSoup(resp.text, 'lxml').select('div.r > a')]

[('https://namu.wiki/w/%EC%95%84%EC%9D%B4%EB%A6%B0(%EB%A0%88%EB%93%9C%EB%B2%A8%EB%B2%B3)',
  '아이린(레드벨벳) - 나무위키https://namu.wiki/w/아이린(레드벨벳)'),
 ('https://ko.wikipedia.org/wiki/%EC%95%84%EC%9D%B4%EB%A6%B0_(1991%EB%85%84)',
  '아이린 (1991년) - 위키백과, 우리 모두의 백과사전https://ko.wikipedia.org/wiki/아이린_(1991년)'),
 ('https://twitter.com/search/%23%EC%95%84%EC%9D%B4%EB%A6%B0',
  '#아이린 - Twitter Searchhttps://twitter.com/search/%23아이린'),
 ('https://maeari33.tistory.com/5',
  '아이린 과거와 성형전 썰 - 울려퍼지는 메아리 - 티스토리https://maeari33.tistory.com/5'),
 ('https://www.pinterest.co.kr/dreamtree001/%EC%95%84%EC%9D%B4%EB%A6%B0/',
  '아이린 최고 인기 이미지 1478개 - 2019 | 레드벨벳 아이린, 레드벨벳 및 ...https://www.pinterest.co.kr/dreamtree001/아이린/'),
 ('https://www.pinterest.co.kr/charles780675/%EC%95%84%EC%9D%B4%EB%A6%B0/',
  '아이린 최고 인기 이미지 860개 - 2019 | 레드벨벳 아이린, Kpop 및 레드 ...https://www.pinterest.co.kr/charles780675/아이린/'),
 ('https://gall.dcinside.com/board/lists?id=irene',
  '아이린 갤러리https://gall.dcinside.com/board/lists?id=irene'),
 (