In [1]:
import requests

In [2]:
import time

def download(method, url, param=None, data=None, headers = None, timeout=1, maxretries=3):
    if headers == None:
        headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'}
    try:
        resp = requests.request(method, url, 
                                params=param, data=data, 
                                headers = headers)
        resp.raise_for_status()
    except requests.exceptions.HTTPError as e:
        if 500 <= e.response.status_code < 600 and maxretries > 0:
            print(maxretries)
            time.sleep(timeout)
            resp = download(method, url, param, data, headers, timeout, maxretries-1)
        else:
            print(e.response.status_code)
            print(e.response.reason)
    return resp

In [3]:
# 실습할 url

url = "http://pythonscraping.com/pages/page3.html"

In [4]:
resp = download('get', url)

In [5]:
from bs4 import BeautifulSoup

In [6]:
dom = BeautifulSoup(resp.text, 'lxml')

In [7]:
dom

<html>
<head>
<style>
img{
	width:75px;
}
table{
	width:50%;
}
td{
	margin:10px;
	padding:10px;
}
.wrapper{
	width:800px;
}
.excitingNote{
	font-style:italic;
	font-weight:bold;
}
</style>
</head>
<body>
<div id="wrapper">
<img src="../img/gifts/logo.jpg" style="float:left;"/>
<h1>Totally Normal Gifts</h1>
<div id="content">Here is a collection of totally normal, totally reasonable gifts that your friends are sure to love! Our collection is
hand-curated by well-paid, free-range Tibetan monks.<p>
We haven't figured out how to make online shopping carts yet, but you can send us a check to:<br/>
123 Main St.<br/>
Abuja, Nigeria
We will then send your totally amazing gift, pronto! Please include an extra $5.00 for gift wrapping.</p></div>
<table id="giftList">
<tr><th>
Item Title
</th><th>
Description
</th><th>
Cost
</th><th>
Image
</th></tr>
<tr class="gift" id="gift1"><td>
Vegetable Basket
</td><td>
This vegetable basket is the perfect gift for your health conscious (or overweight) frien

In [8]:
dom.find('div', {"id":'footer'})

<div id="footer">
© Totally Normal Gifts, Inc. <br/>
+234 (617) 863-0736
</div>

In [9]:
footer = dom.find('div', {"id":'footer'})

In [10]:
# paser를 html.parser로 했을 때는 안되었다. -> 바꿀 수 있는 옵션은 lxml 밖에 없다.
# lxml이 제일 좋다. 
# lxml이 안될때 대안으로 html.parser를 써라.

[_.name for _ in footer.find_parents()]

['div', 'body', 'html', '[document]']

# 미션! footer의 형제 노드를 다 가져오기
- 2가지 방법이 있다. footer의 부모를 찾아서 자식들을 찾는 방법
- 또는 footer의 next/previous sibling을 찾는 방법

In [11]:
# footer의 parent를 이용해서 찾는 방법

[_.name for _ in footer.find_parent().find_all(recursive=False)]

['img', 'h1', 'div', 'table', 'div']

In [12]:
# footer의 sibling을 이용해서 찾는 방법
# footer 자기 자신을 제외하고 4개임

[_.name for _ in footer.find_previous_siblings()]

['table', 'div', 'h1', 'img']

In [13]:
# 나의 시도....
# 개행문자가 포함되어서 None이 껴있다.

[tag.name for tag in footer.find_parent().children]

[None, 'img', None, 'h1', None, 'div', None, 'table', None, 'div', None]

In [14]:
# 나의 시도...
# list도 쓸 수 있다.

footer.find_parent().find_all(['h1','img','div', 'p',], recursive=False)

[<img src="../img/gifts/logo.jpg" style="float:left;"/>,
 <h1>Totally Normal Gifts</h1>,
 <div id="content">Here is a collection of totally normal, totally reasonable gifts that your friends are sure to love! Our collection is
 hand-curated by well-paid, free-range Tibetan monks.<p>
 We haven't figured out how to make online shopping carts yet, but you can send us a check to:<br/>
 123 Main St.<br/>
 Abuja, Nigeria
 We will then send your totally amazing gift, pronto! Please include an extra $5.00 for gift wrapping.</p></div>,
 <div id="footer">
 © Totally Normal Gifts, Inc. <br/>
 +234 (617) 863-0736
 </div>]

# 미션2: 러시안 인형 사진 골라오기

In [15]:
# 이런 방법도 있지만 이건 수업의 목적상 맞지 않음...
# 구조 기반으로 가져오는 방법 연습해야한다.
# footer를 출발점 삼아서 가져와바라.

dom.find_all('img')[2]

<img src="../img/gifts/img2.jpg"/>

In [16]:
# 구조 기반으로 가져오는 방법

footer.find_previous_sibling().find_all("tr")[2].find_all("td")[-1].find()

<img src="../img/gifts/img2.jpg"/>

In [17]:
# 이것저것 혼자 해보자...

len(footer.find_previous_sibling().find_all('tr'))

6

# 미션3: page 내에 있는 이미지를 다 가져와서 url 주소를 만들어라.

In [18]:
image1 = footer.find_previous_sibling().find_all("tr")[2].find_all("td")[-1].find()
image1

<img src="../img/gifts/img2.jpg"/>

In [19]:
image1["src"]

'../img/gifts/img2.jpg'

In [20]:
requests.compat.urljoin(resp.url, image1["src"])

'http://pythonscraping.com/img/gifts/img2.jpg'

In [21]:
# 정답
[requests.compat.urljoin(resp.url, _.find_all('td')[-1].find()["src"]) for _ in footer.find_previous_sibling().find_all("tr")[1:]]

['http://pythonscraping.com/img/gifts/img1.jpg',
 'http://pythonscraping.com/img/gifts/img2.jpg',
 'http://pythonscraping.com/img/gifts/img3.jpg',
 'http://pythonscraping.com/img/gifts/img4.jpg',
 'http://pythonscraping.com/img/gifts/img6.jpg']

In [22]:
images = [requests.compat.urljoin(resp.url, _.find_all('td')[-1].find()["src"]) for _ in footer.find_previous_sibling().find_all("tr")[1:]]

# 페이지 내의 이미지를 자동으로 다 다운받아보자

In [23]:
# download 함수를 써서 이미지를 resopnse 객체로 가져온다.

resp = download('get', images[1])

In [24]:
# url을 보고 확장자를 알 수 있는 경우는 거의 없다. 그래서 header 정보를 보는 법을 알아야한다.
# Content-Type을 보면 확장자를 알 수 있다.
# 이미지이고 확장자는 jpeg이다.

resp.headers

{'Date': 'Sat, 13 Jul 2019 16:18:10 GMT', 'Server': 'Apache', 'Last-Modified': 'Mon, 04 Aug 2014 00:49:04 GMT', 'ETag': '"412006b-e438-4ffc31b072000"', 'Accept-Ranges': 'bytes', 'Content-Length': '58424', 'Cache-Control': 'max-age=1209600', 'Expires': 'Sat, 27 Jul 2019 16:18:10 GMT', 'Keep-Alive': 'timeout=5, max=100', 'Connection': 'Keep-Alive', 'Content-Type': 'image/jpeg'}

In [25]:
# 컨텐트의 타입과 확장자 자동으로 확인하기

filetype = resp.headers["Content-Type"].split("/")
filetype

['image', 'jpeg']

In [26]:
# file 이름 자동으로 따오기

resp.url.split('/')[-1].split('.')[0]

'img2'

In [27]:
# filename 자동으로 생성하기

filename = "{0}.{1}".format(resp.url.split('/')[-1].split('.')[0],
                resp.headers["Content-Type"].split("/")[1])
filename

'img2.jpeg'

In [28]:
with open(filename, "wb") as f:
    f.write(resp.content)

In [None]:
# 반복문으로 다 다운받기

for i in images:
    resp = download('get', images[1])
    filetype = resp.headers["Content-Type"].split("/")
    filename = "{0}.{1}".format(resp.url.split('/')[-1].split('.')[0],
                resp.headers["Content-Type"].split("/")[1])
    with open(filename, "wb") as f:
        f.write(resp.content)

# 이제 정규식으로 이미지를 긁어와보자
- 위에 했던 것을 정규식을 이용하는 방법

In [30]:
url ="http://pythonscraping.com/pages/page3.html"

In [31]:
resp = download('get', url)

In [32]:
import re

In [33]:
pattern = r'''img\ssrc="([^"]*?)"'''

In [34]:
imageList = re.findall(pattern, resp.text)[1:]
imageList

['../img/gifts/img1.jpg',
 '../img/gifts/img2.jpg',
 '../img/gifts/img3.jpg',
 '../img/gifts/img4.jpg',
 '../img/gifts/img6.jpg']

In [35]:
for i in imageList:
    image_url = requests.compat.urljoin(resp.url, i)
    image_resp = download('get', image_url)
    filename = "%s.%s"%(i.split('/')[-1].split('.')[0], image_resp.headers["Content-Type"].split('/')[1])
    with open(filename, 'wb') as f:
        f.write(image_resp.content)

# 네이버 이미지 검색에서 긁어오기!

In [36]:
url = "https://search.naver.com/"
params= {'query':'수지'}

In [37]:
resp = download('get', url, param=params)

In [38]:
# 원하는 결과가 안나왔다. 
# 이유는 네이버에서는 where 파라미터가 필수이기 때문이었다. 통합검색 창에 대한 파라미터는 다음과 같다.{where:nexearch}

resp.text

'<script type="text/javascript">window.location.replace("http://www.naver.com/");</script>\n'

In [39]:
url = "https://search.naver.com/search.naver?where=image&sm=tab_jum&query=%EC%88%98%EC%A7%80"

In [40]:
resp = download('get', url, param=params)

In [41]:
resp.text

'<!doctype html> <html lang="ko"> <head> <meta charset="utf-8"> <meta name="referrer" content="always">  <meta name="format-detection" content="telephone=no,address=no,email=no"> <meta name="viewport" content="width=device-width,initial-scale=1.0,maximum-scale=2.0"> <meta property="og:title" content="수지 : 네이버 이미지검색"/> <meta property="og:image" content="https://ssl.pstatic.net/sstatic/search/common/og_v3.png"> <meta property="og:description" content="\'수지\'의 네이버 이미지검색 결과입니다."> <meta name="description" lang="ko" content="\'수지\'의 네이버 이미지검색 결과입니다."> <title>수지 : 네이버 이미지검색</title> <link rel="shortcut icon" href="https://ssl.pstatic.net/sstatic/search/favicon/favicon_140327.ico">  <link rel="search" type="application/opensearchdescription+xml" href="https://ssl.pstatic.net/sstatic/search/opensearch-description.https.xml" title="Naver" /><link rel="stylesheet" type="text/css" href="https://ssl.pstatic.net/sstatic/search/pc/css/search1_190711.css"> <link rel="stylesheet" type="text/css" href="h

#### 정규식방식

In [42]:
pattern = r'(<div\sclass="photo_grid _box".*?<div\sclass="more_img">)'

In [43]:
image_box = re.findall(pattern, resp.text)[0]

In [44]:
image_box

'<div class="photo_grid _box"> <div class="img_area _item" data-id="post17664980_226209835"> <a href="#" class="thumb _thumb" onclick="tCR(\'a=img_noc*p.img&r=1&i=post_0d7bb02b8bb8ae24c633d37be0ce060d&g=593781827924496&u=https%3A%2F%2Fm.post.naver.com%2Fviewer%2FpostView.nhn%3FvolumeNo%3D17664980%26memberNo%3D24056119%26vType%3DVERTICAL\');" title="랑콤X수지 새 화보 공개! 맨얼굴도 투명하고 청초해♡ | 포스트" > ]*?alt="(.*?)".*?>'

In [46]:
results = re.findall(pattern2, image_box)

In [47]:
len(results)

50

In [48]:
# 브라우저가 렌더링하기 전 소스를 가져왔기 때문에, 브라우저 개발자도구에서 보는 것과 소스가 다르다.

results

[('data:image/gif;base64,R0lGODlhAQABAIAAAAAAAP///yH5BAEAAAAALAAAAAABAAEAAAIBRAA7',
  '랑콤X수지 새 화보 공개! 맨얼굴도 투명하고 청초해♡ | 포스트'),
 ('data:image/gif;base64,R0lGODlhAQABAIAAAAAAAP///yH5BAEAAAAALAAAAAABAAEAAAIBRAA7',
  '수지-김현주-효민-송해나, 화보 속 4인 4색 가을 립 메이크업♥ | 포스트'),
 ('data:image/gif;base64,R0lGODlhAQABAIAAAAAAAP///yH5BAEAAAAALAAAAAABAAEAAAIBRAA7',
  '따라하고 싶은 화보 속 여자 연예인 헤어스타일 | 포스트'),
 ('data:image/gif;base64,R0lGODlhAQABAIAAAAAAAP///yH5BAEAAAAALAAAAAABAAEAAAIBRAA7',
  "[사진]수지, '실루엣도 아름답게' | 포토뉴스"),
 ('data:image/gif;base64,R0lGODlhAQABAIAAAAAAAP///yH5BAEAAAAALAAAAAABAAEAAAIBRAA7',
  '수능 끝난 예비 대학생에 선물을 한다면? | 포스트'),
 ('data:image/gif;base64,R0lGODlhAQABAIAAAAAAAP///yH5BAEAAAAALAAAAAABAAEAAAIBRAA7',
  '수지 머리색 알려주세요 | 지식iN'),
 ('data:image/gif;base64,R0lGODlhAQABAIAAAAAAAP///yH5BAEAAAAALAAAAAABAAEAAAIBRAA7',
  "[포토]수지 '감탄 나오는 미모' | 포토뉴스"),
 ('data:image/gif;base64,R0lGODlhAQABAIAAAAAAAP///yH5BAEAAAAALAAAAAABAAEAAAIBRAA7',
  '[스타캐스트] 2018년도 수지와 함께 ♥ 2018 시즌 그리팅 비하인드 | 포토뉴스'),
 ('data:image/gif;b

In [49]:
# 소스를 읽어보고 정규식을 다시 만들었다.

pattern3 = r'<img.*?alt="(.*?)".*?data-source="(.*?)(?:.jpg)'

In [50]:
re.findall(pattern3, image_box)

[('랑콤X수지 새 화보 공개! 맨얼굴도 투명하고 청초해♡ | 포스트',
  'https://search.pstatic.net/common/?src=http%3A%2F%2Fpost.phinf.naver.net%2FMjAxOTAxMjNfMjk0%2FMDAxNTQ4MjIzODY0NTg2.cfRgfOKBK_K3bcJJngmxQhChIa19sug0a4X9dCS8-H8g.K-uv243AHElJnMSll0BnQfzSl6rJykqs5iTE6OLy1Z8g.JPEG%2FIf5KFmLIpId-x3RlY9aOfLDb5nTo'),
 ('수지-김현주-효민-송해나, 화보 속 4인 4색 가을 립 메이크업♥ | 포스트',
  'https://search.pstatic.net/common/?src=http%3A%2F%2Fpost.phinf.naver.net%2FMjAxODEwMDJfMTEx%2FMDAxNTM4NDQzMTcwOTc3.irzEqni6Giwv-bjdm1iCZPipR6rnq4BwdEgq0b5vkmcg.JPCvpWdA-4D34s8Z1YdBgB1Qa0mTFpEiOidbNaYfeXEg.JPEG%2FIdYZDqEGr6zhQkM0EkEnsYt6jHFk'),
 ('따라하고 싶은 화보 속 여자 연예인 헤어스타일 | 포스트',
  'https://search.pstatic.net/common/?src=http%3A%2F%2Fpost.phinf.naver.net%2FMjAxOTA2MTJfMTcg%2FMDAxNTYwMzIwMzYzNjI0.K24S8Ob5_rsI-NtGkoPUj-ZQ1vJdXGkcXWOwtndGy6Qg.ptXnkAB-u4lsmvhILfhOkxPqECTUAf8lt8wZbI9ydoog.JPEG%2FIMHCv8QlIFyVOoK_WbkxixLwKRaM'),
 ("[사진]수지, '실루엣도 아름답게' | 포토뉴스",
  'https://search.pstatic.net/common/?src=http%3A%2F%2Fimgnews.naver.net%2Fimage%2F109%2F2019%2F05%2F

#### bs4 방식

In [51]:
dom = BeautifulSoup(resp.text, 'lxml')

In [52]:
dom

<!DOCTYPE html>
<html lang="ko"> <head> <meta charset="utf-8"/> <meta content="always" name="referrer"/> <meta content="telephone=no,address=no,email=no" name="format-detection"/> <meta content="width=device-width,initial-scale=1.0,maximum-scale=2.0" name="viewport"/> <meta content="수지 : 네이버 이미지검색" property="og:title"/> <meta content="https://ssl.pstatic.net/sstatic/search/common/og_v3.png" property="og:image"/> <meta content="'수지'의 네이버 이미지검색 결과입니다." property="og:description"/> <meta content="'수지'의 네이버 이미지검색 결과입니다." lang="ko" name="description"/> <title>수지 : 네이버 이미지검색</title> <link href="https://ssl.pstatic.net/sstatic/search/favicon/favicon_140327.ico" rel="shortcut icon"/> <link href="https://ssl.pstatic.net/sstatic/search/opensearch-description.https.xml" rel="search" title="Naver" type="application/opensearchdescription+xml"/><link href="https://ssl.pstatic.net/sstatic/search/pc/css/search1_190711.css" rel="stylesheet" type="text/css"/> <link href="https://ssl.pstatic.net/sstatic/s

In [53]:
dom.find_all('div', {'class':"img_area _item"})

[<div class="img_area _item" data-id="post17664980_226209835"> <a class="thumb _thumb" href="#" onclick="tCR('a=img_noc*p.img&amp;r=1&amp;i=post_0d7bb02b8bb8ae24c633d37be0ce060d&amp;g=593781827924496&amp;u=https%3A%2F%2Fm.post.naver.com%2Fviewer%2FpostView.nhn%3FvolumeNo%3D17664980%26memberNo%3D24056119%26vType%3DVERTICAL');" title="랑콤X수지 새 화보 공개! 맨얼굴도 투명하고 청초해♡ | 포스트"> <img alt="랑콤X수지 새 화보 공개! 맨얼굴도 투명하고 청초해♡ | 포스트" class="_img" data-height="1200" data-source="https://search.pstatic.net/common/?src=http%3A%2F%2Fpost.phinf.naver.net%2FMjAxOTAxMjNfMjk0%2FMDAxNTQ4MjIzODY0NTg2.cfRgfOKBK_K3bcJJngmxQhChIa19sug0a4X9dCS8-H8g.K-uv243AHElJnMSll0BnQfzSl6rJykqs5iTE6OLy1Z8g.JPEG%2FIf5KFmLIpId-x3RlY9aOfLDb5nTo.jpg&amp;type=b400" data-width="947" onerror="var we=$Element(this); we.addClass('bg_nimg'); we.attr('alt','이미지준비중'); we.attr('src','data:image/gif;base64,R0lGODlhAQABAIAAAAAAAP///yH5BAEAAAAALAAAAAABAAEAAAIBRAA7');" src="data:image/gif;base64,R0lGODlhAQABAIAAAAAAAP///yH5BAEAAAAALAAAAAABAAEAAAIB

In [54]:
len(dom.find_all('div', {'class':"img_area _item"}))

50

In [55]:
dom.find_all('div', {'class':"img_area _item"})[0].find_all('img')[0]['data-source']

'https://search.pstatic.net/common/?src=http%3A%2F%2Fpost.phinf.naver.net%2FMjAxOTAxMjNfMjk0%2FMDAxNTQ4MjIzODY0NTg2.cfRgfOKBK_K3bcJJngmxQhChIa19sug0a4X9dCS8-H8g.K-uv243AHElJnMSll0BnQfzSl6rJykqs5iTE6OLy1Z8g.JPEG%2FIf5KFmLIpId-x3RlY9aOfLDb5nTo.jpg&type=b400'

In [85]:
# list comprehen

[tag.find_all('img')[0]['data-source'] for tag in dom.find_all('div', {'class':"img_area _item"})]

['https://search.pstatic.net/common/?src=http%3A%2F%2Fpost.phinf.naver.net%2FMjAxOTAxMjNfMjk0%2FMDAxNTQ4MjIzODY0NTg2.cfRgfOKBK_K3bcJJngmxQhChIa19sug0a4X9dCS8-H8g.K-uv243AHElJnMSll0BnQfzSl6rJykqs5iTE6OLy1Z8g.JPEG%2FIf5KFmLIpId-x3RlY9aOfLDb5nTo.jpg&type=b400',
 'https://search.pstatic.net/common/?src=http%3A%2F%2Fpost.phinf.naver.net%2FMjAxODEwMDJfMTEx%2FMDAxNTM4NDQzMTcwOTc3.irzEqni6Giwv-bjdm1iCZPipR6rnq4BwdEgq0b5vkmcg.JPCvpWdA-4D34s8Z1YdBgB1Qa0mTFpEiOidbNaYfeXEg.JPEG%2FIdYZDqEGr6zhQkM0EkEnsYt6jHFk.jpg&type=b400',
 'https://search.pstatic.net/common/?src=http%3A%2F%2Fpost.phinf.naver.net%2FMjAxOTA2MTJfMTcg%2FMDAxNTYwMzIwMzYzNjI0.K24S8Ob5_rsI-NtGkoPUj-ZQ1vJdXGkcXWOwtndGy6Qg.ptXnkAB-u4lsmvhILfhOkxPqECTUAf8lt8wZbI9ydoog.JPEG%2FIMHCv8QlIFyVOoK_WbkxixLwKRaM.jpg&type=b400',
 'https://search.pstatic.net/common/?src=http%3A%2F%2Fimgnews.naver.net%2Fimage%2F109%2F2019%2F05%2F25%2F0004016315_001_20190525150509282.jpg&type=b400',
 'https://search.pstatic.net/common/?src=http%3A%2F%2Fpost.phinf.nave

In [78]:
# for문으로 다 찾아오기

results = []
for i in range(len(dom.find_all('div', {'class':'img_area _item'}))):
    results.append(dom.find_all('div', {"class":"img_area _item"})[i].find_all('img')[0]['data-source'])
results

['https://search.pstatic.net/common/?src=http%3A%2F%2Fpost.phinf.naver.net%2FMjAxOTAxMjNfMjk0%2FMDAxNTQ4MjIzODY0NTg2.cfRgfOKBK_K3bcJJngmxQhChIa19sug0a4X9dCS8-H8g.K-uv243AHElJnMSll0BnQfzSl6rJykqs5iTE6OLy1Z8g.JPEG%2FIf5KFmLIpId-x3RlY9aOfLDb5nTo.jpg&type=b400',
 'https://search.pstatic.net/common/?src=http%3A%2F%2Fpost.phinf.naver.net%2FMjAxODEwMDJfMTEx%2FMDAxNTM4NDQzMTcwOTc3.irzEqni6Giwv-bjdm1iCZPipR6rnq4BwdEgq0b5vkmcg.JPCvpWdA-4D34s8Z1YdBgB1Qa0mTFpEiOidbNaYfeXEg.JPEG%2FIdYZDqEGr6zhQkM0EkEnsYt6jHFk.jpg&type=b400',
 'https://search.pstatic.net/common/?src=http%3A%2F%2Fpost.phinf.naver.net%2FMjAxOTA2MTJfMTcg%2FMDAxNTYwMzIwMzYzNjI0.K24S8Ob5_rsI-NtGkoPUj-ZQ1vJdXGkcXWOwtndGy6Qg.ptXnkAB-u4lsmvhILfhOkxPqECTUAf8lt8wZbI9ydoog.JPEG%2FIMHCv8QlIFyVOoK_WbkxixLwKRaM.jpg&type=b400',
 'https://search.pstatic.net/common/?src=http%3A%2F%2Fimgnews.naver.net%2Fimage%2F109%2F2019%2F05%2F25%2F0004016315_001_20190525150509282.jpg&type=b400',
 'https://search.pstatic.net/common/?src=http%3A%2F%2Fpost.phinf.nave