# Requests and BeautifulSoup

- requests로 데이터를 요청 후 BeautifulSoup으로 파싱

In [5]:
from IPython.display import Image

In [6]:
import requests
from bs4 import BeautifulSoup

In [7]:
res = requests.get("https://www.naver.com/")

In [8]:
res.text

'\n<!doctype html>                          <html lang="ko" data-dark="false"> <head> <meta charset="utf-8"> <title>NAVER</title> <meta http-equiv="X-UA-Compatible" content="IE=edge"> <meta name="viewport" content="width=1190"> <meta name="apple-mobile-web-app-title" content="NAVER"/> <meta name="robots" content="index,nofollow"/> <meta name="description" content="네이버 메인에서 다양한 정보와 유용한 컨텐츠를 만나 보세요"/> <meta property="og:title" content="네이버"> <meta property="og:url" content="https://www.naver.com/"> <meta property="og:image" content="https://s.pstatic.net/static/www/mobile/edit/2016/0705/mobile_212852414260.png"> <meta property="og:description" content="네이버 메인에서 다양한 정보와 유용한 컨텐츠를 만나 보세요"/> <meta name="twitter:card" content="summary"> <meta name="twitter:title" content=""> <meta name="twitter:url" content="https://www.naver.com/"> <meta name="twitter:image" content="https://s.pstatic.net/static/www/mobile/edit/2016/0705/mobile_212852414260.png"> <meta name="twitter:description" content="네이버

- BeautifulSoup을 통해서 원하는 text만 가져오려고 함.

## request

In [10]:
res = requests.get('https://www.python.org/')

In [11]:
res.text



In [12]:
res.status_code

200

In [13]:
res.ok

True

### requests 옵션

In [14]:
res = requests.get('https://www.python.org/', timeout = 1)
res.ok

True

In [17]:
res = requests.get('https://www.python.org/', timeout = 0.001)
res.ok

ConnectTimeout: HTTPSConnectionPool(host='www.python.org', port=443): Max retries exceeded with url: / (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x107aef700>, 'Connection to www.python.org timed out. (connect timeout=0.001)'))

In [18]:
res = requests.get('https://www.python.org/', timeout = 1)
res.ok

True

In [19]:
res.status_code

200

In [20]:
res.encoding

'utf-8'

In [21]:
res.request

<PreparedRequest [GET]>

## BeautifulSoup 기초

In [22]:
from bs4 import BeautifulSoup

In [23]:
text = """
<html>
<head>
<title>Ma:deri</title>
</head>
<body>
<h1> maderi h1 </h1>
<h2> maderi h2 </h2>
<p class="title"><b>BeautifulSoup</b></p>
<p class="content">contents
<a href="http://www.maderi.co.kr/" class="m_main" id="maderi">Maderi</a>
<a href="http://datamarketing.co.kr/" class="d_main" id="dmk" val="home">DMK</a>
</p>
<p class="call">call me...</p>
<div>
    <table>
        <thead></thead>
        <tbody>
            <tr>
                <td>철수</td>
                <td>24</td>
            </tr>
            <tr>
                <td>영희</td>
                <td>31</td>
            </tr>
        </tbody>
    </table>
</div>
</body>
</html>
"""

In [24]:
soup = BeautifulSoup(text, 'html.parser')

In [25]:
soup


<html>
<head>
<title>Ma:deri</title>
</head>
<body>
<h1> maderi h1 </h1>
<h2> maderi h2 </h2>
<p class="title"><b>BeautifulSoup</b></p>
<p class="content">contents
<a class="m_main" href="http://www.maderi.co.kr/" id="maderi">Maderi</a>
<a class="d_main" href="http://datamarketing.co.kr/" id="dmk" val="home">DMK</a>
</p>
<p class="call">call me...</p>
<div>
<table>
<thead></thead>
<tbody>
<tr>
<td>철수</td>
<td>24</td>
</tr>
<tr>
<td>영희</td>
<td>31</td>
</tr>
</tbody>
</table>
</div>
</body>
</html>

In [27]:
print(soup.prettify())

<html>
 <head>
  <title>
   Ma:deri
  </title>
 </head>
 <body>
  <h1>
   maderi h1
  </h1>
  <h2>
   maderi h2
  </h2>
  <p class="title">
   <b>
    BeautifulSoup
   </b>
  </p>
  <p class="content">
   contents
   <a class="m_main" href="http://www.maderi.co.kr/" id="maderi">
    Maderi
   </a>
   <a class="d_main" href="http://datamarketing.co.kr/" id="dmk" val="home">
    DMK
   </a>
  </p>
  <p class="call">
   call me...
  </p>
  <div>
   <table>
    <thead>
    </thead>
    <tbody>
     <tr>
      <td>
       철수
      </td>
      <td>
       24
      </td>
     </tr>
     <tr>
      <td>
       영희
      </td>
      <td>
       31
      </td>
     </tr>
    </tbody>
   </table>
  </div>
 </body>
</html>



### body, head 접근

In [28]:
h1 = soup.html.body.h1
h2 = soup.html.body.h2.text

In [29]:
print(h1)
print(h2)

<h1> maderi h1 </h1>
 maderi h2 


In [31]:
p = soup.html.body.p
print(p)
print()
p2 = p.next_sibling.next_sibling
print(p2)
print(p2.next_elements)

<p class="title"><b>BeautifulSoup</b></p>

<p class="content">contents
<a class="m_main" href="http://www.maderi.co.kr/" id="maderi">Maderi</a>
<a class="d_main" href="http://datamarketing.co.kr/" id="dmk" val="home">DMK</a>
</p>
<generator object PageElement.next_elements at 0x120c58f20>


In [32]:
p

<p class="title"><b>BeautifulSoup</b></p>

In [33]:
p2

<p class="content">contents
<a class="m_main" href="http://www.maderi.co.kr/" id="maderi">Maderi</a>
<a class="d_main" href="http://datamarketing.co.kr/" id="dmk" val="home">DMK</a>
</p>

In [34]:
print(list(p2.next_elements))

['contents\n', <a class="m_main" href="http://www.maderi.co.kr/" id="maderi">Maderi</a>, 'Maderi', '\n', <a class="d_main" href="http://datamarketing.co.kr/" id="dmk" val="home">DMK</a>, 'DMK', '\n', '\n', <p class="call">call me...</p>, 'call me...', '\n', <div>
<table>
<thead></thead>
<tbody>
<tr>
<td>철수</td>
<td>24</td>
</tr>
<tr>
<td>영희</td>
<td>31</td>
</tr>
</tbody>
</table>
</div>, '\n', <table>
<thead></thead>
<tbody>
<tr>
<td>철수</td>
<td>24</td>
</tr>
<tr>
<td>영희</td>
<td>31</td>
</tr>
</tbody>
</table>, '\n', <thead></thead>, '\n', <tbody>
<tr>
<td>철수</td>
<td>24</td>
</tr>
<tr>
<td>영희</td>
<td>31</td>
</tr>
</tbody>, '\n', <tr>
<td>철수</td>
<td>24</td>
</tr>, '\n', <td>철수</td>, '철수', '\n', <td>24</td>, '24', '\n', '\n', <tr>
<td>영희</td>
<td>31</td>
</tr>, '\n', <td>영희</td>, '영희', '\n', <td>31</td>, '31', '\n', '\n', '\n', '\n', '\n', '\n', '\n']


In [35]:
from pprint import pprint

In [36]:
pprint(list(p2.next_elements))

['contents\n',
 <a class="m_main" href="http://www.maderi.co.kr/" id="maderi">Maderi</a>,
 'Maderi',
 '\n',
 <a class="d_main" href="http://datamarketing.co.kr/" id="dmk" val="home">DMK</a>,
 'DMK',
 '\n',
 '\n',
 <p class="call">call me...</p>,
 'call me...',
 '\n',
 <div>
<table>
<thead></thead>
<tbody>
<tr>
<td>철수</td>
<td>24</td>
</tr>
<tr>
<td>영희</td>
<td>31</td>
</tr>
</tbody>
</table>
</div>,
 '\n',
 <table>
<thead></thead>
<tbody>
<tr>
<td>철수</td>
<td>24</td>
</tr>
<tr>
<td>영희</td>
<td>31</td>
</tr>
</tbody>
</table>,
 '\n',
 <thead></thead>,
 '\n',
 <tbody>
<tr>
<td>철수</td>
<td>24</td>
</tr>
<tr>
<td>영희</td>
<td>31</td>
</tr>
</tbody>,
 '\n',
 <tr>
<td>철수</td>
<td>24</td>
</tr>,
 '\n',
 <td>철수</td>,
 '철수',
 '\n',
 <td>24</td>,
 '24',
 '\n',
 '\n',
 <tr>
<td>영희</td>
<td>31</td>
</tr>,
 '\n',
 <td>영희</td>,
 '영희',
 '\n',
 <td>31</td>,
 '31',
 '\n',
 '\n',
 '\n',
 '\n',
 '\n',
 '\n',
 '\n']


### BeautifulSoup find

In [38]:
soup


<html>
<head>
<title>Ma:deri</title>
</head>
<body>
<h1> maderi h1 </h1>
<h2> maderi h2 </h2>
<p class="title"><b>BeautifulSoup</b></p>
<p class="content">contents
<a class="m_main" href="http://www.maderi.co.kr/" id="maderi">Maderi</a>
<a class="d_main" href="http://datamarketing.co.kr/" id="dmk" val="home">DMK</a>
</p>
<p class="call">call me...</p>
<div>
<table>
<thead></thead>
<tbody>
<tr>
<td>철수</td>
<td>24</td>
</tr>
<tr>
<td>영희</td>
<td>31</td>
</tr>
</tbody>
</table>
</div>
</body>
</html>

In [39]:
a = soup.find("a")
pprint(a)

<a class="m_main" href="http://www.maderi.co.kr/" id="maderi">Maderi</a>


In [40]:
a = soup.find("a")
print(a.text)

Maderi


In [42]:
print(a.get('href'))

http://www.maderi.co.kr/


In [43]:
a = soup.find_all("a")

In [44]:
a

[<a class="m_main" href="http://www.maderi.co.kr/" id="maderi">Maderi</a>,
 <a class="d_main" href="http://datamarketing.co.kr/" id="dmk" val="home">DMK</a>]

In [45]:
print(a[0].get('href'))

http://www.maderi.co.kr/


In [46]:
print(a[1].get('href'))

http://datamarketing.co.kr/


In [47]:
a2 = soup.find_all("a", class_ = 'm_main')
a2

[<a class="m_main" href="http://www.maderi.co.kr/" id="maderi">Maderi</a>]

In [48]:
a = soup.find("a")
print(a.text)
print(a.get('href'))

Maderi
http://www.maderi.co.kr/


In [49]:
a2 = soup.find('a', {'class' : 'd_main', 'val' : 'home'})
a2

<a class="d_main" href="http://datamarketing.co.kr/" id="dmk" val="home">DMK</a>

### BeautifulSoup select

In [50]:
soup


<html>
<head>
<title>Ma:deri</title>
</head>
<body>
<h1> maderi h1 </h1>
<h2> maderi h2 </h2>
<p class="title"><b>BeautifulSoup</b></p>
<p class="content">contents
<a class="m_main" href="http://www.maderi.co.kr/" id="maderi">Maderi</a>
<a class="d_main" href="http://datamarketing.co.kr/" id="dmk" val="home">DMK</a>
</p>
<p class="call">call me...</p>
<div>
<table>
<thead></thead>
<tbody>
<tr>
<td>철수</td>
<td>24</td>
</tr>
<tr>
<td>영희</td>
<td>31</td>
</tr>
</tbody>
</table>
</div>
</body>
</html>

In [51]:
soup.find('p')

<p class="title"><b>BeautifulSoup</b></p>

In [54]:
soup.select_one('p.title > b') # . ==> class

<b>BeautifulSoup</b>

In [55]:
soup.select('#dmk') # # ==> id

[<a class="d_main" href="http://datamarketing.co.kr/" id="dmk" val="home">DMK</a>]

In [56]:
soup.select("a[val = 'home']")

[<a class="d_main" href="http://datamarketing.co.kr/" id="dmk" val="home">DMK</a>]

In [57]:
url = soup.select('p.content > a')
url

[<a class="m_main" href="http://www.maderi.co.kr/" id="maderi">Maderi</a>,
 <a class="d_main" href="http://datamarketing.co.kr/" id="dmk" val="home">DMK</a>]

In [68]:
url2 = soup.select('p.content > a:nth-of-type(2)')[0]
url2

<a class="d_main" href="http://datamarketing.co.kr/" id="dmk" val="home">DMK</a>

In [69]:
url2.get('href')

'http://datamarketing.co.kr/'

In [71]:
soup.select('p.call')[0].text

'call me...'

## Requests and BeautifulSoup 실습

In [126]:
import requests 
from bs4 import BeautifulSoup

In [127]:
url = "https://www.smu.ac.kr/lounge/qna/faq.do"

In [128]:
res = requests.get(url)
res.ok

True

In [77]:
res.text

'\r\n\r\n\r\n<!doctype html>\n<html lang="ko" class="no-js">\n<head>\n\t\n<!-- Global site tag (gtag.js) - Google Analytics -->\n<script async src="https://www.googletagmanager.com/gtag/js?id=UA-129351146-1"></script>\n<script>\n  window.dataLayer = window.dataLayer || [];\n  function gtag(){dataLayer.push(arguments);}\n  gtag(\'js\', new Date());\n\n  gtag(\'config\', \'UA-129351146-1\');\n</script>\n\n\t\n<title>상명라운지 | 상명 Q&amp;A | 자주하는 질문 게시판목록 | 상명라운지</title>\n<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />\n<meta http-equiv="X-UA-Compatible" content="IE=edge" />\n<meta name="viewport" content="width=device-width,initial-scale=1.0,minimum-scale=1.0,maximum-scale=1.0,user-scalable=no">\n<link rel="canonical" href="https://www.smu.ac.kr:443/lounge/qna/faq.do" />\n<link href="https://fonts.googleapis.com/css?family=Cardo" rel="stylesheet">\n<link rel="stylesheet" type="text/css" href="/_common/cms.css" />\n<link rel="stylesheet" type="text/css" href="/_res/loung

In [129]:
soup = BeautifulSoup(res.text, 'html.parser')

In [80]:
table = soup.select('.board-thumb-wrap > li')

In [82]:
len(table)

7

In [83]:
table[0]

<li class="">
<dl class="board-thumb-content-wrap board-thumb-content-wrap-v01">
<dt class="board-thumb-content-title">
<a href="?mode=view&amp;articleNo=698890&amp;article.offset=0&amp;articleLimit=10" style="  " target="" title="Office365(학생메일) 사용법 자세히 보기">
						Office365(학생메일) 사용법
							
							</a>
</dt>
<dd class="board-thumb-content-info">
<ul>
<li class="board-thumb-content-writer">
<span class="hide">작성자</span>
							통합 관리자
						</li>
<li class="board-thumb-content-date">
<span class="hide">작성일</span>
							2019-04-03
						</li>
<li class="board-thumb-content-views">
<span class="hide">조회수</span>
							67081
						</li>
</ul>
</dd>
</dl>
<div class="list-file"></div>
</li>

In [86]:
title = table[0].select_one('a').text.replace('\n','').replace('\t','')
title

'Office365(학생메일) 사용법'

In [88]:
author = table[0].select_one('.board-thumb-content-writer').text.replace('\r','').replace('\t','').replace('\n','')
author

'작성자통합 관리자'

In [89]:
param = table[0].select_one('a').get('href')
param

'?mode=view&articleNo=698890&article.offset=0&articleLimit=10'

### 세부 게시물 크롤링

In [116]:
url = "https://www.smu.ac.kr/lounge/qna/faq.do?mode=view&articleNo=698890&article.offset=0&articleLimit=10"

In [117]:
res = requests.get(url)
res

<Response [200]>

In [118]:
soup = BeautifulSoup(res.text, 'html.parser')

In [94]:
soup


<!DOCTYPE html>

<html class="no-js" lang="ko">
<head>
<!-- Global site tag (gtag.js) - Google Analytics -->
<script async="" src="https://www.googletagmanager.com/gtag/js?id=UA-129351146-1"></script>
<script>
  window.dataLayer = window.dataLayer || [];
  function gtag(){dataLayer.push(arguments);}
  gtag('js', new Date());

  gtag('config', 'UA-129351146-1');
</script>
<title>상명라운지 | 상명 Q&amp;A | 자주하는 질문 게시판읽기(Office365(학생메일) 사용법) | 상명라운지</title>
<meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
<meta content="IE=edge" http-equiv="X-UA-Compatible"/>
<meta content="width=device-width,initial-scale=1.0,minimum-scale=1.0,maximum-scale=1.0,user-scalable=no" name="viewport"/>
<link href="https://www.smu.ac.kr:443/lounge/qna/faq.do" rel="canonical"/>
<link href="https://fonts.googleapis.com/css?family=Cardo" rel="stylesheet"/>
<link href="/_common/cms.css" rel="stylesheet" type="text/css">
<link href="/_res/lounge/_css/layout.css" rel="stylesheet" type="text/css"/>
<!--

In [97]:
soup.select_one('h4').text.replace('\r','').replace('\t','').replace('\n','').replace(' ', '')

'Office365(학생메일)사용법'

In [98]:
contents = ""

for s in soup.select('.fr-view > .fr-view >p'):
    contents += s.text

In [99]:
contents

'Office365는 계정 비밀번호 연동 후 사용이 가능합니다.Office365(학생메일) 비밀번호 연동 방법1. https://portal.smu.ac.kr샘물포털 사이트 접속 후 로그인2. Office365 버튼 클릭 \xa0(오른쪽 퀵메뉴에 Office365 클릭해도 접속 가능)3. 비밀번호 입력창이나오면 통합로그인 패스워드와 동일한 패스워드로 입력4. 패스워드 입력하면 자동으로 학번@sangmyung.kr 로 계정이 연동되며, 이 후 메일이 사용가능합니다.Office 설치형 제품군(pro plus)은 재학생만 사용가능합니다.이외의 기능은 전체 학생이 사용 가능합니다.'

### faq 전체 가져오기

In [130]:
table = soup.select('.board-thumb-wrap > li')

In [131]:
table

[<li class="">
 <dl class="board-thumb-content-wrap board-thumb-content-wrap-v01">
 <dt class="board-thumb-content-title">
 <a href="?mode=view&amp;articleNo=698890&amp;article.offset=0&amp;articleLimit=10" style="  " target="" title="Office365(학생메일) 사용법 자세히 보기">
 						Office365(학생메일) 사용법
 							
 							</a>
 </dt>
 <dd class="board-thumb-content-info">
 <ul>
 <li class="board-thumb-content-writer">
 <span class="hide">작성자</span>
 							통합 관리자
 						</li>
 <li class="board-thumb-content-date">
 <span class="hide">작성일</span>
 							2019-04-03
 						</li>
 <li class="board-thumb-content-views">
 <span class="hide">조회수</span>
 							67089
 						</li>
 </ul>
 </dd>
 </dl>
 <div class="list-file"></div>
 </li>,
 <li class="">
 <dl class="board-thumb-content-wrap board-thumb-content-wrap-v01">
 <dt class="board-thumb-content-title">
 <a href="?mode=view&amp;articleNo=684733&amp;article.offset=0&amp;articleLimit=10" style="  " target="" title="학사)복학 신청을 꼭 해야하나요? 복학은 어떻게 신청하나요? 자세히 보기

In [134]:
result_list = []
for t in table:
    _dict = {}
    param = t.select_one('a').get('href')
    base_url = 'https://www.smu.ac.kr/lounge/qna/faq.do'
    url = base_url + param
    res = requests.get(url)
    print(url)
    soup = BeautifulSoup(res.text, 'html.parser')
    title = soup.select_one('h4').text.replace('\r','').replace('\t','').replace('\n','').replace(' ', '')
    contents = ""
    for s in soup.select('.fr-view > .fr-view >p'):
        contents += s.text
    _dict['title'] = title
    _dict['content'] = contents
    result_list.append(_dict)

https://www.smu.ac.kr/lounge/qna/faq.do?mode=view&articleNo=698890&article.offset=0&articleLimit=10
https://www.smu.ac.kr/lounge/qna/faq.do?mode=view&articleNo=684733&article.offset=0&articleLimit=10
https://www.smu.ac.kr/lounge/qna/faq.do?mode=view&articleNo=684732&article.offset=0&articleLimit=10
https://www.smu.ac.kr/lounge/qna/faq.do?mode=view&articleNo=264776&article.offset=0&articleLimit=10
https://www.smu.ac.kr/lounge/qna/faq.do?mode=view&articleNo=264774&article.offset=0&articleLimit=10
https://www.smu.ac.kr/lounge/qna/faq.do?mode=view&articleNo=264773&article.offset=0&articleLimit=10
https://www.smu.ac.kr/lounge/qna/faq.do?mode=view&articleNo=264772&article.offset=0&articleLimit=10


In [135]:
result_list

[{'title': 'Office365(학생메일)사용법',
  'content': 'Office365는 계정 비밀번호 연동 후 사용이 가능합니다.Office365(학생메일) 비밀번호 연동 방법1. https://portal.smu.ac.kr샘물포털 사이트 접속 후 로그인2. Office365 버튼 클릭 \xa0(오른쪽 퀵메뉴에 Office365 클릭해도 접속 가능)3. 비밀번호 입력창이나오면 통합로그인 패스워드와 동일한 패스워드로 입력4. 패스워드 입력하면 자동으로 학번@sangmyung.kr 로 계정이 연동되며, 이 후 메일이 사용가능합니다.Office 설치형 제품군(pro plus)은 재학생만 사용가능합니다.이외의 기능은 전체 학생이 사용 가능합니다.'},
 {'title': '학사)복학신청을꼭해야하나요?복학은어떻게신청하나요?',
  'content': "휴학 기간이 끝난 후 자동적으로 복학이 되지 않으며, 꼭 아래의 절차를 통해 복학 신청을 해야합니다.만약 휴학 기간이 끝난 후에도 복학을 하지 않으면, 수강신청이 불가하며 추후 미복학제적이 되니 꼭 복학신청을 하시기 바랍니다.복학을 신청하는 방법은 아래와 같습니다.1) 샘물통합정보시스템 로그인2) '학적정보' > '휴학/복학/자퇴/휴학취소신청' > 인적사항 및 복학 정보 입력 > 저장\xa0\xa0(군휴학 후 복학하는 경우 전역증 등의 증빙서류를 첨부하여 제출)* 복 신청 후 복학이 완료되기까지 3~4일 정도가 소요되오니, 수강신청을 위하여 꼭\xa0학적변동 기간 내에 복학신청을 완료하여주시기 바랍니다."},
 {'title': '학사)휴학은어떻게신청하나요?',
  'content': "휴학을 신청하는 방법은 아래와 같습니다.1) 샘물통합정보시스템 로그인2) '학생정보' > 학적정보 > '휴학/복학/자퇴/휴학취소신청' > 인적사항 및 휴학 정보 입력 > 저장(군휴학 및 질병휴학의 경우 입영통지서\xa0등의 증빙서류를 첨부하여 제출)* 휴학 신청 후 휴학이 완료되기까지 3~4일 정도가 소요됩니다."},
 {