# Libarary import & version check

In [21]:
import requests
import tensorflow as tf
import nltk
import bs4
from bs4 import BeautifulSoup
from selenium import webdriver

In [13]:
requests.__version__

'2.22.0'

In [14]:
webdriver.__version__

'3.14.1'

In [15]:
nltk.__version__

'3.4.5'

In [16]:
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

In [22]:
bs4.__version__

'4.6.0'

# Scraping / Crawling 실습

## requests 패키지

In [28]:
def getDownload( url, param = None, retries = 3):
    resp = None
    
    try:
        resp = requests.get( url, params = param)
        resp.raise_for_status()
    except requests.exceptions.HTTPError as e:
        if 500 <= resp.status_code < 600 and retries > 0:
            print( 'Retries : {0}.'.format( retries ) )
            return getDownload( url, param, retries - 1)
        else:
            print( resp.status_code)
            print( resp.reason)
            print( resp.request.headers)
        
    return resp

In [32]:
url = 'http://www.crawler-test.com/status_codes/status_500'
getDownload(url)

Retries : 3.
Retries : 2.
Retries : 1.
500
Internal Server Error
{'User-Agent': 'python-requests/2.22.0', 'Accept-Encoding': 'gzip, deflate', 'Accept': '*/*', 'Connection': 'keep-alive'}


<Response [500]>

In [41]:
def postDownload( url, data = None, param = None, retries = 3):
    resp = None
    
    try:
        resp = requests.post( url, data, params = param)
        resp.raise_for_status()
    except requests.exceptions.HTTPError as e:
        if 500 <= resp.status_code < 600 and retries > 0:
            print( 'Retries : {0}.'.format( retries ) )
            return getDownload( url, param, retries - 1)
        else:
            print( resp.status_code)
            print( resp.reason)
            print( resp.request.headers)
        
    return resp

In [34]:
url = 'http://pythonscraping.com/pages/files/processing.php'
data = {'fristname':'테스트', 'lastname':1234}

In [37]:
html = postDownload( url, data)
print(html.request.body)
print("*"*40)
print(html.request.headers)
html.text

fristname=%ED%85%8C%EC%8A%A4%ED%8A%B8&lastname=1234
****************************************
{'User-Agent': 'python-requests/2.22.0', 'Accept-Encoding': 'gzip, deflate', 'Accept': '*/*', 'Connection': 'keep-alive', 'Content-Length': '51', 'Content-Type': 'application/x-www-form-urlencoded'}


'Hello there,  1234!'

# Cookie 활용

In [82]:
def postDownloadCookie( url, data = None, param = None, cookie = None, retries = 3):
    resp = None
    
    try:
        resp = requests.post( url, data, cookies = cookie, params = param)
        resp.raise_for_status()
    except requests.exceptions.HTTPError as e:
        if 500 <= resp.status_code < 600 and retries > 0:
            print( 'Retries : {0}.'.format( retries ) )
            return getDownload( url, param, cookie, retries - 1)
        else:
            print( resp.status_code)
            print( resp.reason)
            print( resp.request.headers)
        
    return resp

In [83]:
url = 'http://pythonscraping.com/pages/files/processing.php'
data = {'fristname':'테스트', 'lastname':1234}

In [84]:
html = postDownloadCookie( url, data)
cookie = html.cookies.get_dict()

In [85]:
html = postDownloadCookie( url, data, cookie)
html.text

'Hello there,  1234!'

In [86]:
session = requests.Session()

In [87]:
data : { 'username':'test', 'password' : 'password'}

In [88]:
html = session.post(url, data)

In [89]:
html= session.post(url)
html.text

'Hello there,  !'

# BeautifulSoup을 이용한 HTML 분석

In [54]:
html = '''
<!DOCTYPE html>
<html>
    <head>
        <meta charset="utf-8">
        <title>BeautifulSoup Training</title>
    </head>
    <body>
        <div id="result">
            <p class="row">
                <a class="red">Go to pasge 1</a>
                <a class="blue">Go to pasge 2</a>
                <a class="green">Go to pasge 3</a>
                <a class="red">Go to pasge 4</a>
                <b class="yellow">Go to pasge 5</a>
                <c id="gray">Go to pasge 6</a>
                <d id="red">Go to pasge 7</a>
            </p>
        </div>
    </body>
</html>
'''

In [55]:
dom = BeautifulSoup(html, 'lxml')

In [56]:
dom

<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8"/>
<title>BeautifulSoup Training</title>
</head>
<body>
<div id="result">
<p class="row">
<a class="red">Go to pasge 1</a>
<a class="blue">Go to pasge 2</a>
<a class="green">Go to pasge 3</a>
<a class="red">Go to pasge 4</a>
<b class="yellow">Go to pasge 5
                <c id="gray">Go to pasge 6
                <d id="red">Go to pasge 7
            </d></c></b></p>
</div>
</body>
</html>

## find() / find_all()을 이용한 원하는 tag 내용 추출

In [57]:
dom.find('a')

<a class="red">Go to pasge 1</a>

In [58]:
dom.find_all('a')

[<a class="red">Go to pasge 1</a>,
 <a class="blue">Go to pasge 2</a>,
 <a class="green">Go to pasge 3</a>,
 <a class="red">Go to pasge 4</a>]

In [59]:
dom.find('', {"id":"result"})

<div id="result">
<p class="row">
<a class="red">Go to pasge 1</a>
<a class="blue">Go to pasge 2</a>
<a class="green">Go to pasge 3</a>
<a class="red">Go to pasge 4</a>
<b class="yellow">Go to pasge 5
                <c id="gray">Go to pasge 6
                <d id="red">Go to pasge 7
            </d></c></b></p>
</div>

In [60]:
dom.find('', {'class': 'red'})

<a class="red">Go to pasge 1</a>

In [61]:
dom.find_all('', {'class': 'red'})

[<a class="red">Go to pasge 1</a>, <a class="red">Go to pasge 4</a>]

In [68]:
url = 'http://pythonscraping.com/pages/page3.html'
html = getDownload(url)
html.text

'<html>\n<head>\n<style>\nimg{\n\twidth:75px;\n}\ntable{\n\twidth:50%;\n}\ntd{\n\tmargin:10px;\n\tpadding:10px;\n}\n.wrapper{\n\twidth:800px;\n}\n.excitingNote{\n\tfont-style:italic;\n\tfont-weight:bold;\n}\n</style>\n</head>\n<body>\n<div id="wrapper">\n<img src="../img/gifts/logo.jpg" style="float:left;">\n<h1>Totally Normal Gifts</h1>\n<div id="content">Here is a collection of totally normal, totally reasonable gifts that your friends are sure to love! Our collection is\nhand-curated by well-paid, free-range Tibetan monks.<p>\nWe haven\'t figured out how to make online shopping carts yet, but you can send us a check to:<br>\n123 Main St.<br>\nAbuja, Nigeria\n</br>We will then send your totally amazing gift, pronto! Please include an extra $5.00 for gift wrapping.</div>\n<table id="giftList">\n<tr><th>\nItem Title\n</th><th>\nDescription\n</th><th>\nCost\n</th><th>\nImage\n</th></tr>\n\n<tr id="gift1" class="gift"><td>\nVegetable Basket\n</td><td>\nThis vegetable basket is the perfec

In [69]:
dom = BeautifulSoup(html.text, 'lxml')
dom

<html>
<head>
<style>
img{
	width:75px;
}
table{
	width:50%;
}
td{
	margin:10px;
	padding:10px;
}
.wrapper{
	width:800px;
}
.excitingNote{
	font-style:italic;
	font-weight:bold;
}
</style>
</head>
<body>
<div id="wrapper">
<img src="../img/gifts/logo.jpg" style="float:left;"/>
<h1>Totally Normal Gifts</h1>
<div id="content">Here is a collection of totally normal, totally reasonable gifts that your friends are sure to love! Our collection is
hand-curated by well-paid, free-range Tibetan monks.<p>
We haven't figured out how to make online shopping carts yet, but you can send us a check to:<br/>
123 Main St.<br/>
Abuja, Nigeria
We will then send your totally amazing gift, pronto! Please include an extra $5.00 for gift wrapping.</p></div>
<table id="giftList">
<tr><th>
Item Title
</th><th>
Description
</th><th>
Cost
</th><th>
Image
</th></tr>
<tr class="gift" id="gift1"><td>
Vegetable Basket
</td><td>
This vegetable basket is the perfect gift for your health conscious (or overweight) frien

In [70]:
footer = dom.find('div', {'id' : 'footer'})
footer

<div id="footer">
© Totally Normal Gifts, Inc. <br/>
+234 (617) 863-0736
</div>

In [71]:
parent = footer.find_parent()
parent.name, parent.attrs

('div', {'id': 'wrapper'})

In [72]:
children = parent.find_all(recursive = False)
for row in children:
    print(row.name, row.attrs)

img {'src': '../img/gifts/logo.jpg', 'style': 'float:left;'}
h1 {}
div {'id': 'content'}
table {'id': 'giftList'}
div {'id': 'footer'}


In [81]:
aList = dom.find_all('tr')
for row in aList:
    print(row.find_all(recursive = False)[2].text.strip())

Cost
$15.00
$10,000.52
$10,005.00
$0.50
$1.50


## Select 함수

In [92]:
html = '''
<!DOCTYPE html>
<html>
    <head>
        <meta charset="utf-8">
        <title>BeautifulSoup Training</title>
    </head>
    <body>
        <div id="result">
            <p class="row">
                <a class="red">Go to pasge 1</a>
                <a class="blue">Go to pasge 2</a>
                <a class="green">Go to pasge 3</a>
                <a class="red">Go to pasge 4</a>
                <b class="yellow">Go to pasge 5</a>
                <c id="gray">Go to pasge 6</a>
                <d id="red">Go to pasge 7</a>
            </p>
        </div>
    </body>
</html>
'''

In [93]:
dom = BeautifulSoup(html, 'lxml')
dom

<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8"/>
<title>BeautifulSoup Training</title>
</head>
<body>
<div id="result">
<p class="row">
<a class="red">Go to pasge 1</a>
<a class="blue">Go to pasge 2</a>
<a class="green">Go to pasge 3</a>
<a class="red">Go to pasge 4</a>
<b class="yellow">Go to pasge 5
                <c id="gray">Go to pasge 6
                <d id="red">Go to pasge 7
            </d></c></b></p>
</div>
</body>
</html>

In [94]:
dom.select_one('a')

<a class="red">Go to pasge 1</a>

In [95]:
dom.select('a')

[<a class="red">Go to pasge 1</a>,
 <a class="blue">Go to pasge 2</a>,
 <a class="green">Go to pasge 3</a>,
 <a class="red">Go to pasge 4</a>]

In [96]:
dom.select_one('#gray')

<c id="gray">Go to pasge 6
                <d id="red">Go to pasge 7
            </d></c>

In [97]:
dom.select_one('.red')

<a class="red">Go to pasge 1</a>

In [98]:
dom.select("#gray")

[<c id="gray">Go to pasge 6
                 <d id="red">Go to pasge 7
             </d></c>]

In [99]:
url = 'https://www.daum.net'
html = requests.get(url)
html_text = html.text

In [102]:
dom = BeautifulSoup(html_text, 'lxml')

In [106]:
# id가 mArticle인 tag부분 전부
dom.select('#mArticle')    

[<article class="wrap_main" id="mArticle">
 <div class="feature_tmp">
 <div class="advert_tmp" id="adMain">
 <ins class="daum_ddn_area" data-ad-frame-id="adCgiMain" data-ad-frame-name="adCgiMain" data-ad-height="120" data-ad-type="P" data-ad-unit="DAN-1k1zi6towzodw" data-ad-width="655" style="display: inline-block;"></ins>
 <script charset="utf-8" src="//t1.daumcdn.net/adfit/static/ad.min.js" type="text/javascript"></script>
 </div>
 <div class="bg_login login_tmp #loginbox">
 <h2 class="screen_out" id="loginInfoTitle" tabindex="0">로그인 정보</h2>
 <div class="inner_login" id="inner_login">
 <a class="link_login" href="https://logins.daum.net/accounts/dsso.do?url=https%3A%2F%2Fwww.daum.net%2F"><span class="ico_pctop ico_daum"></span>Daum 아이디로 로그인</a>
 <a class="link_login link_kakaoid" href="https://logins.daum.net/accounts/ksso.do?url=https%3A%2F%2Fwww.daum.net%2F"><span class="ico_pctop ico_kakao"></span>카카오계정으로 로그인</a>
 <a class="link_join" href="https://accounts.kakao.com/weblogin/crea

In [107]:
# div tag에서 class가 cmain_tmp인 부분
dom.select("div.cmain_tmp")

[<div class="cmain_tmp">
 <div class="section_media">
 <h2 class="screen_out" id="mediaTitle">미디어</h2>
 <div class="panel_bloc news_on #newsbox" data-tab="news" id="mediaTab">
 <h3 class="tit_tabcont tit_news #title" id="newsTab"><a class="txt_pctop link_tit #news" href="http://media.daum.net/"><span class="ir_wa">뉴스</span></a></h3>
 <div class="page_tabcont">
 <strong class="screen_out">선택됨 뉴스 1탭</strong><span class="ico_pctop btn_page on"></span>
 <span class="ico_pctop btn_page"></span><span class="ico_pctop btn_page"></span>
 </div>
 <div class="wrap_tabcont wrap_news" id="news">
 <div class="news_prime news_tab1 #newsbox #news1">
 <div class="group_news">
 <div class="box_sector">
 <ul class="list_thumb">
 <li>
 <a class="link_item #img @1-1 ?c_id=" href="https://newslink.media.daum.net/p/20200222152233996">
 <span class="thumb_g">
 <img alt="" class="img_thumb" height="94" src="//img2.daumcdn.net/thumb/C308x188/?fname=https://t1.daumcdn.net/section/oc/17ed40bca12047b38242302577d6

In [109]:
#div tag에서 class가 section_media인 부분
dom.select("div.section_media")

[<div class="section_media">
 <h2 class="screen_out" id="mediaTitle">미디어</h2>
 <div class="panel_bloc news_on #newsbox" data-tab="news" id="mediaTab">
 <h3 class="tit_tabcont tit_news #title" id="newsTab"><a class="txt_pctop link_tit #news" href="http://media.daum.net/"><span class="ir_wa">뉴스</span></a></h3>
 <div class="page_tabcont">
 <strong class="screen_out">선택됨 뉴스 1탭</strong><span class="ico_pctop btn_page on"></span>
 <span class="ico_pctop btn_page"></span><span class="ico_pctop btn_page"></span>
 </div>
 <div class="wrap_tabcont wrap_news" id="news">
 <div class="news_prime news_tab1 #newsbox #news1">
 <div class="group_news">
 <div class="box_sector">
 <ul class="list_thumb">
 <li>
 <a class="link_item #img @1-1 ?c_id=" href="https://newslink.media.daum.net/p/20200222152233996">
 <span class="thumb_g">
 <img alt="" class="img_thumb" height="94" src="//img2.daumcdn.net/thumb/C308x188/?fname=https://t1.daumcdn.net/section/oc/17ed40bca12047b38242302577d6b8c3" width="154"/>
 <spa

In [110]:
url = 'https://media.daum.net/issue/5008621'
html = requests.get(url)

In [111]:
dom = BeautifulSoup(html.text, 'lxml')
dom

<!DOCTYPE html>
<html lang="ko">
<head>
<meta charset="utf-8"/>
<meta content="IE=Edge" http-equiv="X-UA-Compatible"/>
<meta content="Daum 뉴스" property="og:author"/>
<meta content="다음뉴스" property="og:site_name"/>
<meta content="이슈 - 코로나바이러스감염증-19" property="og:title"/>
<meta content="https://t1.daumcdn.net/section/oc/aa01024536824d68a69435f25b3259e4" property="og:image"/>
<meta content="다음뉴스" property="og:description"/>
<meta content="http://media.daum.net/issue/5008621" property="og:url"/>
<meta content="always" name="referrer"/>
<meta content="media.daum.net" name="svcdomain"/>
<title>이슈 - 코로나바이러스감염증-19 | 다음뉴스</title>
<link href="//m2.daumcdn.net/img-media/2010ci/Daum_favicon.ico" rel="shortcut icon"/>
<link href="//t1.daumcdn.net/media/static/media-1358/dist/news/css/common.min.css" rel="stylesheet" type="text/css"/>
<link href="//t1.daumcdn.net/media/static/media-1358/dist/news/css/news.min.css" rel="stylesheet" type="text/css"/>
<!--[if lte IE 7]>
<script type="text/javascript" sr

In [124]:
news_list = dom.select('#cMain > div#mArticle > ul > li > div > strong.tit_thumb > a.link_txt')

In [125]:
print(news_list[0])

<a class="link_txt" href="http://v.media.daum.net/v/20200222154614379">속초서 군간부 아내·상근예비역 코로나19 확진..부대 '발칵'</a>


In [126]:
news_lists = []
for index in range(0, len(news_list)):
    news_lists.append(news_list[index].text)
    print(news_list[index].text)

속초서 군간부 아내·상근예비역 코로나19 확진..부대 '발칵'
청도대남병원 사망자 2명, 직접 사인은 코로나19
서초구서 코로나19 확진자 추가.."대구 출장 다녀온 30대"
코로나19 확진 청주여성 지역 식당·마트 들러..증평 긴장 고...
이스라엘 성지순례단 9명도 코로나19 확진..감염경로 오리무중
코로나19 덮친 대구 경제 직격탄.."앞이 안 보인다"
강원 동부전선 육군 코로나19 비상..장병 1명 등 확진
은평성모병원 확진자 1명 추가..접촉자 자가격리
통합당 "문대통령, 어느 나라 대통령?"..'코로나19 대응'...
전주 방화범 코로나 증세로 일선경찰서 '긴장'..음성' 결과에...


In [127]:
for text in news_lists:
    print(text)

속초서 군간부 아내·상근예비역 코로나19 확진..부대 '발칵'
청도대남병원 사망자 2명, 직접 사인은 코로나19
서초구서 코로나19 확진자 추가.."대구 출장 다녀온 30대"
코로나19 확진 청주여성 지역 식당·마트 들러..증평 긴장 고...
이스라엘 성지순례단 9명도 코로나19 확진..감염경로 오리무중
코로나19 덮친 대구 경제 직격탄.."앞이 안 보인다"
강원 동부전선 육군 코로나19 비상..장병 1명 등 확진
은평성모병원 확진자 1명 추가..접촉자 자가격리
통합당 "문대통령, 어느 나라 대통령?"..'코로나19 대응'...
전주 방화범 코로나 증세로 일선경찰서 '긴장'..음성' 결과에...


In [128]:
with open('/home/ai27/workspace/news_list.txt', 'w') as f:
    for text in news_lists:
        f.write(text +'\n')

# Selenium - Webdriver 사용

In [143]:
path = '/home/ai27/anaconda3/chromedriver'
driver = webdriver.Chrome(path)   #1단계 : chromedriver 등록

In [144]:
url = 'http://example.webscraping.com/places/default/search'
driver.get(url)    #2단계 : Url 주소 얻기

In [145]:
driver.find_element_by_id('search_term').clear()   # 3단계 : tag 찾아서 실행
driver.find_element_by_id('search_term').send_keys('korea')
driver.find_element_by_id('search').click()

In [147]:
results = driver.find_element_by_id('results')
for tag in results.find_elements_by_tag_name('a'):
    print(tag.text)
    print(tag.get_attribute('href'))

North Korea
http://example.webscraping.com/places/default/view/North-Korea-165
South Korea
http://example.webscraping.com/places/default/view/South-Korea-211


In [150]:
driver.get('https://www.google.co.kr')
search = driver.find_element_by_name('q')
search.send_keys("파이썬")
search.submit()  #Enter 치는 효과

In [156]:
user = 'facebook id'
pwd = 'facebook password'

driver.get('https://www.facebook.com')

element = driver.find_element_by_id('email')
element.send_keys(user)
element = driver.find_element_by_id('pass')
element.send_keys(pwd)
element.send_keys(Keys.RETURN)

NameError: name 'Keys' is not defined