## 웹 페이지 탐색 및 크롤링 계획

> robots.txt 로 크롤링 룰 정해 놓은 것들이 있다.
User-agent: 아래 기술되어 있는 내용에 적용되는 로봇의 이름이 표기되어 있음 ('*'는 모든 대상).
Disallow: 해당 사이트가 허용하고 있지 않는 페이지가 기술되어 있다. ('/'는 모든 페이지).
Allow: 해당 사이트가 허용하는 페이지가 기술되어 있다. ("/$"는 해당 루트 페이지)

url/robots.txt (eg. naver.com/robots.txt)

1. 클롤링 하려는 정보에 대한 정의(어떤 데이터를 수집할 것인가?)

2. 해당 데이터를 수집하기에 가장 적합한 웹 페이지는 무엇인가?

3. 어떤 방법으로 접근할 것인가?

### 웹에서 글 추출하기

In [1]:
from bs4 import BeautifulSoup

In [2]:
html_text = '''
<html>
<body>
<h1 id="title">자연어 처리 강의</h1>
<p id="content">코드 구현을 중심으로 진행합니다</p>
</body>
</html>
'''

In [3]:
soup = BeautifulSoup(html_text, 'html')

In [4]:
print(f'html :\n{soup.html}')
print('--------------------')
print(f'body: \n{soup.html.body}')
print('--------------------')
print(f'h1: {soup.html.body.h1}')
print(f'h1_content: {soup.html.body.h1.string}')

html :
<html>
<body>
<h1 id="title">자연어 처리 강의</h1>
<p id="content">코드 구현을 중심으로 진행합니다</p>
</body>
</html>
--------------------
body: 
<body>
<h1 id="title">자연어 처리 강의</h1>
<p id="content">코드 구현을 중심으로 진행합니다</p>
</body>
--------------------
h1: <h1 id="title">자연어 처리 강의</h1>
h1_content: 자연어 처리 강의


In [5]:
import urllib.request as req

In [6]:
url = 'https://learningspoons.com'

In [7]:
with req.urlopen(url) as res:
    soup = BeautifulSoup(res, 'lxml')
    results = soup.select('li > a[href="https://learningspoons.com/account"]')
    for idx, result in enumerate(results):
        print(idx, result)

0 <a class="nav-top-link nav-top-not-logged-in is-small" href="https://learningspoons.com/account">
<span>
    로그인     / 회원가입  </span>
</a>
1 <a class="account-link-mobile is-small" href="https://learningspoons.com/account" title="내 계정">
<i class="icon-user"></i> </a>
2 <a class="nav-top-link nav-top-not-logged-in" href="https://learningspoons.com/account">
<span class="header-account-title">
    로그인  </span>
</a>


In [8]:
with req.urlopen(url) as res:
    soup = BeautifulSoup(res, 'lxml')
    results = soup.select('h2[class="bb-course-title"] > a')
    for result in results:
        print(f'강의명: {result.string}')

In [9]:
from selenium import webdriver

In [10]:
PATH = './chromedriver.exe'
driver = webdriver.Chrome(PATH)
driver.get('https://online.learningspoons.com')

In [11]:
def click_login(driver):
    LINK = "https://online.learningspoons.com/register/"
    element = driver.find_element_by_css_selector(f'a[href="{LINK}"]')
    element.click()
    
    return driver

In [12]:
driver = click_login(driver)

In [13]:
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.keys import Keys

def write_login_form(driver, form):
    element_id = driver.find_element_by_css_selector('input[data-key="username"]')
    element_pw = driver.find_element_by_css_selector('input[data-key="user_password"]')
    element_id.send_keys(form['id'])
    element_pw.send_keys(form['pw'])
    ActionChains(driver).send_keys(Keys.RETURN).perform()
    
    return driver

In [14]:
#form = {'id': 'id_value', 'pw': 'password_value'}
#driver = write_login_form(driver, form)

In [15]:
from selenium import webdriver
import random as rd

In [16]:
def get_driver(path_engine, options=None):
    driver = webdriver.Chrome(path_engine, options=options)
    return driver

def get_url(driver, url):
    driver.get(url)
    return driver

In [17]:
def set_option():
    window_sizes = [(1280, 1024), (1600, 1200), (1920, 1440), (1920, 1080), (2560, 1600), (3840, 2400)]
    options = webdriver.ChromeOptions()
    options.add_argument('headless')  # 웹을 보이지 않게 실행
    options.add_argument('window-size={}x{}'.format(*window_sizes[rd.randint(0, len(window_sizes)-1)]))  # 창 크기 설정
    options.add_argument('disable-gpu')    # gpu 설정
    options.add_argument('lang=ko_KR')
    
    return options

In [18]:
def get_info(driver):
    output = []
    elements = driver.find_elements_by_css_selector('h3[class="Maw(160px)"]')
    for e in elements:
        _a = e.find_element_by_css_selector('a').text
        _span = e.find_element_by_css_selector('span').text
        output.append((_a, _span))
    
    elements2 = driver.find_elements_by_css_selector('tr[class="dt-row Bgc($hoverBgColor):h Bdb Bdbc($seperatorColor) H(44px) "]')
    for e2 in elements2:
        _item = e2.find_element_by_css_selector('td[class="Ya(t) Fz(14px) Whs(nw) Py(6px) Ta(Start) Start(0) Pend(10px)"] > p').text
        _value = e2.find_element_by_css_selector('td[class="Ya(t) Fz(14px) Whs(nw) Ta(end) Pstart(10px) Py(60px)"] > span').text
        output.append((_item, _value))
    
    return driver, output

In [19]:
CHROME = './chromedriver.exe'
URL = 'https://finance.yahoo.com/'
driver = get_driver(CHROME, options=set_option())
driver = get_url(driver, URL)
driver, info = get_info(driver)
for _info in info:
    print(_info[0], _info[1])

S&P Futures 3,771.00
Dow Futures 30,934.00
Nasdaq Futures 12,739.50
Russell 2000 Futures 2,078.10
Crude Oil 51.24
Gold 1,926.50


## 이미지에서 글 추출하기

pre-requisite: https://github.com/tesseract-ocr/tessdoc/blob/master/Installation.md

In [20]:
from pytesseract import pytesseract
from PIL import Image

In [21]:
def load_image(path, mode=''):
    return Image.open(path)

def run_pytesseract(image, path_engine, config='-l eng'):  # config를 만지면 여러 설정으로 다른 결과들을 얻을 수 있음
    pytesseract.tesseract_cmd = path_engine
    result = pytesseract.image_to_string(image, config=config)
    return result

In [22]:
ENGINE_PATH = "C:\\Program Files (x86)\\Tesseract-OCR\\tesseract.exe"
IMAGE_PATH = "./test.jpg"

In [23]:
run_pytesseract(load_image(IMAGE_PATH), ENGINE_PATH)

'Text Messaqo\nToday 1342\n\nGOV.UK CORONAVIRUS ALERT\nNew rules in force now: you\nmust stay at home More info &\n\nexemptions at ggvuj/\n\ncgronavrrus Stay ax home.\nProtect the NHS. Save lives.\n\nWe would like to inform you\nthat you have been recorded as\nleaving your home on 3\noccasions yesterday. A fine of\n£35 has been added to your\ngugg account, For further\ninformation please visit 9M?\n\ntrggking. Protect the NHS Save\nlives.\n\n \n\n'

In [26]:
import pandas as pd

def image2data(path_img):
    data = pytesseract.image_to_data(path_img)
    lines = data.split('\n')
    columns, df_data = None, None
    for i, line in enumerate(lines):
        line = line.split('\t')
        if i == 0:
            columns = line.copy()
            df_data = { col:[] for col in line }
            continue
        for col, l in zip(columns, line):
            df_data[col] += [l]
    return pd.DataFrame(df_data)

In [27]:
image2data(IMAGE_PATH)

ValueError: arrays must all be same length

In [28]:
def image_to_osd(path_image):
    lines = pytesseract.image_to_osd(path_image).split('\n')
    lines = [ line.strip().split(':') for line in lines ]
    return { k.strip():v.strip() for k, v in lines }

In [30]:
image_to_osd(IMAGE_PATH)

ValueError: not enough values to unpack (expected 2, got 1)

In [29]:
def get_pdf(path_image, path_output):
    assert path_output.split('.')[-1] == 'pdf', 'The output format should be the "pdf"'
    pdf = pytesseract.image_to_pdf_or_hocr(path_image, extension='pdf')
    with open(path_output, 'w+b') as f:
        f.write(pdf)
        print('saved successfully!')

In [31]:
get_pdf(IMAGE_PATH, './test.pdf')

saved successfully!
