## 인코딩과 디코딩

In [1]:
"한".encode("utf8")  # UTF8로 인코딩

b'\xed\x95\x9c'

In [2]:
"한".encode("cp949")

b'\xc7\xd1'

In [3]:
"한".encode("utf16")

b'\xff\xfe\\\xd5'

In [4]:
"한".encode("cp949").decode("cp949")

'한'

In [5]:
"한".encode("cp949").decode("utf8")

UnicodeDecodeError: 'utf-8' codec can't decode byte 0xc7 in position 0: invalid continuation byte

In [6]:
len("안녕하세요. 반갑습니다.")

13

In [7]:
"안녕하세요. 반갑습니다."[:3]

'안녕하'

In [9]:
"Hello, World."

print("안녕. 세상아.")  # 표준출력(화면)이라는 파일에 쓰는 것.

안녕. 세상아.


In [11]:
f = open("test1.txt", "wt", encoding="utf8")
f.write("안녕. 세상아\n")
f.write("Hello, World\n")
f.close()

In [12]:
f = open("test2.txt", "wb")  # binary는 != TEXT가 아닌 경우
f.write("안녕. 세상아\n".encode("utf8"))
f.write("Hello, World\n".encode("utf8"))
f.close()

In [14]:
"가".encode("utf8")

b'\xea\xb0\x80'

### with절

In [None]:
try:
    f = open("test1.txt", "wt", encoding="utf8")
    f.write("안녕. 세상아\n")
    f.write("Hello, World\n")
finally:
    f.close()

In [25]:
with open("test1.txt", "wt", encoding="utf8") as f:
    f.write("안녕. 세상아\n")
    f.write("Hello, World\n")

In [26]:
with open("test1.txt", "rt", encoding="utf8") as f:
    data = f.read()   # 파일의 내용을 한 방에 다 메모리로 올려요.
    print(data)

안녕. 세상아
Hello, World



In [36]:
# 사이트에 따라, 이미지 다운로드가 불가할 수도 있습니다.
image_url = "https://www.almanac.com/sites/default/files/styles/primary_image_in_article/public/birth_month_flowers-primary-1920x1280px_pixabay.jpg"

import requests

res = requests.get(image_url)
# res.content => 원본 데이터
# res.text => res.content를 requests가 디코딩한 것 => 어떤 디코딩? (응답에 적혀있어요.)
image_data = res.content

print(image_data[:20])  # 처음 20바이트만 살펴봅니다.

with open("flower.jpg", "wb") as f:
    f.write(image_data)
    print("file saved.")

b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00\x00\x01\x00\x01\x00\x00'
file saved.


In [31]:
%ls *.jpg

flower.jpg


## 네이버 웹툰 크롤링

In [64]:
import os
import requests
from bs4 import BeautifulSoup

In [39]:
# FIXME: 경로명
episode_url = "https://comic.naver.com/webtoon/detail.nhn?titleId=715772&no=24&weekday=thu"

In [47]:
res = requests.get(episode_url)
# res.content.decode("utf8")
html = res.text
soup = BeautifulSoup(html, 'html.parser')

In [46]:
# res.headers

In [54]:
tag_list = soup.select('.wt_viewer img')
len(tag_list)

42

In [67]:
for tag in tag_list:
    image_url = tag['src']
    print(image_url)
    
    filename = os.path.basename(image_url)
#     filepath = 'data/좀비딸/24화/' + filename
    filepath = os.path.join('data', '좀비딸', '24화', filename)  # FIXME: 경로명
    dirpath = os.path.dirname(filepath)
    if not os.path.exists(dirpath):
        os.makedirs(dirpath)
    
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36',
        'Referer': episode_url,
    }
    
    image_data = requests.get(image_url, headers=headers).content
    
    with open(filepath, "wb") as f:
        print("write to {}".format(filepath))
        f.write(image_data)

https://image-comic.pstatic.net/webtoon/715772/24/20190109233749_28e26d7648a54ac95606418eeebec066_IMAG01_1.jpg
write to data/좀비딸/24화/20190109233749_28e26d7648a54ac95606418eeebec066_IMAG01_1.jpg
https://image-comic.pstatic.net/webtoon/715772/24/20190109233749_28e26d7648a54ac95606418eeebec066_IMAG01_2.jpg
write to data/좀비딸/24화/20190109233749_28e26d7648a54ac95606418eeebec066_IMAG01_2.jpg
https://image-comic.pstatic.net/webtoon/715772/24/20190109233749_28e26d7648a54ac95606418eeebec066_IMAG01_3.jpg
write to data/좀비딸/24화/20190109233749_28e26d7648a54ac95606418eeebec066_IMAG01_3.jpg
https://image-comic.pstatic.net/webtoon/715772/24/20190109233749_28e26d7648a54ac95606418eeebec066_IMAG01_4.jpg
write to data/좀비딸/24화/20190109233749_28e26d7648a54ac95606418eeebec066_IMAG01_4.jpg
https://image-comic.pstatic.net/webtoon/715772/24/20190109233749_28e26d7648a54ac95606418eeebec066_IMAG01_5.jpg
write to data/좀비딸/24화/20190109233749_28e26d7648a54ac95606418eeebec066_IMAG01_5.jpg
https://image-comic.pstatic.ne

### 함수화

In [78]:
import os
import requests
from bs4 import BeautifulSoup


def 네이버웹툰_크롤링(title_id, no):
    episode_url = "https://comic.naver.com/webtoon/detail.nhn"
    params = {
        'titleId': title_id,
        'no': no,
    }

    res = requests.get(episode_url, params=params)
    html = res.text
    soup = BeautifulSoup(html, 'html.parser')
    
    # h2_tag = soup.select('.detail')[0]
#     h2_tag = soup.select_one('.detail')
    title = '좀비딸'  # FIXME
    
    episode_name = soup.select_one('.tit_area h3').text.strip()
    print(episode_name)  # FIXME:
    
    return

    tag_list = soup.select('.wt_viewer img')

    for tag in tag_list:
        image_url = tag['src']
        print(image_url)

        filename = os.path.basename(image_url)
    #     filepath = 'data/좀비딸/24화/' + filename
        filepath = os.path.join('data', '좀비딸', '24화', filename)  # FIXME: 경로명
        dirpath = os.path.dirname(filepath)
        if not os.path.exists(dirpath):
            os.makedirs(dirpath)

        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36',
            'Referer': episode_url,
        }

        image_data = requests.get(image_url, headers=headers).content

        with open(filepath, "wb") as f:
            print("write to {}".format(filepath))
            f.write(image_data)

In [79]:
웹툰코드_사전 = {
    '좀비딸': '715772',
}

네이버웹툰_크롤링(웹툰코드_사전['좀비딸'], "24")

24화
