# 다음 뉴스 크롤링
- 최근 3년(2017.08.01 ~ 2020.08.02)간 분야별 상위랭킹 50개 기사의 {게시날짜, 제목, 본문, 요약본, 링크} 스크래핑
- news.db 데이터베이스 내 daumnews 테이블에 삽입
- 총 **109,419**건 수집

In [29]:
import requests
from requests import request
from requests.compat import urljoin, urlparse
from requests.exceptions import HTTPError
from bs4 import BeautifulSoup
import time
import re
import random
import pandas as pd
from datetime import datetime, timedelta 

## 함수

### download()

In [2]:
def download(url, params={}, headers={}, method='GET', limit=3):
    try:
        resp = request(method, url,
               params=params if method=='GET' else {},
               data=params if method=='POST' else {},
               headers=headers)
        resp.raise_for_status()
    except HTTPError as e:
        if limit > 0 and e.response.status_code >= 500:
            print(limit)
            time.sleep(random.random()*5) # => random
            resp = download(url, params, headers, method, limit-1)
        else:
            print('[{}] '.format(e.response.status_code) + url)
            print(e.response.status_code)
            print(e.response.reason)
            print(e.response.headers)
    return resp

### get_dateslist()

In [3]:
def get_dateslist(start_str, end_str):
    start = datetime.strptime(start_str, '%Y%m%d') 
    end = datetime.strptime(end_str, '%Y%m%d')
    end += timedelta(days=1)

    datelst = [] 
    while start.strftime('%Y%m%d') != end.strftime('%Y%m%d'): 
        datelst.append(start.strftime('%Y%m%d')) 
        start += timedelta(days=1)
    
    return datelst

### get_newslink()

In [4]:
def get_newslink(url_base, params, headers, datelst):
    kind = ['news', 'entertain', 'sports']
    newslink = []
    
    for i in datelst:
        for j in kind:
            
            params['regDate']=i
            url = url_base + '/' + j
            
            resp = download(url, params=params, headers=headers)
            if resp.status_code!=200:
                continue
            dom = BeautifulSoup(resp.content, 'lxml')

            for _ in dom.select('.list_news2 .cont_thumb > strong > a'):
                if _['href'] not in newslink:
                    newslink.append(_['href'])
    return newslink

### get_newstext()
- 제목, 기사작성일, 본문, 요약 crawling
- 기사작성일: 수정일시는 없는 기사도 많았으므로 입력일시를 사용
- 기사 자동요약이 없는 경우에는 기사는 데이터 수집하지 않도록 구현

In [5]:
def get_newstext(url):
    # url 받아서 DOM 객체 만들기
    html = requests.get(url).text
    dom = BeautifulSoup(html, 'html.parser')
    
    # summary 버튼 유무 확인하고 있으면 data 가져오기
    if not dom.select('.btn_summary + div p'):
        return None
    else:
        # 제목
        title = dom.select_one('em + h3').text.strip()  

        # 기사작성일
        date = dom.select_one('.info_view .num_date').text.strip()
        
        # 본문
        body_list = []
        for _ in dom.select('.article_view > section p'):
            body_list.append(_.text.strip())
        if len(body_list) > 1:
            body_list.pop() # 기자 이메일 제거
        body = '\n'.join(body_list)
        
        if body == '':
            return None
        
        # 요약
        summary_list = []
        for _ in dom.select('.btn_summary + div p'):
            summary_list.append(_.text.strip())
        summary = '\n'.join(summary_list)

    return title, date, body, summary

## DB 생성

news.db는 data 디렉터리 안에 위치

In [40]:
import sqlite3

conn = sqlite3.connect('../data/news.db')
cur = conn.cursor()

In [7]:
# cur.executescript('''
#     DROP TABLE IF EXISTS daumnews;
#     CREATE TABLE daumnews(
#         id INTEGER PRIMARY KEY AUTOINCREMENT NOT NULL,
#         title TEXT NOT NULL,
#         date TEXT,
#         body TEXT,
#         summary TEXT,
#         link TEXT NOT NULL
#     );
# ''')

## INSERT TO DB (뉴스링크, 내용)
20170801 ~ 20200802
### 20170801 - 20180731(총 365일)
- 2017년 03월 23일부터만 랭킹뉴스 데이터 존재

In [8]:
url_base = 'https://news.daum.net/ranking/popular'
params = {}
headers = {
            'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36'
        }

datelst = get_dateslist('20170801', '20180731')
newslink = get_newslink(url_base, params, headers, datelst)

In [9]:
for _ in newslink:
    if get_newstext(_) != None:
        title, date, body, summary = get_newstext(_)

        # INSERT TO DB
        cur.execute('INSERT INTO daumnews VALUES(NULL,?,?,?,?,?)',[title, date, body, summary, _])
        conn.commit()

### 20180801 - 20190731(총 365일)

In [20]:
url_base = 'https://news.daum.net/ranking/popular'
params = {}
headers = {
            'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36'
        }

datelst = get_dateslist('20180801', '20190731')
newslink = get_newslink(url_base, params, headers, datelst)

In [21]:
for _ in newslink:
    if get_newstext(_) != None:
        title, date, body, summary = get_newstext(_)

        # INSERT TO DB
        cur.execute('INSERT INTO daumnews VALUES(NULL,?,?,?,?,?)',[title, date, body, summary, _])
        conn.commit()

### 20190801 - 20200802(총 368일)

In [37]:
url_base = 'https://news.daum.net/ranking/popular'
params = {}
headers = {
            'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36'
        }

datelst = get_dateslist('20190801', '20200802')
newslink = get_newslink(url_base, params, headers, datelst)

In [41]:
for _ in newslink:
    if get_newstext(_) != None:
        title, date, body, summary = get_newstext(_)

        # INSERT TO DB
        cur.execute('INSERT INTO daumnews VALUES(NULL,?,?,?,?,?)',[title, date, body, summary, _])
        conn.commit()

In [42]:
cur.close()
conn.close()

navernews 테이블이 있던 news.db에 daumnews 테이블 데이터를 추가한 후 db 이름 변경  
-> **news_raw.db**

## 데이터프레임으로 바꿀 때

In [None]:
def make_crawed_df(url_list):
    col_names = ['title', 'date', 'body', 'summary', 'link']
    rows = []
    
    for _ in url_list:
        if get_newstext(_) != None:
            rows.append(get_newstext(_) + [_])
        
    df = pd.DataFrame(rows, columns=col_names)
    
    return df

In [None]:
url_list = ['https://news.v.daum.net/v/20200728064119204', 'https://news.v.daum.net/v/20200728171755863',
           'https://news.v.daum.net/v/20200728194024562', 'https://news.v.daum.net/v/20200728214416635']
make_crawed_df(url_list)