# Query News Crawler

## 0. imports

In [1]:
%load_ext lab_black

In [2]:
import sys

sys.path.append("..")

In [3]:
import os
import re
import time
import random
import dill
import requests
import urllib
import bs4

from collections import defaultdict
from datetime import datetime, timedelta
from typing import List, Dict, Union

from bs4 import BeautifulSoup
from bs4.element import NavigableString
from kss import split_sentences

from newspaper import Article
from tqdm import tqdm, trange
from crawler.utils import clean_text

## 1. QueryNewsCrawler class

### 1.1 line by line coding

In [4]:
query = "P2E P&E"
query = urllib.parse.quote(query)

main_url = "https://search.naver.com/search.naver"
query_url = "?where=news&sm=tab_pge&query="
# 네이버 뉴스의 경우 page가 아닌
# 뉴스 기사 개수 카운팅으로 보여주고 있음
# 1 ~ 4000개까지 10개 단위로 보여줌
page_url = "&sort=0&photo=0&field=0&pd=0&ds=&de=&cluster_rank=23&mynews=0&office_type=0&office_section_code=0&news_office_checked=&nso=so:r,p:all,a:all&start="

start = 1
start_range = list(range(1, 4000, 10))
url = f"{main_url}{query_url}{query}{page_url}{start}"

In [5]:
# req = requests.get(url)
# soup = BeautifulSoup(req.text, "html.parser")

# # ul lists
# ul_lists = soup.find("ul", {"class": "list_news"})
# # news links
# links = ul_lists.find_all("a", {"class": "news_tit"})
# link_title_dict = {row["href"]: {"title": row["title"]} for row in links}


# # update news texts
# news_texts = ul_lists.find_all("a", {"class": "api_txt_lines dsc_txt_wrap"})
# for row in news_texts:
#     link_title_dict[row["href"]]["text"] = row.text

In [6]:
# # request and soup
# req = requests.get(url)
# soup = BeautifulSoup(req.text, "html.parser")

# # ul lists and li lists
# ul_lists = soup.find("ul", {"class": "list_news"})
# li_lists = ul_lists.findChildren("li", {"class": "bx"})

# news_data = []
# for li in li_lists:

#     # news link & title
#     link = li.find("a", {"class": "news_tit"})
#     href, title = link["href"], link["title"]

#     # news text
#     text = li.find("a", {"class": "api_txt_lines dsc_txt_wrap"}).text

#     # thumb nail if exist
#     thumb = ""
#     thumb_link = li.find("img", {"class": "thumb api_get"})
#     if thumb_link:
#         thumb = thumb_link["src"]

#     # update link_title_dict
#     news_data.append({"href": href, "title": title, "text": text, "thumb": thumb})

In [7]:
# li = li_lists[0]
# li.find("span", {"class": "info"}).text

'13시간 전'

In [9]:
# for news in news_data:
#     href = news["href"]

#     # 언어가 한국어이므로 language='ko'로 설정
#     a = Article(href, language="ko")
#     a.download()
#     a.parse()

#     news["date"] = a.publish_date

In [10]:
# news_data

### 1.2 Class

In [11]:
class QueryNewsCrawler:
    def __init__(self):
        # 네이버 뉴스의 경우 page가 아닌
        # 뉴스 기사 개수 카운팅으로 보여주고 있음
        # 1 ~ 4000개까지 10개 단위로 보여줌
        self.main_url = "https://search.naver.com/search.naver"
        self.query_url = "?where=news&sm=tab_pge&query="
        self.page_url = "&sort=0&photo=0&field=0&pd=0&ds=&de=&cluster_rank=23&mynews=0&office_type=0&"
        self.page_url += (
            "office_section_code=0&news_office_checked=&nso=so:r,p:all,a:all&start="
        )

    def crawl_news_by_query(self, query: str, count: int = 200) -> List[Dict]:
        self.query = query
        parsed_query = urllib.parse.quote(query)

        if count:
            start_range = list(range(1, count, 10))
        else:
            start_range = list(range(1, 4000, 10))

        news_data = []
        for s_idx in tqdm(start_range):
            url = f"{self.main_url}{self.query_url}{parsed_query}{self.page_url}{s_idx}"

            # request and soup
            req = requests.get(url)
            soup = BeautifulSoup(req.text, "html.parser")

            # ul lists and li lists
            ul_lists = soup.find("ul", {"class": "list_news"})
            li_lists = ul_lists.findChildren("li", {"class": "bx"})
            for li in li_lists:
                # news link & title
                link = li.find("a", {"class": "news_tit"})
                href, title = link["href"], link["title"]

                # publish date
                publish_date = self._get_datetime(href, li)

                # news text
                text = li.find("a", {"class": "api_txt_lines dsc_txt_wrap"}).text
                # thumb nail if exist
                thumb = ""
                thumb_link = li.find("img", {"class": "thumb api_get"})
                if thumb_link:
                    thumb = thumb_link["src"]

                # update link_title_dict
                news_data.append(
                    {
                        "href": href,
                        "date": publish_date,
                        "title": title,
                        "text": text,
                        "thumb": thumb,
                    }
                )
            time.sleep(random.uniform(0.6, 0.9))
        return news_data

    def _get_datetime(self, href: str, li: bs4.element.Tag):
        try:
            a = Article(href, language="ko")
            a.download()
            a.parse()
            publish_date = a.publish_date
        except:
            publish_date = None

        if publish_date is None:
            date = li.find("span", {"class": "info"}).text
            date = date.split()[0]

            if "분" in date:
                minutes = re.sub(r"[^\d+]", "", date)
                publish_date = datetime.now() - timedelta(minutes=int(minutes))
            elif "시간" in date:
                hours = re.sub(r"[^\d+]", "", date)
                publish_date = datetime.now() - timedelta(hours=int(hours))
            elif "일" in date:
                days = re.sub(r"[^\d+]", "", date)
                publish_date = datetime.now() - timedelta(days=int(days))
            else:
                publish_date = datetime.strptime(date, "%Y.%m.%d.")
        return publish_date

## 2. Test

In [12]:
crawler = QueryNewsCrawler()

In [17]:
query = "코스피"
count = 20
news_data = crawler.crawl_news_by_query(query=query, count=count)

100%|██████████| 2/2 [00:08<00:00,  4.48s/it]


In [18]:
news_data

[{'href': 'http://news.tf.co.kr/read/economy/1925213.htm',
  'date': datetime.datetime(2022, 3, 17, 16, 28, 6, tzinfo=tzoffset(None, 32400)),
  'title': "[시황] 국내증시, 美 금리인상에도 '상승'…코스피 2700선 코앞",
  'text': '코스피, 1.33% 오른 2694.51 마감 17일 코스피지수가 1% 넘게 상승하며 2700선을 목전에 뒀다. 코스피와 코스닥 모두 외국인과 기관이 크게 매수했다. 코스피지수는 전일 대비 1.33%(35.28포인트) 상승한 2694.51에 마쳤다....',
  'thumb': 'https://search.pstatic.net/common/?src=https%3A%2F%2Fimgnews.pstatic.net%2Fimage%2Forigin%2F629%2F2022%2F03%2F17%2F138204.jpg&type=ff264_180&expire=2&refresh=true'},
 {'href': 'http://yna.kr/AKR20220317049551002?did=1195m',
  'date': datetime.datetime(2022, 3, 17, 16, 3, 56, tzinfo=tzoffset(None, 32400)),
  'title': "코스피, 미국 FOMC 불확실성 해소에 '안도 랠리'…1.33% 상승(종합)",
  'text': '코스닥지수 2.50% 급등…8거래일 만에 900대 회복 코스피가 17일 미국 연방준비제도(Fed·연준)의 금리 인상에도 안도 랠리를 펼쳤다. 이날 코스피는 전 거래일보다 35.28포인트(1.33%) 오른 2,694.51에 장을 마쳤다....',
  'thumb': 'https://search.pstatic.net/common/?src=https%3A%2F%2Fimgnews.pstatic.net%2Fimage%2Forigin%2F001%2F2022%2F03%2F17%2F