# Query News Crawler

## 0. imports

In [1]:
%load_ext lab_black

In [2]:
import sys

sys.path.append("..")

In [130]:
import os
import re
import time
import random
import dill
import requests
import urllib

from collections import defaultdict
from datetime import datetime, timedelta
from typing import List, Dict, Union

from bs4 import BeautifulSoup
from bs4.element import NavigableString
from kss import split_sentences

from tqdm import tqdm, trange
from crawler.utils import clean_text

## 1. QueryNewsCrawler class

### 1.1 line by line coding

In [152]:
query = "코스피"
query = urllib.parse.quote(query)

main_url = "https://search.naver.com/search.naver"
query_url = "?where=news&sm=tab_pge&query="
# 네이버 뉴스의 경우 page가 아닌
# 뉴스 기사 개수 카운팅으로 보여주고 있음
# 1 ~ 4000개까지 10개 단위로 보여줌
page_url = "&sort=0&photo=0&field=0&pd=0&ds=&de=&cluster_rank=23&mynews=0&office_type=0&office_section_code=0&news_office_checked=&nso=so:r,p:all,a:all&start="

start = 1
start_range = list(range(1, 4000, 10))
url = f"{main_url}{query_url}{query}{page_url}{start}"

In [124]:
# req = requests.get(url)
# soup = BeautifulSoup(req.text, "html.parser")

# # ul lists
# ul_lists = soup.find("ul", {"class": "list_news"})
# # news links
# links = ul_lists.find_all("a", {"class": "news_tit"})
# link_title_dict = {row["href"]: {"title": row["title"]} for row in links}


# # update news texts
# news_texts = ul_lists.find_all("a", {"class": "api_txt_lines dsc_txt_wrap"})
# for row in news_texts:
#     link_title_dict[row["href"]]["text"] = row.text

In [125]:
# request and soup
req = requests.get(url)
soup = BeautifulSoup(req.text, "html.parser")

# ul lists and li lists
ul_lists = soup.find("ul", {"class": "list_news"})
li_lists = ul_lists.findChildren("li", {"class": "bx"})

news_data = []
for li in li_lists:

    # news link & title
    link = li.find("a", {"class": "news_tit"})
    href, title = link["href"], link["title"]

    # news text
    text = li.find("a", {"class": "api_txt_lines dsc_txt_wrap"}).text

    # thumb nail if exist
    thumb = ""
    thumb_link = li.find("img", {"class": "thumb api_get"})
    if thumb_link:
        thumb = thumb_link["src"]

    # update link_title_dict
    news_data.append({"href": href, "title": title, "text": text, "thumb": thumb})

In [127]:
# news_data

### 1.2 Class

In [136]:
class QueryNewsCrawler:
    def __init__(self):
        # 네이버 뉴스의 경우 page가 아닌
        # 뉴스 기사 개수 카운팅으로 보여주고 있음
        # 1 ~ 4000개까지 10개 단위로 보여줌
        self.main_url = "https://search.naver.com/search.naver"
        self.query_url = "?where=news&sm=tab_pge&query="
        self.page_url = "&sort=0&photo=0&field=0&pd=0&ds=&de=&cluster_rank=23&mynews=0&office_type=0&"
        self.page_url += (
            "office_section_code=0&news_office_checked=&nso=so:r,p:all,a:all&start="
        )

    def crawl_news_by_query(self, query: str, count: int = 200) -> List[Dict]:
        self.query = query
        parsed_query = urllib.parse.quote(query)

        if count:
            start_range = list(range(1, count, 10))
        else:
            start_range = list(range(1, 4000, 10))

        news_data = []
        for s_idx in tqdm(start_range):
            url = f"{self.main_url}{self.query_url}{parsed_query}{self.page_url}{s_idx}"

            # request and soup
            req = requests.get(url)
            soup = BeautifulSoup(req.text, "html.parser")

            # ul lists and li lists
            ul_lists = soup.find("ul", {"class": "list_news"})
            li_lists = ul_lists.findChildren("li", {"class": "bx"})
            for li in li_lists:
                # news link & title
                link = li.find("a", {"class": "news_tit"})
                href, title = link["href"], link["title"]
                # news text
                text = li.find("a", {"class": "api_txt_lines dsc_txt_wrap"}).text
                # thumb nail if exist
                thumb = ""
                thumb_link = li.find("img", {"class": "thumb api_get"})
                if thumb_link:
                    thumb = thumb_link["src"]

                # update link_title_dict
                news_data.append(
                    {"href": href, "title": title, "text": text, "thumb": thumb}
                )
            time.sleep(random.uniform(0.6, 0.9))

        return news_data

## 2. Test

In [137]:
crawler = QueryNewsCrawler()

In [149]:
query = "코스피"
count = 20
news_data = crawler.crawl_news_by_query(query=query, count=count)

100%|██████████| 2/2 [00:02<00:00,  1.11s/it]


In [150]:
news_data[:2]

[{'href': 'http://www.newsis.com/view/?id=NISX20220314_0001792931&cID=15001&pID=15000',
  'title': '美금리인상 임박…코스피 2500선 하락 우려도',
  'text': '5%p 인상 \'빅스텝\' 가능성도 열어둬 주중 선반영시 코스피 2500대 하락 우려 증권가 "0.25%p 올리고 정책조정... 코스피가 이 같은 연준의 빅스텝(0.5%포인트 인상)을 선반영할 경우 주중 2600선 아래로 떨어질 수 있다는...',
  'thumb': 'https://search.pstatic.net/common/?src=https%3A%2F%2Fimgnews.pstatic.net%2Fimage%2Forigin%2F003%2F2022%2F03%2F15%2F11062029.jpg&type=ff264_180&expire=2&refresh=true'},
 {'href': 'http://yna.kr/AKR20220315122700002?did=1195m',
  'title': '긴축 우려 등에 코스피 사흘째 하락…2,620대로 밀려',
  'text': '코스닥지수, 혼조 흐름 속 하락 마감 코스피가 사흘째 하락하며 2,620대까지 밀렸다. 15일 코스피는 전 거래일보다 24.12포인트(0.91%) 내린 2,621.53에 마감했다. 지수는 전장보다 15.34포인트(0.58%) 낮은 2,630....',
  'thumb': 'https://search.pstatic.net/common/?src=https%3A%2F%2Fimgnews.pstatic.net%2Fimage%2Forigin%2F001%2F2022%2F03%2F15%2F13052478.jpg&type=ff264_180&expire=2&refresh=true'}]