### 프로젝트 시작

In [None]:
!scrapy startproject naver_news

### items.py

In [None]:
%%writefile naver_news/naver_news/items.py
# %load naver_news/naver_news/items.py
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html

import scrapy


class NaverNewsItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    date = scrapy.Field()
    press_agency = scrapy.Field()
    category = scrapy.Field()
    link = scrapy.Field()
    title = scrapy.Field()
    content = scrapy.Field()


### url 저장하는 함수

In [None]:
%%writefile naver_news/naver_news/spiders/naver_articles.py
import requests
import scrapy
from scrapy.http import TextResponse
from datetime import datetime, timedelta

def get_urls(category='105'):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36'
    }
    date = (datetime.today() - timedelta(1)).strftime('%Y%m%d')
    last_p, urls = '', []
    for page in range(1, 1000, 10):
        # 마지막 페이지로 
        url = 'https://news.naver.com/main/list.nhn?mode=LSD&mid=sec&listType=title&sid1={}&date={}&page={}'.format(category, date, page)
        req = requests.get(url, headers=headers)
        resp = TextResponse(req.url, body=req.text, encoding='utf-8')

        try:
            chk_next = resp.xpath('//div[@class="paging"]/a[@class="next nclicks(fls.page)"]/text()')[0].extract()
        except:
            chk_next = '끝'

        if chk_next == '끝':
            # 마지막 페이지가 여러개일때
            # 마지막 페이지가 1개일때
            pages = resp.xpath('//a[@class="nclicks(fls.page)"]/text()' |
                              '//div[@class="paging"]/strong').extract() 
            last_p = pages[-1]
            print(last_p)
            break

    for page in range(1, int(last_p)+1):
        urls.append('https://news.naver.com/main/list.nhn?mode=LSD&mid=sec&listType=title&sid1={}&date={}&page={}'.format(category, date, page))
    return urls


### spider.py

In [None]:
%%writefile naver_news/naver_news/spiders/spider.py
import time
import re
import requests
import scrapy
from .naver_articles import *
from scrapy.http import TextResponse
from naver_news.items import NaverNewsItem

class NaverNewsSpider(scrapy.Spider):
    name = 'naver_news'
    allow_domain=["https://news.naver.com"]
    categ = {#'정치': '100',
     #'101':'경제',
     #'102': '사회',
     '103': '생활/문화',
     #'세계': '104',
     '105': 'IT/과학'}
    user_agent= 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36'
    def start_requests(self):
        all_category_urls = []
        for category in self.categ.keys():
            all_category_urls.append(get_urls(category))

        for urls in all_category_urls:
            for url in urls:
                yield scrapy.Request(url, callback=self.parse)
            time.sleep(10)
            
    def parse(self, resp):
        links = resp.xpath('//*[@id="main_content"]/div[2]/ul/li/a/@href').extract()
        # links = [resp.urljoin(link) for link in links]
        for link in links:
            yield scrapy.Request(link, callback=self.parse_content)
            
    def parse_content(self, resp):
        item = NaverNewsItem()
        title = resp.xpath('//*[@id="articleTitle"]/text() | //*[@id="content"]/div[1]/div/h2/text()')[0].extract()
        date = resp.xpath('//*[@id="main_content"]/div[1]/div[3]/div/span[@class="t11"]/text() | \
                        //div[@class="article_info"]/span[@class="author"]/em')[0].extract()
        content = resp.xpath('//*[@id="articleBodyContents"]/text() | \
                        //*[@id="articleBodyContents"]/strong/text() | \
                        //*[@id="articleBodyContents"]/div/text() | \
                        //*[@id="articleBodyContents"]/div/div/text() | \
                        //*[@id="articleBodyContents"]/font/text() | \
                        //*[@id="articleBodyContents"]/div[2]/ul/li/span/span/text() | \
                        //*[@id="articeBody"]/text()').extract()
        content = [text.replace('\xa0', ' ').strip() for text in content]
        try:
            c_num = resp.url.split('sid1=')[1].split('&')[0]
        
            item['date'] = re.findall('[0-9]{4}[.][0-9]{2}[.][0-9]{2}', date)[0]
            item['category'] = self.categ[c_num]
            item['press_agency'] = resp.xpath('//a[@class="nclicks(atp_press)"]/img/@title | //div[@class="press_logo"]/a/img/@alt')[0].extract()
            item['link'] = resp.url
            item['title'] = title.strip()
            item['content'] = '\n'.join(content).strip()

            yield item
        except:
            print('nope') # url에 카테고리가 없는 연예기사 스포츠기사는 제외

### mongo db 저장 함수

In [None]:
%%writefile naver_news/naver_news/mongodb.py
import pymongo
# DB와 연결
client = pymongo.MongoClient('mongodb://127.0.0.1:27017/') 
# DB Table 지정
db = client.news
collection = db.articles

### piplines.py

In [None]:
%%writefile naver_news/naver_news/pipelines.py
# %load naver_news/naver_news/pipelines.py
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html


# useful for handling different item types with a single interface
from itemadapter import ItemAdapter
from .mongodb import collection
import time

class NaverNewsPipeline:
    def process_item(self, item, spider):
        # time.sleep(10)
        data = {
            'p_date': item['date'],
            'category': item['category'],
            'press_agency': item['press_agency'],
            'link': item['link'],
            'title': item['title'],
            'content': item['content'],
            }
        print('='*5)
        collection.insert(data)
        return item


### settings.py

In [None]:
%%writefile naver_news/naver_news/settings.py
# %load naver_news/naver_news/settings.py
# Scrapy settings for naver_news project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
#     https://docs.scrapy.org/en/latest/topics/settings.html
#     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#     https://docs.scrapy.org/en/latest/topics/spider-middleware.html

BOT_NAME = 'naver_news'

SPIDER_MODULES = ['naver_news.spiders']
NEWSPIDER_MODULE = 'naver_news.spiders'


# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'naver_news (+http://www.yourdomain.com)'

# Obey robots.txt rules
ROBOTSTXT_OBEY = False

# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32

# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)
#COOKIES_ENABLED = False

# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False

# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
#   'Accept-Language': 'en',
#}

# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
#    'naver_news.middlewares.NaverNewsSpiderMiddleware': 543,
#}

# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
#    'naver_news.middlewares.NaverNewsDownloaderMiddleware': 543,
#}

# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
#    'scrapy.extensions.telnet.TelnetConsole': None,
#}

# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
#ITEM_PIPELINES = {
#    'naver_news.pipelines.NaverNewsPipeline': 300,
#}

# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
ITEM_PIPELINES = {
    'naver_news.pipelines.NaverNewsPipeline': 300,
}