In [1]:
!scrapy startproject naver_news_02

New Scrapy project 'naver_news_02', using template directory 'c:\programdata\anaconda3\lib\site-packages\scrapy\templates\project', created in:
    C:\Code_dss15\project\02_CRAWLING\naver_news_02

You can start your first spider with:
    cd naver_news_02
    scrapy genspider example example.com


In [3]:
%%writefile naver_news_02/naver_news_02/items.py
# %load naver_news_02/naver_news_02/items.py
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html

import scrapy


class NaverNews02Item(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    date = scrapy.Field()
    press_agency = scrapy.Field()
    category = scrapy.Field()
    link = scrapy.Field()
    title = scrapy.Field()
    content = scrapy.Field()


Overwriting naver_news_02/naver_news_02/items.py


In [1]:
!ls naver_news_02/naver_news_02/

__init__.py
__pycache__
article_url_1.csv
article_url_2.csv
article_url_3.csv
article_url_4.csv
article_url_half.csv
article_urls.csv
article_urls.xlsx
data
items.py
middlewares.py
mongodb.py
pipelines.py
settings.py
spiders


In [2]:
%%writefile naver_news_02/naver_news_02/spiders/spider.py
# %load naver_news_02/naver_news_02/spiders/spider.py
import pandas as pd
import time
import re
import requests
import scrapy
from scrapy.http import TextResponse
from naver_news_02.items import NaverNews02Item

class NaverNews02Spider(scrapy.Spider):
    name = 'naver_news_02'
    allow_domain=["https://news.naver.com"]
    user_agent= 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36'
    categ = {#'정치': '100',
     '101':'경제',
     '102': '사회',
     '103': '생활/문화',
     #'세계': '104',
     '105': 'IT/과학'}
    
    def start_requests(self):
        #df = pd.read_csv('article_url_1.csv')
        df = pd.read_csv('naver_news_02/article_soci.csv')
        rows = df.iloc
        date_ex = '202011'
        
        for row in rows:
            date_ = str(row['date'])
            date_ = str(date_)[0:7]
            if date_ != date_ex:
                time.sleep(2)
            #print(row['categ'], row['date'], row['last_p'])
            for page in range(1, int(row['last_p'])+1):
                url = 'https://news.naver.com/main/list.nhn?mode=LSD&mid=sec&listType=title&sid1={}&date={}&page={}'.format(row['categ'], row['date'], page)
                yield scrapy.Request(url, callback=self.parse)
            date_ex = str(date_)[0:7]
            
    def parse(self, resp):
        links = resp.xpath('//*[@id="main_content"]/div[2]/ul/li/a/@href').extract()
        
        # links = [resp.urljoin(link) for link in links]
        for link in links:
            yield scrapy.Request(link, callback=self.parse_content)
            
    def parse_content(self, resp):
        item = NaverNews02Item()
        title = resp.xpath('//*[@id="articleTitle"]/text() | //*[@id="content"]/div[1]/div/h2/text() | \
            //h4[@class="title"]/text()')[0].extract()
        date = resp.xpath('//*[@id="main_content"]/div[1]/div[3]/div/span[@class="t11"]/text() | \
                        //div[@class="article_info"]/span[@class="author"]/em/text()|\
                        //div[@class="info"]/span[1]/text()')[0].extract()
        content = resp.xpath('//*[@id="articleBodyContents"]/text() | \
                        //*[@id="articleBodyContents"]/strong/text() | \
                        //*[@id="articleBodyContents"]/div/text() | \
                        //*[@id="articleBodyContents"]/div/div/text() | \
                        //*[@id="articleBodyContents"]/font/text() | \
                        //*[@id="articleBodyContents"]/div[2]/ul/li/span/span/text() | \
                        //*[@id="newsEndContents"]/text() | \
                        //*[@id="articeBody"]/text()').extract()
        content = [text.replace('\xa0', ' ').strip() for text in content]
        categ_num = resp.url.split('sid1=')[1].split('&')[0]
        
        item['date'] = re.findall('[0-9]{4}[.][0-9]{2}[.][0-9]{2}', date)[0]
        item['category'] = self.categ[categ_num]
        item['press_agency'] = resp.xpath('//a[@class="nclicks(atp_press)"]/img/@title | //div[@class="press_logo"]/a/img/@alt | \
                                          //*[@id="pressLogo"]/a/img/@alt')[0].extract()
        item['link'] = resp.url
        item['title'] = title.strip()
        item['content'] = '\n'.join(content).strip()
        
        yield item

Overwriting naver_news_02/naver_news_02/spiders/spider.py


In [5]:
%%writefile naver_news_02/naver_news_02/pipelines.py
# %load naver_news_02/naver_news_02/pipelines.py
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html


# useful for handling different item types with a single interface
from itemadapter import ItemAdapter
from .mongodb import collection

class NaverNews02Pipeline:
    def process_item(self, item, spider):
        # time.sleep(10)
        data = {
            'p_date': item['date'],
            'category': item['category'],
            'press_agency': item['press_agency'],
            'link': item['link'],
            'title': item['title'],
            'content': item['content'],
            }
        print('='*5)
        collection.insert(data)
        return item


Overwriting naver_news_02/naver_news_02/pipelines.py


In [1]:
%%writefile naver_news_02/naver_news_02/mongodb.py
import pymongo
# DB와 연결
client = pymongo.MongoClient('mongodb://127.0.0.1:27017/') 
# DB Table 지정
db = client.news
collection = db.articles_society

Overwriting naver_news_02/naver_news_02/mongodb.py


In [7]:
%%writefile naver_news_02/naver_news_02/settings.py
# %load naver_news_02/naver_news_02/settings.py
# Scrapy settings for naver_news_02 project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
#     https://docs.scrapy.org/en/latest/topics/settings.html
#     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#     https://docs.scrapy.org/en/latest/topics/spider-middleware.html

BOT_NAME = 'naver_news_02'

SPIDER_MODULES = ['naver_news_02.spiders']
NEWSPIDER_MODULE = 'naver_news_02.spiders'


# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'naver_news_02 (+http://www.yourdomain.com)'

# Obey robots.txt rules
ROBOTSTXT_OBEY = False

# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32

# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)
#COOKIES_ENABLED = False

# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False

# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
#   'Accept-Language': 'en',
#}

# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
#    'naver_news_02.middlewares.NaverNews02SpiderMiddleware': 543,
#}

# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
#    'naver_news_02.middlewares.NaverNews02DownloaderMiddleware': 543,
#}

# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
#    'scrapy.extensions.telnet.TelnetConsole': None,
#}

# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
#ITEM_PIPELINES = {
#    'naver_news_02.pipelines.NaverNews02Pipeline': 300,
#}

# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
ITEM_PIPELINES = {
    'naver_news_02.pipelines.NaverNews02Pipeline': 300,
}

Overwriting naver_news_02/naver_news_02/settings.py
