#### 프로젝트 생성

In [None]:
!scrapy startproject naver_article

#### items.py

In [None]:
%%writefile naver_article/naver_article/items.py
import scrapy

class NaverArticleItem(scrapy.Item):
    title = scrapy.Field()
    date = scrapy.Field()
    press = scrapy.Field()
    content = scrapy.Field()
    category = scrapy.Field()
    link = scrapy.Field()
    photo_url = scrapy.Field()

#### spider.py

In [1]:
%%writefile naver_article/naver_article/spiders/spider.py
import scrapy
import datetime
from naver_article.items import NaverArticleItem
from selenium import webdriver

class ArticleSpider(scrapy.Spider):
    name = 'NaverArticle'
    
    def __init__(self, keyword="자살", start_date="2015.01.01", end_date="2018.12.31", **kwargs):
        self.start_urls = "https://search.naver.com/search.naver?where=news&sm=tab_jum&query={}".format(keyword)
        self.search_press_ls = [
            # 종합지
            "경향신문", "국민일보", "동아일보", "문화일보", "서울신문", "세계일보", "조선일보", "중앙일보", "한겨레", "한국일보",
            # 방송/통신사
            "JTBC", "KBS", "MBC", "MBN", "SBS CNBC", "SBS", "TV조선","YTN",
            "뉴스1", "뉴시스", "연합뉴스", "연합뉴스TV", "채널A", "한국경제TV",
            # 경제지
            "매일경제", "머니투데이", "서울경제", "아시아경제", "이데일리", "조선비즈",
            "조세일보", "파이낸셜뉴스", "한국경제", "헤럴드경제",
            # 인터넷/IT지
            "ZDNet Korea", "노컷뉴스", "데일리안", "디지털데일리", "디지털타임스", "머니S",
            "미디어오늘", "블로터", "아이뉴스24", "오마이뉴스", "전자신문", "프레시안",
            # 매거진
            "뉴스위크", "매경이코노미", "시사IN", "시사저널", "신동아", "월간 산", "이코노미스트", 
            "주간경향", "주간동아", "주간조선", "중앙SUNDAY", "한겨레21", "한경비즈니스",
            # 전문지/포토
            "기자협회보", "뉴스타파", "동아사이언스", "여성신문", "일다", "참세상", 
            "코리아헤럴드", "코메디닷컴", "헬스조선",
        ]
        self.start_date = datetime.datetime.strptime(start_date, "%Y.%m.%d")
        self.end_date = datetime.datetime.strptime(end_date, "%Y.%m.%d")
        
        super().__init__(**kwargs)
    
    ## 사용하는 함수들
    
    ## 네이버 뉴스 신문사별 id 가져오기: {"경향신문" : "1032"}
    def get_press_dict(self, driver):
        driver.find_element_by_xpath('//*[@id="news_popup"]').click()
        keys = [element.get_attribute("title") for element in driver.find_elements_by_css_selector('#order_cat li label')]
        values = [element.get_attribute("value") for element in driver.find_elements_by_css_selector('#order_cat li input')]
        driver.find_element_by_xpath('//*[@id="news_popup"]').click()
        return {key:value for key, value in zip(keys, values)}
    
    ## 네이버 뉴스 검색할 신문사 선택하기
    def set_search_press(self, driver):
        press_dict = self.get_press_dict(driver)
        driver.find_element_by_xpath('//*[@id="news_popup"]').click()

        ## 선택되어있는거 초기화
        categorys = driver.find_elements_by_css_selector('#order_cat .viewtit input')
        for category in categorys:
            if category.get_attribute("checked") != "true":
                category.click()
            category.click()

        ## 검색할 신문사 선택
        for press in self.search_press_ls:
            driver.find_element_by_xpath('//*[@id="ca_{}"]'.format(press_dict[press])).click()
        driver.find_element_by_xpath('//*[@id="_nx_option_media"]/div[2]/div[3]/button[1]').click()
        
    ## 네이버 뉴스 검색 기간 설정하기(왜이렇게 입력이 안됨...)
    def set_search_date(self, driver, start_date, end_date):
        driver.find_element_by_xpath('//*[@id="snb"]/div/ul/li[2]').click()
        while True:
            driver.find_element_by_xpath('//*[@id="news_input_period_begin"]').clear()
            driver.find_element_by_xpath('//*[@id="news_input_period_begin"]').send_keys(start_date)
            input_start_date = driver.find_element_by_xpath('//*[@id="news_input_period_begin"]').get_attribute("value")
            if start_date == input_start_date:
                break
        while True:
            driver.find_element_by_xpath('//*[@id="news_input_period_end"]').clear()
            driver.find_element_by_xpath('//*[@id="news_input_period_end"]').send_keys(end_date)        
            input_end_date = driver.find_element_by_xpath('//*[@id="news_input_period_end"]').get_attribute("value")
            if end_date == input_end_date:
                break
        driver.find_element_by_xpath('//*[@id="_nx_option_date"]/div[2]/span/button').click()
        
    ## 텍스트 다듬어 넣기
    def content_clear(self, content):
        return content.replace("\n", "").replace("\t", "").replace("\xa0", "").strip()
    
    
    ## 스파이더 시작
    
    def start_requests(self):
        url = self.start_urls
        yield scrapy.Request(url, callback=self.parse) 
    
    def parse(self, response):
        options = webdriver.ChromeOptions()
        options.add_argument('headless')
        options.add_argument('window-size=1920x1080')
        options.add_argument("disable-gpu")
        options.add_argument("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36")

        driver = webdriver.Chrome(options=options)
        driver.get(response.url)
        driver.find_element_by_xpath('//*[@id="_search_option_btn"]').click()
        driver.find_element_by_xpath('//*[@id="main_pack"]/div/div[1]/div[3]/ul/li[2]/a').click()
        self.set_search_press(driver)
        
        temp_start_date = self.start_date
        temp_end_date = temp_start_date + datetime.timedelta(days=9)
        
        while True:
            if temp_end_date >= self.end_date:
                temp_end_date = self.end_date
            
            self.set_search_date(driver, temp_start_date.strftime("%Y.%m.%d"), temp_end_date.strftime("%Y.%m.%d"))
            try:
                while True:
                    links = [element.get_attribute("href") for element in driver.find_elements_by_xpath('//*[@id="main_pack"]/div/ul/li/dl/dd/a')]
                    for link in links:
                        yield scrapy.Request(link, callback=self.parse_page_contents, dont_filter=True)
                    ## 다음페이지 클릭
                    driver.find_element_by_css_selector('#main_pack > div > div.paging > a.next').click()
            ## 다음 페이지가 없으면 에러남
            except:
                pass
            
            if temp_end_date == self.end_date:
                driver.quit()
                break
            temp_start_date += datetime.timedelta(days=10)
            temp_end_date += datetime.timedelta(days=10)
            
    def parse_page_contents(self, response):
        if "entertain" in response.url:
            title = self.content_clear(response.xpath('//*[@id="content"]/div[1]/div/h2/text()').extract()[0])
            category = "TV연예"
            press = response.xpath('//*[@id="content"]/div[1]/div/div[1]/a/img/@alt').extract()[0]
            try:
                date = response.xpath('//*[@id="main_content"]/div[1]/div[3]/div/span/text()').extract()[0].replace("오전", "AM").replace("오후", "PM")
                date = datetime.datetime.strptime(date, "%Y.%m.%d. %p %I:%M")
            except:
                date = response.xpath('//*[@id="content"]/div[1]/div/div[2]/span/em/text()').extract()[0].replace("오전", "AM").replace("오후", "PM")
                date = datetime.datetime.strptime(date, "%Y.%m.%d. %p %I:%M")
            content = [cont.strip() for cont in response.xpath('//*[@id="articeBody"]/text()').extract()]
            content = self.content_clear(" ".join(content))
            
            photo_url = response.xpath('//*[@class="end_photo_org"]/img/@src').extract()


        else:
            title = self.content_clear(response.xpath('//*[@id="articleTitle"]/text()').extract()[0])
            try:
                category = response.xpath('//*[@id="articleBody"]/div[2]/a/em/text()').extract()[0]
            except:
                category = "-"
            press = response.xpath('//*[@id="main_content"]/div[1]/div[1]/a/img/@title').extract()[0]
            try:
                date = response.xpath('//*[@id="main_content"]/div[1]/div[3]/div/span/text()').extract()[0].replace("오전", "AM").replace("오후", "PM")
                date = datetime.datetime.strptime(date, "%Y.%m.%d. %p %I:%M")
            except:
                date = response.xpath('//*[@id="main_content"]/div[1]/div[3]/div/span[2]/text()').extract()[0].replace("오전", "AM").replace("오후", "PM")
                date = datetime.datetime.strptime(date, "%Y.%m.%d. %p %I:%M")
            content = [cont.strip() for cont in response.xpath('//*[@id="articleBodyContents"]/text()').extract()]
            content = self.content_clear(" ".join(content))
            photo_url = response.xpath('//*[@class="end_photo_org"]/img/@src').extract()
        
        item = NaverArticleItem()
        item["title"] = title
        item["link"] = response.url
        item["category"] = category
        item["press"] = press
        item["date"] = date
        item["content"] = content
        item["photo_url"] = photo_url
        yield item

Overwriting naver_article/naver_article/spiders/spider.py


#### robots.txt 설정 무시

In [20]:
!sed -i .bak 's/ROBOTSTXT_OBEY = True/ROBOTSTXT_OBEY = False/' naver_article/naver_article/settings.py

#### 실행 파일 제작

In [2]:
%%writefile run.sh
cd naver_article/
scrapy crawl NaverArticle -o naver_article.csv -a keyword="자살" -a start_date="2015.01.01" -a end_date="2018.12.31"

Overwriting run.sh


#### 실행

In [None]:
!source run.sh 

In [1]:
df = pd.read_csv("naver_article/naver_article.csv")
df.tail()

Unnamed: 0,category,content,date,link,photo_url,press,title
120374,생활,[오마이뉴스 오창균 기자] 어릴 때의 기억이다. 트랜지스터 라디오에서 흘러나오던 목...,2015-01-10 14:24:00,https://news.naver.com/main/read.nhn?mode=LSD&...,,오마이뉴스,어릴 적 추억 담긴 '씨아기'와 '문래'... 뭔지 아시나요
120375,TV연예,‘하트투하트’ 최강희가 천정명과 강렬한 첫 만남을 가졌다. 9일 첫 방송한 tvN...,2015-01-09 23:56:00,https://entertain.naver.com/read?oid=014&aid=0...,,파이낸셜뉴스,"‘하트투하트’ 최강희, 사람 찌르는 천정명 목격? ‘스릴넘치는 첫만남’"
120376,사회,"내 인생의 첫 나무는 낙엽송이었다. 나무심기가 한창이던 1960년대, 초록색 싹눈이...",2015-01-10 14:10:00,https://news.naver.com/main/read.nhn?mode=LSD&...,,조선비즈,[인터뷰] 30년 나무와 숲이 가르쳐 준 지혜
120377,TV연예,‘하트투하트’ 천정명이 첫 회부터 강렬한 시작을 알렸다. 지난 9일 첫 방송된 t...,2015-01-10 10:48:00,https://entertain.naver.com/read?oid=014&aid=0...,,파이낸셜뉴스,"‘하트투하트’ 천정명, 뻔뻔하고 능청스러운 정신과 의사로 ‘완벽빙의’"
120378,TV연예,[이데일리 스타in 강민정 기자] 케이블챈ㄹ tvN 금토드라마 ‘하트투하트’가 시청...,2015-01-10 08:57:00,https://entertain.naver.com/read?oid=018&aid=0...,,이데일리,"최강희표 로코 '하트투하트', 기분 좋은 출발..1040女 꽉 잡았다"


In [6]:
df.date = df.date.astype("datetime64")

In [8]:
temp = df.date.apply(lambda x : str(x.year) + "-" + str(x.month) + "-" + str(x.day))

In [10]:
len(temp.unique())

1461

In [11]:
365*4

1460

In [4]:
len(df.content.unique())

112369