In [None]:
# 스크래피 프로젝트 만들기

In [None]:
!scrapy startproject stock

In [1]:
%%writefile stock/stock/items.py
import scrapy


class StockItem(scrapy.Item):
    title = scrapy.Field() # 해당 종목 뉴스 기사 타이틀
    news_link = scrapy.Field() # 뉴스 링크
    date = scrapy.Field() # 일자
    stock = scrapy.Field()
    stock_code = scrapy.Field() # 6.11 # 해당 종목 코드

Overwriting stock/stock/items.py


In [2]:
from datetime import datetime, timedelta
today = datetime.now().strftime('%Y.%m.%d %H:%M')
today = str(today).split(" ")[0].replace(".", "")
today

'20210628'

In [None]:
date = str(today).split(" ")[0].replace(".", "")
date

In [42]:
%%writefile stock/stock/spiders/spider.py
import scrapy
import re 
import pandas as pd
from stock.items import StockItem
from datetime import datetime, timedelta


class StockSpider(scrapy.Spider):
    name = "Stock"
    
    def start_requests(self):
        # 주식 코드 리스트화
        codes = pd.read_csv("/home/ubuntu/stock_code.csv")["ISU_SRT_CD"].tolist()
        # 해당 종목 링크 생성
        urls = [f"https://finance.naver.com/item/news_news.nhn?code={code}&page=&sm=title_entity_id.basic" for code in codes]
        for url in urls:
            yield scrapy.Request(url, callback=self.parse)
            
    def parse(self, response):
        # 해당 종목의 모든 페이지 url의 정보를 담는다.
        page_links = response.xpath('/html/body/div/table[2]/tr/td/a/@href').extract()
        last_page = 3
        stock_url = str(response.url)
        for page in range(1,int(last_page)+1):
            url = stock_url[:-25] + str(page) + stock_url[-25:]
            yield scrapy.Request(url, callback=self.parse_content1)  
    
    def parse_content1(self, response):
        
        today = datetime.now().strftime('%Y.%m.%d %H:%M')
        today = str(today).split(" ")[0].replace(".", "")
        
        # 모든 종목의 오늘에 해당하는 뉴스의 링크만 생성해서 넘기기
        
        links = response.xpath('/html/body/div/table[1]/tbody/tr/td[1]/a/@href').extract()
        link_date = response.xpath('/html/body/div/table[1]/tbody/tr/td[3]/text()').extract()
        
        
        request_links = []
        for date, link in zip(link_date, links):
            
            if date.replace(".", "")[1:-6] == today:
                print("="*100)
                print(date, link)
                print("="*100)
                request_links.append(link)
        
        for request_link in request_links:
            yield scrapy.Request("https://finance.naver.com/" + request_link, callback=self.parse_content2)
        
    def parse_content2(self, response):
        # 상세 페이지로 들어간 후 원하는 정보를 크롤링한다.
        item = StockItem()
        item["title"] = response.xpath('//*[@id="content"]/div[2]/table/tbody/tr[1]/th/strong/text()').extract()
        item["date"] = response.xpath('//*[@id="content"]/div[2]/table/tbody/tr[2]/th/span/span/text()').extract()
        item["news_link"] = response.url
        item['stock'] = response.xpath('//*[@id="middle"]/div[1]/div[1]/h2/a/text()').extract()
        item["stock_code"] = response.xpath('//*[@id="middle"]/div[1]/div[1]/div/span[1]/text()').extract()
        yield item    

Overwriting stock/stock/spiders/spider.py


In [45]:
%%writefile run.sh
cd stock
rm stock.csv
scrapy crawl Stock -o stock.csv

Overwriting run.sh


In [46]:
!source run.sh

2021-06-28 20:53:57 [scrapy.utils.log] INFO: Scrapy 2.5.0 started (bot: stock)
2021-06-28 20:53:57 [scrapy.utils.log] INFO: Versions: lxml 4.6.3.0, libxml2 2.9.10, cssselect 1.1.0, parsel 1.6.0, w3lib 1.22.0, Twisted 21.2.0, Python 3.8.5 (default, Mar 30 2021, 06:19:28) - [GCC 7.5.0], pyOpenSSL 20.0.1 (OpenSSL 1.1.1k  25 Mar 2021), cryptography 3.4.7, Platform Linux-5.4.0-1051-aws-x86_64-with-glibc2.27
2021-06-28 20:53:57 [scrapy.utils.log] DEBUG: Using reactor: twisted.internet.epollreactor.EPollReactor
2021-06-28 20:53:57 [scrapy.crawler] INFO: Overridden settings:
{'BOT_NAME': 'stock',
 'NEWSPIDER_MODULE': 'stock.spiders',
 'SPIDER_MODULES': ['stock.spiders']}
2021-06-28 20:53:57 [scrapy.extensions.telnet] INFO: Telnet Password: 313f01a2e5e94ddf
2021-06-28 20:53:57 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.memusage.MemoryUsage',
 'scrapy.extensions.feedexport.FeedExporter',
 

In [47]:
pd.read_csv("stock/stock.csv")

Unnamed: 0,date,news,news_link,stock,stock_code,title
0,2021.06.28 08:27,조선비즈,https://finance.naver.com//item/news_read.nhn?...,CJ ENM,,"NH투자증권 “CJ ENM, 2분기 미디어 중심 호실적 예상”"
1,2021.06.28 16:35,이데일리,https://finance.naver.com//item/news_read.nhn?...,CS,,"CS, 70억원 규모 전환사채 발행 결정"
2,2021.06.28 15:39,아시아경제,https://finance.naver.com//item/news_read.nhn?...,CJ제일제당,,"푸라닭치킨, CJ제일제당 '크레잇'과 함께 개발한 1호 제품 '텐더 치바로우' 선봬"
3,2021.06.28 10:51,조세일보,https://finance.naver.com//item/news_read.nhn?...,CJ제일제당,,"CJ제일제당, ‘술술 잘 풀리는 동치미 물냉면’ 프로모션"
4,2021.06.28 11:17,조세일보,https://finance.naver.com//item/news_read.nhn?...,CJ프레시웨이,,"CJ프레시웨이, 부산 대표 프랜차이즈 ‘정직유부’와 맞손"
...,...,...,...,...,...,...
424,2021.06.28 19:03,조선비즈,https://finance.naver.com//item/news_read.nhn?...,휴젤,,"GS그룹, 휴젤 인수전 뛰어드나... “바이오에 관심”"
425,2021.06.28 17:18,한국경제,https://finance.naver.com//item/news_read.nhn?...,휴온스글로벌,,"휴온스글로벌, 1회 접종 러시아 코로나 백신도 생산"
426,2021.06.28 10:12,머니투데이,https://finance.naver.com//item/news_read.nhn?...,휴비스,,SK케미칼·휴비스 손잡고 '친환경 원사' 새 장 열었다
427,2021.06.28 13:56,파이낸셜뉴스,https://finance.naver.com//item/news_read.nhn?...,휴온스글로벌,,"휴온스글로벌, 러 1회 접종 '스푸트니크 라이트' 생산도 맡는다"
