### 네이버TV 크롤링

In [1]:
import requests
import pandas as pd
from bs4 import BeautifulSoup

In [68]:
def navertv():
    response = requests.get('https://tv.naver.com/r/')
    dom = BeautifulSoup(response.content, 'html.parser')
    top3 = dom.select('#container > div > div.top_main > div > div > div > ul > li')
    top100 = dom.select('#content > div > div > div > div')
    
    # top3
    datas = []
    for top in top3:
        datas.append({
            "title": top.select_one('[class="title"]').text.replace('\n', ''),
            "program": top.select_one('[class="ch"]').text,
            "count": top.select_one('[class="hit"]').text.replace('재생 수', ''),
            "like": top.select_one('[class="like"]').text.replace('좋아요 수', ''),
            "link": top.find('a').get('href')
            })
    top3_df = pd.DataFrame(datas, columns=['title', 'program', 'count', 'like', 'link'])
    
    #top100
    datas2 = []
    for top in top100:
        datas2.append({
            "title": top.select_one('tooltip').text,
            "program": top.select_one('.chn').text.strip(),
            "count": top.select_one('[class="hit"]').text.replace('재생 수', ''),
            "like": top.select_one('[class="like"]').text.replace('좋아요 수', ''),
            "link": top.find('a').get('href')
            })
    top100_df = pd.DataFrame(datas2, columns=['title', 'program', 'count', 'like', 'link'])

    return pd.concat([top3_df, top100_df]).reset_index(drop=True)

In [69]:
df = navertv()
df.iloc[:5]

Unnamed: 0,title,program,count,like,link
0,"[찐한 엔딩] 김혜수♡주지훈, 격정적인 KISS♥",하이에나,674987,6002,https://tv.naver.com/v/12870385/list/67096
1,"[선공개] 이상윤·육성재, 눈물과 함께한 집사부일체와의 마지막",집사부일체,92453,630,https://tv.naver.com/v/12873342/list/67096
2,"[선공개] 홍진영, ‘트롯퀸’ 김연자 드레스 보고 반짝이 의상 홀릭!",미운 우리 새끼,67006,221,https://tv.naver.com/v/12867654/list/67096
3,☞눈물주의☜ 임영웅 ‘배신자’♭ 얄밉게 떠난 님아☹,미스터트롯,1653334,19802,https://tv.naver.com/v/12840075/list/67096
4,[충격 엔딩] 나는… 네가 지금 미치도록 보고 싶다,이태원 클라쓰,444051,3715,https://tv.naver.com/v/12869932/list/67096


# Scrapy로 크롤링한 데이터 Mongodb에 저장하기

### 1. 프로젝트 생성

In [7]:
!scrapy startproject navertv

New Scrapy project 'navertv', using template directory '/home/ubuntu/.pyenv/versions/3.6.9/envs/python3/lib/python3.6/site-packages/scrapy/templates/project', created in:
    /home/ubuntu/notebook/크롤링프로젝트/navertv

You can start your first spider with:
    cd navertv
    scrapy genspider example example.com


In [9]:
!tree navertv

[01;34mnavertv[00m
├── [01;34mnavertv[00m
│   ├── __init__.py
│   ├── [01;34m__pycache__[00m
│   ├── items.py
│   ├── middlewares.py
│   ├── pipelines.py
│   ├── settings.py
│   └── [01;34mspiders[00m
│       ├── __init__.py
│       └── [01;34m__pycache__[00m
└── scrapy.cfg

4 directories, 7 files


### 2. itmes.py 수정

In [10]:
! cat navertv/navertv/items.py

# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html

import scrapy


class NavertvItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    pass


In [70]:
%%writefile navertv/navertv/items.py
import scrapy

class NavertvItem(scrapy.Item):
    title = scrapy.Field()
    program = scrapy.Field()
    count = scrapy.Field()
    like = scrapy.Field()
    link = scrapy.Field()

Overwriting navertv/navertv/items.py


### 3. spider.py 생성

In [101]:
%%writefile navertv/navertv/spiders/spider.py
import scrapy
from bs4 import BeautifulSoup

from navertv.items import NavertvItem

class spider(scrapy.Spider):
    
    name = "Navertv"
    allow_domain = ['tv.naver.com']
    #start_urls = ['https://tv.naver.com/r/']
    
    def start_requests(self):
        urls = 'https://tv.naver.com/r/'
        yield scrapy.Request(url=urls, callback=self.top3_content)
        
    def top3_content(self, response):
        dom = BeautifulSoup(response.body, 'html.parser')
        top3 = dom.select('#container > div > div.top_main > div > div > div > ul > li')
        top100 = dom.select('#content > div > div > div > div')
        
        for top in top3:
            title = top.select_one('[class="title"]').text.replace('\n', '')
            program = top.select_one('[class="ch"]').text
            count = top.select_one('[class="hit"]').text.replace('재생 수', '')
            like = top.select_one('[class="like"]').text.replace('좋아요 수', '')
            link = top.find('a').get('href')
                
            item = NavertvItem()
            item['title'] = title
            item['program'] = program
            item['count'] = count
            item['like'] = like
            item['link'] = link
            
            yield item
    
        for top in top100:
            title = top.select_one('tooltip').text
            program = top.select_one('.chn').text.strip()
            count = top.select_one('[class="hit"]').text.replace('재생 수', '')
            like = top.select_one('[class="like"]').text.replace('좋아요 수', '')
            link = top.find('a').get('href')
        
            item = NavertvItem()
            item['title'] = title
            item['program'] = program
            item['count'] = count
            item['like'] = like
            item['link'] = link
    
            yield item


Overwriting navertv/navertv/spiders/spider.py


In [1]:
%%writefile navertv/run.sh
cd navertv
#PATH = /home/ubuntu/.pyenv/versions/3.6.9/envs/python3/lib/python3.6/site-packages
#export PATH
scrapy crawl Navertv -o items.csv

Overwriting navertv/run.sh


In [15]:
!ls -al run.sh

-rw-rw-r-- 1 ubuntu ubuntu 45 Mar 15 14:32 run.sh


In [16]:
import os
os.chmod("run.sh", 0o764)

In [17]:
!ls -al run.sh

-rwxrw-r-- 1 ubuntu ubuntu 45 Mar 15 14:32 run.sh


In [4]:
! navertv/run.sh

navertv/run.sh: 2: navertv/run.sh: PATH: not found
2020-03-15 22:08:12 [scrapy.utils.log] INFO: Scrapy 1.8.0 started (bot: navertv)
2020-03-15 22:08:12 [scrapy.utils.log] INFO: Versions: lxml 4.5.0.0, libxml2 2.9.10, cssselect 1.1.0, parsel 1.5.2, w3lib 1.21.0, Twisted 19.10.0, Python 3.6.9 (default, Jan 17 2020, 08:43:56) - [GCC 7.4.0], pyOpenSSL 19.1.0 (OpenSSL 1.1.1d  10 Sep 2019), cryptography 2.8, Platform Linux-4.15.0-1060-aws-x86_64-with-debian-buster-sid
2020-03-15 22:08:12 [scrapy.crawler] INFO: Overridden settings: {'BOT_NAME': 'navertv', 'FEED_FORMAT': 'csv', 'FEED_URI': 'items.csv', 'NEWSPIDER_MODULE': 'navertv.spiders', 'SPIDER_MODULES': ['navertv.spiders']}
2020-03-15 22:08:12 [scrapy.extensions.telnet] INFO: Telnet Password: 58c4bf00e2dd06ce
2020-03-15 22:08:12 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.memusage.MemoryUsage',
 'scrapy.extensions.feedexport.FeedExpo

2020-03-15 22:08:13 [scrapy.core.scraper] DEBUG: Scraped from <200 https://tv.naver.com/r/>
{'count': '13,905',
 'like': '42',
 'link': 'https://tv.naver.com/v/12881131/list/67096',
 'program': 'MBC뉴스',
 'title': '길게 늘어선 \'드라이브 스루\'…"한국 방식 배워야"'}
2020-03-15 22:08:13 [scrapy.core.scraper] DEBUG: Scraped from <200 https://tv.naver.com/r/>
{'count': '6,782',
 'like': '365',
 'link': 'https://tv.naver.com/v/12879898/list/67096',
 'program': '집사부일체',
 'title': '“질투 나요” 육성재, 마지막 밤! 아쉬운 마음에 솔직 폭발하는 막내'}
2020-03-15 22:08:13 [scrapy.core.scraper] DEBUG: Scraped from <200 https://tv.naver.com/r/>
{'count': '17,395',
 'like': '179',
 'link': 'https://tv.naver.com/v/12879192/list/67096',
 'program': '런닝맨',
 'title': '지석진, 유재석에 선 전화한 이광수에 삐짐↗ (ft. 유산슬)'}
2020-03-15 22:08:13 [scrapy.core.scraper] DEBUG: Scraped from <200 https://tv.naver.com/r/>
{'count': '19,396',
 'like': '1,443',
 'link': 'https://tv.naver.com/v/12878214/list/67096',
 'program': 'SBS 인기가요',
 'title': '가요계 NEW 영웅 ‘NCT 127’의 폭발적인 스

2020-03-15 22:08:13 [scrapy.core.scraper] DEBUG: Scraped from <200 https://tv.naver.com/r/>
{'count': '3,694',
 'like': '18',
 'link': 'https://tv.naver.com/v/12880005/list/67096',
 'program': '복면가왕',
 'title': "부드러운 목소리 '강변북로'의 정체는! 홍서범!"}
2020-03-15 22:08:13 [scrapy.core.scraper] DEBUG: Scraped from <200 https://tv.naver.com/r/>
{'count': '5,819',
 'like': '36',
 'link': 'https://tv.naver.com/v/12879577/list/67096',
 'program': '복면가왕',
 'title': "매혹적인 음색 '컵라면'의 정체는! 리틀 혜은이 요요미!"}
2020-03-15 22:08:13 [scrapy.core.scraper] DEBUG: Scraped from <200 https://tv.naver.com/r/>
{'count': '11,449',
 'like': '104',
 'link': 'https://tv.naver.com/v/12878321/list/67096',
 'program': '런닝맨',
 'title': '‘시청률 요정’ 임수향, 복학생 언니로 컴백! (ft. 프로게이머 소민)'}
2020-03-15 22:08:13 [scrapy.core.scraper] DEBUG: Scraped from <200 https://tv.naver.com/r/>
{'count': '164,281',
 'like': '1,668',
 'link': 'https://tv.naver.com/v/12869504/list/67096',
 'program': '이태원 클라쓰',
 'title': '스윗♡하게 협박하는 박서준 ＂지금 움직이면, 너 해고야♨＂'}
20

### 4. settings.py 수정

In [26]:
! head -n 22 navertv/navertv/settings.py | tail -n 2

# Obey robots.txt rules
ROBOTSTXT_OBEY = True


In [28]:
! sed -i 's/ROBOTSTXT_OBEY = True/ROBOTSTXT_OBEY = False/' navertv/navertv/settings.py

In [29]:
! head -n 22 navertv/navertv/settings.py | tail -n 2

# Obey robots.txt rules
ROBOTSTXT_OBEY = False


In [102]:
! ./run.sh

2020-03-15 16:18:10 [scrapy.utils.log] INFO: Scrapy 1.8.0 started (bot: navertv)
2020-03-15 16:18:10 [scrapy.utils.log] INFO: Versions: lxml 4.5.0.0, libxml2 2.9.10, cssselect 1.1.0, parsel 1.5.2, w3lib 1.21.0, Twisted 19.10.0, Python 3.6.9 (default, Jan 17 2020, 08:43:56) - [GCC 7.4.0], pyOpenSSL 19.1.0 (OpenSSL 1.1.1d  10 Sep 2019), cryptography 2.8, Platform Linux-4.15.0-1060-aws-x86_64-with-debian-buster-sid
2020-03-15 16:18:10 [scrapy.crawler] INFO: Overridden settings: {'BOT_NAME': 'navertv', 'FEED_FORMAT': 'csv', 'FEED_URI': 'items.csv', 'NEWSPIDER_MODULE': 'navertv.spiders', 'SPIDER_MODULES': ['navertv.spiders']}
2020-03-15 16:18:10 [scrapy.extensions.telnet] INFO: Telnet Password: 098c4b898d67281a
2020-03-15 16:18:10 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.memusage.MemoryUsage',
 'scrapy.extensions.feedexport.FeedExporter',
 'scrapy.extensions.logstats.LogStats']
2020

2020-03-15 16:18:11 [scrapy.core.scraper] DEBUG: Scraped from <200 https://tv.naver.com/r/>
{'count': '2,356,721',
 'like': '21,171',
 'link': 'https://tv.naver.com/v/12584114/list/67096',
 'program': '미스터트롯',
 'title': '임영웅 ‘보라빛 엽서’♩ 심금을 울리는 목소리 ˘-˘乃'}
2020-03-15 16:18:11 [scrapy.core.scraper] DEBUG: Scraped from <200 https://tv.naver.com/r/>
{'count': '152,987',
 'like': '2,292',
 'link': 'https://tv.naver.com/v/12870179/list/67096',
 'program': '하이에나',
 'title': '“보고 싶어 왔다… 배알도 없이” 주지훈, 김혜수에 취중 고백♥'}
2020-03-15 16:18:11 [scrapy.core.scraper] DEBUG: Scraped from <200 https://tv.naver.com/r/>
{'count': '101,452',
 'like': '798',
 'link': 'https://tv.naver.com/v/12869991/list/67096',
 'program': '이태원 클라쓰',
 'title': '[메이킹] 단밤VS장가 웃음 터지는 ★최강포차★ 요리 대회 현장'}
2020-03-15 16:18:11 [scrapy.core.scraper] DEBUG: Scraped from <200 https://tv.naver.com/r/>
{'count': '363,572',
 'like': '8,307',
 'link': 'https://tv.naver.com/v/12839194/list/67096',
 'program': '미스터트롯',
 'title': '찬또한테 딱!딱! 붙어라♥ 이찬

2020-03-15 16:18:11 [scrapy.core.scraper] DEBUG: Scraped from <200 https://tv.naver.com/r/>
{'count': '263,261',
 'like': '2,037',
 'link': 'https://tv.naver.com/v/12838823/list/67096',
 'program': '미스터트롯',
 'title': '장민호 ☆EDM★전사로 돌아오다!! ‘역쩐인생’♪'}
2020-03-15 16:18:11 [scrapy.core.scraper] DEBUG: Scraped from <200 https://tv.naver.com/r/>
{'count': '48,920',
 'like': '314',
 'link': 'https://tv.naver.com/v/12870383/list/67096',
 'program': '미스터트롯',
 'title': "♜6위·7위♜ '맏형' 장민호와 '해군병장' 김희재 ^‿^乃"}
2020-03-15 16:18:11 [scrapy.core.scraper] DEBUG: Scraped from <200 https://tv.naver.com/r/>
{'count': '133,906',
 'like': '1,242',
 'link': 'https://tv.naver.com/v/12869418/list/67096',
 'program': '이태원 클라쓰',
 'title': '주주총회를 앞두고 쓰러진 김다미… 당황한 박서준..!'}
2020-03-15 16:18:11 [scrapy.core.scraper] DEBUG: Scraped from <200 https://tv.naver.com/r/>
{'count': '63,974',
 'like': '290',
 'link': 'https://tv.naver.com/v/12870322/list/67096',
 'program': '미스터트롯',
 'title': '장만호는 무효⚠ 유효 투표수는 ❛5,428,900표❜'}
20

2020-03-15 16:18:11 [scrapy.core.scraper] DEBUG: Scraped from <200 https://tv.naver.com/r/>
{'count': '56,892',
 'like': '385',
 'link': 'https://tv.naver.com/v/12869124/list/67096',
 'program': '하이바이마마',
 'title': '뜻밖의 동병상련(?) 묘하게 통하는 김태희&고보결'}
2020-03-15 16:18:11 [scrapy.core.scraper] DEBUG: Scraped from <200 https://tv.naver.com/r/>
{'count': '107,130',
 'like': '588',
 'link': 'https://tv.naver.com/v/12869562/list/67096',
 'program': '하이바이마마',
 'title': '김태희X고보결과 마주친 이규형X이시우! 무슨 상황?! ㄴㅇ0ㅇㄱ'}
2020-03-15 16:18:11 [scrapy.core.scraper] DEBUG: Scraped from <200 https://tv.naver.com/r/>
{'count': '1,854,772',
 'like': '19,196',
 'link': 'https://tv.naver.com/v/12116952/list/67096',
 'program': '미스터트롯',
 'title': '임영웅 ‘일편단심 민들레야’♪ #안정 #섬세 #완벽 #✌'}
2020-03-15 16:18:11 [scrapy.core.scraper] DEBUG: Scraped from <200 https://tv.naver.com/r/>
{'count': '15,324',
 'like': '20',
 'link': 'https://tv.naver.com/v/12872977/list/67096',
 'program': 'OSEN',
 'title': '[O!SPORT

### 5. pipeline.py 설정

In [103]:
%%writefile navertv/navertv/mongodb.py
import pymongo

client = pymongo.MongoClient('mongodb://minhye:dss@15.164.13.131:27017')
db = client.navertv
collection = db.data

Writing navertv/navertv/mongodb.py


In [104]:
!cat navertv/navertv/pipelines.py

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html


class NavertvPipeline(object):
    def process_item(self, item, spider):
        return item


In [116]:
%%writefile navertv/navertv/pipelines.py
import requests
from bs4 import BeautifulSoup
from .mongodb import collection

class NavertvPipeline(object):
    
    def process_item(self, item, spider):
        data = {
            "title": item["title"],
            "program": item["program"],
            "count": item["count"],
            "like": item["like"],
            "link": item["link"],
        }
        collection.insert(data)
        
        return item

Overwriting navertv/navertv/pipelines.py


In [106]:
# settings.py에서 파이프라인 연결

In [107]:
!echo "ITEM_PIPELINES = {" >> navertv/navertv/settings.py

In [110]:
!echo "    'navertv.pipelines.NavertvPipeline': 300," >> navertv/navertv/settings.py

In [111]:
!echo "}" >> navertv/navertv/settings.py

In [112]:
!tail -n 3 navertv/navertv/settings.py

ITEM_PIPELINES = {
    'navertv.pipelines.NavertvPipeline': 300,
}


In [2]:
! navertv/run.sh

2020-03-18 13:59:00 [scrapy.utils.log] INFO: Scrapy 1.8.0 started (bot: navertv)
2020-03-18 13:59:00 [scrapy.utils.log] INFO: Versions: lxml 4.5.0.0, libxml2 2.9.10, cssselect 1.1.0, parsel 1.5.2, w3lib 1.21.0, Twisted 19.10.0, Python 3.6.9 (default, Jan 17 2020, 08:43:56) - [GCC 7.4.0], pyOpenSSL 19.1.0 (OpenSSL 1.1.1d  10 Sep 2019), cryptography 2.8, Platform Linux-4.15.0-1060-aws-x86_64-with-debian-buster-sid
2020-03-18 13:59:00 [scrapy.crawler] INFO: Overridden settings: {'BOT_NAME': 'navertv', 'FEED_FORMAT': 'csv', 'FEED_URI': 'items.csv', 'NEWSPIDER_MODULE': 'navertv.spiders', 'SPIDER_MODULES': ['navertv.spiders']}
2020-03-18 13:59:00 [scrapy.extensions.telnet] INFO: Telnet Password: e6941fb5480d7bff
2020-03-18 13:59:00 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.memusage.MemoryUsage',
 'scrapy.extensions.feedexport.FeedExporter',
 'scrapy.extensions.logstats.LogStats']
2020

2020-03-18 13:59:01 [scrapy.core.scraper] DEBUG: Scraped from <200 https://tv.naver.com/r/>
{'count': '12,578',
 'like': '29',
 'link': 'https://tv.naver.com/v/12924117/list/67096',
 'program': '맛 좀 보실래요',
 'title': '[반전] 스윗남 안지훈, 한가림 돈 보고 들이댄 ‘속물!’'}
2020-03-18 13:59:01 [scrapy.core.scraper] DEBUG: Scraped from <200 https://tv.naver.com/r/>
{'count': '2,979,925',
 'like': '28,032',
 'link': 'https://tv.naver.com/v/12463458/list/67096',
 'program': '미스터트롯',
 'title': '임영웅 ‘어느 60대 노부부 이야기’♩ 첫 소절에 찡≋'}
2020-03-18 13:59:01 [scrapy.core.scraper] DEBUG: Scraped from <200 https://tv.naver.com/r/>
{'count': '9,760',
 'like': '1',
 'link': 'https://tv.naver.com/v/12925650/list/67096',
 'program': '뉴스는 YTN',
 'title': '[현장영상] "117만 가구에 최대 50만 원 지급 예정"'}
2020-03-18 13:59:01 [scrapy.core.scraper] DEBUG: Scraped from <200 https://tv.naver.com/r/>
{'count': '41,045',
 'like': '111',
 'link': 'https://tv.naver.com/v/12919888/list/67096',
 'program': '방법',
 'title': '악귀를 품은 정지소, 활활 타버린 성동일! 그리고 일상으로 

2020-03-18 13:59:01 [scrapy.core.scraper] DEBUG: Scraped from <200 https://tv.naver.com/r/>
{'count': '21,876',
 'like': '41',
 'link': 'https://tv.naver.com/v/12919698/list/67096',
 'program': '방법',
 'title': '저주를 막기 위한 엄지원의 제안 ＂날 방법해＂'}
2020-03-18 13:59:01 [scrapy.core.scraper] DEBUG: Scraped from <200 https://tv.naver.com/r/>
{'count': '29,739',
 'like': '74',
 'link': 'https://tv.naver.com/v/12919892/list/67096',
 'program': '방법',
 'title': '악귀와 연결된 방법사! 엄지원에게 ＂저 언니 방법 못해요＂'}
2020-03-18 13:59:01 [scrapy.core.scraper] DEBUG: Scraped from <200 https://tv.naver.com/r/>
{'count': '172,947',
 'like': '118',
 'link': 'https://tv.naver.com/v/12865582/list/67096',
 'program': '아내의 맛',
 'title': '마마 vs 이웃집 악동들 자비없는 물총 전쟁! 하지만..._아내의 맛 89회 예고'}
2020-03-18 13:59:01 [scrapy.core.scraper] DEBUG: Scraped from <200 https://tv.naver.com/r/>
{'count': '7,667',
 'like': '107',
 'link': 'https://tv.naver.com/v/12920893/list/67096',
 'program': '불타는 청춘',
 'title': '김형준, 18년 만에 슈가맨으로 돌아온 이유 고백☆'}
2020-