#### Nielsen
----

In [1]:
import requests
import pandas as pd
import json
from bs4 import BeautifulSoup

#### 1. 프로젝트 생성

In [2]:
!scrapy startproject nielsen

New Scrapy project 'nielsen', using template directory '/home/ubuntu/.pyenv/versions/3.6.9/envs/python3/lib/python3.6/site-packages/scrapy/templates/project', created in:
    /home/ubuntu/python3/notebook/crawling/nielsen

You can start your first spider with:
    cd nielsen
    scrapy genspider example example.com


#### 2. items.py 수정

In [12]:
%%writefile ./nielsen/nielsen/items.py

import scrapy
class NielsenItem(scrapy.Item):

    rank = scrapy.Field()
    broadcast = scrapy.Field()
    program = scrapy.Field()
    rate = scrapy.Field()

Overwriting ./nielsen/nielsen/items.py


#### 3. spider.py 생성

In [71]:
%%writefile ./nielsen/nielsen/spiders/spider.py

import scrapy
from bs4 import BeautifulSoup
from datetime import datetime, timedelta

from nielsen.items import NielsenItem
from scrapy.http import HtmlResponse
from scrapy.selector import Selector


class Spider(scrapy.Spider):
    
    name = 'Nielsen'
    
    def __init__(self, **kwargs):
        self.start_urls = 'http://nielsenkorea.co.kr/tv_terrestrial_day.asp?menu=Tit_1&sub_menu={}_1'.format(kwargs['category'])
    
        super().__init__(**kwargs)
        
    def start_requests(self):
        if self.start_urls == 'http://nielsenkorea.co.kr/tv_terrestrial_day.asp?menu=Tit_1&sub_menu=1_1':
            yield scrapy.Request(url = self.start_urls, callback=self.get_content1)
        else:
            yield scrapy.Request(url = self.start_urls, callback=self.get_content2)
            
    def get_content1(self, response):
        dom = BeautifulSoup(response.body ,'html.parser')
        ranks = dom.select('.tb_txt_center')[0:40:2]
        broadcasts = dom.select('.tb_txt_center')[1:40:2]
        programs = dom.select('table.ranking_tb .tb_txt')[0:20]
        rates_10 = dom.select('table.ranking_tb .percent')
        rates_20 = dom.select('table.ranking_tb .percent_g')
        rates_all = rates_10[:10]+rates_20[:10]
        
        item = NielsenItem()
        
        for rank, broadcast, program, rate in zip(ranks, broadcasts, programs, rates_all):
            
            item['rank'] = rank.text.split('\t')[0]
            item['broadcast'] = broadcast.text.split('\t')[0]
            item['program'] = program.text.split('\t')[0]
            item['rate'] = rate.text.strip()
        
            yield item
        
    def get_content2(self, response):
        dom = BeautifulSoup(response.body ,'html.parser')
        ranks = dom.select('.tb_txt_center')[0:20:2]
        broadcasts = dom.select('.tb_txt_center')[1:20:2]
        programs = dom.select('table.ranking_tb .tb_txt')[0:10]
        rates_10 = dom.select('table.ranking_tb .percent')
        rates_20 = dom.select('table.ranking_tb .percent_g')
        rates_all = rates_10[:10]+rates_20[:10]
        
        item = NielsenItem()
        
        for rank, broadcast, program, rate in zip(ranks, broadcasts, programs, rates_all):
            
            item['rank'] = rank.text.split('\t')[0]
            item['broadcast'] = broadcast.text.split('\t')[0]
            item['program'] = program.text.split('\t')[0]
            item['rate'] = rate.text.strip()
        
            yield item

Overwriting ./nielsen/nielsen/spiders/spider.py


In [77]:
%%writefile nielsen/run.sh
cd nielsen
scrapy crawl Nielsen -o nielsen_ground.csv -a category=1
scrapy crawl Nielsen -o nielsen_general.csv -a category=2
scrapy crawl Nielsen -o nielsen_cabletv.csv -a category=3

Overwriting nielsen/run.sh


In [7]:
!chmod 764 ./nielsen/run.sh

In [62]:
!ls -al ./nielsen/run.sh

-rwxrw-r-- 1 ubuntu ubuntu 184 Mar 18 14:31 ./nielsen/run.sh


In [78]:
%%writefile ./nielsen/nielsen/mongodb.py
import pymongo

client = pymongo.MongoClient('mongodb://id:pw@ip')
db = client.nielsen_server
collection = db.items

Overwriting ./nielsen/nielsen/mongodb.py


In [79]:
%%writefile ./nielsen/nielsen/pipelines.py
from .mongodb import collection

class NielsenPipeline(object):
    def process_item(self, item, spider):
        columns = ["rank", "program", "broadcast","rate"]
        data = {column: item[column] for column in columns}
        collection.insert(data)
        
        return item

Overwriting ./nielsen/nielsen/pipelines.py


In [65]:
!echo "ITEM_PIPELINES = {" >> nielsen/nielsen/settings.py '\n nielsen.pipelines.NielsenPipeline' : 300, "\n}" >> nielsen/nielsen/settings.py

In [67]:
!tail -n 3 nielsen/nielsen/settings.py

ITEM_PIPELINES = { 
 'nielsen.pipelines.NielsenPipeline' : 300, 
}


In [80]:
!./nielsen/run.sh

2020-03-18 14:44:33 [scrapy.utils.log] INFO: Scrapy 1.8.0 started (bot: nielsen)
2020-03-18 14:44:33 [scrapy.utils.log] INFO: Versions: lxml 4.5.0.0, libxml2 2.9.10, cssselect 1.1.0, parsel 1.5.2, w3lib 1.21.0, Twisted 19.10.0, Python 3.6.9 (default, Jan 17 2020, 13:17:41) - [GCC 7.4.0], pyOpenSSL 19.1.0 (OpenSSL 1.1.1d  10 Sep 2019), cryptography 2.8, Platform Linux-4.15.0-1060-aws-x86_64-with-debian-buster-sid
2020-03-18 14:44:33 [scrapy.crawler] INFO: Overridden settings: {'BOT_NAME': 'nielsen', 'FEED_FORMAT': 'csv', 'FEED_URI': 'nielsen_ground.csv', 'NEWSPIDER_MODULE': 'nielsen.spiders', 'SPIDER_MODULES': ['nielsen.spiders']}
2020-03-18 14:44:33 [scrapy.extensions.telnet] INFO: Telnet Password: 8529dafb31f1a0f2
2020-03-18 14:44:33 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.memusage.MemoryUsage',
 'scrapy.extensions.feedexport.FeedExporter',
 'scrapy.extensions.logstats.LogSta

2020-03-18 14:44:34 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.memusage.MemoryUsage',
 'scrapy.extensions.feedexport.FeedExporter',
 'scrapy.extensions.logstats.LogStats']
2020-03-18 14:44:34 [scrapy.middleware] INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
 'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
 'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
 'scrapy.downloadermiddlewares.retry.RetryMiddleware',
 'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
 'scrapy.downloadermiddlewares.redirect.RedirectMiddleware',
 'scrapy.downloadermiddlewares.cookies.CookiesMiddleware',
 'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddle

In [81]:
categories = {1 : 'ground' , 2 : 'general' , 3 : 'cabletv'}
categories

{1: 'ground', 2: 'general', 3: 'cabletv'}

In [82]:
dfs = [pd.read_csv("nielsen/nielsen_{}.csv".format(category)) for category in categories.values()]

In [83]:
[(category, len(df)) for category, df in zip(categories.values(), dfs)]

[('ground', 20), ('general', 10), ('cabletv', 10)]

In [76]:
!rm -rf nielsen/nielsen_cabletv.csv
!rm -rf nielsen/nielsen_general.csv
!rm -rf nielsen/nielsen_ground.csv

In [86]:
result_df = pd.concat(dfs, ignore_index=True)
result_df[18:22]

Unnamed: 0,broadcast,program,rank,rate
18,SBS,생활의달인,18,5.9
19,SBS,불타는청춘,20,5.8
20,TV CHOSUN,세상어디에도없는아내의맛2부,1,10.566
21,TV CHOSUN,세상어디에도없는아내의맛1부,2,6.019


In [87]:
import pymongo

client = pymongo.MongoClient('mongodb://test:testpw@15.165.136.173:27017')
db = client.wavve_server
collection = db.items

In [91]:
datas = collection.find({}, {"_id": False})

In [92]:
pd.DataFrame(datas).tail(2)

Unnamed: 0,count,date,title
598,11회,2004-01-17(토),X맨 1
599,11회,2020-01-08(수),팔로우 미 시즌12
