## 티빙VOD 크롤링

### 프로젝트생성

In [79]:
!scrapy startproject tving_vod

New Scrapy project 'tving_vod', using template directory '/home/ubuntu/.pyenv/versions/3.6.9/envs/python3/lib/python3.6/site-packages/scrapy/templates/project', created in:
    /home/ubuntu/notebook/Project_crawling/tving_vod

You can start your first spider with:
    cd tving_vod
    scrapy genspider example example.com


### items.py 수정

In [80]:
!cat tving_vod/tving_vod/items.py

# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html

import scrapy


class TvingVodItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    pass


In [1]:
% % writefile tving_vod/tving_vod/items.py
import scrapy

class TvingVodItem(scrapy.Item):
    program = scrapy.Field()
    title = scrapy.Field()
    genre = scrapy.Field()
    broadcast_date = scrapy.Field()
    channel = scrapy.Field()
    view_day = scrapy.Field()
    view_week = scrapy.Field()
    sale_day = scrapy.Field()
    sale_week = scrapy.Field()
    p_url = scrapy.Field()

Overwriting tving_vod/tving_vod/items.py


### spider.py 생성

In [2]:
% % writefile tving_vod/tving_vod/spiders/spider.py
from tving_vod.items import TvingVodItem
import json
import scrapy

class Spider(scrapy.Spider):
    name = "TvingVod"
    allow_domain = ['tiving.com']
    start_urls = ['http://www.tving.com/vod/popular']

    def parse(self, response):
        apiKey = response.text.split('apiKey=')[1].split("&")[0]
        pageSize = 100
        url = "https://api.tving.com/v2/media/episodes?pageNo=1&pageSize={}&order=viewDay&adult=all&free=all&guest=all&scope=all&lastFrequency=y&personal=N&screenCode=CSSD0100&networkCode=CSND0900&osCode=CSOD0900&teleCode=CSCD0900&apiKey={}".format(
            pageSize, apiKey)
        yield scrapy.Request(url, callback=self.get_content)

    def get_content(self, response):

        for data in json.loads(response.body_as_unicode())['body']['result']:
            program = data['program']['name']['ko']
            title = data['vod_name']['ko']
            genre = data['program']['category1_name']['ko']
            broadcast_date = data['episode']['broadcast_date']
            channel = data['channel']['name']['ko']
            actor = data['program']['actor']
            view_day = data['view_count']['day']
            view_week = data['view_count']['week']
            sale_day = data['sale_count']['day']
            sale_week = data['sale_count']['week']
            p_url = "http://www.tving.com/vod/player/" + data['vod_code']

            item = TvingVodItem()
            item['program'] = program
            item['title'] = title
            item['genre'] = genre
            item['broadcast_date'] = broadcast_date
            item['channel'] = channel
            item['view_day'] = view_day
            item['view_week'] = view_week
            item['sale_day'] = sale_day
            item['sale_week'] = sale_week
            item['p_url'] = p_url

            yield item

Overwriting tving_vod/tving_vod/spiders/spider.py


### models.py 작성

In [8]:
% % writefile tving_vod/tving_vod/models.py
from scrapy.utils.project import get_project_settings
from sqlalchemy import Integer, Text
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy import create_engine, Column, Table, ForeignKey, MetaData

DeclarativeBase = declarative_base()


def db_connect():
    """
    Performs database connection using database settings from settings.py.
    Returns sqlalchemy engine instance
    """
    return create_engine(get_project_settings().get("CONNECTION_STRING"))


def create_table(engine):
    DeclarativeBase.metadata.create_all(engine)


class TvingVodDB(DeclarativeBase):
    __tablename__ = "TvingVod"

    id = Column(Integer, primary_key=True)
    program = Column('program', Text)
    title = Column('title', Text)
    genre = Column('genre', Text)
    broadcast_date = Column('broadcast_date', Text)
    channel = Column('channel', Text)
    view_day = Column('view_day', Integer)
    view_week = Column('view_week', Integer)
    sale_day = Column('sale_day', Integer)
    sale_week = Column('sale_week', Integer)

Overwriting tving_vod/tving_vod/models.py


### pipeline 연결

In [11]:
% % writefile tving_vod/tving_vod/pipelines.py
from tving_vod.models import TvingVodDB, db_connect, create_table
from sqlalchemy.orm import sessionmaker

class TvingVodPipeline(object):
    def __init__(self):
        """
        Initializes database connection and sessionmaker.
        Creates deals table.
        """
        engine = db_connect()
        create_table(engine)
        self.Session = sessionmaker(bind=engine)

    def process_item(self, item, spider):
        """Save deals in the database.

        This method is called for every item pipeline component.
        """
        session = self.Session()
        tvingvoddb = TvingVodDB()
        tvingvoddb.program = item["program"]
        tvingvoddb.title = item["title"]
        tvingvoddb.genre = item["genre"]
        tvingvoddb.broadcast_date = item["broadcast_date"]
        tvingvoddb.channel = item["channel"]
        tvingvoddb.view_day = item["view_day"]
        tvingvoddb.view_week = item["view_week"]
        tvingvoddb.sale_day = item["sale_day"]
        tvingvoddb.sale_week = item["sale_week"]
        tvingvoddb.p_url = item["p_url"]

        try:
            session.add(tvingvoddb)
            session.commit()
        except:
            session.rollback()
            raise
        finally:
            session.close()

        return item

Overwriting tving_vod/tving_vod/pipelines.py


### settings.py 수정 : ROBOTSTXT_OBEY = FALSE로

In [122]:
!sed - i 's/ROBOTSTXT_OBEY = True/ROBOTSTXT_OBEY = False/' tving_vod/tving_vod/settings.py

In [123]:
!head - n 22 tving_vod/tving_vod/settings.py | tail - n 2

# Obey robots.txt rules
ROBOTSTXT_OBEY = False


In [13]:
# 파이프라인 열기
#ITEM_PIPELINES = {
#    'navertv.pipelines.NavertvPipeline': 300,
#}

# MySQL 설정
#CONNECTION_STRING = "{drivername}://{user}:{passwd}@{host}:{port}/{db_name}?charset=utf8mb4".format(
#     drivername="mysql",
#     user="root",
#     passwd="password",
#     host="**.***.**.***",
#     port="****",
#     db_name="OTT",
#)

### run.sh 

In [150]:
% % writefile tving_vod/run.sh
cd tving_vod
scrapy crawl TvingVod - o tving_vod_04.csv

Overwriting tving_vod/run.sh


In [92]:
# 권한 변경
import os
os.chmod("tving_vod/run.sh", 0o764)

In [93]:
!ls - al tving_vod/run.sh

-rwxrw-r-- 1 ubuntu ubuntu 51 Apr  8 12:10 tving_vod/run.sh


### 크롤링 실행하고 확인하기

In [None]:
# 크롤링 실행하고 확인하기

!tving_vod/run.sh

df = pd.read_csv("./tving_vod/TvingVod.csv")
df.head()

In [12]:
!tving_vod/run.sh

2020-04-23 10:31:47 [scrapy.utils.log] INFO: Scrapy 1.8.0 started (bot: tving_vod)
2020-04-23 10:31:47 [scrapy.utils.log] INFO: Versions: lxml 4.5.0.0, libxml2 2.9.10, cssselect 1.1.0, parsel 1.5.2, w3lib 1.21.0, Twisted 19.10.0, Python 3.6.9 (default, Jan 17 2020, 08:43:56) - [GCC 7.4.0], pyOpenSSL 19.1.0 (OpenSSL 1.1.1d  10 Sep 2019), cryptography 2.8, Platform Linux-4.15.0-1065-aws-x86_64-with-debian-buster-sid
2020-04-23 10:31:47 [scrapy.crawler] INFO: Overridden settings: {'BOT_NAME': 'tving_vod', 'FEED_FORMAT': 'csv', 'FEED_URI': 'TvingVod_test.csv', 'NEWSPIDER_MODULE': 'tving_vod.spiders', 'SPIDER_MODULES': ['tving_vod.spiders']}
2020-04-23 10:31:47 [scrapy.extensions.telnet] INFO: Telnet Password: ba4b62da01c3b482
2020-04-23 10:31:47 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.memusage.MemoryUsage',
 'scrapy.extensions.feedexport.FeedExporter',
 'scrapy.extensions.logstats

2020-04-23 10:31:48 [scrapy.core.scraper] DEBUG: Scraped from <200 https://api.tving.com/v2/media/episodes?pageNo=1&pageSize=100&order=viewDay&adult=all&free=all&guest=all&scope=all&lastFrequency=y&personal=N&screenCode=CSSD0100&networkCode=CSND0900&osCode=CSOD0900&teleCode=CSCD0900&apiKey=1e7952d0917d6aab1f0293a063697610>
{'broadcast_date': 20200419,
 'channel': 'tvN',
 'genre': '드라마',
 'p_url': 'http://www.tving.com/vod/player/E003034978',
 'program': '하이바이, 마마!',
 'sale_day': 13,
 'sale_week': 209,
 'title': '하이바이마마 16화',
 'view_day': 5360,
 'view_week': 45179}
2020-04-23 10:31:48 [scrapy.core.scraper] DEBUG: Scraped from <200 https://api.tving.com/v2/media/episodes?pageNo=1&pageSize=100&order=viewDay&adult=all&free=all&guest=all&scope=all&lastFrequency=y&personal=N&screenCode=CSSD0100&networkCode=CSND0900&osCode=CSOD0900&teleCode=CSCD0900&apiKey=1e7952d0917d6aab1f0293a063697610>
{'broadcast_date': 20200419,
 'channel': 'JTBC',
 'genre': '예능',
 'p_url': 'http://www.tving.com/vod/pla

2020-04-23 10:31:48 [scrapy.core.scraper] DEBUG: Scraped from <200 https://api.tving.com/v2/media/episodes?pageNo=1&pageSize=100&order=viewDay&adult=all&free=all&guest=all&scope=all&lastFrequency=y&personal=N&screenCode=CSSD0100&networkCode=CSND0900&osCode=CSOD0900&teleCode=CSCD0900&apiKey=1e7952d0917d6aab1f0293a063697610>
{'broadcast_date': 20200418,
 'channel': 'JTBC',
 'genre': '예능',
 'p_url': 'http://www.tving.com/vod/player/E003034052',
 'program': '트래블러 - 아르헨티나',
 'sale_day': 3,
 'sale_week': 27,
 'title': '트래블러-아르헨티나 10화',
 'view_day': 1138,
 'view_week': 9470}
2020-04-23 10:31:48 [scrapy.core.scraper] DEBUG: Scraped from <200 https://api.tving.com/v2/media/episodes?pageNo=1&pageSize=100&order=viewDay&adult=all&free=all&guest=all&scope=all&lastFrequency=y&personal=N&screenCode=CSSD0100&networkCode=CSND0900&osCode=CSOD0900&teleCode=CSCD0900&apiKey=1e7952d0917d6aab1f0293a063697610>
{'broadcast_date': 20200419,
 'channel': 'JTBC',
 'genre': '예능',
 'p_url': 'http://www.tving.com/vod

2020-04-23 10:31:48 [scrapy.core.scraper] DEBUG: Scraped from <200 https://api.tving.com/v2/media/episodes?pageNo=1&pageSize=100&order=viewDay&adult=all&free=all&guest=all&scope=all&lastFrequency=y&personal=N&screenCode=CSSD0100&networkCode=CSND0900&osCode=CSOD0900&teleCode=CSCD0900&apiKey=1e7952d0917d6aab1f0293a063697610>
{'broadcast_date': 20180118,
 'channel': 'tvN',
 'genre': '드라마',
 'p_url': 'http://www.tving.com/vod/player/E001316300',
 'program': '슬기로운 감빵생활',
 'sale_day': 1,
 'sale_week': 3,
 'title': '슬기로운 감빵생활 16화',
 'view_day': 602,
 'view_week': 3841}
2020-04-23 10:31:48 [scrapy.core.scraper] DEBUG: Scraped from <200 https://api.tving.com/v2/media/episodes?pageNo=1&pageSize=100&order=viewDay&adult=all&free=all&guest=all&scope=all&lastFrequency=y&personal=N&screenCode=CSSD0100&networkCode=CSND0900&osCode=CSOD0900&teleCode=CSCD0900&apiKey=1e7952d0917d6aab1f0293a063697610>
{'broadcast_date': 20200421,
 'channel': '채널A',
 'genre': '교양',
 'p_url': 'http://www.tving.com/vod/player

2020-04-23 10:31:48 [scrapy.core.scraper] DEBUG: Scraped from <200 https://api.tving.com/v2/media/episodes?pageNo=1&pageSize=100&order=viewDay&adult=all&free=all&guest=all&scope=all&lastFrequency=y&personal=N&screenCode=CSSD0100&networkCode=CSND0900&osCode=CSOD0900&teleCode=CSCD0900&apiKey=1e7952d0917d6aab1f0293a063697610>
{'broadcast_date': 20200412,
 'channel': 'OCN',
 'genre': '드라마',
 'p_url': 'http://www.tving.com/vod/player/E003031520',
 'program': '루갈 무삭제판',
 'sale_day': 2,
 'sale_week': 10,
 'title': '루갈 무삭제판 6화',
 'view_day': 303,
 'view_week': 5855}
2020-04-23 10:31:48 [scrapy.core.scraper] DEBUG: Scraped from <200 https://api.tving.com/v2/media/episodes?pageNo=1&pageSize=100&order=viewDay&adult=all&free=all&guest=all&scope=all&lastFrequency=y&personal=N&screenCode=CSSD0100&networkCode=CSND0900&osCode=CSOD0900&teleCode=CSCD0900&apiKey=1e7952d0917d6aab1f0293a063697610>
{'broadcast_date': 20190725,
 'channel': 'tvN',
 'genre': '드라마',
 'p_url': 'http://www.tving.com/vod/player/E0

2020-04-23 10:31:49 [scrapy.core.scraper] DEBUG: Scraped from <200 https://api.tving.com/v2/media/episodes?pageNo=1&pageSize=100&order=viewDay&adult=all&free=all&guest=all&scope=all&lastFrequency=y&personal=N&screenCode=CSSD0100&networkCode=CSND0900&osCode=CSOD0900&teleCode=CSCD0900&apiKey=1e7952d0917d6aab1f0293a063697610>
{'broadcast_date': 20200422,
 'channel': 'CH.DIA',
 'genre': '디지털오리지널',
 'p_url': 'http://www.tving.com/vod/player/E003042218',
 'program': '입짧은햇님',
 'sale_day': 0,
 'sale_week': 0,
 'title': '입짧은햇님 104화',
 'view_day': 230,
 'view_week': 169}
2020-04-23 10:31:49 [scrapy.core.scraper] DEBUG: Scraped from <200 https://api.tving.com/v2/media/episodes?pageNo=1&pageSize=100&order=viewDay&adult=all&free=all&guest=all&scope=all&lastFrequency=y&personal=N&screenCode=CSSD0100&networkCode=CSND0900&osCode=CSOD0900&teleCode=CSCD0900&apiKey=1e7952d0917d6aab1f0293a063697610>
{'broadcast_date': 20200422,
 'channel': 'Olive',
 'genre': '예능',
 'p_url': 'http://www.tving.com/vod/playe

2020-04-23 10:31:49 [scrapy.core.scraper] DEBUG: Scraped from <200 https://api.tving.com/v2/media/episodes?pageNo=1&pageSize=100&order=viewDay&adult=all&free=all&guest=all&scope=all&lastFrequency=y&personal=N&screenCode=CSSD0100&networkCode=CSND0900&osCode=CSOD0900&teleCode=CSCD0900&apiKey=1e7952d0917d6aab1f0293a063697610>
{'broadcast_date': 20131228,
 'channel': 'tvN',
 'genre': '드라마',
 'p_url': 'http://www.tving.com/vod/player/E000568449',
 'program': '응답하라 1994',
 'sale_day': 1,
 'sale_week': 4,
 'title': '90년대에게',
 'view_day': 163,
 'view_week': 704}
2020-04-23 10:31:49 [scrapy.core.scraper] DEBUG: Scraped from <200 https://api.tving.com/v2/media/episodes?pageNo=1&pageSize=100&order=viewDay&adult=all&free=all&guest=all&scope=all&lastFrequency=y&personal=N&screenCode=CSSD0100&networkCode=CSND0900&osCode=CSOD0900&teleCode=CSCD0900&apiKey=1e7952d0917d6aab1f0293a063697610>
{'broadcast_date': 20181120,
 'channel': 'JTBC',
 'genre': '드라마',
 'p_url': 'http://www.tving.com/vod/player/E0019