#### Wavve

---

In [2]:
import requests
import pandas as pd
import json
from bs4 import BeautifulSoup

#### 1. 프로젝트 생성

In [4]:
!scrapy startproject wavve

New Scrapy project 'wavve', using template directory '/home/ubuntu/.pyenv/versions/3.6.9/envs/python3/lib/python3.6/site-packages/scrapy/templates/project', created in:
    /home/ubuntu/python3/notebook/crawling/wavve

You can start your first spider with:
    cd wavve
    scrapy genspider example example.com


In [6]:
!tree wavve

/bin/sh: 1: tree: not found


#### 2. item.py 수정

In [10]:
%%writefile wavve/wavve/items.py
import scrapy

class WavveItem(scrapy.Item):
    title = scrapy.Field()
    episode = scrapy.Field()
    date = scrapy.Field()

Overwriting wavve/wavve/items.py


#### 3. spider.py 생성

In [32]:
%%writefile ./wavve/wavve/spiders/spider.py

import scrapy
import json

from wavve.items import WavveItem

class Spider(scrapy.Spider):
    
    name = 'Wavve'
    
    def __init__(self, **kwargs):
        self.base_url = "https://apis.pooq.co.kr/cf/vod/popularcontents?WeekDay=all&broadcastid=6339&came=broadcast&contenttype=vod&genre={}".format(kwargs["category"])
        self.start_urls = []
        for page in range(1,11):
            offset = (page - 1) *20
            
            self.start_urls.append(self.base_url + f"&limit=20&offset={offset}&orderby=viewtime&page={page}&uiparent=GN2-VN2&uirank=2&uitype=VN2&apikey=apikey&credential=none&device=pc&drm=wm&partner=pooq&pooqzone=none&region=kor&targetage=auto")

        super().__init__(**kwargs)
        
        
    def start_requests(self):
        for url in self.start_urls:
            yield scrapy.Request(url = url, callback=self.get_content)
            
    def get_content(self, response):
        for data in json.loads(response.body_as_unicode())['cell_toplist']['celllist']:
            title = data['title_list'][0]['text']
            count = data['title_list'][1]['text'].split('$')[0]
            try :
                date = data['title_list'][1]['text'].split('$')[2]
            except:
                count = 0
                date = data['title_list'][1]['text'].split('$')[0]
            
            item = WavveItem()
            item['title'] = title
            item['episode'] = count
            item['date'] = date
            
            yield item
        

Overwriting ./wavve/wavve/spiders/spider.py


In [39]:
%%writefile wavve/run.sh
cd wavve
scrapy crawl Wavve -o wavve_all.csv -a category=all
scrapy crawl Wavve -o wavve_drama.csv -a category=01
scrapy crawl Wavve -o wavve_ent.csv -a category=02

Overwriting wavve/run.sh


In [14]:
!chmod 764 wavve/run.sh

In [15]:
!ls -al wavve/run.sh

-rwxrw-r-- 1 ubuntu ubuntu 165 Mar 18 12:08 wavve/run.sh


In [40]:
%%writefile ./wavve/wavve/mongodb.py
import pymongo

client = pymongo.MongoClient('mongodb://id:pw@ip')
db = client.wavve_server
collection = db.items

Overwriting ./wavve/wavve/mongodb.py


In [41]:
%%writefile ./wavve/wavve/pipelines.py
from .mongodb import collection

class WavvePipeline(object):
    def process_item(self, item, spider):
        data = {
            
            'title' : item['title'],
            'count' : item['episode'],
            'date' : item['date'],

        }
        
        collection.insert(data)
        
        return item

Overwriting ./wavve/wavve/pipelines.py


In [23]:
!echo "ITEM_PIPELINES = {\n" >> wavve/wavve/settings.py '\n wavve.pipelines.WavvePipeline' : 300, "}" >> wavve/wavve/settings.py

In [25]:
!tail -n 5 wavve/wavve/settings.py

#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
ITEM_PIPELINES = {
 'wavve.pipelines.WavvePipeline' : 300, }


In [None]:
!./wavve/run.sh

In [54]:
categories = {'all' : 'all' , '01' : 'drama' ,'02' : 'ent'}

In [55]:
dfs = [pd.read_csv("wavve/wavve_{}.csv".format(category)) for category in categories.values()]

In [56]:
[(category, len(df)) for category, df in zip(categories.values(), dfs)]

[('all', 200), ('drama', 200), ('ent', 200)]

In [57]:
result_df = pd.concat(dfs, ignore_index=True)
result_df.tail(2)

Unnamed: 0,date,episode,title
598,2004-01-17(토),11회,X맨 1
599,2020-01-08(수),11회,팔로우 미 시즌12


In [58]:
import pymongo

client = pymongo.MongoClient('mongodb://id:pw@ip')
db = client.wavve_server
collection = db.items

In [59]:
datas = collection.find({}, {"_id": False})

In [60]:
pd.DataFrame(datas).tail(2)

Unnamed: 0,count,date,title
598,11회,2004-01-17(토),X맨 1
599,11회,2020-01-08(수),팔로우 미 시즌12
