# 1. 프로젝트 생성

In [1]:
!rm -rf dailyhotel
!scrapy startproject dailyhotel

New Scrapy project 'dailyhotel', using template directory '/home/ubuntu/.pyenv/versions/3.6.9/envs/python3/lib/python3.6/site-packages/scrapy/templates/project', created in:
    /home/ubuntu/python3/project/codes/crawling/dailyhotel

You can start your first spider with:
    cd dailyhotel
    scrapy genspider example example.com


# 2. xpath 찾기

In [1]:
# hotel key 얻는 모듈 생성 -> dailyhotel_keys.py

In [2]:
%%writefile dailyhotel/dailyhotel_keys.py
def get_hotel_keys():
    import scrapy, requests
    from scrapy.http import TextResponse
    import datetime
    
    
    KST = datetime.timezone(datetime.timedelta(hours=9))
    today = datetime.datetime.now(tz=KST) # 한국시간으로 설정
    start_date= today.strftime("%Y-%m-%d") # 시작일 디폴트 = 오늘
    end_date = (today + datetime.timedelta(days=1)).strftime("%Y-%m-%d") # 종료일 디폴트 = 내일
    staydays = (datetime.datetime.strptime(end_date,'%Y-%m-%d').date() - \
                datetime.datetime.strptime(start_date,'%Y-%m-%d').date()).days # 숙박일수 디폴트 = 1박start_date, end_date
    persons, staydays = 1, 1
    
    
    # 호텔리스트 가져올 parent_url 생성
    page = 1
    hotelidx = []
    while True:
        parent_url = f"https://www.dailyhotel.com/newdelhi/goodnight/api/v4/stays/sales?dateCheckIn={start_date}&regionIdx=30070&shortCutType=hotel_resort&persons={persons}&stays={staydays}&details=true&regionStayType=hotel_resort&stayType=hotel,resort&page={page}&limit=50"
        headers = {'app-version': '2.2.14'}
        req = requests.get(parent_url, headers=headers)
        response = req.json()
        list_all = response['data']['staySaleSections']

        if len(list_all) > 0:
            for each in list_all:
                if each['sectionViewType'] == 'NORMAL_섹션뷰타입':
                    lists = each['staySales']
                    print('page=', str(page))
                    page += 1
        else:
            print('All registered hotels have been searched.') # 이거는 확인용
            break


        # parent_url에서 각 호텔들의 Index number 가져오기
        for each in lists:
            hotelidx.append(each['hotelIdx'])
    
    return hotelidx

Writing dailyhotel/dailyhotel_keys.py


# 3. item.py

In [3]:
%%writefile dailyhotel/dailyhotel/items.py
import scrapy


class DailyhotelItem(scrapy.Item):
    date = scrapy.Field()
    platform = scrapy.Field()
    name = scrapy.Field()
    level = scrapy.Field()
    score = scrapy.Field()
    review_count = scrapy.Field()
    location = scrapy.Field()
    room_type = scrapy.Field()
    price = scrapy.Field()
    link = scrapy.Field()

Overwriting dailyhotel/dailyhotel/items.py


# 4. spider.py

In [4]:
%%writefile dailyhotel/dailyhotel/spiders/spider.py
import scrapy, requests
import json
import datetime
from dailyhotel_keys import get_hotel_keys
from dailyhotel.items import DailyhotelItem


class DailyhotelSpider(scrapy.Spider):
    name = "Dailyhotel"
    allow_domain=["dailyhotel.com"]
    KST = datetime.timezone(datetime.timedelta(hours=9))
    today = datetime.datetime.now(tz=KST) # 한국시간으로 설정
    start_date= today.strftime("%Y-%m-%d") # 시작일 디폴트 = 오늘
    end_date = (today + datetime.timedelta(days=1)).strftime("%Y-%m-%d") # 종료일 디폴트 = 내일
    staydays = (datetime.datetime.strptime(end_date,'%Y-%m-%d').date() - \
                datetime.datetime.strptime(start_date,'%Y-%m-%d').date()).days # 숙박일수 디폴트 = 1박start_date, end_date
    persons, staydays = 2, 1
    start_urls= ["https://www.dailyhotel.com/"]
    
    
    def parse(self, response):
        hotel_idx = get_hotel_keys()
        KST = datetime.timezone(datetime.timedelta(hours=9))
        today = datetime.datetime.now(tz=KST) # 한국시간으로 설정
        start_date= today.strftime("%Y-%m-%d") # 시작일 디폴트 = 오늘
        links = []
        start_urls = "https://www.dailyhotel.com/"
        headers = {'app-version': '2.2.14',
                   'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.192 Safari/537.36',
                  }
        for idx in hotel_idx:
            hotel_url = start_urls + f'newdelhi/goodnight/api/v9/hotel/{idx}?stays=1&dateCheckIn={start_date}&regionStayType=all'
            links.append(hotel_url)   
        for link in links:
            yield scrapy.Request(link, headers=headers, callback=self.parse_content)
    
    
    def parse_content(self, response):
        KST = datetime.timezone(datetime.timedelta(hours=9))
        today = datetime.datetime.now(tz=KST) # 한국시간으로 설정
        start_date= today.strftime("%Y-%m-%d") # 시작일 디폴트 = 오늘  
        date_for_column = today.strftime("%Y/%m/%d %H:%M")
        resp_content = response.json()
        staydays=1        
        #date
        date = date_for_column
        #name
        name = resp_content['data']['name'] 
        #level
        level = resp_content['data']['stayGradeName'] 
        #score
        try:
            score = float(round(resp_content['data']['rating']['values']/100*5, 1))
        except:
            score = float(0)
        # location
        location = resp_content['data']['address']
        # roomtype
        roomtypes = []
        for each in resp_content['data']['rooms']:
            roomtypes.append(each['roomName'])
        #price
        prices = []
        for each in resp_content['data']['rooms']:
            prices.append(int(each['amount']['discountAverage']))
        #review
        review = float(resp_content['data']['statistic']['reviewScoreTotalCount'])
        #link
        hotel_id = resp_content['data']['idx']
        hotel_desc_url = f"https://www.dailyhotel.com/stays/{hotel_id}?dateCheckIn={start_date}&stays={staydays}" # 크롤링이랑 예약링크 다름 -> request url과 다르도록!
        #platform
        platform = "데일리호텔"

        
        item = DailyhotelItem()
        for i in range(len(roomtypes)):
            item['date'] = date
            item['platform'] = platform
            item['name'] = name
            item['level'] = level
            item['score'] = score
            item['review_count'] = review
            item['location'] = location
            item['room_type'] = roomtypes[i]
            item['price'] = prices[i]
            item['link'] = hotel_desc_url     
            yield item

Writing dailyhotel/dailyhotel/spiders/spider.py


# 5. settings.py: robots.txt 여부 파악

# 6. 프로젝트 실행

In [5]:
%%writefile run.sh
cd dailyhotel
rm dailyhotel.csv
scrapy crawl Dailyhotel -o dailyhotel.csv 

Overwriting run.sh


# 7. pipeline.py: 크롤링데이터 mongodb에 저장

In [2]:
# scrapy project에 pymongo module 생성

In [6]:
%%writefile dailyhotel/dailyhotel/mongodb.py
import pymongo


client = pymongo.MongoClient("mongodb://dss:dss@15.164.48.207:27017")
collection = client.hotel.hotel_info

Writing dailyhotel/dailyhotel/mongodb.py


In [26]:
# pipeline.py 설정

In [7]:
%%writefile dailyhotel/dailyhotel/pipelines.py
from itemadapter import ItemAdapter
from .mongodb import collection

class DailyhotelPipeline:
    def process_item(self, item, spider):
        data = {"date": item['date'], "platform": item['platform'], "name": item['name'],
                "level": item['level'], "score": item['score'], "review": item['review_count'],
                "location": item['location'], "room_type": item['room_type'],
                "price": item['price'], "link": item['link']}
        collection.insert(data)
        return item

Overwriting dailyhotel/dailyhotel/pipelines.py


# 8. settings.py -> pipeline 설정변경

In [8]:
!sed -i 's/#ITEM_PIPELINES/ITEM_PIPELINES/' dailyhotel/dailyhotel/settings.py
!sed -i "s/#    'dailyhotel.pipelines.DailyhotelPipeline': 300,/    'dailyhotel.pipelines.DailyhotelPipeline': 300,}/" dailyhotel/dailyhotel/settings.py

In [9]:
!head -n 67 dailyhotel/dailyhotel/settings.py | tail -n 3

ITEM_PIPELINES = {
    'dailyhotel.pipelines.DailyhotelPipeline': 300,}
#}


# 9. 프로젝트 실행!

In [None]:
!/bin/bash run.sh

In [None]:
# 결과 정상 여부 확인

In [11]:
import pandas as pd
data = pd.read_csv('dailyhotel/dailyhotel.csv', encoding='utf-8-sig')
print(data.shape)
data.tail()

(1617, 10)


Unnamed: 0,date,level,link,location,name,platform,price,review_count,room_type,score
1612,2021/03/18 23:21,특1급,https://www.dailyhotel.com/stays/36159?dateChe...,서울특별시 강남구 논현로 854,안다즈 서울 강남,데일리호텔,359700,288,[조식2인 PKG] Deluxe King,4.5
1613,2021/03/18 23:21,특1급,https://www.dailyhotel.com/stays/36159?dateChe...,서울특별시 강남구 논현로 854,안다즈 서울 강남,데일리호텔,359700,288,[조식2인 PKG] Deluxe Twin,4.5
1614,2021/03/18 23:21,특1급,https://www.dailyhotel.com/stays/36159?dateChe...,서울특별시 강남구 논현로 854,안다즈 서울 강남,데일리호텔,399000,288,[1+1 패키지 트윈객실 하나 더!] Standard King,4.5
1615,2021/03/18 23:21,특1급,https://www.dailyhotel.com/stays/36159?dateChe...,서울특별시 강남구 논현로 854,안다즈 서울 강남,데일리호텔,476900,288,[스위트 특가] Andaz Suite,4.5
1616,2021/03/18 23:21,특1급,https://www.dailyhotel.com/stays/36159?dateChe...,서울특별시 강남구 논현로 854,안다즈 서울 강남,데일리호텔,599000,288,[1+1 패키지 트윈객실 하나 더!] Andaz Suite,4.5


In [None]:
# 10. crontab 실행