In [10]:
import logging
import requests

LIMIT = 10
logging.basicConfig(level=logging.INFO, format= '%(asctime)s - %(levelname)s : %(message)s')

INDEX_URL = ' https://spa1.scrape.center/api/movie?limit={limit}&offset={offset}'

In [11]:
def scrape_api(url):
    logging.info('start scraping %s', url)
    try:
        response = requests.get(url)
        if response.status_code == 200:
            return response.json()
        logging.error('get invalid status code %s while scraping %s', response.status_code, url)
    except requests.RequestException:
        logging.error('error occurred while scraping %s', url, exc_info=True)

In [12]:
def scrape_index(page):
    url = INDEX_URL.format(limit = LIMIT, offset = LIMIT*(page - 1))
    return  scrape_api(url)

In [20]:
DETAIL_URL = 'https://spa1.scrape.center/api/movie/{id}'

def scrape_detail(id):
    url = DETAIL_URL.format(id = id)
    return scrape_api(url)

In [34]:
TOTAL_PAGE = 10

for i in range(1, TOTAL_PAGE + 1):
    index_data = scrape_index(i)
    for item in index_data['results']:
        detail_id = item['id']
        detail_data = scrape_detail(detail_id)
        logging.info('get detail data %s', detail_data)

2023-01-26 13:13:15,159 - INFO : start scraping  https://spa1.scrape.center/api/movie?limit=10&offset=0
2023-01-26 13:13:17,029 - INFO : start scraping https://spa1.scrape.center/api/movie/1
2023-01-26 13:13:17,848 - INFO : get detail data {'id': 1, 'name': '霸王别姬', 'alias': 'Farewell My Concubine', 'cover': 'https://p0.meituan.net/movie/ce4da3e03e655b5b88ed31b5cd7896cf62472.jpg@464w_644h_1e_1c', 'categories': ['剧情', '爱情'], 'regions': ['中国内地', '中国香港'], 'actors': [{'name': '张国荣', 'role': '程蝶衣', 'image': 'https://p0.meituan.net/movie/5de69a492dcbd3f4b014503d4e95d46c28837.jpg@128w_170h_1e_1c'}, {'name': '张丰毅', 'role': '段小楼', 'image': 'https://p0.meituan.net/movie/35e74707f69da838d7ba3422b8f6579840705.jpg@128w_170h_1e_1c'}, {'name': '巩俐', 'role': '菊仙', 'image': 'https://p1.meituan.net/moviemachine/b650dcb00c40356934a275515217850f191104.jpg@128w_170h_1e_1c'}, {'name': '吕齐', 'role': '关师傅', 'image': 'https://p1.meituan.net/movie/30e12a78b5e61916edb1e33ce6fec19b34794.jpg@128w_170h_1e_1c'}, {'na

In [41]:
import pymongo

In [45]:
MONGO_CONNECTION_STRING = 'mongodb://localhost:27017'
MONGO_DB_NAME = 'crawlerdb'
MONGO_COLLECTION_NAME = 'movies'

In [59]:
client = pymongo.MongoClient(MONGO_CONNECTION_STRING)
db = client['crawlerdb']
collection = db['movies']

In [64]:
def save_data(data):
    collection.update_one({
        'name':data.get('name')
    },{
        '$set':data
    }, upsert=True)

In [65]:

def main():
    for i in range(1, TOTAL_PAGE + 1):
        index_data = scrape_index(i)
        for item in index_data['results']:
            detail_id = item['id']
            detail_data = scrape_detail(detail_id)
            logging.info('get detail data %s', detail_data)
            save_data(detail_data)
            logging.info('data saved successfully')

In [66]:
if __name__ == '__main__':
    main()

2023-01-26 19:07:43,391 - INFO : start scraping  https://spa1.scrape.center/api/movie?limit=10&offset=0
2023-01-26 19:07:46,724 - INFO : start scraping https://spa1.scrape.center/api/movie/1
2023-01-26 19:07:48,709 - INFO : get detail data {'id': 1, 'name': '霸王别姬', 'alias': 'Farewell My Concubine', 'cover': 'https://p0.meituan.net/movie/ce4da3e03e655b5b88ed31b5cd7896cf62472.jpg@464w_644h_1e_1c', 'categories': ['剧情', '爱情'], 'regions': ['中国内地', '中国香港'], 'actors': [{'name': '张国荣', 'role': '程蝶衣', 'image': 'https://p0.meituan.net/movie/5de69a492dcbd3f4b014503d4e95d46c28837.jpg@128w_170h_1e_1c'}, {'name': '张丰毅', 'role': '段小楼', 'image': 'https://p0.meituan.net/movie/35e74707f69da838d7ba3422b8f6579840705.jpg@128w_170h_1e_1c'}, {'name': '巩俐', 'role': '菊仙', 'image': 'https://p1.meituan.net/moviemachine/b650dcb00c40356934a275515217850f191104.jpg@128w_170h_1e_1c'}, {'name': '吕齐', 'role': '关师傅', 'image': 'https://p1.meituan.net/movie/30e12a78b5e61916edb1e33ce6fec19b34794.jpg@128w_170h_1e_1c'}, {'na