In [3]:
import pandas as pd
book = pd.read_excel("豆伴(208462734).xlsx",sheet_name="读过")

In [5]:
book.to_csv("book.csv")

In [18]:
# -*- coding: utf-8 -*-
# @Author  : yocichen
# @Email   : yocichen@126.com
# @File    : doubanBooks.py
# @Software: PyCharm
# @Time    : 2019/11/9 11:38


'''
任务：
爬取书籍图片链接/
爬取内容简介/
爬取作者简介/
爬取标签（需要判断标签的类型，最好给一个筛选列表）- 改爬具体标签下的书籍src了
'''

import re
import openpyxl
import requests
from requests import RequestException
from bs4 import BeautifulSoup
import lxml
import time
import random

def get_one_page(url):
    '''
    Get the html of a page by requests module
    :param url: page url
    :return: html / None
    '''
    try:
        head = ['Mozilla/5.0', 'Chrome/78.0.3904.97', 'Safari/537.36']
        headers = {
            'user-agent':head[random.randint(0, 2)]
        }
        response = requests.get(url, headers=headers) #, proxies={'http':'171.15.65.195:9999'}
        if response.status_code == 200:
            return response.text
        return None
    except RequestException:
        return None

def get_request_res(pattern_text, html):
    '''
    Get the book info by re module
    :param pattern_text: re pattern
    :param html: page's html text
    :return: book's info
    '''
    pattern = re.compile(pattern_text, re.S)
    res = re.findall(pattern, html)
    if len(res) > 0:
        return res[0].split('<', 1)[0][1:]
    else:
        return 'NULL'
    

def get_request_press(pattern_text, html):
    '''
    Get the book info by re module
    :param pattern_text: re pattern
    :param html: page's html text
    :return: book's info
    '''
    pattern = re.compile(pattern_text, re.S)
    res = re.findall(pattern, html)
    if len(res) > 0:
        return res[0].split(">")[1]
        
    else:
        return 'NULL'

def get_bs_res(selector, html):
    '''
    Get the book info by bs4 module
    :param selector: info selector
    :param html: page's html text
    :return: book's info
    '''
    soup = BeautifulSoup(html, 'lxml')
    res = soup.select(selector)
    # if res is not None or len(res) is not 0:
    #     return res[0].string
    # else:
    #     return 'NULL'
    if res is None:
        return 'NULL'
    elif len(res) == 0:
        return 'NULL'
    else:
        return res[0].string

# Get other info by bs module
def get_bs_img_res(selector, html):
    soup = BeautifulSoup(html, 'lxml')
    res = soup.select(selector)
    if len(res) != 0:
        return str(res[0])
    else:
        return 'NULL'

def parse_one_page(url,headers):
    res = requests.get(url,headers=headers,allow_redirects=False)
    html = res.content.decode("utf-8")
    '''
    Parse the useful info of html by re module
    :param html: page's html text
    :return: all of book info(dict)
    '''
    book_info = {}
    book_name = get_bs_res('div > h1 > span', html)
    # print('Book-name', book_name)
    book_info['Book_name'] = book_name
    # info > a:nth-child(2)
    author = get_bs_res('div > span:nth-child(1) > a', html)
    if author is None:
        author = get_bs_res('#info > a:nth-child(2)', html)
    # print('Author', author)
    author = author.replace(" ", "")
    author = author.replace("\n", "")
    book_info['Author'] = author

    publisher = get_request_press(u'出版社:</span>([\s\S]*?)</a>', html)
    # print('Publisher', publisher)
    book_info['publisher'] = publisher

    publish_time = get_request_res(u'出版年:</span>(.*?)<br/>', html)
    # print('Publish-time', publish_time)
    book_info['publish_time'] = publish_time

    ISBN = get_request_res(u'ISBN:</span>(.*?)<br/>', html)
    # print('ISBN', ISBN)
    book_info['ISBN'] = ISBN

    img_label = get_bs_img_res('#mainpic > a > img', html)
    pattern = re.compile('src="(.*?)"', re.S)
    img = re.findall(pattern, img_label)
    if len(img) != 0:
        # print('img-src', img[0])
        book_info['img_src'] = img[0]
    else:
        # print('src not found')
        book_info['img_src'] = 'NULL'

    book_intro = get_bs_res('#link-report > div:nth-child(1) > div > p', html)
    # print('book introduction', book_intro)
    book_info['book_intro'] = book_intro

    author_intro = get_bs_res('#content > div > div.article > div.related_info > div:nth-child(4) > div > div > p', html)
    # print('author introduction', author_intro)
    book_info['author_intro'] = author_intro

    grade = get_bs_res('div > div.rating_self.clearfix > strong', html)
    if len(grade) == 1:
        # print('Score no mark')
        book_info['Score'] = 'NULL'
    else:
        # print('Score', grade[1:])
        book_info['Score'] = grade[1:]

    comment_num = get_bs_res('#interest_sectl > div > div.rating_self.clearfix > div > div.rating_sum > span > a > span', html)
    # print('commments', comment_num)
    book_info['commments'] = comment_num

    five_stars = get_bs_res('#interest_sectl > div > span:nth-child(5)', html)
    # print('5-stars', five_stars)
    book_info['5_stars'] = five_stars

    four_stars = get_bs_res('#interest_sectl > div > span:nth-child(9)', html)
    # print('4-stars', four_stars)
    book_info['4_stars'] = four_stars

    three_stars = get_bs_res('#interest_sectl > div > span:nth-child(13)', html)
    # print('3-stars', three_stars)
    book_info['3_stars'] = three_stars

    two_stars = get_bs_res('#interest_sectl > div > span:nth-child(17)', html)
    # print('2-stars', two_stars)
    book_info['2_stars'] = two_stars

    one_stars = get_bs_res('#interest_sectl > div > span:nth-child(21)', html)
    # print('1-stars', one_stars)
    book_info['1_stars'] = one_stars

    return book_info

def write_bookinfo_excel(book_info, file):
    '''
    Write book info into excel file
    :param book_info: a dict
    :param file: memory excel file
    :return: the num of successful item
    '''
    wb = openpyxl.load_workbook(file)
    ws = wb.worksheets[0]
    sheet_row = ws.max_row
    sheet_col = ws.max_column
    i = sheet_row
    j = 1
    for key in book_info:
        ws.cell(i+1, j).value = book_info[key]
        j += 1
    done = ws.max_row - sheet_row
    wb.save(file)
    return done

def read_booksrc_get_info(src_file, info_file):
    '''
    Read the src file and access each src, parse html and write info into file
    :param src_file: src file
    :param info_file: memory file
    :return: the num of successful item
    '''
    wb = openpyxl.load_workbook(src_file)
    ws = wb.worksheets[0]
    row = ws.max_row
    done = 0
    for i in range(868, row+1):
        src = ws.cell(i, 1).value
        if src is None:
            continue
        html = get_one_page(str(src))
        book_info = parse_one_page(html)
        done += write_bookinfo_excel(book_info, info_file)
        if done % 10 == 0:
            print(done, 'done')
    return done


In [2]:
headers = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36',
    'Cookie': 'll="108288"; bid=LxuBFcq903Y; _pk_id.100001.8cb4=d9ac29ae35fdb232.1687791343.; _pk_ses.100001.8cb4=1; __utma=30149280.453255204.1687791344.1687791344.1687791344.1; __utmc=30149280; __utmz=30149280.1687791344.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); __utmt=1; dbcl2="208462734:1M1epG+m4oE"; ck=O7sa; push_noty_num=0; push_doumail_num=0; __utmv=30149280.20846; __utmb=30149280.5.9.1687791368060; ap_v=0,6.0',
    # 'Host': 'www.douban.com',
    'Content-type': 'text/html; charset=utf-8',
    # 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
    'Upgrade-Insecure-Requests': '1',
    # 'Accept-Encoding': 'gzip, deflate, br'
}

In [63]:
import requests
res = requests.get("https://book.douban.com/subject/35596032/",headers=headers,allow_redirects=False)

In [64]:
parse_one_page(res.content.decode("utf-8"))

{'Book_name': '山',
 'Author': '刘慈欣',
 'publisher': '中信出版集团',
 'publish_time': '2021-11',
 'ISBN': '9787521735345',
 'img_src': 'https://img9.doubanio.com/view/subject/s/public/s34342444.jpg',
 'book_intro': '在登上海山顶峰的时候，冯帆感觉此生足矣，那时他可以从容地去死。但现在，他突然变成了世界上最怕死的人。他攀登过岩石的世界屋脊，这次又登上了海水构成的世界最高峰，下次会登什么样的山呢？这无论如何得活下去才能知道。几年前在珠峰雪暴中的感觉又回来了，那感觉曾使他割断了连接同伴和恋人的登山索，将他们送进了死亡世界，现在他知道自己做对了。如果现在真有什么可背叛的东西来拯救自己的生命，他会背叛的。 他必须活下去，因为山无处不在。',
 'author_intro': 'NULL',
 'Score': '8.4 ',
 'commments': '1880',
 '5_stars': '44.7%',
 '4_stars': '42.5%',
 '3_stars': '11.6%',
 '2_stars': '0.9%',
 '1_stars': '0.3%'}

In [19]:
info = res.content.decode("utf-8")

In [26]:
with open("test.txt","w+") as f:
    f.write(info)

https://www.notion.so/sparklingdeng/e030d9d8768f42f59adf191e53b07d3b?v=bf35e9a5b12c4c9d99f3beb2a6d3fc2c&pvs=4

In [11]:
import time
from notion_database.properties import Properties
from notion_database.database import Database
from notion_database.page import Page
import requests
from bs4 import BeautifulSoup
import re
def updateBookUrl(database_id):
    P = Page(integrations_token="secret_wa5Mmdai45S2vBpXXr1Hkx8eATxKQVGTydscWBstPsG")
    D = Database(integrations_token="secret_wa5Mmdai45S2vBpXXr1Hkx8eATxKQVGTydscWBstPsG")
    D.find_all_page(database_id=database_id)
    print
    while D.result["has_more"]:
        books = D.result["results"]
        for i in books:
            time.sleep(1)
            P.retrieve_page(page_id=i["id"])
            url = P.result["properties"]["影片链接"]["url"]
            book_info = parse_one_page(url, headers=headers)
            print(book_info)
            if book_info:
                PROPERTY = Properties()
                PROPERTY.set_files("Cover", files_list=[book_info["img_scc"]])
                PROPERTY.set_select("Publisher",text=book_info["publisher"])
                PROPERTY.set_select("Author",text=book_info["Author"])
                PROPERTY.set_number("ISBN",text=book_info["ISBN"])
                P.update_page(page_id=i["id"], properties=PROPERTY)
                print("update success   " + book_info["Book_name"])
            else:
                print("skip")
            D.find_all_page(database_id=database_id, start_cursor=D.result["next_cursor"], page_size=200)

In [29]:
database_id="e030d9d8768f42f59adf191e53b07d3b"
P = Page(integrations_token="secret_wa5Mmdai45S2vBpXXr1Hkx8eATxKQVGTydscWBstPsG")
D = Database(integrations_token="secret_wa5Mmdai45S2vBpXXr1Hkx8eATxKQVGTydscWBstPsG")
D.find_all_page(database_id=database_id)
print(D.result)

books = D.result["results"]
for i in books:
    time.sleep(1)
    P.retrieve_page(page_id=i["id"])
    print(P.result["properties"]["链接"])
    url = P.result["properties"]["链接"]["url"]
    book_info = parse_one_page(url, headers=headers)
    print(book_info)
    if book_info:
        PROPERTY = Properties()
        PROPERTY.set_files("Cover", files_list=[book_info["img_src"]])
        PROPERTY.set_select("Publisher",text=book_info["publisher"])
        PROPERTY.set_select("Author",text=book_info["Author"])
        PROPERTY.set_rich_text("ISBN",text=book_info["ISBN"])
        P.update_page(page_id=i["id"], properties=PROPERTY)
        print("update success   " + book_info["Book_name"])
    else:
        print("skip")

{'object': 'list', 'results': [{'object': 'page', 'id': '08771248-6091-4344-b8df-7e7dc1e48efc', 'created_time': '2023-07-07T14:10:00.000Z', 'last_edited_time': '2023-07-08T02:12:00.000Z', 'created_by': {'object': 'user', 'id': 'f2ce332b-8a03-4c31-adba-86f0b763dbb1'}, 'last_edited_by': {'object': 'user', 'id': '35634d5a-9fc9-470e-a221-bb0acfacd884'}, 'cover': None, 'icon': None, 'parent': {'type': 'database_id', 'database_id': 'e030d9d8-768f-42f5-9adf-191e53b07d3b'}, 'archived': False, 'properties': {'链接': {'id': 'CEvK', 'type': 'url', 'url': 'https://book.douban.com/subject/7163389/'}, 'Genre': {'id': 'E%3EuZ', 'type': 'relation', 'relation': [], 'has_more': False}, '豆瓣评分': {'id': 'FYYT', 'type': 'number', 'number': 6.6}, 'Publisher': {'id': 'O%5BuU', 'type': 'select', 'select': {'id': '80e095e1-b097-46c3-a350-9830f926622d', 'name': '中信出版集团', 'color': 'green'}}, 'Status': {'id': 'S%5ECb', 'type': 'status', 'status': {'id': '0ebf5e90-b2cf-4eda-ba5a-dddf62073857', 'name': 'Read', 'color'

In [None]:
https://www.notion.so/sparklingdeng/e030d9d8768f42f59adf191e53b07d3b?v=bf35e9a5b12c4c9d99f3beb2a6d3fc2c&pvs=4

In [19]:

import feedparser
from config import *
rss_movietracker = feedparser.parse(rss_address,
                                    request_headers=headers)
print(rss_movietracker)

{'bozo': False, 'entries': [{'title': '读过漂亮朋友', 'title_detail': {'type': 'text/plain', 'language': None, 'base': 'https://www.douban.com/feed/people/208462734/interests', 'value': '读过漂亮朋友'}, 'links': [{'rel': 'alternate', 'type': 'text/html', 'href': 'https://book.douban.com/subject/1893972/'}], 'link': 'https://book.douban.com/subject/1893972/', 'summary': '<table><tr>\n    <td width="80px"><a href="https://book.douban.com/subject/1893972/" title="漂亮朋友">\n    <img alt="漂亮朋友" src="https://img1.doubanio.com/view/subject/s/public/s6224189.jpg" /></a></td>\n    <td>\n        <p>推荐: 力荐</p>\n    </td></tr></table>', 'summary_detail': {'type': 'text/html', 'language': None, 'base': 'https://www.douban.com/feed/people/208462734/interests', 'value': '<table><tr>\n    <td width="80px"><a href="https://book.douban.com/subject/1893972/" title="漂亮朋友">\n    <img alt="漂亮朋友" src="https://img1.doubanio.com/view/subject/s/public/s6224189.jpg" /></a></td>\n    <td>\n        <p>推荐: 力荐</p>\n    </td></tr>

In [3]:
rss_movietracker["entries"][0]["summary"]

'<table><tr>\n    <td width="80px"><a href="https://book.douban.com/subject/25837863/" title="安徒生童话故事集">\n    <img alt="安徒生童话故事集" src="https://img2.doubanio.com/view/subject/s/public/s33645941.jpg" /></a></td>\n    <td>\n        <p>推荐: 还行</p>\n    </td></tr></table>'

In [20]:
def DataBase_item_query(query_database_id):
    headers = {
    "accept": "application/json",
    "Notion-Version": "2022-06-28",
    "content-type": "application/json",
    "authorization": notion_api}

    proxies = {'http': "http://127.0.0.1:7890",
               'https': "http://127.0.0.1:7890"}
    url_notion_block = 'https://api.notion.com/v1/databases/' + query_database_id + '/query'
    res_notion = requests.post(url_notion_block, headers=headers)
    print(res_notion.json)
    S_0 = res_notion.json()
    res_travel = S_0['results']
    if_continue = len(res_travel)
    if if_continue > 0:
        while if_continue % 100 == 0:
            body = {
                'start_cursor': res_travel[-1]['id']
            }
            res_notion_plus = requests.post(url_notion_block, headers=headers, json=body, verify=False)
            S_0plus = res_notion_plus.json()
            res_travel_plus = S_0plus['results']
            for i in res_travel_plus:
                if i['id'] == res_travel[-1]['id']:
                    continue
                res_travel.append(i)
            if_continue = len(res_travel_plus)
    return res_travel

In [22]:
import NotionAPI

notion_books = DataBase_item_query("e030d9d8768f42f59adf191e53b07d3b")
books = [item['properties']['链接']['url'] for item in notion_books]

<bound method Response.json of <Response [200]>>


In [23]:
books

['https://book.douban.com/subject/7163389/',
 'https://book.douban.com/subject/35236944/',
 'https://book.douban.com/subject/35947257/',
 'https://book.douban.com/subject/10554308/',
 'https://book.douban.com/subject/3266609/',
 'https://book.douban.com/subject/1068920/',
 'https://book.douban.com/subject/30259153/',
 'https://book.douban.com/subject/4913064/',
 'https://book.douban.com/subject/26264953/',
 'https://book.douban.com/subject/1192090/',
 'https://book.douban.com/subject/6434543/',
 'https://book.douban.com/subject/25976333/',
 'https://book.douban.com/subject/35022388/',
 'https://book.douban.com/subject/1041482/',
 'https://book.douban.com/subject/11444474/',
 'https://book.douban.com/subject/6518605/',
 'https://book.douban.com/subject/1034282/',
 'https://book.douban.com/subject/1060068/',
 'https://book.douban.com/subject/4908879/',
 'https://book.douban.com/subject/6890100/',
 'https://book.douban.com/subject/35236945/',
 'https://book.douban.com/subject/1012611/',
 

In [40]:
def DataBase_additem(database_id, body_properties, station):
    body = {
        'parent': {'type': 'database_id', 'database_id': database_id},
    }
    body.update(body_properties)
    headers = {
    "accept": "application/json",
    "Notion-Version": "2022-06-28",
    "content-type": "application/json",
    "authorization": "Bearer secret_4aA1JcVQKjF1RlunDeKhlaUWCrkO4By4BUY8jJynckw"
}
    url_notion_additem = 'https://api.notion.com/v1/pages'
    notion_additem = requests.post(url_notion_additem, headers=headers, json=body)
    print(notion_additem)
    if notion_additem.status_code == 200:
        print(station + '·更新成功')
    else:
        print(str(notion_additem.status_code) + '·更新失败')

In [46]:
import NotionAPI
import time
notion_books = DataBase_item_query("e030d9d8768f42f59adf191e53b07d3b")
books = [item['properties']['链接']['url'] for item in notion_books]
for item in rss_movietracker["entries"]:
    print(item)
    # if "看过" not in item["title"]:
    #     continue
    print(item["published"])
    # rel = NotionAPI.select_items_form_Databaseitems(notion_books, "链接", movie_url)
    if "读过" in item["title"]:
        title = item["title"].split("读过")[1]
    elif "最近在读" in item["title"]:
        title = item["title"].split("最近在读")[1]
    else:
        continue
    time1 = item["published"]
    pattern1 = re.compile(r'(?<=src=").+(?=")', re.I)  
    cover_url = re.findall(pattern1, item["summary"])[0]
    book_url = item["link"]
    cover_url = cover_url.replace("s_ratio_poster", "r")
    pattern2 = re.compile(r'(?<=. ).+\d{4}', re.S)  # 匹配时间
    month_satandard = {'Jan': '01', 'Feb': '02', 'Mar': '03', 'Apr': '04', 'May': '05', 'Jun': '06',
                       'Jul': '07', 'Aug': '08', 'Sep': '09', 'Oct': '10', 'Nov': '11', 'Dec': '12'}
    time1 = re.findall(pattern2, time1)[0]
    time1 = time1.split(" ")
    day = time1[0]
    month = month_satandard[time1[1]]
    year = time1[2]
    watch_time = str(year) + "-" + str(month) + "-" + str(day)
    pattern = re.compile(r'(?<=<p>).+(?=</p>)', re.S)  # 匹配评论·
    # pattern2 = re.compile(r'(?<=<p>)(.|\n)+(?=</p>)', re.I) # 匹配评论·
    allcomment = re.findall(pattern, item["summary"])[0]  # 需要进一步处理
    # print(allcomment)

    pattern1 = re.compile(r'(?<=推荐: ).+(?=</p>)', re.S)  # 匹配评分
    # 一星：很差 二星：较差 三星：还行 四星：推荐 五星：力荐
    scoredict = {'很差': '⭐', '较差': '⭐⭐', '还行': '⭐⭐⭐', '推荐': '⭐⭐⭐⭐', '力荐': '⭐⭐⭐⭐⭐', }
    # score = re.findall(pattern1, allcomment)
    score = allcomment[-2:]
    comment = ""
    if score:
        score = scoredict[score]
    else:
        score = "⭐⭐⭐"

    if book_url not in books:
        book_info = parse_one_page(book_url, headers=headers)
        body = {
                    'properties': {
                        '标题': {
                            'title': [{'type': 'text', 'text': {'content': str(title)}}]
                        },
                        '阅读时间': {'date': {'start': str(watch_time)}},
                        '评分': {'type': 'select', 'select': {'name': str(score)}},
                        'Author': {'type': 'select', 'select': {'name': str(book_info["Author"])}},
                        'Publisher': {'type': 'select', 'select': {'name': str(book_info["publisher"])}},
                        'Cover': {
                            'files': [{'type': 'external', 'name': '封面', 'external': {'url': str(cover_url)}}]
                        },
                        '链接': {'type': 'url', 'url': str(book_url)},
                        'ISBN': 
                            {'type': 'rich_text', 'rich_text': [{'type': 'text', 'text': {'content': str(book_info["ISBN"])},  'plain_text': str(book_info["ISBN"])}]},
                        
                        }

                    }
        print(body)
        DataBase_additem("e030d9d8768f42f59adf191e53b07d3b", body, title)
        time.sleep(3)
    

<bound method Response.json of <Response [200]>>
{'title': '读过漂亮朋友', 'title_detail': {'type': 'text/plain', 'language': None, 'base': 'https://www.douban.com/feed/people/208462734/interests', 'value': '读过漂亮朋友'}, 'links': [{'rel': 'alternate', 'type': 'text/html', 'href': 'https://book.douban.com/subject/1893972/'}], 'link': 'https://book.douban.com/subject/1893972/', 'summary': '<table><tr>\n    <td width="80px"><a href="https://book.douban.com/subject/1893972/" title="漂亮朋友">\n    <img alt="漂亮朋友" src="https://img1.doubanio.com/view/subject/s/public/s6224189.jpg" /></a></td>\n    <td>\n        <p>推荐: 力荐</p>\n    </td></tr></table>', 'summary_detail': {'type': 'text/html', 'language': None, 'base': 'https://www.douban.com/feed/people/208462734/interests', 'value': '<table><tr>\n    <td width="80px"><a href="https://book.douban.com/subject/1893972/" title="漂亮朋友">\n    <img alt="漂亮朋友" src="https://img1.doubanio.com/view/subject/s/public/s6224189.jpg" /></a></td>\n    <td>\n        <p>推荐: 力