diff --git a/Pipfile b/Pipfile index 6210575..8691fa8 100644 --- a/Pipfile +++ b/Pipfile @@ -7,7 +7,7 @@ name = "pypi" [packages] uvloop = {version = "*",sys_platform = "!= 'win32'"} -torequests = ">=4.8.13" +torequests = ">=5.0.10" starlette = "*" uvicorn = "*" aiomysql = "*" diff --git a/README.md b/README.md index 71e474d..f2ea71d 100644 --- a/README.md +++ b/README.md @@ -61,7 +61,7 @@ | 17 | [机器之心](https://www.jiqizhixin.com/search/article?keywords=python&search_internet=true&sort=time) | 3 | CN | [√](https://www.clericpy.top/newspaper/articles.query.html?source=%E6%9C%BA%E5%99%A8%E4%B9%8B%E5%BF%83) | 知名公众号 | | 18 | [依云's Blog](https://blog.lilydjwg.me/tag/python?page=1) | 3 | CN | [√](https://www.clericpy.top/newspaper/articles.query.html?source=%E4%BE%9D%E4%BA%91%27s+Blog) | 文章质量很高 | | 19 | [DEV Community](https://dev.to/t/python/latest) | 3 | EN | [√](https://www.clericpy.top/newspaper/articles.query.html?source=DEV+Community) | 算是个挺好的社区, post 也都不太水 | -| 20 | [Python猫](https://juejin.im/user/57b26118a341310060fa74da/posts?sort=newest) | 3 | CN | [√](https://www.clericpy.top/newspaper/articles.query.html?source=Python%E7%8C%AB) | 2018 年末比较热情的博主, 原创 + 优质译文 | +| 20 | [Python猫](https://zhuanlan.zhihu.com/pythonCat) | 3 | CN | [√](https://www.clericpy.top/newspaper/articles.query.html?source=Python%E7%8C%AB) | 2018 年末比较热情的博主, 原创 + 优质译文 | | 21 | [Python之美](https://zhuanlan.zhihu.com/python-cn) | 3 | CN | [√](https://www.clericpy.top/newspaper/articles.query.html?source=Python%E4%B9%8B%E7%BE%8E) | 早期文章较多, 创业以后更新不太多了 | | 22 | [静觅](https://cuiqingcai.com/category/technique/python) | 3 | CN | [√](https://www.clericpy.top/newspaper/articles.query.html?source=%E9%9D%99%E8%A7%85) | 崔庆才的个人博客, 保持更新的原创博主 | | 23 | [推酷(中文)](https://www.tuicool.com/topics/11130000?st=0&lang=1) | 3 | CN | [√](https://www.clericpy.top/newspaper/articles.query.html?source=%E6%8E%A8%E9%85%B7%28%E4%B8%AD%E6%96%87%29) | 推文类站点. 按热门排序 | diff --git a/newspaper/crawler/sources.py b/newspaper/crawler/sources.py index 3e1e7b8..f9b1af2 100644 --- a/newspaper/crawler/sources.py +++ b/newspaper/crawler/sources.py @@ -159,7 +159,7 @@ }, { "title": "Python猫", - "url": "https://juejin.im/user/57b26118a341310060fa74da/posts?sort=newest", + "url": "https://zhuanlan.zhihu.com/pythonCat", "level": 3, "lang": "CN", "status": "√", diff --git a/newspaper/crawler/spiders.py b/newspaper/crawler/spiders.py index af96413..54caa4c 100644 --- a/newspaper/crawler/spiders.py +++ b/newspaper/crawler/spiders.py @@ -239,9 +239,14 @@ async def common_spider_tuicool(lang, source, max_page=1, ignore_descs=None): proxy = None for page in range(0, max_page): # st 参数: 0 是按时间顺序, 1 是热门文章 - api: str = f'https://www.tuicool.com/topics/11130000?st=1&lang={lang_num}&pn={page}' - r = await req.get( - api, ssl=False, proxy=proxy, retry=1, timeout=5, headers=headers) + api: str = f'https://www.tuicool.com/ah/0?st=1&lang={lang_num}&pn={page}' + r = await req.get(api, + ssl=False, + proxy=proxy, + retry=1, + timeout=5, + headers=headers) + # print(r.text) if not r: logger.info(f'crawl tuicool {lang} page={page} failed: {r}') return articles @@ -255,12 +260,12 @@ async def common_spider_tuicool(lang, source, max_page=1, ignore_descs=None): break for item in items: article: dict = {'source': source} - url = null_tree.css(item, '.aricle_item_info>.title>a').get( - 'href', '') + url = null_tree.css(item, + '.aricle_item_info>.title>a').get('href', '') url = add_host(url, host) title = null_tree.css(item, '.aricle_item_info>.title>a').text - cover = null_tree.css(item, '.article_thumb_image>img').get( - 'src', '') + cover = null_tree.css(item, + '.article_thumb_image>img').get('src', '') cover = cover.replace( 'https://static0.tuicool.com/images/abs_img_no_small.jpg', '') time_span = null_tree.css(item, @@ -273,6 +278,8 @@ async def common_spider_tuicool(lang, source, max_page=1, ignore_descs=None): desc = null_tree.css( item, '.aricle_item_info>div.tip>span:nth-of-type(1)').text.strip() + if not re.search('Python|python', f'{title}{desc}'): + continue if desc in ignore_descs: continue article['cover'] = cover @@ -300,13 +307,12 @@ async def common_spider_juejin(user, source, max_page=1): for page in range(max_page): try: params['before'] = now - r = await req.get( - api, - ssl=False, - params=params, - retry=1, - timeout=5, - headers={"User-Agent": CHROME_PC_UA}) + r = await req.get(api, + ssl=False, + params=params, + retry=1, + timeout=5, + headers={"User-Agent": CHROME_PC_UA}) if not r: logger.info(f'crawl juejin page={page} failed: {r}') return articles @@ -365,8 +371,8 @@ async def python_news() -> list: # 兼容下没有 desc 的情况 node = item.cssselect('.post-body.entry-content') or [null_tree] desc = node[0].text_content() - article['desc'] = desc.split('\n\n\n', 1)[0].strip().replace( - '\n', ' ') + article['desc'] = desc.split('\n\n\n', + 1)[0].strip().replace('\n', ' ') article['url'] = item.cssselect( '.post-title.entry-title>a')[0].get('href', '') article['url_key'] = get_url_key(article['url']) @@ -407,8 +413,8 @@ async def python_news_history() -> list: # 兼容下没有 desc 的情况 node = item.cssselect('.post-body.entry-content') or [null_tree] desc = node[0].text_content() - article['desc'] = desc.split('\n\n\n', 1)[0].strip().replace( - '\n', ' ') + article['desc'] = desc.split('\n\n\n', + 1)[0].strip().replace('\n', ' ') article['url'] = item.cssselect( '.post-title.entry-title>a')[0].get('href', '') article['url_key'] = get_url_key(article['url']) @@ -598,12 +604,10 @@ async def importpython() -> list: # 一周一更, 所以只取第一个就可以了 limit = 1 seed = 'https://importpython.com/newsletter/archive/' - r = await req.get( - seed, - retry=1, - timeout=20, - ssl=False, - headers={"User-Agent": CHROME_PC_UA}) + r = await req.get(seed, + timeout=15, + ssl=False, + headers={"User-Agent": CHROME_PC_UA}) if not r: logger.error(f'{source} crawl failed: {r}, {r.text}') return articles @@ -617,8 +621,10 @@ async def importpython() -> list: url = add_host(href, 'https://importpython.com/') title = item.cssselect('div.caption>.well-add-card>h4')[0].text desc_node = item.cssselect('div.caption>div[class="col-lg-12"]')[0] - desc = tostring( - desc_node, method='html', with_tail=0, encoding='unicode') + desc = tostring(desc_node, + method='html', + with_tail=0, + encoding='unicode') day, month, year = re.findall(r'- (\d+) (\S+) (\d+)', title)[0] month = month[:3] raw_time = f'{year}-{month}-{day}' @@ -659,8 +665,10 @@ async def awesome_python() -> list: try: article: dict = {'source': source} url = add_host(href, 'https://python.libhunt.com/') - r = await req.get( - url, retry=2, timeout=15, headers={"User-Agent": CHROME_PC_UA}) + r = await req.get(url, + retry=2, + timeout=15, + headers={"User-Agent": CHROME_PC_UA}) if not r: logger.error(f'fetch {url} failed: {r}') break @@ -699,8 +707,10 @@ async def real_python() -> list: articles: list = [] limit = 20 seed = 'https://realpython.com/' - r = await req.get( - seed, retry=1, timeout=20, headers={"User-Agent": CHROME_PC_UA}) + r = await req.get(seed, + retry=1, + timeout=20, + headers={"User-Agent": CHROME_PC_UA}) if not r: logger.error(f'{source} crawl failed: {r}, {r.text}') return articles @@ -804,12 +814,13 @@ async def julien_danjou() -> list: source: str = 'Julien Danjou' articles: list = [] seed = 'https://julien.danjou.info/page/1/' - scode = await outlands_request({ - 'method': 'get', - 'timeout': 5, - 'retry': 2, - 'url': seed, - }, 'u8') + scode = await outlands_request( + { + 'method': 'get', + 'timeout': 5, + 'retry': 2, + 'url': seed, + }, 'u8') items = fromstring(scode).cssselect('.post-feed>article.post-card') # 判断发布时间如果是 1 小时前就 break break_time = ttime(time.time() - 60 * 60) @@ -827,20 +838,21 @@ async def julien_danjou() -> list: [null_tree])[0].text if not (title and url): raise ValueError(f'{source} no title {url}') - detail_scode = await outlands_request({ - 'method': 'get', - 'timeout': 5, - 'retry': 2, - 'url': url, - }, 'u8') + detail_scode = await outlands_request( + { + 'method': 'get', + 'timeout': 5, + 'retry': 2, + 'url': url, + }, 'u8') if not detail_scode: raise ValueError(f'{source} has no detail_scode {url}') raw_pub_time = find_one( 'property="article:published_time" content="(.+?)"', detail_scode)[1] # 2019-05-06T08:58:00.000Z - ts_publish = ttime( - ptime(raw_pub_time, fmt='%Y-%m-%dT%H:%M:%S.000Z')) + ts_publish = ttime(ptime(raw_pub_time, + fmt='%Y-%m-%dT%H:%M:%S.000Z')) cover_item = item.cssselect('img.post-card-image') if cover_item: cover = cover_item[0].get('src', '') @@ -872,11 +884,10 @@ async def doughellmann() -> list: max_page: int = 1 seed = 'https://doughellmann.com/blog/page/{page}/' for page in range(1, max_page + 1): - r = await req.get( - seed.format(page=page), - retry=1, - timeout=20, - headers={"User-Agent": CHROME_PC_UA}) + r = await req.get(seed.format(page=page), + retry=1, + timeout=20, + headers={"User-Agent": CHROME_PC_UA}) if not r: logger.error(f'{source} crawl failed: {r}, {r.text}') return articles @@ -924,12 +935,13 @@ async def mouse_vs_python() -> list: seed = 'https://www.blog.pythonlibrary.org/page/{page}/' for page in range(1, max_page + 1): api = seed.format(page=page) - scode = await outlands_request({ - 'method': 'get', - 'timeout': 5, - 'retry': 2, - 'url': api, - }, 'u8') + scode = await outlands_request( + { + 'method': 'get', + 'timeout': 5, + 'retry': 2, + 'url': api, + }, 'u8') items = fromstring(scode).cssselect('#content>article') if max_page > 1: logger.info( @@ -1044,12 +1056,11 @@ async def hn_python() -> list: } for page in range(max_page): params['page'] = page - r = await req.get( - api, - params=params, - retry=2, - timeout=10, - headers={"User-Agent": CHROME_PC_UA}) + r = await req.get(api, + params=params, + retry=2, + timeout=10, + headers={"User-Agent": CHROME_PC_UA}) if not r: logger.error(f'{source} crawl failed: {r}, {r.text}') return articles @@ -1207,8 +1218,9 @@ async def jiqizhixin() -> list: article['ts_publish'] = ttime( ptime(item['published_at'], fmt='%Y/%m/%d %H:%M')) title = item.get('title') or '' - title = title.replace('Python', 'Python').replace( - 'python', 'Python') + title = title.replace('Python', + 'Python').replace('python', + 'Python') article['title'] = title article['cover'] = item.get('cover_image_url') or '' article['desc'] = f'「{item["author"]}」 {shorten_desc(desc)}' @@ -1234,11 +1246,10 @@ async def lilydjwg() -> list: max_page: int = 1 seed = 'https://blog.lilydjwg.me/tag/python?page={page}' for page in range(1, max_page + 1): - r = await req.get( - seed.format(page=page), - retry=1, - timeout=20, - headers={"User-Agent": CHROME_PC_UA}) + r = await req.get(seed.format(page=page), + retry=1, + timeout=20, + headers={"User-Agent": CHROME_PC_UA}) if not r: logger.error(f'{source} crawl failed: {r}, {r.text}') return articles @@ -1263,8 +1274,8 @@ async def lilydjwg() -> list: cover = (item.cssselect('img') or [null_tree])[0].get('src', '') month, day, year = item.cssselect( '.date')[0].text_content().strip().split() - month = f'0{month}' [-2:] - day = f'0{day}' [-2:] + month = f'0{month}'[-2:] + day = f'0{day}'[-2:] article['ts_publish'] = ttime( ptime(f'{year}/{month}/{day}', fmt='%Y/%m/%d')) article['title'] = title @@ -1290,20 +1301,21 @@ async def dev_io() -> list: source: str = "DEV Community" articles: list = [] max_page: int = 1 - per_page: int = 15 + per_page: int = 30 filt_score: int = 10 - curl_string1 = r'''curl 'https://ye5y9r600c-3.algolianet.com/1/indexes/ordered_articles_production/query?x-algolia-agent=Algolia%20for%20vanilla%20JavaScript%203.20.3&x-algolia-application-id=YE5Y9R600C&x-algolia-api-key=YWVlZGM3YWI4NDg3Mjk1MzJmMjcwNDVjMjIwN2ZmZTQ4YTkxOGE0YTkwMzhiZTQzNmM0ZGFmYTE3ZTI1ZDFhNXJlc3RyaWN0SW5kaWNlcz1zZWFyY2hhYmxlc19wcm9kdWN0aW9uJTJDVGFnX3Byb2R1Y3Rpb24lMkNvcmRlcmVkX2FydGljbGVzX3Byb2R1Y3Rpb24lMkNDbGFzc2lmaWVkTGlzdGluZ19wcm9kdWN0aW9uJTJDb3JkZXJlZF9hcnRpY2xlc19ieV9wdWJsaXNoZWRfYXRfcHJvZHVjdGlvbiUyQ29yZGVyZWRfYXJ0aWNsZXNfYnlfcG9zaXRpdmVfcmVhY3Rpb25zX2NvdW50X3Byb2R1Y3Rpb24lMkNvcmRlcmVkX2NvbW1lbnRzX3Byb2R1Y3Rpb24%3D' -H 'accept: application/json' -H 'Referer: https://dev.to/' -H 'Origin: https://dev.to' -H 'User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.106 Safari/537.36' -H 'DNT: 1' --data '{"params":"query=*&hitsPerPage=''' + str( - per_page) - curl_string3 = r'''&attributesToHighlight=%5B%5D&tagFilters=%5B%22python%22%5D"}' --compressed''' for page in range(0, max_page): - curl_string2 = f'&page={page}' - curl_string = f'{curl_string1}{curl_string2}{curl_string3}' - request_args = curlparse(curl_string) - r = await req.request(retry=1, timeout=20, **request_args) + r = await req.get( + f'https://dev.to/search/feed_content?per_page={per_page}&page={page}&tag=python&sort_by=published_at&sort_direction=desc&tag_names%5B%5D=python&approved=&class_name=Article', + headers={ + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36', + 'Referer': 'https://dev.to/t/python/latest' + }, + retry=1, + timeout=20) if not r: logger.error(f'{source} crawl failed: {r}, {r.text}') return articles - items = r.json().get('hits') or [] + items = r.json().get('result') or [] if not items: break host = 'https://dev.to/' @@ -1313,7 +1325,8 @@ async def dev_io() -> list: ) for item in items: try: - if item['score'] < filt_score: + if item['public_reactions_count'] + item[ + 'comments_count'] < filt_score: # filt by min score continue article: dict = {'source': source} @@ -1336,7 +1349,7 @@ async def dev_io() -> list: return articles -@register_online +# @register_online # @register_history # @register_test async def pythoncat() -> list: @@ -1368,6 +1381,22 @@ async def zhihu_zhuanlan_python_cn() -> list: return articles +@register_online +# @register_history +# @register_test +async def zhihu_zhuanlan_python_cat() -> list: + """Python猫""" + source: str = "Python猫" + name: str = 'pythonCat' + articles: list = [] + limit = 10 + articles = await common_spider_zhihu_zhuanlan(name, source, limit=limit) + logger.info( + f'crawled {len(articles)} articles in {timeago(time.time() - START_TIME, 1, 1)} -> [{source}]. {" ?????????" if not articles else ""}' + ) + return articles + + @register_online # @register_history # @register_test @@ -1529,15 +1558,17 @@ async def tuicool_en() -> list: source: str = "推酷(英文)" articles: list = [] max_page: int = 1 - articles = await common_spider_tuicool( - 'EN', source, max_page=max_page, ignore_descs={'Real Python'}) + articles = await common_spider_tuicool('EN', + source, + max_page=max_page, + ignore_descs={'Real Python'}) logger.info( f'crawled {len(articles)} articles in {timeago(time.time() - START_TIME, 1, 1)} -> [{source}]. {" ?????????" if not articles else ""}' ) return articles -@register_online +# @register_online # @register_history # @register_test async def kf_toutiao() -> list: @@ -1559,18 +1590,19 @@ async def kf_toutiao() -> list: ignore_usernames: set = {'豌豆花下猫'} for page in range(0, max_page): params['page'] = page - scode = await outlands_request({ - 'method': 'get', - 'params': params, - 'url': api, - 'ssl': False, - 'retry': 1, - 'headers': { - 'Referer': 'https://juejin.im/tag/Python?sort=popular', - 'Origin': 'https://juejin.im', - 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36', - } - }, 'u8') + scode = await outlands_request( + { + 'method': 'get', + 'params': params, + 'url': api, + 'ssl': False, + 'retry': 1, + 'headers': { + 'Referer': 'https://juejin.im/tag/Python?sort=popular', + 'Origin': 'https://juejin.im', + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36', + } + }, 'u8') if not scode: logger.error(f'{source} crawl failed: {scode}') return articles @@ -1588,8 +1620,8 @@ async def kf_toutiao() -> list: if item.get('user', {}).get('username', '') in ignore_usernames: continue # 2019-05-05T03:51:12.886Z - gmt_time = re.sub(r'\..*', '', item['createdAt']).replace( - 'T', ' ') + gmt_time = re.sub(r'\..*', '', + item['createdAt']).replace('T', ' ') ts_publish = ttime(ptime(gmt_time, tzone=0)) article['ts_publish'] = ts_publish article['lang'] = 'en' if item['english'] else 'CN' @@ -1823,15 +1855,16 @@ async def nedbatchelder() -> list: limit: int = 5 api: str = 'https://nedbatchelder.com/blog/tag/python.html' host: str = 'https://nedbatchelder.com/' - scode = await outlands_request({ - 'method': 'get', - 'timeout': 5, - 'headers': { - 'Referer': api, - 'User-Agent': CHROME_PC_UA, - }, - 'url': api, - }, 'u8') + scode = await outlands_request( + { + 'method': 'get', + 'timeout': 5, + 'headers': { + 'Referer': api, + 'User-Agent': CHROME_PC_UA, + }, + 'url': api, + }, 'u8') container_html = null_tree.tostring( null_tree.css(fromstring(scode), '.category')).decode('utf-8') if not container_html: @@ -1923,8 +1956,8 @@ async def the5fire() -> list: if ':' not in raw_time: if 'm' in raw_time: raw_time = re.sub('m.*', 'm', raw_time) - ts_publish = ttime( - ptime(raw_time, fmt='%Y-%m-%d %I %p')) + ts_publish = ttime(ptime(raw_time, + fmt='%Y-%m-%d %I %p')) else: raw_time = raw_time[:10] ts_publish = ttime(ptime(raw_time, fmt='%Y-%m-%d')) @@ -2197,18 +2230,27 @@ async def reddit() -> list: """Reddit""" source: str = "Reddit" articles: list = [] - limit: int = 10 + limit: int = 22 # 有 20 赞以上的才收录 min_ups: int = 20 + # 或者 10 评论的才收录 + min_cmts: int = 10 # api doc: https://www.reddit.com/dev/api/#GET_top api: str = f'https://api.reddit.com/r/Python/top/?t=day&limit={limit}' host: str = 'https://www.reddit.com/' - - scode = await outlands_request({ - 'method': 'get', - 'url': api, - }, 'u8') - if not scode: + for _ in range(2): + scode = await outlands_request( + { + 'method': 'get', + 'url': api, + 'headers': { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36' + } + }, 'u8') + # print(scode) + if scode: + break + else: logger.error(f'{source} crawl failed') return articles rj: dict = json.loads(scode) @@ -2218,7 +2260,8 @@ async def reddit() -> list: if item['kind'] != 't3': continue data = item['data'] - if (data.get('ups') or 0) < min_ups: + if (data.get('ups') or data.get('score') or + 0) < min_ups and (data.get('num_comments') or 0) < min_cmts: continue article: dict = {'source': source} title: str = data['title'] @@ -2446,8 +2489,8 @@ async def medium_python() -> list: 'method': 'get', 'url': seed, }, 'u8') - items = fromstring( - scode.encode('utf-8'), parser=XMLParser()).xpath('//channel/item') + items = fromstring(scode.encode('utf-8'), + parser=XMLParser()).xpath('//channel/item') now = ttime() for item in items[:limit]: try: