In [2]:
# 瘦身代码,先用存入本地的html测试
from bs4 import BeautifulSoup

#with open("nature.html") as f:
with open("science.html") as f:
    html = f.read()

localhtml = html
souphtml = BeautifulSoup(html, "html.parser")
for i in souphtml.find_all(True):
    for name in list(i.attrs):
        if i[name] and name not in ["class"]:
            del i[name]

for i in souphtml.find_all(["svg", "img", "video", "audio"]):
    i.decompose()

with open("science-slim.html", "w") as f:
    f.write(str(souphtml))

你是一个精通python的爬虫工程师，需要使用aiohttp爬取网页，然后用BeautifulSoup解析出列表中的几个字段：几个字段：文章名，摘要内容, url，doi, 发表日期, 发表杂志

目标html代码有如下结构:
```html
```

In [12]:
import aiohttp
import asyncio
from bs4 import BeautifulSoup

async def fetch_page(session, url):
    async with session.get(url) as response:
        return await response.text()

async def scrape_article_data(url):
    async with aiohttp.ClientSession() as session:
        html = await fetch_page(session, url)
        soup = BeautifulSoup(html, 'html.parser')
        articles_data = []

        # Find all article cards
        cards = soup.find_all('div', class_='card-content')

        for card in cards:
            # Extracting required fields
            title_tag = card.find('h3', class_='article-title')

            if title_tag and title_tag.find('a'):
                article_title = title_tag.get_text(strip=True)
                article_url = title_tag.find('a')['href']
                article_doi = article_url.split('/doi/')[-1] if '/doi/' in article_url else 'DOI not found'

                body_tag = card.find('div', class_='card-body')
                article_abstract = body_tag.get_text(strip=True) if body_tag else 'Abstract not found'

                time_tag = card.find('time')
                publication_date = time_tag.get_text(strip=True) if time_tag else 'Date not found'

                # Assuming the magazine title comes after "In" within the article title
                publishing_magazine = article_title.split(' In ')[-1] if ' In ' in article_title else 'Magazine not found'

                articles_data.append({
                    'Article Name': article_title,
                    'URL': article_url,
                    'DOI': article_doi,
                    'Abstract Content': article_abstract,
                    'Publishing Date': publication_date,
                    'Publishing Magazine': publishing_magazine
                })

        return articles_data

In [13]:
# The URL of the target webpage (adjust to the actual URL you want to scrape)
target_url = "https://www.science.org/toc/science/current"

# Set up and run the event loop
#loop = asyncio.get_event_loop()
articles = await scrape_article_data(target_url)
for article in articles:
    print(article)

{'Article Name': 'Time to support Indigenous science', 'URL': '/doi/10.1126/science.ado0684', 'DOI': '10.1126/science.ado0684', 'Abstract Content': 'Abstract not found', 'Publishing Date': '18 Jan 2024', 'Publishing Magazine': 'Magazine not found'}
{'Article Name': 'News at a glance', 'URL': '/doi/10.1126/science.ado0966', 'DOI': '10.1126/science.ado0966', 'Abstract Content': 'Abstract not found', 'Publishing Date': '18 Jan 2024', 'Publishing Magazine': 'Magazine not found'}
{'Article Name': 'Uprooted Ukrainian academics reboot in exile', 'URL': '/doi/10.1126/science.ado0967', 'DOI': '10.1126/science.ado0967', 'Abstract Content': 'Many institutions and scientists displaced from occupied territories may never return', 'Publishing Date': '18 Jan 2024', 'Publishing Magazine': 'Magazine not found'}
{'Article Name': 'Bacteria stitch exotic building blocks into novel proteins', 'URL': '/doi/10.1126/science.ado0968', 'DOI': '10.1126/science.ado0968', 'Abstract Content': 'Efficient method for 

In [7]:
import aiohttp

async def fetch(url):
    async with aiohttp.ClientSession() as session:
            async with session.get(url) as response:
                assert response.status == 200
                return await response.text()


In [11]:
xml_nt = await fetch("https://www.nature.com/nature.rss")
xml_ntb = await fetch("https://www.nature.com/nbt.rss")
xml_nm = await fetch("https://www.nature.com/nmeth.rss")
xml_nc = await fetch("https://www.nature.com/ncomms.rss")
#xml_cell = await fetch("https://www.cell.com/cell/current.rss")
#xml_cr = await fetch("http://www.cell.com/cell-reports/current.rss")
xml_sci = await fetch("https://www.science.org/action/showFeed?type=etoc&feed=rss&jc=science")

In [59]:
# save one to local
with open("nbt.rss.xml", "w") as f:
    f.write(str(xml_ntb))


询问LLM写一个爬虫代码. Prompt:

你是一个精通python的爬虫工程师，需要使用aiohttp获取rss订阅源，然后解析出榜单中的几个字段：文章名，摘要内容,对应的url，doi, 发表日期,来源(杂志名)等.
该订阅源的xml代码如下:
```xml

```

In [91]:
import aiohttp
import asyncio
from lxml import etree
from lxml.html import fromstring

def clean_html(html_content):
    # Parse the HTML content and get the text without tags
    tree = fromstring(html_content)
    text = tree.text_content().strip()  # Remove leading/trailing whitespace
    return text

async def fetch(session, url):
    async with session.get(url) as response:
        return await response.text()

async def parse_rss(feed_url):
    async with aiohttp.ClientSession() as session:
        xml_data = await fetch(session, feed_url)
        root = etree.fromstring(xml_data.encode('utf-8'))

        # XML中使用的命名空间
        ns_map = {
            'rdf': "http://www.w3.org/1999/02/22-rdf-syntax-ns#",
            'prism': "http://prismstandard.org/namespaces/basic/2.0/",
            'dc': "http://purl.org/dc/elements/1.1/",
            'content': "http://purl.org/rss/1.0/modules/content/",
            # 默认命名空间没有前缀，在XPath中必须给它一个前缀
            'default': "http://purl.org/rss/1.0/",
        }

        articles = []
        for item in root.xpath('//default:item', namespaces=ns_map):
            
            summary_html = item.findtext('content:encoded', namespaces=ns_map)
            # Clean HTML tags from the summary
            summary_clean = clean_html(summary_html)

            # Splitting into published journal info and description parts
            split_summary = summary_clean.split('; ', 1)
            journal_info = split_summary[0].strip() if len(split_summary) > 0 else ''
            description = split_summary[1].strip() if len(split_summary) > 1 else ''

            article = {
                'title': item.findtext('dc:title', namespaces=ns_map),
                'journal_info': journal_info,
                'abstract': description,
                'url': item.findtext('prism:url', namespaces=ns_map),
                'doi': item.findtext('dc:identifier', namespaces=ns_map),
                'publish_date': item.findtext('dc:date', namespaces=ns_map),
                'source': item.findtext('prism:publicationName', namespaces=ns_map),
            }
            articles.append(article)

        return articles

async def main():
    feed_url = 'http://feeds.nature.com/nbt/rss/current'
    articles = await parse_rss(feed_url)
    for article in articles:
        print(article)


#asyncio.run(main())
articles = await parse_rss('http://feeds.nature.com/nbt/rss/current')

In [94]:
root = etree.fromstring(xml_ntb.encode('utf-8'))


# XML中使用的命名空间
ns_map = {
    'rdf': "http://www.w3.org/1999/02/22-rdf-syntax-ns#",
    'prism': "http://prismstandard.org/namespaces/basic/2.0/",
    'dc': "http://purl.org/dc/elements/1.1/",
    'content': "http://purl.org/rss/1.0/modules/content/",
    # 默认命名空间没有前缀，在XPath中必须给它一个前缀
    'default': "http://purl.org/rss/1.0/",
}

items = root.xpath('//default:item', namespaces=ns_map)

item = items[0]

summary_html = item.findtext('content:encoded', namespaces=ns_map)
print(summary_html)
# Clean HTML tags from the summary
summary_clean = clean_html(summary_html)

print(summary_clean)


article = articles[0]

print(article["abstract"])


                <p>Nature Biotechnology, Published online: 18 January 2024; <a href="https://www.nature.com/articles/s41587-023-02110-1">doi:10.1038/s41587-023-02110-1</a></p>Mapping higher-order RNA structures and intermolecular RNA–RNA interactions throughout the transcriptome is critical for understanding RNA functions. We developed KARR-seq, a chemical-assisted RNA proximity capture and sequencing technology that enables sensitive and accurate detection of the RNA structurome and functional RNA–RNA interactions.
Nature Biotechnology, Published online: 18 January 2024; doi:10.1038/s41587-023-02110-1Mapping higher-order RNA structures and intermolecular RNA–RNA interactions throughout the transcriptome is critical for understanding RNA functions. We developed KARR-seq, a chemical-assisted RNA proximity capture and sequencing technology that enables sensitive and accurate detection of the RNA structurome and functional RNA–RNA interactions.
doi:10.1038/s41587-023-02110-1Mapping highe

In [40]:
import aiohttp
import asyncio
from bs4 import BeautifulSoup

async def fetch_rss(url):
    async with aiohttp.ClientSession() as session:
        async with session.get(url) as response:
            return await response.text()

def parse_rss(xml_data):
    soup = BeautifulSoup(xml_data, 'xml')
    items = soup.find_all("item")

    articles = []

    for item in items:
        article = {
            "title": item.title.text,
            "content": item.title.text,
            "url": item.link.text,
            "doi": item.find("dc:identifier").text if item.find("dc:identifier") else None,
            "date_published": item.find("dc:date").text if item.find("dc:date") else None
        }
        articles.append(article)

    return articles

In [26]:

async def main():
    rss_url = "https://www.nature.com/nbt.rss"  # Replace with your actual RSS feed URL
    xml_data = await fetch_rss(rss_url)
    articles = parse_rss(xml_data)

    for article in articles:
        print("Title:", article["title"])
        print("URL:", article["url"])
        print("DOI:", article["doi"])
        print("Published Date:", article["date_published"])
        print("---" * 10)

if __name__ == "__main__":
    await main()

Title: KARR-seq maps higher-order RNA structures and RNA–RNA interactions across the transcriptome
URL: https://www.nature.com/articles/s41587-023-02110-1
DOI: doi:10.1038/s41587-023-02110-1
Published Date: 2024-01-18
------------------------------
Title: KARR-seq reveals cellular higher-order RNA structures and RNA–RNA interactions
URL: https://www.nature.com/articles/s41587-023-02109-8
DOI: doi:10.1038/s41587-023-02109-8
Published Date: 2024-01-18
------------------------------
Title: First self-amplifying mRNA vaccine approved
URL: https://www.nature.com/articles/s41587-023-02101-2
DOI: doi:10.1038/s41587-023-02101-2
Published Date: 2024-01-17
------------------------------
Title: <i>N</i><sup>1</sup>-methylpseudouridylation affects the fidelity of mRNA translation
URL: https://www.nature.com/articles/s41587-023-02113-y
DOI: doi:10.1038/s41587-023-02113-y
Published Date: 2024-01-17
------------------------------
Title: Denmark invests in mucosal vaccines
URL: https://www.nature.com/

In [27]:
from typing import Any
from metagpt.actions.action import Action

TRENDING_ANALYSIS_PROMPT = """# 任务要求
你是一位文献调研员. 你被要求基于各大杂志发布的擅长领域和文献标题,汇总生成一篇报告,向用户提供其中的亮点和个性化推荐. 

内容风格请参考以下大纲:
# 今日主刊趋势的标题 (标题应生动并体现今日内容的亮点)
## 今日趋势：揭秘今日主刊热研究！探索研究热点领域，并发现吸引开发者注意的关键领域。从**到**，以前所未有的方式见证顶级项目。
## 列表亮点：聚焦今日文献标题, 为用户提供独特且引人注目的内容。


报告内容请严格按照以下格式生成:
```
# 今日主刊趋势
## 今日趋势
今日，**和**继续作为最受欢迎的编程语言占据主导地位。关键兴趣领域包括**、**和**。
最受欢迎的项目有Project1和Project2。
## 趋势类别
1. 生成式AI
    - [title1](url)：[项目详情，例如星标总数和今日新增，编程语言，...]
    - [title2](url)：...
...
## 列表亮点
1. [title1](url)：[提供推荐此项目的具体原因]。
2. [title1](url)：[提供推荐此项目的具体原因]。
3. ...
```

---
[主刊最新趋势文本]:
{trending}
"""

class AnalysisOSSTrending(Action):

    async def run(
        self,
        trending: Any
    ):
        return await self._aask(TRENDING_ANALYSIS_PROMPT.format(trending=trending))

2024-01-20 17:47:30.384 | INFO     | metagpt.const:get_metagpt_package_root:32 - Package root set to /Users/ciao/repo/MetaGPT
  class AsyncSpooledTemporaryFile(AsyncBase):
2024-01-20 17:47:30.576 | INFO     | metagpt.config:get_default_llm_provider_enum:124 - LLMProviderEnum.OPENAI Model: gpt-4-1106-preview
2024-01-20 17:47:30.576 | INFO     | metagpt.config:get_default_llm_provider_enum:126 - API: LLMProviderEnum.OPENAI


In [28]:
test_act = AnalysisOSSTrending()

articles = parse_rss(xml)
resp = await test_act.run(articles)

2024-01-20 17:48:54.651 | INFO     | metagpt.config:get_default_llm_provider_enum:124 - LLMProviderEnum.OPENAI Model: gpt-4-1106-preview
2024-01-20 17:48:54.652 | INFO     | metagpt.config:get_default_llm_provider_enum:126 - API: LLMProviderEnum.OPENAI


# 今日主刊趋势的标题
## 今日趋势：揭秘今日主刊热研究！探索研究热点领域，并发现吸引开发者注意的关键领域。从RNA结构映射到疫苗创新，以前所未有的方式见证顶级项目。

## 今日趋势
今日，生物科技和医药研究继续作为最受关注的领域占据主导地位。关键兴趣领域包括RNA结构分析、mRNA疫苗技术和可持续生物制药。
最受欢迎的项目有KARR-seq技术和自我扩增mRNA疫苗。

## 趋势类别
1. RNA技术与互作研究
    - [KARR-seq maps higher-order RNA structures and RNA–RNA interactions across the transcriptome](https://www.nature.com/articles/s41587-023-02110-1)：[揭示细胞内复杂RNA结构和RNA之间的相互作用，为基因表达调控提供新视角。]
    - [KARR-seq reveals cellular higher-order RNA structures and RNA–RNA interactions](https://www.nature.com/articles/s41587-023-02109-8)：[同上，可能是系列研究的一部分，进一步深化了对RNA结构的理解。]
2. mRNA疫苗技术
    - [First self-amplifying mRNA vaccine approved](https://www.nature.com/articles/s41587-023-02101-2)：[标志着mRNA疫苗技术的一个重大突破，有望改善疫苗的有效性和生产效率。]
    - [N1-methylpseudouridylation affects the fidelity of mRNA translation](https://www.nature.com/articles/s41587-023-02113-y)：[探讨了mRNA翻译过程中的一种修饰对翻译准确性的影响，对mRNA疫苗设计有重要意义。]
3. 生物制药与可持续性
    - [Denmark invests in mucosal vaccines](https://www.nature.com/articles/s41587-

2024-01-20 17:50:08.701 | INFO     | metagpt.utils.cost_manager:update_cost:48 - Total running cost: $0.045 | Max budget: $10.000 | Current cost: $0.045, prompt_tokens: 1068, completion_tokens: 1159


# Test OSS assistant

In [3]:
from metagpt.team import Team
from bigpt.roles.OSS import SubscriptionAssistant, CrawlerEngineer

team = Team()
team.hire([SubscriptionAssistant(), CrawlerEngineer()])
#team.hire([CrawlerEngineer()])
team.run_project("从36kr创投平台https://pitchhub.36kr.com/financing-flash爬取所有初创企业融资的信息，获取标题，链接， 时间，总结今天的融资新闻，然后在11:40发送给我")
#team.run_project("从Nature期刊(https://www.nature.com/nature/research-articles)爬取所有微生物学领域中与生物信息学、新方法、新发现相关的信息，获取标题，链接， 时间，总结今天的科研新进展，在14:20分发送给我")
#asyncio.run(team.run())


2024-02-04 14:17:43.149 | INFO     | metagpt.config:get_default_llm_provider_enum:124 - LLMProviderEnum.OPENAI Model: gpt-4-1106-preview
2024-02-04 14:17:43.150 | INFO     | metagpt.config:get_default_llm_provider_enum:126 - API: LLMProviderEnum.OPENAI
2024-02-04 14:17:43.167 | INFO     | metagpt.config:get_default_llm_provider_enum:124 - LLMProviderEnum.OPENAI Model: gpt-4-1106-preview
2024-02-04 14:17:43.168 | INFO     | metagpt.config:get_default_llm_provider_enum:126 - API: LLMProviderEnum.OPENAI


In [4]:
from pycallgraph2 import Config,PyCallGraph, GlobbingFilter
from pycallgraph2.output import GraphvizOutput 

from bigpt.actions.OSSs import WriteCrawlerCode, ParseSubRequirement, RunSubscription 


config = Config()
#config = Config(max_depth=1)
config.trace_filter = GlobbingFilter(exclude=[
    'pycallgraph.*',
    '*.secret_function',
    '__*'
])

CrawAct = WriteCrawlerCode()

with PyCallGraph(output=GraphvizOutput(output_file='OSS.png',output_type='png'), config=config):
    await team.run()
    
    #await CrawAct.run("从Nature期刊(https://www.nature.com/nature/research-articles)爬取所有微生物学领域中与生物信息学、新方法、新发现相关的信息，获取标题，链接， 时间，总结今天的科研新进展")

2024-02-04 14:17:45.986 | INFO     | metagpt.config:get_default_llm_provider_enum:124 - LLMProviderEnum.OPENAI Model: gpt-4-1106-preview
2024-02-04 14:17:45.988 | INFO     | metagpt.config:get_default_llm_provider_enum:126 - API: LLMProviderEnum.OPENAI
2024-02-04 14:17:46.006 | INFO     | bigpt.roles.OSS:_act:62 - Grace(Subscription Assistant): ready to ParseSubRequirement


[CONTENT]
{
    "Language": "zh_cn",
    "Cron Expression": "40 11 * * *",
    "Crawler URL List": [
        "https://pitchhub.36kr.com/financing-flash"
    ],
    "Page Content Extraction": "获取所有初创企业当天融资的标题，链接和时间。",
    "Crawl Post Processing": "总结今天的融资新闻。",
    "Information Supplement

2024-02-04 14:17:52.513 | INFO     | metagpt.utils.cost_manager:update_cost:48 - Total running cost: $0.126 | Max budget: $10.000 | Current cost: $0.008, prompt_tokens: 490, completion_tokens: 109
2024-02-04 14:17:52.541 | INFO     | metagpt.roles.role:_act:360 - John(Crawling Engineer): to do WriteCrawlerCode(WriteCrawlerCode)


": ""
}
[/CONTENT]
To achieve the user requirement of extracting all startup financing titles, links, and times from the given HTML outline, we can write a `parse` function that looks for the specific HTML structure where this information is contained. Based on the provided outline, it seems that each financing news item is contained within a `div` with the class `css-xle9x`. The title and link are within an `a` tag with the class `title`, and the time is within a `span` tag with the class `time`.

Here's a complete `parse` function that extracts this information:

```python
from bs4 import BeautifulSoup
from typing import List, Dict

def parse(soup: BeautifulSoup) -> List[Dict[str, str]]:
    # Find all the divs that contain financing news
    news_items = soup.find_all('div', class_='css-xle9x')
    
    # List to hold all the extracted financing news
    financing_news = []
    
    # Iterate over each news item and extract the title, link, and time
    for item in news_items:
     

2024-02-04 14:18:21.651 | INFO     | metagpt.utils.cost_manager:update_cost:48 - Total running cost: $0.236 | Max budget: $10.000 | Current cost: $0.110, prompt_tokens: 9387, completion_tokens: 524
2024-02-04 14:18:21.657 | INFO     | bigpt.roles.OSS:_act:62 - Grace(Subscription Assistant): ready to RunSubscription
2024-02-04 14:18:21.751 | INFO     | metagpt.config:get_default_llm_provider_enum:124 - LLMProviderEnum.OPENAI Model: gpt-4-1106-preview
2024-02-04 14:18:21.752 | INFO     | metagpt.config:get_default_llm_provider_enum:126 - API: LLMProviderEnum.OPENAI


_all` methods accordingly.


CancelledError: 