In [1]:
import requests

def get_top_hackernews_stories(top_n=5):
    # 获取最新的故事ID列表
    top_stories_url = "https://hacker-news.firebaseio.com/v0/topstories.json"
    story_ids = requests.get(top_stories_url).json()

    top_stories = []
    
    # 获取前top_n个故事的详细信息
    for story_id in story_ids[:top_n]:
        story_url = f"https://hacker-news.firebaseio.com/v0/item/{story_id}.json"
        story_details = requests.get(story_url).json()
        
        if story_details:
            top_stories.append({
                'title': story_details.get('title'),
                'url': story_details.get('url'),
                'score': story_details.get('score'),
                'by': story_details.get('by'),
                'time': story_details.get('time')
            })
    
    return top_stories

if __name__ == "__main__":
    top_stories = get_top_hackernews_stories()
    for idx, story in enumerate(top_stories, 1):
        print(f"{idx}. {story['title']} (Score: {story['score']})")
        print(f"   By: {story['by']}, Time: {story['time']}")
        print(f"   URL: {story['url']}\n")


1. Box64 and RISC-V in 2024: What It Takes to Run the Witcher 3 on RISC-V (Score: 72)
   By: pabs3, Time: 1724732607
   URL: https://box86.org/2024/08/box64-and-risc-v-in-2024/

2. Notes on Buttondown.com (Score: 18)
   By: luu, Time: 1724736352
   URL: https://jmduke.com/posts/microblog/buttondown-dot-com/

3. FreeBSD-rustdate, a reimplementation of FreeBSD-update (Score: 30)
   By: ggm, Time: 1724730395
   URL: https://rustdate.over-yonder.net/

4. Erasure Coding for Distributed Systems (Score: 193)
   By: eatonphil, Time: 1724702625
   URL: https://transactional.blog/blog/2024-erasure-coding

5. Dokku: My favorite personal serverless platform (Score: 685)
   By: tosh, Time: 1724685716
   URL: https://hamel.dev/blog/posts/dokku/



In [5]:
import scrapy
from scrapy.crawler import CrawlerProcess

class HackernewsSpider(scrapy.Spider):
    name = 'hackernews_spider'
    allowed_domains = ['news.ycombinator.com']
    start_urls = ['https://news.ycombinator.com/']

    def parse(self, response):
        stories = response.css('tr.athing')
        for story in stories:
            title_tag = story.find('span', class_='titleline').find('a')
            if title_tag:
                title = title_tag.css('::text').get()
                link = title_tag.css('::attr(href)').get()
                yield {
                    'title': title,
                    'link': link
                }

        # 处理下一页的链接
        next_page = response.css('a.morelink::attr(href)').get()
        if next_page:
            yield response.follow(next_page, self.parse)

# 设置爬虫
process = CrawlerProcess(settings={
    'FEEDS': {
        'hackernews_hotspots.json': {
            'format': 'json',
            'encoding': 'utf8',
            'store_empty': False,
            'indent': 4,
        },
    },
    'LOG_LEVEL': 'INFO',  # 设置爬虫的日志等级
})

# 启动爬虫
process.crawl(HackernewsSpider)
process.start()

# 完



2024-08-27 16:38:10 [scrapy.utils.log] INFO: Scrapy 2.11.2 started (bot: scrapybot)
2024-08-27 16:38:10 [scrapy.utils.log] INFO: Versions: lxml 5.3.0.0, libxml2 2.12.9, cssselect 1.2.0, parsel 1.9.1, w3lib 2.2.1, Twisted 24.7.0, Python 3.10.14 (main, Apr 27 2024, 21:17:55) [GCC 13.2.0], pyOpenSSL 24.2.1 (OpenSSL 3.3.1 4 Jun 2024), cryptography 43.0.0, Platform Linux-5.15.153.1-microsoft-standard-WSL2-x86_64-with-glibc2.39
2024-08-27 16:38:10 [scrapy.addons] INFO: Enabled addons:
[]
2024-08-27 16:38:10 [scrapy.extensions.telnet] INFO: Telnet Password: 4eab7062a39f3a54
2024-08-27 16:38:10 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.memusage.MemoryUsage',
 'scrapy.extensions.feedexport.FeedExporter',
 'scrapy.extensions.logstats.LogStats']
2024-08-27 16:38:10 [scrapy.crawler] INFO: Overridden settings:
{'LOG_LEVEL': 'INFO'}
2024-08-27 16:38:10 [scrapy.middleware] INFO: Enabled downlo

ReactorNotRestartable: 