In [8]:
import requests
from bs4 import BeautifulSoup
import csv
import datetime
import time
import pandas as pd

Key improvements to the script:

1.Added pagination to fetch multiple pages until we reach 100 titles
2.The script now loops through pages (using the ?p=N parameter in the URL)
3.Added a 1-second delay between requests to be respectful to the server
4.Progress messages to show which page is being fetched
5.A check to stop if we reach the end of available stories

The script will now collect titles across multiple pages until it reaches 100 titles or runs out of content. It prints progress updates so you can see what's happening as it works.

You can adjust the number of titles by changing the num_titles variable in the main() function if you ever need a different amount.

In [9]:
def scrape_hackernews(num_titles=100):
    titles = []
    page = 1
    
    # Keep fetching pages until we have enough titles or run out of pages
    while len(titles) < num_titles:
        # URL of Hacker News - first page has no parameter, subsequent pages use p=N
        url = "https://news.ycombinator.com/" if page == 1 else f"https://news.ycombinator.com/news?p={page}"
        
        print(f"Fetching page {page}...")
        
        # Send a GET request to the URL
        response = requests.get(url)
        
        # Check if the request was successful
        if response.status_code != 200:
            print(f"Failed to retrieve the webpage: Status code {response.status_code}")
            break
        
        # Parse the HTML content
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Find all story titles (they are in <span class="titleline"> elements)
        title_elements = soup.find_all('span', class_='titleline')
        
        # If no more titles found, break the loop
        if not title_elements:
            print("No more titles found.")
            break
        
        # Extract the titles from this page
        for title_element in title_elements:
            # The title is in the first a tag inside the span
            a_tag = title_element.find('a')
            if a_tag:
                titles.append(a_tag.text)
                # Break if we've reached the desired number of titles
                if len(titles) >= num_titles:
                    break
        
        # Move to the next page
        page += 1
        
        # Be nice to the server - add a short delay between requests
        time.sleep(1)
    
    return titles[:num_titles]  # Ensure we return exactly the number requested

def save_to_csv(titles):
    # Generate a filename with current date and time
    timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
    filename = f"hackernews_titles_{timestamp}.csv"
    
    # Write titles to CSV file
    with open(filename, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(['Title'])  # Header row
        for title in titles:
            writer.writerow([title])
    
    return filename

In [4]:
def main():
    num_titles = 100
    print(f"Scraping {num_titles} Hacker News titles...")
    
    titles = scrape_hackernews(num_titles)
    
    if titles:
        print(f"Found {len(titles)} titles.")
        filename = save_to_csv(titles)
        print(f"Titles saved to {filename}")
    else:
        print("No titles were found or there was an error.")

if __name__ == "__main__":
    main()

Scraping 100 Hacker News titles...
Fetching page 1...
Fetching page 2...
Fetching page 3...
Fetching page 4...
Found 100 titles.
Titles saved to hackernews_titles_20250407_104704.csv


In [5]:
df = pd.read_csv('hackernews_titles_20250407_104704.csv')

In [10]:
df

Unnamed: 0,Title
0,Rsync replaced with openrsync on macOS Sequoia
1,AI masters Minecraft: DeepMind program finds d...
2,Glamorous Toolkit
3,Dark Mirror Ideologies
4,We asked camera companies why their RAW format...
...,...
95,Database Protocols Are Underwhelming
96,"Serving Vector Tiles, Fast"
97,Federal cuts disrupt repairs to iconic U.S. tr...
98,Emulating an iPhone in QEMU
