In [2]:
import sys
import requests
from bs4 import BeautifulSoup
import json
from datetime import datetime
from dateutil.parser import parse as date_parse
from urllib.parse import urljoin
import time

In [14]:
# Base URL for the startups category, with a placeholder for the page number.
BASE_URL = "https://techcrunch.com/latest/page/{}/"

# Set a User-Agent to mimic a real browser visit.
HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}

In [6]:
def scrape_article(url):
    response = requests.get(url, headers = HEADERS)
    soup = BeautifulSoup(response.text, 'lxml')
    
    hyperlinks = []
    paragraphs = []
    for i in soup.find_all('p', class_='wp-block-paragraph'):
        anchors = i.find_all('a')
        paragraphs.append(i.get_text(" ", strip = True))
        if anchors:
            for a in anchors:
                hl = a.get('href')
                if hl and len(hl) > 21 and hl[:22] == 'https://techcrunch.com':
                    hyperlinks.append(hl)
                else:
                    continue

    title = soup.find('h1').get_text()
    return {'title':title, 'hyperlinks':hyperlinks, 'content':" ".join(paragraphs)}
    

In [8]:
url = 'https://techcrunch.com/2025/09/26/last-day-to-save-668-on-techcrunch-disrupt-2025-tickets/'
scrape_article(url)

{'title': 'Today is the last day to save up to $668 on TechCrunch Disrupt 2025 tickets',
 'hyperlinks': ['https://techcrunch.com/events/tc-disrupt-2025/?utm_source=tc&utm_medium=post&utm_campaign=disrupt2025&utm_content=ticketsales&promo=post_1dayrbplus_09262025&display=',
  'https://techcrunch.com/events/tc-disrupt-2025/tickets/?promo=post_1dayrbplus_09262025&utm_campaign=disrupt2025&utm_content=ticketsales&utm_medium=post&utm_source=tc',
  'https://techcrunch.com/events/tc-disrupt-2025/speakers/?promo=post_1dayrbplus_09262025&utm_campaign=disrupt2025&utm_content=ticketsales&utm_medium=post&utm_source=tc',
  'https://techcrunch.com/events/tc-disrupt-2025/agenda/?promo=post_1dayrbplus_09262025&utm_campaign=disrupt2025&utm_content=ticketsales&utm_medium=post&utm_source=tc',
  'https://techcrunch.com/events/tc-disrupt-2025/exhibit/?promo=post_1dayrbplus_09262025&utm_campaign=disrupt2025&utm_content=ticketsales&utm_medium=post&utm_source=tc',
  'https://techcrunch.com/events/tc-disrupt-20

In [10]:
def articles(base_url, pages):
    all_urls = []
    for page in range(1, pages +1):
        page_url = base_url.format(page)
        print(f"\nScraping category page {page}: {page_url}")
        try:
            response = requests.get(page_url, headers=HEADERS)
            response.raise_for_status()
        except requests.exceptions.RequestException as e:
            print(f'[!] Error fetching category page {page_url}: {e}')
            break
        soup = BeautifulSoup(response.text, 'lxml')
        posts = soup.select("a.loop-card__title-link")
        print(f"Found {len(posts)} article links on page {page}")
        
        for i, post in enumerate(posts, start=1): 
            all_urls.append(post['href'])
    return all_urls

In [16]:
articles(BASE_URL, 2)


Scraping category page 1: https://techcrunch.com/latest/page/1/
Found 27 article links on page 1

Scraping category page 2: https://techcrunch.com/latest/page/2/
Found 27 article links on page 2


['https://techcrunch.com/2025/10/05/waffles-eat-bluesky/',
 'https://techcrunch.com/2025/10/05/suspect-arrested-after-threats-against-tiktoks-culver-city-headquarters/',
 'https://techcrunch.com/2025/10/05/californias-new-ai-safety-law-shows-regulation-and-innovation-dont-have-to-clash/',
 'https://techcrunch.com/2025/10/05/openai-and-jony-ive-may-be-struggling-to-figure-out-their-ai-device/',
 'https://techcrunch.com/2025/10/05/techcrunch-mobility-toyota-makes-a-1-5b-bet-on-the-startup-ecosystem/',
 'https://techcrunch.com/2025/10/05/the-reinforcement-gap-or-why-some-ai-skills-improve-faster-than-others/',
 'https://techcrunch.com/2025/10/05/the-young-minds-app-wants-to-protect-and-educate-children-online-and-will-show-its-tech-at-techcrunch-disrupt-2025/',
 'https://techcrunch.com/2025/10/04/a-breach-every-month-raises-doubts-about-south-koreas-digital-defenses/',
 'https://techcrunch.com/2025/10/04/newsom-signs-bill-giving-uber-and-lyft-drivers-in-california-the-right-to-unionize/',

In [22]:
all_urls = articles(BASE_URL, 300)
all_data = {}
for i in all_urls:
    all_data[i] = scrape_article(i)


Scraping category page 1: https://techcrunch.com/latest/page/1/
Found 27 article links on page 1

Scraping category page 2: https://techcrunch.com/latest/page/2/
Found 27 article links on page 2

Scraping category page 3: https://techcrunch.com/latest/page/3/
Found 27 article links on page 3

Scraping category page 4: https://techcrunch.com/latest/page/4/
Found 27 article links on page 4

Scraping category page 5: https://techcrunch.com/latest/page/5/
Found 27 article links on page 5

Scraping category page 6: https://techcrunch.com/latest/page/6/
Found 27 article links on page 6

Scraping category page 7: https://techcrunch.com/latest/page/7/
Found 27 article links on page 7

Scraping category page 8: https://techcrunch.com/latest/page/8/
Found 27 article links on page 8

Scraping category page 9: https://techcrunch.com/latest/page/9/
Found 27 article links on page 9

Scraping category page 10: https://techcrunch.com/latest/page/10/
Found 27 article links on page 10

Scraping categor

In [23]:
print(len(all_data.keys()))

5991


In [24]:
output_filename = f'techcrunch_200_latest_pages.json'
with open(output_filename, 'w', encoding='utf-8') as f:
    json.dump(all_data, f, indent=4, ensure_ascii=False)

In [77]:
all_data[all_urls[-1]]['title']

'Spotify to label AI music, filter spam and more in AI policy change'

In [83]:
with open(output_filename, 'r', encoding='utf-8') as f:
    dat = json.load(f)

In [85]:
len(dat.keys())

352

ModuleNotFoundError: No module named 'pyvis'