In [1]:
import json
import time
import os
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import WebDriverException
from webdriver_manager.chrome import ChromeDriverManager


tag = "latest"
# Load request headers from a local file
headers_file = "headers.json"
if os.path.exists(headers_file):
    with open(headers_file, "r", encoding="utf-8") as f:
        request_headers = json.load(f)
else:
    request_headers = {}
    print("Warning: No headers.json file found. Request headers will be empty.")

In [2]:
import json
import time
import os
import random
import requests
from typing import List
from collections import OrderedDict
from bs4 import BeautifulSoup

# User-defined parameters
categories = ["topic", "person", "place", "source"]
tag = "latest"
category = "topic"  # Change this to switch categories

class Topic():
    def __init__(self, topic_name, topic_href):
        self.name = topic_name
        self.href = topic_href

    def get_dict(self):
        return {self.name: self.href}

    def get_tuple(self):
        return self.name, self.href

    @classmethod
    def create_list(cls, names, hrefs):
        return [cls(name, href) for name, href in zip(names, hrefs)]

def collect_topics(category, tag):
    url = f'https://ground.news/my/discover/{category}'
    print(f'Starting to collect topics from {url}...')
    
    # Get the webpage content using requests
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'lxml')
    divs = soup.find_all('div', class_='flex flex-grow text-18 items-center justify-between')
    seed_topics = [div.find('span').text for div in divs]
    print(f'Collected seed topics from {category}: {len(seed_topics)}')

    # BFS on the topics
    topic_list = {}
    queue = [Topic(topic_name, name2href(topic_name)) for topic_name in seed_topics]
    print(f'Queue hrefs: {[_.href for _ in queue]}')  
    while len(queue):
        time.sleep(random.random()) # Random sleep to avoid getting blocked
        if queue[0].name in topic_list:
            queue = queue[1:]
            continue
        else:
            topic_list |= queue[0].get_dict()
            try:
                queue = queue[1:] + get_related_topics(queue[0])
                with open(f'topic_collection/{tag}_topic_list_{category}.json', 'w', encoding='utf-8') as f:
                    topic_list = OrderedDict(sorted(topic_list.items()))
                    json.dump(topic_list, f, indent=4, ensure_ascii=False)
            except Exception as e:
                print(f'Error getting related topics for {queue[0].name}: {e}')
                queue = queue[1:]
            
        if len(topic_list) >= 3000:
            break
        if len(topic_list) % 50 == 0:
            print(f'Collected {len(topic_list)} topics from {category}.')
    print(f'{category} finished')

def get_related_topics(topic: Topic) -> List[Topic]:
    topic, href = topic.get_tuple()
    url = 'https://ground.news' + href

    print(f'Getting related topics for {url}...')
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'lxml')
    related_topics = soup.find_all('div', class_='col-span-12 tablet:col-span-6 desktop:col-span-3')
    names = [_.find('p', class_='text-18 truncate').text for _ in related_topics]
    hrefs = [_.find('a', href=True)['href'] for _ in related_topics]
    return Topic.create_list(names, hrefs)

def name2href(topic_name):
    return '/interest/' + topic_name.lower().replace(' ', '-')

# Run the function to collect topics
collect_topics(category, tag)

Starting to collect topics from https://ground.news/my/discover/topic...
Collected seed topics from topic: 24
Queue hrefs: ['/interest/european-union', '/interest/artificial-intelligence', '/interest/us-politics', '/interest/daytona-500', '/interest/bafta', "/interest/valentine's-day", '/interest/france-politics', '/interest/nato', '/interest/ukraine-war', '/interest/usaid', '/interest/wrestling', '/interest/baseball', '/interest/politics', '/interest/tech', '/interest/international', '/interest/business', '/interest/science', '/interest/environment', '/interest/offbeat', '/interest/entertainment', '/interest/sports', '/interest/europe', '/interest/middle-east', '/interest/lifestyle']
Getting related topics for https://ground.news/interest/european-union...
Getting related topics for https://ground.news/interest/artificial-intelligence...
Getting related topics for https://ground.news/interest/us-politics...
Getting related topics for https://ground.news/interest/daytona-500...
Getting

KeyboardInterrupt: 