# Simple SRF News Json File Extractor

Dieses Notebook dient als Experimentierfeld für die Extraktion von Daten aus dem SRF News zu einem JSON-File.   
Die Funktionen werden in der Datei `srf_news.py` gespeichert und können dann in einem anderen Notebook importiert werden.

# 1 Libraries

In [1]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import re
import json
import random
import os
os.chdir("../")


# define url and topic
url = "https://www.srf.ch"
topic = "news"

# 2 Topic Extraction

In [2]:
def extract_possible_topics(url):
    # Send an HTTP GET request to the URL
    response = requests.get(url)

    # Check if the request was successful
    if response.status_code == 200:
        # Parse the HTML content of the page
        soup = BeautifulSoup(response.text, "html.parser")

        # Find all anchor tags (a) with an 'href' attribute
        links = soup.find_all("a", href=True)

        # Initialize a set to store unique topics
        possible_topics = set()

        # Extract and categorize topics from the URLs
        for link in links:
            formatted_url = urljoin(url, link["href"])
            # Extract potential topic keywords from the URL
            topics = re.findall(r'/([^/]+)/', formatted_url)
            for topic in topics:
                possible_topics.add(topic)

        return possible_topics

    else:
        print(f"Failed to retrieve the webpage. Status code: {response.status_code}")
        return None

# call function
print(extract_possible_topics("https://www.srf.ch"))

{'resultcenter', 'www.srf.ch', 'www.rts.ch', 'wirtschaft', 'www.rtr.ch', 'gesundheit', 'startseite', 'literatur', 'www.rsi.ch', 'www.swissinfo.ch', 'woerterbuch', 'www.radioswisspop.ch', 'meteo-stories', 'dialog', 'www.srgssr.ch', 'live', 'decodar-nossa-cultura', 'jobs.srf.ch', 'musik', 'ski-alpin', 'www.radioswissclassic.ch', 'website-und-apps', 'radio', 'kunst', 'arbeitsrecht', 'mehr-sport', 'kassensturz-espresso', 'themen', 'tv', 'www.radioswissjazz.ch', 'gesellschaft', 'fussball', 'eishockey', 'wetter', 'video', 'schweiz', 'school', 'tennis', 'international'}


# 3 Topic all Link Extraction

In [3]:
def scrape_srf_links(url, topic):
    # Send an HTTP GET request to the URL
    response = requests.get(url)

    # Check if the request was successful
    if response.status_code == 200:
        # Parse the HTML content of the page
        soup = BeautifulSoup(response.text, "html.parser")

        # Find all anchor tags (a) with an 'href' attribute
        links = soup.find_all("a", href=True)

        # Initialize a list to store links for the specified topic
        topic_links = []

        # Categorize and format the URLs into the list for the specified topic
        for link in links:
            formatted_url = urljoin(url, link["href"])
            if f"/{topic}" in formatted_url:
                topic_links.append(formatted_url)

        return topic_links

    else:
        print(f"Failed to retrieve the webpage. Status code: {response.status_code}")
        return None

# call function
print(scrape_srf_links("https://www.srf.ch", "sport"))



['https://www.srf.ch/sport', 'https://www.srf.ch/sport', 'https://www.srf.ch/sport/resultcenter/results#/live/ski/1792634', 'https://www.srf.ch/sport', 'https://www.srf.ch/sport/fussball/europa-league/kehrauspartie-fuer-lugano-servette-kann-slavia-nur-noch-den-gruppensieg-streitig-machen', 'https://www.srf.ch/sport/fussball/champions-league/abschluss-der-cl-gruppenphase-ein-rueckblick-zwischen-1-und-10-000', 'https://www.srf.ch/sport/ski-alpin/weltcup-frauen/news-aus-dem-skisport-kein-frauen-training-am-donnerstag-in-val-d-isere', 'https://www.srf.ch/sport/eishockey/nationalmannschaft/nl-topskorer-vor-heimturnier-thuerkauf-wir-wissen-was-auf-dem-spiel-steht', 'https://www.srf.ch/sport/mehr-sport/rad/news-aus-dem-radsport-tour-de-suisse-2024-in-rueschlikon-ambri-cari-und-blatten', 'https://www.srf.ch/sport/fussball/champions-league/umfrage-nach-gruppenphase-wer-hat-s-am-besten-gemacht-waehlen-sie-den-schoensten-cl-treffer', 'https://www.srf.ch/sport/mehr-sport/basketball/aufregung-in-de

# 4 Combine Topic Extraction and Link Extraction into one funtion

In [4]:
def scrape_srf_links(url, topic=None):
    # Send an HTTP GET request to the URL
    response = requests.get(url)

    # Check if the request was successful
    if response.status_code == 200:
        # Parse the HTML content of the page
        soup = BeautifulSoup(response.text, "html.parser")

        # Find all anchor tags (a) with an 'href' attribute
        links = soup.find_all("a", href=True)

        # Initialize a list to store links for the specified topic
        topic_links = []

        # Initialize a set to store unique topics
        possible_topics = set()

        # Categorize and format the URLs into the list for the specified topic
        for link in links:
            formatted_url = urljoin(url, link["href"])
            topics = re.findall(r'/([^/]+)/', formatted_url)
            for t in topics:
                possible_topics.add(t)
            if topic and f"/{topic}" in formatted_url:
                topic_links.append(formatted_url)

        if topic:
            return topic_links
        else:
            return possible_topics

    else:
        print(f"Failed to retrieve the webpage. Status code: {response.status_code}")
        return None

# To get all possible topics
print(list(scrape_srf_links("https://www.srf.ch")))

# To get links related to a specific topic (e.g., "sport")
print(scrape_srf_links("https://www.srf.ch", "sport"))


['resultcenter', 'www.srf.ch', 'www.rts.ch', 'wirtschaft', 'www.rtr.ch', 'gesundheit', 'startseite', 'literatur', 'www.rsi.ch', 'www.swissinfo.ch', 'woerterbuch', 'www.radioswisspop.ch', 'meteo-stories', 'dialog', 'www.srgssr.ch', 'live', 'decodar-nossa-cultura', 'jobs.srf.ch', 'musik', 'ski-alpin', 'www.radioswissclassic.ch', 'website-und-apps', 'radio', 'kunst', 'arbeitsrecht', 'mehr-sport', 'kassensturz-espresso', 'themen', 'tv', 'www.radioswissjazz.ch', 'gesellschaft', 'fussball', 'eishockey', 'wetter', 'video', 'schweiz', 'school', 'tennis', 'international']
['https://www.srf.ch/sport', 'https://www.srf.ch/sport', 'https://www.srf.ch/sport/resultcenter/results#/live/ski/1792634', 'https://www.srf.ch/sport', 'https://www.srf.ch/sport/fussball/europa-league/kehrauspartie-fuer-lugano-servette-kann-slavia-nur-noch-den-gruppensieg-streitig-machen', 'https://www.srf.ch/sport/fussball/champions-league/abschluss-der-cl-gruppenphase-ein-rueckblick-zwischen-1-und-10-000', 'https://www.srf.c

# 5 Extract all Topics and Links export to Json file

In [5]:
def create_json_for_intent(topic, filename, data_directory="data/"):
    # Call the existing function to scrape links
    topic_links = scrape_srf_links(url, topic="news")

    # Check if links were successfully retrieved
    if topic_links:
        # Prepare the JSON structure
        data = {
            "intents": [
                {
                    "tag": topic.replace("_", " "),
                    "patterns": [f"Get me the {topic.replace('_', ' ')}"],
                    "responses": []
                }
            ]
        }

        # Add a response for each link
        for link in topic_links:
            response_variations = [
                f"Here is a {topic.replace('_', ' ')}: {link}",
                f"Here's some {topic.replace('_', ' ')} for you: {link}",
                f"Check out this {topic.replace('_', ' ')}: {link}",
                f"Here's the latest {topic.replace('_', ' ')}: {link}",
                f"Sure, here's a {topic.replace('_', ' ')}: {link}",
                f"{topic.replace('_', ' ')} coming right up: {link}",
                f"Here you go, the {topic.replace('_', ' ')} you requested: {link}",
                f"Enjoy this {topic.replace('_', ' ')}: {link}"
            ]
            response = random.choice(response_variations)
            data["intents"][0]["responses"].append(response)

        # Specify the full path to the JSON file
        full_path = os.path.join(data_directory, filename)

        # Write the data to the JSON file
        with open(full_path, 'w') as json_file:
            json.dump(data, json_file, indent=4)

        print(f"JSON file '{full_path}' created successfully.")
    else:
        print("No links found for the topic. JSON file not created.")

# call function
create_json_for_intent(topic="news", filename="news.json")

JSON file 'data/news.json' created successfully.


# 6 Extract all Topics and Links export to Json file

In [6]:
def create_json_for_topics(topics, data_directory="data/"):
    for topic in topics:
        # Call the existing function to scrape links
        topic_links = scrape_srf_links("https://www.srf.ch", topic)

        # Check if links were successfully retrieved
        if topic_links:
            # Prepare the JSON structure
            data = {
                "intents": [
                    {
                        "tag": topic.replace("_", " "),
                        "patterns": [f"Get me {topic.replace('_', ' ')} news"],
                        "responses": []
                    }
                ]
            }

            # Add a response for each link
            for link in topic_links:
                response_variations = [
                    f"Here is a {topic.replace('_', ' ')} news article: {link}",
                    f"Here's some {topic.replace('_', ' ')} news for you: {link}",
                    f"Check out this {topic.replace('_', ' ')} news: {link}",
                    f"Here's the latest {topic.replace('_', ' ')} news: {link}",
                    f"Sure, here's a {topic.replace('_', ' ')} news link: {link}",
                    f"{topic.replace('_', ' ')} news coming right up: {link}",
                    f"Here you go, the {topic.replace('_', ' ')} news you requested: {link}",
                    f"Enjoy this {topic.replace('_', ' ')} news article: {link}"
                ]
                response = random.choice(response_variations)
                data["intents"][0]["responses"].append(response)

            # Specify the full path to the JSON file
            filename = f"{topic}.json"
            full_path = os.path.join(data_directory, filename)

            # Write the data to the JSON file
            with open(full_path, 'w') as json_file:
                json.dump(data, json_file, indent=4)

            print(f"JSON file '{full_path}' created successfully for {topic} news.")
        else:
            print(f"No links found for the topic {topic}. JSON file not created.")

# List of topics you want to create JSON files for
topics_list = (list(scrape_srf_links("https://www.srf.ch")))

# Call the function to create JSON files for the specified topics
create_json_for_topics(topics_list)


JSON file 'data/resultcenter.json' created successfully for resultcenter news.
JSON file 'data/www.srf.ch.json' created successfully for www.srf.ch news.
JSON file 'data/www.rts.ch.json' created successfully for www.rts.ch news.
JSON file 'data/wirtschaft.json' created successfully for wirtschaft news.
JSON file 'data/www.rtr.ch.json' created successfully for www.rtr.ch news.
JSON file 'data/gesundheit.json' created successfully for gesundheit news.
JSON file 'data/startseite.json' created successfully for startseite news.
JSON file 'data/literatur.json' created successfully for literatur news.
JSON file 'data/www.rsi.ch.json' created successfully for www.rsi.ch news.
JSON file 'data/www.swissinfo.ch.json' created successfully for www.swissinfo.ch news.
JSON file 'data/woerterbuch.json' created successfully for woerterbuch news.
JSON file 'data/www.radioswisspop.ch.json' created successfully for www.radioswisspop.ch news.
JSON file 'data/meteo-stories.json' created successfully for met

# 7 Extract all Topics and Links export to One Json file

In [7]:
def create_combined_json_for_topics(topics, data_directory="data/"):
    # Initialize a dictionary to store data for all topics
    combined_data = {
        "intents": []
    }

    for topic in topics:
        # Call the existing function to scrape links
        topic_links = scrape_srf_links("https://www.srf.ch", topic)

        # Check if links were successfully retrieved
        if topic_links:
            # Prepare the JSON structure for the current topic
            topic_data = {
                "tag": topic.replace("_", " "),
                "patterns": [
                    f"Get me the latest {topic.replace('_', ' ')}",
                    f"Tell me about {topic.replace('_', ' ')}",
                    f"What's happening in {topic.replace('_', ' ')}",
                    f"Give me updates on {topic.replace('_', ' ')}",
                    f"I'm interested in {topic.replace('_', ' ')}",
                    f"Can you provide {topic.replace('_', ' ')} ",
                    f"Tell me more about {topic.replace('_', ' ')}",
                    f"Share {topic.replace('_', ' ')}",
                    f"{topic.replace('_', ' ')}",
                ],
                "responses": []
            }

            # Add a response for each link
            for link in topic_links:
                response_variations = [
                    f"Here is a {topic.replace('_', ' ')} news article: {link}",
                    f"Here's some {topic.replace('_', ' ')} news for you: {link}",
                    f"Check out this {topic.replace('_', ' ')} news: {link}",
                    f"Here's the latest {topic.replace('_', ' ')} news: {link}",
                    f"Sure, here's a {topic.replace('_', ' ')} news link: {link}",
                    f"{topic.replace('_', ' ')} news coming right up: {link}",
                    f"Here you go, the {topic.replace('_', ' ')} news you requested: {link}",
                    f"Enjoy this {topic.replace('_', ' ')} news article: {link}"
                ]
                response = random.choice(response_variations)
                topic_data["responses"].append(response)

            # Add the data for the current topic to the combined_data
            combined_data["intents"].append(topic_data)
        else:
            print(f"No links found for the topic {topic}. Skipping.")

    # Specify the full path to the combined JSON file
    combined_filename = "intents_news.json"
    combined_full_path = os.path.join(data_directory, combined_filename)

    # Write the combined data to the JSON file
    with open(combined_full_path, 'w') as json_file:
        json.dump(combined_data, json_file, indent=4)

    print(f"Combined JSON file '{combined_full_path}' created successfully for all topics.")

topics_list = (list(scrape_srf_links("https://www.srf.ch")))

create_combined_json_for_topics(topics_list)

Combined JSON file 'data/intents_news.json' created successfully for all topics.
