In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import json
import re

In [2]:
# Scraper en enkelt side og returnerer artikellinks
def scrape_page(page_url):
    try:
        response = requests.get(page_url)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')
        
        articles = []
        
        # Find alle li-elementer med den rigtige klasse
        for item in soup.find_all('li', class_='tc_searchresults__item__item'):
            link_tag = item.find('a', href=True)  # Find 'a'-tagget inde i 'li'-elementet
            if link_tag:
                article_url = link_tag['href']
                if not article_url.startswith('http'):
                    article_url = 'https://tv2.dk' + article_url
                
                # Brug regex til at finde datoen i formatet YYYY-MM-DD
                date_match = re.search(r'\d{4}-\d{2}-\d{2}', article_url)
                if date_match:
                    article_date = date_match.group(0)  # Udtræk datoen
                else:
                    article_date = "Unknown date"
                
                articles.append({"url": article_url, "date": article_date})
        
        return articles
    
    except Exception as e:
        print(f"An error occurred with URL: {page_url}")
        print(e)
        return []  # Returner en tom liste, hvis der er en fejl

In [3]:
def scrape_all_pages(base_url, search_query, max_pages):
    all_article_links = []

    for page_num in range(1, max_pages + 1):
        page_url = f"{base_url}?query={search_query}&sort=date&page={page_num}"
        print(f"Scraping page: {page_num} - {page_url}")

        articles_on_page = scrape_page(page_url)
        all_article_links.extend(articles_on_page)

        # Vent lidt for at undgå at blive blokeret
        time.sleep(2)

    return all_article_links

In [4]:
# Hovedfunktion til at scrappe alle artikler og gemme i en CSV-fil
def main():
    base_url = 'https://search.tv2.dk/'
    search_query = "c25"
    max_pages = 11

    # Scrape alle artikel-links
    all_article_links = scrape_all_pages(base_url, search_query, max_pages)

    # Scrape teksten fra hver artikel
    all_articles_text = []

    for idx, article_data in enumerate(all_article_links):
        article_url = article_data['url']  # Udtræk kun URL'en fra ordbogen
        article_date = article_data['date']
        
        print(f"Scraping article {idx + 1}/{len(all_article_links)}: {article_url}")
        
        # Sørg for at sende kun URL-strengen til scrape_article_text
        article_text = scrape_article_text(article_url)
        
        all_articles_text.append({
            "url": article_url,
            "text": article_text,
            "date": article_date
        })

        # Vent lidt mellem anmodninger for at undgå blokering
        time.sleep(2)

    # Gem resultaterne i en CSV-fil
    df = pd.DataFrame(all_articles_text)
    df.to_csv('tv2_c25_articles.csv', index=False)

    print(f"Scraping completed. Total articles scraped: {len(all_articles_text)}")

In [5]:
# Funktion til at scrappe teksten fra en artikel
def scrape_article_text(article_url):
    try:
        response = requests.get(article_url)  # article_url skal være en streng, ikke en ordbog
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, 'html.parser')
            
            # Hent artiklens tekstindhold - tilpas selector afhængigt af HTML-strukturen
            paragraphs = soup.find_all('p')
            article_text = '\n'.join([para.text for para in paragraphs])
            return article_text
        else:
            print(f"Article not found (status code {response.status_code}): {article_url}")
            return "Article not found"
    
    except requests.exceptions.RequestException as e:
        print(f"An error occurred while scraping the article {article_url}: {e}")
        return "Error"

In [6]:
# Kør hovedfunktionen
if __name__ == "__main__":
    main()

Scraping page: 1 - https://search.tv2.dk/?query=c25&sort=date&page=1
Scraping page: 2 - https://search.tv2.dk/?query=c25&sort=date&page=2
Scraping page: 3 - https://search.tv2.dk/?query=c25&sort=date&page=3
Scraping page: 4 - https://search.tv2.dk/?query=c25&sort=date&page=4
Scraping page: 5 - https://search.tv2.dk/?query=c25&sort=date&page=5
Scraping page: 6 - https://search.tv2.dk/?query=c25&sort=date&page=6
Scraping page: 7 - https://search.tv2.dk/?query=c25&sort=date&page=7
Scraping page: 8 - https://search.tv2.dk/?query=c25&sort=date&page=8
Scraping page: 9 - https://search.tv2.dk/?query=c25&sort=date&page=9
Scraping page: 10 - https://search.tv2.dk/?query=c25&sort=date&page=10
Scraping page: 11 - https://search.tv2.dk/?query=c25&sort=date&page=11
Scraping article 1/100: https://nyheder.tv2.dk/penge/2024-08-14-inflationen-i-usa-overrasker-det-kan-betyde-lavere-renter-i-hele-europa
Scraping article 2/100: https://nyheder.tv2.dk/live/kort-nyt/ejere-bekraefter-forhandlinger-om-salg-a

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Scraping article 23/100: https://nyheder.tv2.dk/business/2024-02-26-medicinalfirmas-aktier-brager-i-vejret-efter-testresultater-paa-fedtlever
Scraping article 24/100: https://nyheder.tv2.dk/business/2024-01-31-novo-nordisk-smadrer-egne-rekorder-tjener-halvanden-milliard-om-ugen
Scraping article 25/100: https://nyheder.tv2.dk/penge/2024-01-11-skatterabat-skjuler-den-virkelige-vaerdi-af-dronningens-loen
Scraping article 26/100: https://nyheder.tv2.dk/live/2023-12-29-kort-nyt
Scraping article 27/100: https://nyheder.tv2.dk/live/2023-12-27-kort-nyt
Scraping article 28/100: https://nyheder.tv2.dk/live/2023-12-14-kort-nyt
Scraping article 29/100: https://nyheder.tv2.dk/2023-10-24-c25-virksomhed-indgaar-milliardaftale-om-saudisk-fremtidsby
Scraping article 30/100: https://nyheder.tv2.dk/business/2023-10-24-dansk-virksomhed-skal-spille-noeglerolle-i-kontroversielt-megaprojekt-vi-foerer-ikke-politik
Scraping article 31/100: https://nyheder.tv2.dk/business/2023-10-20-flere-elitevirksomheder-vael

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Scraping article 59/100: https://nyheder.tv2.dk/samfund/2022-09-05-danskere-advares-om-stor-ekstraregning-efter-russisk-gasstop
Scraping article 60/100: https://nyheder.tv2.dk/samfund/2022-09-01-danske-aktier-falder-markant-efter-meldinger-om-stigende-renter
Scraping article 61/100: https://nyheder.tv2.dk/business/2022-08-10-overblik-hvad-betyder-inflationen-for-danmarks-oekonomi
Scraping article 62/100: https://nyheder.tv2.dk/samfund/2022-08-02-flere-virksomheder-vil-betale-maend-for-at-gaa-hjemme-i-24-uger-vi-taenker-at-det-kan-blive-en-god-forretning
Scraping article 63/100: https://nyheder.tv2.dk/penge/2022-07-31-efter-raedselsmaaneder-ser-det-danske-aktiemarked-ud-til-at-vende-men-pas-paa-med-investeringerne
Scraping article 64/100: https://nyheder.tv2.dk/business/2022-07-28-vestas-aktien-brager-op-stiger-159-procent
Scraping article 65/100: https://vejr.tv2.dk/video/TWloMkY5X1Y0SmpfekZuRTRlWFhHM1VmRkVYckg5dVc
Article not found (status code 404): https://vejr.tv2.dk/video/TWloMkY5