# Data generation

## Scraping the Wayback machine

In [None]:
import csv
from datetime import datetime, timedelta
from pathlib import Path

dataLoc = Path("/folder") 
output_file = dataLoc / "output.csv"

end_date = datetime(2015, 1, 1) 
start_date = datetime(2024, 12, 31) 

sources = ["dr", "tv2"]

In [None]:
with open(output_file, mode='w', newline='') as file:
    writer = csv.writer(file)
    
    writer.writerow(['Index', 'Source', 'Date', 'Archive_URL', 'ID'])
    
    current_date = start_date
    i = 1
    
    while current_date >= end_date:
        formatted_date = current_date.strftime('%Y%m%d')
        for source in sources:
            archive_url = f"https://web.archive.org/web/{formatted_date}120000/https://www.{source}.dk/"
            id_value = f"{source}_{formatted_date}_{str(i).zfill(5)}"
            writer.writerow([i, source, formatted_date, archive_url, id_value])
            i += 1
        current_date -= timedelta(days=1)

print(f"CSV file saved successfully at '{output_file}'.")

In [None]:
from pathlib import Path
import requests
import csv
import requests
import os
import time
import re
from bs4 import BeautifulSoup
from urllib.parse import urlparse

dataLoc = Path("/folder") 
csv_file_path = dataLoc / "output.csv"
csv_output_folder = dataLoc / "folder"

def is_article_link(url):
    if 'tv2.dk' in url:
        if '/reel/' in url: 
            return False
        if '/live/' in url: 
            return False
        if 'nyheder.' not in url and 'nyhederne.' not in url and 'politik.' not in url and 'Nyheder.' not in url and 'Nyhederne.' not in url and 'Politik.' not in url and 'finans.' not in url and 'Finans' not in url and 'vejret' not in url and 'Vejret' not in url:
            return False
        last_part = url.split('/')[-1]
        return '-' in last_part
   
    if 'dr.dk' in url:
        if '/reel/' in url: 
            return False
        if '/seneste/' in url: 
            return False
        if '/ultra/' in url: 
            return False
        if '/p3/' in url: 
            return False
        if '/tv-guide/' in url: 
            return False
        if '/etik-og-rettelser/' in url: 
            return False
        if '/det-bedste-fra-dr/' in url: 
            return False
        if '/nyheder/' not in url and 'Nyheder' not in url: 
            return False
        if '/om-dr/' in url: 
            return False
            
        last_part = url.split('/')[-1]
        return '-' in last_part
    
    return False 


In [None]:
def scrape_links(base_url):
    try:
        response = requests.get(base_url)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        
        all_links = [
            link['href'] if link['href'].startswith('http') else base_url + link['href']
            for link in soup.find_all('a', href=True)
        ]
        
        article_links = list(set(filter(is_article_link, all_links)))
        return article_links
    
    except requests.exceptions.RequestException as e:
        print(f"Error accessing {base_url}: {e}")
        return []
      
def clean_url(url):
    wayback_prefix = "web.archive.org/web/"
    if wayback_prefix in url:
        url = url.split(wayback_prefix, 1)[-1]
        if "/" in url:
            url = url.split("/", 1)[-1]

    if "www.dr.dk" in url:
        url = re.sub(r'//web/\d+/https?://', '', url)
        url = re.sub(r'^https:/+', '', url)
        url = re.sub(r'www\.dr\.dkwww\.dr\.dk', 'www.dr.dk', url)

    url = re.sub(r'^https?://', '', url)

    return url
    
def process_links(csv_file_path, csv_output_folder):
    with open(csv_file_path, mode='r', encoding='utf-8') as infile:
        reader = csv.reader(infile, delimiter=',')
        urls = list(reader)
    
    if not os.path.exists(csv_output_folder):
        os.makedirs(csv_output_folder)
    
    for row in urls:
        if row: 

            full_url = row[3] 
            base_url = full_url.split("https://", 1)[-1]  
            base_url = "https://" + base_url 
            unique_id = row[4] 
            print(f"Scraping links from: {base_url} (ID: {unique_id})")
            
            time.sleep(13)
            
            scraped_links = scrape_links(base_url) 
            
            if not scraped_links: 
                print(f"Skipping ID {unique_id} due to error or no links found.")
                continue 

            cleaned_links = [clean_url(link) for link in scraped_links]
            cleaned_links = list(set(cleaned_links))

            output_csv = f"{unique_id}.csv" 
            output_path = os.path.join(csv_output_folder, output_csv)
            
            with open(output_path, mode='w', encoding='utf-8', newline='') as outfile:
                writer = csv.writer(outfile, delimiter=',')
                for link in cleaned_links:
                    writer.writerow([link])
            
            print(f"Scraped links for ID {unique_id} saved to {output_path}")

process_links(csv_file_path, csv_output_folder)

## Remove duplicates

In [None]:
import os
import pandas as pd
from datetime import datetime, timedelta
import re

os.chdir('/folder') 
duplicates_folder = './'
DUPLICATES_FILE = './output.csv'
folder_path = './folder'


In [None]:
def extract_date_from_filename(filename):
    match = re.search(r'(\d{8})', filename) 
    if match:
        return datetime.strptime(match.group(1), "%Y%m%d")
    return None

def get_recent_csv_files(folder_path, current_file):
    all_files = sorted([f for f in os.listdir(folder_path) if f.endswith('.csv')], reverse=True)
    
    file_dates = {f: extract_date_from_filename(f) for f in all_files}
    file_dates = {f: d for f, d in file_dates.items() if d}  
    
    if current_file not in file_dates:
        return []
    
    current_date = file_dates[current_file]
    return [f for f, d in file_dates.items() if current_date - timedelta(days=4) <= d < current_date]

def normalize_url(url):
    return url.strip().rstrip('/')  

def remove_duplicates(folder_path, current_file):
    recent_files = get_recent_csv_files(folder_path, current_file)
    all_links = {}

    print(f"\nProcessing {current_file} - Checking against: {recent_files}")

    for file in recent_files:
        file_path = os.path.join(folder_path, file)
        
        if os.path.getsize(file_path) == 0:
            print(f"⚠️ Skipping empty file: {file}")
            continue
        
        try:
            df = pd.read_csv(file_path, header=None, encoding='utf-8', skip_blank_lines=True, on_bad_lines='skip')
            if df.empty:
                print(f"⚠️ Skipping empty or invalid file: {file}")
                continue
        except pd.errors.EmptyDataError:
            print(f"⚠️ Skipping corrupt or empty file: {file}")
            continue
        
        for link in df.iloc[:, 0].dropna().astype(str).apply(normalize_url):
            all_links[link] = file  

    current_path = os.path.join(folder_path, current_file)

    if os.path.getsize(current_path) == 0:
        print(f"⚠️ Skipping empty file: {current_file}")
        return
    
    try:
        df_current = pd.read_csv(current_path, header=None, encoding='utf-8', skip_blank_lines=True, on_bad_lines='skip')
        if df_current.empty:
            print(f"⚠️ No valid data in {current_file}. Skipping.")
            return
    except pd.errors.EmptyDataError:
        print(f"⚠️ Skipping corrupt or empty file: {current_file}")
        return

    df_current['normalized_link'] = df_current.iloc[:, 0].astype(str).apply(normalize_url)
    
    duplicates = df_current[df_current['normalized_link'].isin(all_links.keys())]
    df_current_filtered = df_current[~df_current['normalized_link'].isin(all_links.keys())].drop(columns=['normalized_link'])

    df_current_filtered.to_csv(current_path, index=False, header=False, encoding='utf-8')

    if not duplicates.empty:
        duplicates = duplicates.copy()
        duplicates.loc[:, 'source_file'] = duplicates['normalized_link'].map(all_links)
        duplicates = duplicates[[0, 'source_file']]
        duplicates.insert(0, 'removed_from', current_file)
        
        os.makedirs(duplicates_folder, exist_ok=True)
        duplicates_file_path = os.path.join(duplicates_folder, DUPLICATES_FILE)
        duplicates.to_csv(duplicates_file_path, index=False, mode='a', header=not os.path.exists(duplicates_file_path), encoding='utf-8')

    print(f"✅ Processed {current_file}: Removed {len(duplicates)} duplicates.")

def process_all_files(folder_path):
    all_files = sorted([f for f in os.listdir(folder_path) if f.endswith('.csv')], reverse=True)

    for file in all_files:
        remove_duplicates(folder_path, file)

if __name__ == "__main__":
    process_all_files(folder_path)


## Scrape media

In [None]:
import requests
from bs4 import BeautifulSoup
import csv
import os
import shutil 
import random
from PIL import Image
import io
from pathlib import Path

dataLoc = Path('/folder')
image_directory = dataLoc / "image_folder/"
input_folder = dataLoc / "folder"
output_csv = dataLoc / "output.csv"


In [None]:
def scrape_articles_from_folder(input_folder, output_csv, image_directory):
    if os.path.exists(image_directory):
        shutil.rmtree(image_directory) 
    os.makedirs(image_directory)
    
    with open(output_csv, mode='w', newline='', encoding='utf-8') as output_file:
        writer = csv.writer(output_file)
        writer.writerow(["ID", "Title", "Theme", "Date", "Link", "Image ID", "Text"])
      
        for input_csv in sorted(os.listdir(input_folder)):
            if input_csv.endswith(".csv"):  
                input_csv_path = os.path.join(input_folder, input_csv)
                print(f"Processing file: {input_csv_path}")

                count = 1
                
                with open(input_csv_path, mode='r', encoding='utf-8') as input_file:
                    reader = csv.reader(input_file)
                    links = [row[0] for row in reader]  
                
                for url in links:
                    try:
                        print(f"Scraping: {url}")

                        response = requests.get("https://" + url)
                        response.raise_for_status()
                        
                        soup = BeautifulSoup(response.content, 'html.parser')
                        
                        title = soup.find('h1').get_text(strip=True) if soup.find('h1') else "No title found"
                        
                        article_body = soup.find('div', class_='dre-article-body__content')
                        if article_body:
                            paragraphs = article_body.find_all('p')
                            article_text = "\n".join(p.get_text(strip=True) for p in paragraphs)
                        else:
                            paragraphs = soup.find_all('p') 
                            article_text = "\n".join(p.get_text(strip=True) for p in paragraphs) if paragraphs else "No text found"

                        if not article_text: 
                            continue

                        ID = f"{input_csv[:-4]}_{str(count).zfill(4)}"
                        
                        
                        print(ID)
                        lead_image_tag = soup.find('meta', property='og:image')
        
                        if lead_image_tag and 'content' in lead_image_tag.attrs:
                            image_url = lead_image_tag['content']  
                            if image_url.lower().endswith('.gif'):
                                print(f"Skipping GIF: {image_url}")
                                image_url = None
                                image_id = None
                            else: 
                                image_id = ID + ".jpeg"

                        if not image_id: 
                            continue

                        if image_url:
                            try:
                                response = requests.get(image_url, stream=True)
                                response.raise_for_status()
                                
                                img = Image.open(io.BytesIO(response.content))
                                img_format = img.format 
                                
                                img.thumbnail((1280, 720), Image.Resampling.LANCZOS)
                                
                                image_path = os.path.join(image_directory, f"{image_id}")
                                
                                img_bytes = io.BytesIO()
                                img.save(img_bytes, format='JPEG', quality=10, optimize=False)
                                
                                with open(image_path, 'wb') as img_file:
                                    img_file.write(img_bytes.getvalue())
                                
                                print(f"Image downloaded, resized, and saved as {image_path}")
                            
                            except Exception as e:
                                print(f"Failed to download the image: {e}")
                                image_id = "Image download failed"

                        theme = ""
                        if "dr" in ID:
                            if "nyheder/" in url:
                                theme = url.split("nyheder/")[1].split("/")[0].lower()
                                if theme not in {"naturvidenskab", "miljoe","kroppen", "tech", "teknologi", "klima", "detektor", "politik", "udland", "vejret", "indland", "viden", "penge", "kultur", "webfeature", "regionale"}: 
                                    theme = ""
                        elif "tv2" in ID:
                            if "nyheder.tv2.dk/" in url:
                                theme = url.split("nyheder.tv2.dk/")[1].split("/")[0].lower()
                                if theme not in {"klima", "finans", "krimi", "udland", "samfund", "business", "politik", "penge", "trafik", "tech", "erhverv", "lokalt"}: 
                                    theme = ""
                        
                        if image_url:
                            raw_date = input_csv.split('_')[1]  
                            date = f"{raw_date[:4]}-{raw_date[4:6]}-{raw_date[6:8]}"  
                            
                            writer.writerow([image_id, title, theme, date, url, image_id, article_text])
                        else:
                            print("Skipping this article as no image was found.")

                        count += 1
                    
                    except Exception as e:
                        print(f"Failed to process the URL {url}: {e}")

scrape_articles_from_folder(input_folder, output_csv, image_directory)
