In [1]:
%%capture
!pip install --upgrade selenium
!apt update
!apt install chromium-chromedriver
!cp /usr/lib/chromium-browser/chromedriver /usr/bin

"""# Packages & Function to Get Data"""

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException  # Ensure this is imported
from bs4 import BeautifulSoup
from datetime import datetime
import csv
import time
import shutil
import re
import pandas as pd
from selenium.common.exceptions import TimeoutException

In [6]:
def extract_bond_links(soup):
    bond_table = soup.find("div", {"class": "table-responsive"})
    links = []
    if bond_table:
        rows = bond_table.find_all("tr")
        for row in rows:
            cols = row.find_all("td")
            if len(cols) >= 1:
                link_element = cols[0].find("a", href=True)
                if link_element:
                    links.append(link_element["href"])
    return links

import pandas as pd

def separate_text(df):

    text = df
    # Regular expression to split by commas, except when inside single quotes
    pattern = r"(?<!\\)',(?!')"

    # Split the text by commas, respecting quotes
    separated_list = re.split(pattern, text)

    # Clean up: Remove extra spaces and quotes
    separated_list = [item.strip().strip("'") for item in separated_list]

    return separated_list

def format_value(value):
    # Helper function to format values as needed (e.g., percentages, empty strings, etc.)
    if value is None or value == '':
        return None
    # Handle percentage formatting
    if "%" in value:
        return value.replace(",", ".")  # Convert to decimal format
    return value

def process_bond_dataframe(df):
    processed_data = []

    for index, row in df.iterrows():
        # Extract the raw data
        raw_data = row["//div[@_ngcontent-boerse-frankfurt-c174]//td[contains(@class, 'widget-table-cell')]"]

        # Ensure the data is cleaned and in list format
        if isinstance(raw_data, list):
            cleaned_data = [entry.strip() for entry in raw_data if entry.strip()]
        else:
            print(f"Row {index} has invalid data format. Skipping.")
            continue

        # Directly map the cleaned data to columns
        row_data = {f"column_{i}": format_value(value) for i, value in enumerate(cleaned_data)}

        # Add the URL for traceability
        row_data["URL"] = row["URL"]

        # Append the row data
        processed_data.append(row_data)

    # Create a DataFrame from the processed data
    processed_df = pd.DataFrame(processed_data)
    return processed_df


def extract_bond_data(driver, url):
    # Open the bond detail page
    driver.get(url)
    bond_data = {"URL": url}  # Start with URL for traceability

    try:
        # Cookie Handler
        try:
            wait = WebDriverWait(driver, 5)
            cookie_button = wait.until(
                EC.element_to_be_clickable((By.ID, "cookie-hint-btn-decline"))
            )
            cookie_button.click()
            print("Cookie banner handled successfully (Declined).")
        except TimeoutException:
            print("Cookie banner not found. Skipping...")
        try:
            wait.until(
                EC.invisibility_of_element_located((By.CSS_SELECTOR, ".wrapper[_ngcontent-boerse-frankfurt-c97]"))
            )
            print("Overlay disappeared before 100 button")

            wait.until(
                EC.invisibility_of_element_located((By.CSS_SELECTOR, ".wrapper[_ngcontent-boerse-frankfurt-c98]"))
            )
            print("Overlay c98 disappeared - Loading Table element")
        except TimeoutException:
            print(" Table did not load fully")
            driver.save_screenshot(f"error_loading_initial_table.png") # test

        # Wait for the data table to load
        wait.until(
            EC.presence_of_element_located((By.XPATH, "//div[@_ngcontent-boerse-frankfurt-c174]"))
        )

        # To ensure we get static Data
        time.sleep(2)

        # Define the data points and their XPaths
        data_points = [
            "//div[@_ngcontent-boerse-frankfurt-c174]//td[contains(@class, 'widget-table-cell')]",
            "//div[@_ngcontent-boerse-frankfurt-c174]//td[contains(@class, 'widget-table-cell text-right')]"
        ]

        # Loop through each XPath in data_points and extract the text
        for xpath in data_points:
            try:
                # Use find_elements to handle multiple matches and extract all values
                elements = driver.find_elements(By.XPATH, xpath)
                if elements:
                    # Extract and clean the text from all matched elements
                    extracted_values = [element.text.strip() for element in elements]
                    bond_data[xpath] = extracted_values  # Save all extracted values under the XPath as the key
                else:
                    bond_data[xpath] = []  # Handle missing elements with an empty list

            except Exception as inner_e:
                bond_data[xpath] = []  # Handle missing data gracefully
                print(f"Error extracting data for XPath {xpath}: {inner_e}")

    except Exception as e:
        print(f"An error occurred while processing {url}: {e}")

    return bond_data

In [10]:
# Check if Chromedriver is available on system path
shutil.which("chromedriver")

# Configuration de Selenium WebDriver
options = webdriver.ChromeOptions()
options.add_argument("--headless")
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")

# Initialisation de driver
driver = webdriver.Chrome(options=options)
url = "https://www.boerse-frankfurt.de/anleihen/most-traded"

# Script principal
try:
    # start driver
    driver.get(url)
    wait = WebDriverWait(driver, 5)

    try:
        cookie_button = WebDriverWait(driver, 5).until(
            EC.element_to_be_clickable((By.XPATH, "//button[contains(text(), 'Decline')]"))
        )
        cookie_button.click()
        print("Cookie banner handled successfully (Declined).")
    except TimeoutException:
        print("Cookie banner not found. Skipping...")

    # 100 button
    hundred_button = wait.until(
        EC.element_to_be_clickable((By.XPATH, "//button[contains(@class, 'page-bar-type-button btn btn-lg ng-star-inserted') and text()='100']"))
    )
    hundred_button.click()
    time.sleep(5)

    # Variable to store bond links
    all_bond_links = []
    # Get total number of pages
    page_buttons = driver.find_elements(
        By.XPATH,
        "//button[contains(@class, 'page-bar-type-button page-bar-type-button-width-auto btn btn-lg ng-star-inserted') and not(@disabled)]",
    )
    page_limit = 10 # Limit to 1000 bonds for now
    total_pages = min(
        int(page_buttons[-1].text.strip()), page_limit or float("inf")
    )
    print(f"Total pages varaible: {total_pages}")
    print(f"Total pages shown on website: {page_buttons[-1].text.strip()}")

    for page in range(1, total_pages + 1):
        try:
            if page != 1:
                page_button = wait.until(
                    EC.element_to_be_clickable((By.XPATH, f"//button[contains(@class, 'page-bar-type-button page-bar-type-button-width-auto btn btn-lg ng-star-inserted') and text()='{page}']"))
                )
                page_button.click()
                time.sleep(5)

            page_source = driver.page_source
            soup = BeautifulSoup(page_source, "html.parser")
            bond_links = extract_bond_links(soup)
            all_bond_links.extend(bond_links)

        except Exception as e:
            print(f"Error at the page {page}: {e}")
            break

    # Collecter les static data pour chaque obligation
    static_bonds_data = []
    print(f"preview of bond links: {all_bond_links}")

    # getting bond data now
    # Base URL
    base_url = "https://www.boerse-frankfurt.de"

    # Iterate through each bond link
    for link in all_bond_links[:1000]:  # all_bonds_links should already be collected earlier
        full_url = base_url + link
        print(f"Processing bond URL: {full_url}")
        bond_data = extract_bond_data(driver, full_url)
        static_bonds_data.append(bond_data)

except Exception as e:
    print(f"Critical error: {e}")
    # add screenshot
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    screenshot_path = f"error_screenshot_{timestamp}.png"
    driver.save_screenshot(screenshot_path)

# Export the DataFrame to a CSV file even if errors occur
finally:
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    # Close the WebDriver
    # save data
    df = pd.DataFrame(static_bonds_data)
    processed_df = process_bond_dataframe(df)
    csv_file = f"Static_bond_data{timestamp}.csv"
    processed_df.to_csv(csv_file, index=False)
    df.to_csv("raw_static_data.csv", index=False)
    print(f"Data saved to {csv_file}")
    driver.quit()

In [8]:
processed_df

Unnamed: 0,column_0,column_1,column_2,column_3,column_4,column_5,column_6,column_7,column_8,column_9,...,column_100,column_101,column_102,column_103,column_104,column_105,column_106,URL,column_107,column_108
0,24.01.25 17:00:49,Letzter Preis,99905,Veränderung zum Vortag,0.00%,24.01.25 17:30:00,Geld,Brief,000,000,...,Nachrangig,Kleinste handelbare Einheit,0,Spezialist,BAADER BANK AG,Handelsmodell,Fortlaufende Auktion,https://www.boerse-frankfurt.de/anleihe/de0001...,,
1,24.01.25 17:00:49,Letzter Preis,10059,Veränderung zum Vortag,-0.02%,24.01.25 17:30:00,Geld,Brief,000,000,...,Nachrangig,Kleinste handelbare Einheit,0,Spezialist,BAADER BANK AG,Handelsmodell,Fortlaufende Auktion,https://www.boerse-frankfurt.de/anleihe/de000b...,,
2,24.01.25 17:00:50,Letzter Preis,10008,Veränderung zum Vortag,-0.01%,24.01.25 17:30:00,Geld,Brief,000,000,...,Nachrangig,Kleinste handelbare Einheit,0,Spezialist,BAADER BANK AG,Handelsmodell,Fortlaufende Auktion,https://www.boerse-frankfurt.de/anleihe/de000b...,,
3,24.01.25 17:00:49,Letzter Preis,10037,Veränderung zum Vortag,-0.02%,24.01.25 17:30:00,Geld,Brief,000,000,...,Nachrangig,Kleinste handelbare Einheit,0,Spezialist,BAADER BANK AG,Handelsmodell,Fortlaufende Auktion,https://www.boerse-frankfurt.de/anleihe/de000b...,,
4,24.01.25 17:00:51,Letzter Preis,99963,Veränderung zum Vortag,0.00%,24.01.25 17:30:00,Geld,Brief,000,000,...,Nachrangig,Kleinste handelbare Einheit,0,Spezialist,BAADER BANK AG,Handelsmodell,Fortlaufende Auktion,https://www.boerse-frankfurt.de/anleihe/de000b...,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,24.01.25 17:00:16,Letzter Preis,10088,Veränderung zum Vortag,-0.02%,24.01.25 17:30:00,Geld,Brief,000,000,...,,,,,,,,https://www.boerse-frankfurt.de/anleihe/xs2579...,,
996,24.01.25 08:17:22,Letzter Preis,103965,Veränderung zum Vortag,-0.12%,24.01.25 17:30:00,Geld,Brief,000,000,...,,,,,,,,https://www.boerse-frankfurt.de/anleihe/xs2729...,,
997,24.01.25 17:25:06,Letzter Preis,99771,Veränderung zum Vortag,-0.45%,24.01.25 17:30:00,Geld,Brief,000,000,...,Sonderkündigung,Nachrangig,Kleinste handelbare Einheit,100.000,Spezialist,ICF BANK AG WERTPAPIERHANDELSBANK,Handelsmodell,https://www.boerse-frankfurt.de/anleihe/xs2760...,Fortlaufende Auktion,
998,24.01.25 17:00:13,Letzter Preis,10115,Veränderung zum Vortag,-0.17%,24.01.25 17:30:00,Geld,Brief,000,000,...,Sonderkündigung,Nachrangig,Kleinste handelbare Einheit,100.000,Spezialist,WALTER LUDWIG GMBH WERTPAPIERHANDELSBANK,Handelsmodell,https://www.boerse-frankfurt.de/anleihe/xs2776...,Fortlaufende Auktion,
