In [6]:
%%capture
!pip install --upgrade selenium
!apt update
!apt install chromium-chromedriver
!cp /usr/lib/chromium-browser/chromedriver /usr/bin

"""# Packages & Function to Get Data"""

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException  # Ensure this is imported
from bs4 import BeautifulSoup
from datetime import datetime
import csv
import time
import shutil
import re
import pandas as pd
from selenium.common.exceptions import TimeoutException

In [12]:
def extract_bond_links(soup):
    bond_table = soup.find("div", {"class": "table-responsive"})
    links = []
    if bond_table:
        rows = bond_table.find_all("tr")
        for row in rows:
            cols = row.find_all("td")
            if len(cols) >= 1:
                link_element = cols[0].find("a", href=True)
                if link_element:
                    links.append(link_element["href"])
    return links


def extract_bond_data(driver, url):
    # Open the bond detail page
    driver.get(url)
    bond_data = {"URL": url}  # Start with URL for traceability

    try:
        # Handle cookie banner if it appear

        # Wait for the data table to load
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.XPATH, "//div[@_ngcontent-boerse-frankfurt-c172]"))
        )

        # Define the data points and their XPaths
        data_points = [
            "//div[@_ngcontent-boerse-frankfurt-c172]//td[contains(@class, 'widget-table-cell')]",
            "//div[@_ngcontent-boerse-frankfurt-c172]//td[contains(@class, 'widget-table-cell text-right')]"
        ]

        # Loop through each XPath in data_points and extract the text
        for xpath in data_points:
            try:
                # Use find_elements to handle multiple matches and extract all values
                elements = driver.find_elements(By.XPATH, xpath)
                if elements:
                    # Extract and clean the text from all matched elements
                    extracted_values = [element.text.strip() for element in elements]
                    bond_data[xpath] = extracted_values  # Save all extracted values under the XPath as the key
                else:
                    bond_data[xpath] = []  # Handle missing elements with an empty list

            except Exception as inner_e:
                bond_data[xpath] = []  # Handle missing data gracefully
                print(f"Error extracting data for XPath {xpath}: {inner_e}")

    except Exception as e:
        print(f"An error occurred while processing {url}: {e}")

    return bond_data

In [13]:
# Check if Chromedriver is available on system path
shutil.which("chromedriver")

# Configuration de Selenium WebDriver
options = webdriver.ChromeOptions()
options.add_argument("--headless")
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")

# Initialisation de driver
driver = webdriver.Chrome(options=options)
url = "https://www.boerse-frankfurt.de/anleihen/most-traded"

# Script principal
try:
    # start driver
    driver.get(url)
    wait = WebDriverWait(driver, 5)

    try:
        cookie_button = WebDriverWait(driver, 5).until(
            EC.element_to_be_clickable((By.XPATH, "//button[contains(text(), 'Decline')]"))
        )
        cookie_button.click()
        print("Cookie banner handled successfully (Declined).")
    except TimeoutException:
        print("Cookie banner not found. Skipping...")

    # 100 button
    hundred_button = wait.until(
        EC.element_to_be_clickable((By.XPATH, "//button[contains(@class, 'page-bar-type-button btn btn-lg ng-star-inserted') and text()='100']"))
    )
    hundred_button.click()
    time.sleep(5)

    # Variable to store bond links
    all_bond_links = []
    # Get total number of pages
    page_buttons = driver.find_elements(
        By.XPATH,
        "//button[contains(@class, 'page-bar-type-button page-bar-type-button-width-auto btn btn-lg ng-star-inserted') and not(@disabled)]",
    )
    page_limit = 2 # Limit to 1000 bonds for now
    total_pages = min(
        int(page_buttons[-1].text.strip()), page_limit or float("inf")
    )
    print(f"Total pages varaible: {total_pages}")
    print(f"Total pages shown on website: {page_buttons[-1].text.strip()}")

    for page in range(1, total_pages + 1):
        try:
            if page != 1:
                page_button = wait.until(
                    EC.element_to_be_clickable((By.XPATH, f"//button[contains(@class, 'page-bar-type-button page-bar-type-button-width-auto btn btn-lg ng-star-inserted') and text()='{page}']"))
                )
                page_button.click()
                time.sleep(5)

            page_source = driver.page_source
            soup = BeautifulSoup(page_source, "html.parser")
            bond_links = extract_bond_links(soup)
            all_bond_links.extend(bond_links)

        except Exception as e:
            print(f"Error at the page {page}: {e}")
            break

    # Collecter les static data pour chaque obligation
    static_bonds_data = []
    print(f"preview of bond links: {all_bond_links}")

    # getting bond data now
    # Base URL
    base_url = "https://www.boerse-frankfurt.de"

    # Iterate through each bond link
    for link in all_bond_links[:2]:  # all_bonds_links should already be collected earlier
        full_url = base_url + link
        print(f"Processing bond URL: {full_url}")
        bond_data = extract_bond_data(driver, full_url)
        static_bonds_data.append(bond_data)

except Exception as e:
    print(f"Critical error: {e}")
    # add screenshot
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    screenshot_path = f"error_screenshot_{timestamp}.png"
    driver.save_screenshot(screenshot_path)

# Export the DataFrame to a CSV file even if errors occur
finally:
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    # Close the WebDriver
    # save data
    df = pd.DataFrame(static_bonds_data)
    csv_file = f"Static_bond_data{timestamp}.csv"
    df.to_csv(csv_file, index=False)
    print(f"Data saved to {csv_file}")
    driver.quit()

Cookie banner not found. Skipping...
Total pages varaible: 2
Total pages shown on website: 355
preview of bond links: ['/anleihe/de000bu22007-deutschland-bundesrepublik-2-5-23-25', '/anleihe/de000bu22015-deutschland-bundesrepublik-2-8-23-25', '/anleihe/de0001102382-deutschland-bundesrepublik-1-15-25', '/anleihe/de0001141810-deutschland-bundesrepublik-0-000-20-25', '/anleihe/fr001400qmf9-frankreich-republik-3-23-34', '/anleihe/de0001102374-deutschland-bundesrepublik-0-5-15-25', '/anleihe/de000a3lhk72-traton-finance-luxembourg-s-a-4-23-25', '/anleihe/us900123cg37-tuerkei-republik-6-625-14-45', '/anleihe/de000bu22031-deutschland-bundesrepublik-3-1-23-25', '/anleihe/de0001141828-deutschland-bundesrepublik-0-000-20-25', '/anleihe/de000bu22072-deutschland-bundesrepublik-2-24-26', '/anleihe/de0001141869-deutschland-bundesrepublik-1-3-22-27', '/anleihe/de000bu22023-deutschland-bundesrepublik-3-1-23-25', '/anleihe/de000hlb23t7-landesbank-hessen-thueringen-girozentrale-0-4-21-29', '/anleihe/xs29

In [15]:
df

Unnamed: 0,URL,"//div[@_ngcontent-boerse-frankfurt-c172]//td[contains(@class, 'widget-table-cell')]","//div[@_ngcontent-boerse-frankfurt-c172]//td[contains(@class, 'widget-table-cell text-right')]"
0,https://www.boerse-frankfurt.de/anleihe/de000b...,"[, -, Letzter Preis, -, Veränderung zum Vortag...","[99,945, 0,00%, 14.01.25 17:30:00, Brief, für ..."
1,https://www.boerse-frankfurt.de/anleihe/de000b...,"[, -, Letzter Preis, -, Veränderung zum Vortag...",[]
