In [1]:
%%capture
!pip install --upgrade selenium
!apt update
!apt install chromium-chromedriver
!cp /usr/lib/chromium-browser/chromedriver /usr/bin

"""# Packages & Function to Get Data"""

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException  # Ensure this is imported
from bs4 import BeautifulSoup
from datetime import datetime
import csv
import time
import shutil
import re
import pandas as pd
from selenium.common.exceptions import TimeoutException

In [3]:
def extract_bond_links(soup):
    bond_table = soup.find("div", {"class": "table-responsive"})
    links = []
    if bond_table:
        rows = bond_table.find_all("tr")
        for row in rows:
            cols = row.find_all("td")
            if len(cols) >= 1:
                link_element = cols[0].find("a", href=True)
                if link_element:
                    links.append(link_element["href"])
    return links

import pandas as pd

def separate_text(df):

    text = df
    # Regular expression to split by commas, except when inside single quotes
    pattern = r"(?<!\\)',(?!')"

    # Split the text by commas, respecting quotes
    separated_list = re.split(pattern, text)

    # Clean up: Remove extra spaces and quotes
    separated_list = [item.strip().strip("'") for item in separated_list]

    return separated_list

def format_value(value):
    # Helper function to format values as needed (e.g., percentages, empty strings, etc.)
    if value is None or value == '':
        return None
    # Handle percentage formatting
    if "%" in value:
        return value.replace(",", ".")  # Convert to decimal format
    return value

def process_bond_dataframe(df):
    processed_data = []

    for index, row in df.iterrows():
        # Extract the raw data
        raw_data = row["//div[@_ngcontent-boerse-frankfurt-c174]//td[contains(@class, 'widget-table-cell')]"]

        # Ensure the data is cleaned and in list format
        if isinstance(raw_data, list):
            cleaned_data = [entry.strip() for entry in raw_data if entry.strip()]
        else:
            print(f"Row {index} has invalid data format. Skipping.")
            continue

        # Directly map the cleaned data to columns
        row_data = {f"column_{i}": format_value(value) for i, value in enumerate(cleaned_data)}

        # Add the URL for traceability
        row_data["URL"] = row["URL"]

        # Append the row data
        processed_data.append(row_data)

    # Create a DataFrame from the processed data
    processed_df = pd.DataFrame(processed_data)
    return processed_df


def extract_bond_data(driver, url):
    # Open the bond detail page
    driver.get(url)
    bond_data = {"URL": url}  # Start with URL for traceability

    try:
        # Cookie Handler
        try:
            wait = WebDriverWait(driver, 5)
            cookie_button = wait.until(
                EC.element_to_be_clickable((By.ID, "cookie-hint-btn-decline"))
            )
            cookie_button.click()
            print("Cookie banner handled successfully (Declined).")
        except TimeoutException:
            print("Cookie banner not found. Skipping...")
        try:
            wait.until(
                EC.invisibility_of_element_located((By.CSS_SELECTOR, ".wrapper[_ngcontent-boerse-frankfurt-c97]"))
            )
            print("Overlay disappeared before 100 button")

            wait.until(
                EC.invisibility_of_element_located((By.CSS_SELECTOR, ".wrapper[_ngcontent-boerse-frankfurt-c98]"))
            )
            print("Overlay c98 disappeared - Loading Table element")
        except TimeoutException:
            print(" Table did not load fully")
            driver.save_screenshot(f"error_loading_initial_table.png") # test

        # Wait for the data table to load
        wait.until(
            EC.presence_of_element_located((By.XPATH, "//div[@_ngcontent-boerse-frankfurt-c174]"))
        )

        # To ensure we get static Data
        time.sleep(2)

        # Define the data points and their XPaths
        data_points = [
            "//div[@_ngcontent-boerse-frankfurt-c174]//td[contains(@class, 'widget-table-cell')]",
            "//div[@_ngcontent-boerse-frankfurt-c174]//td[contains(@class, 'widget-table-cell text-right')]"
        ]

        # Loop through each XPath in data_points and extract the text
        for xpath in data_points:
            try:
                # Use find_elements to handle multiple matches and extract all values
                elements = driver.find_elements(By.XPATH, xpath)
                if elements:
                    # Extract and clean the text from all matched elements
                    extracted_values = [element.text.strip() for element in elements]
                    bond_data[xpath] = extracted_values  # Save all extracted values under the XPath as the key
                else:
                    bond_data[xpath] = []  # Handle missing elements with an empty list

            except Exception as inner_e:
                bond_data[xpath] = []  # Handle missing data gracefully
                print(f"Error extracting data for XPath {xpath}: {inner_e}")

    except Exception as e:
        print(f"An error occurred while processing {url}: {e}")

    return bond_data

# Getting Static Data for bonds

In [None]:
# Check if Chromedriver is available on system path
shutil.which("chromedriver")

# Configuration de Selenium WebDriver
options = webdriver.ChromeOptions()
options.add_argument("--headless")
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")

# Initialisation de driver
driver = webdriver.Chrome(options=options)
url = "https://www.boerse-frankfurt.de/anleihen/most-traded"
# url = "https://www.boerse-frankfurt.de/anleihen/green-bonds"

# Script principal
try:
    # start driver
    driver.get(url)
    wait = WebDriverWait(driver, 5)

    try:
        cookie_button = WebDriverWait(driver, 5).until(
            EC.element_to_be_clickable((By.XPATH, "//button[contains(text(), 'Decline')]"))
        )
        cookie_button.click()
        print("Cookie banner handled successfully (Declined).")
    except TimeoutException:
        print("Cookie banner not found. Skipping...")

    # 100 button
    hundred_button = wait.until(
        EC.element_to_be_clickable((By.XPATH, "//button[contains(@class, 'page-bar-type-button btn btn-lg ng-star-inserted') and text()='100']"))
    )
    hundred_button.click()
    time.sleep(5)

    # Variable to store bond links
    all_bond_links = []
    # Get total number of pages
    page_buttons = driver.find_elements(
        By.XPATH,
        "//button[contains(@class, 'page-bar-type-button page-bar-type-button-width-auto btn btn-lg ng-star-inserted') and not(@disabled)]",
    )
    page_limit = 2 # Limit to 1000 bonds for now
    total_pages = min(
        int(page_buttons[-1].text.strip()), page_limit or float("inf")
    )
    print(f"Total pages varaible: {total_pages}")
    print(f"Total pages shown on website: {page_buttons[-1].text.strip()}")

    for page in range(1, total_pages + 1):
        try:
            if page != 1:
                page_button = wait.until(
                    EC.element_to_be_clickable((By.XPATH, f"//button[contains(@class, 'page-bar-type-button page-bar-type-button-width-auto btn btn-lg ng-star-inserted') and text()='{page}']"))
                )
                page_button.click()
                time.sleep(5)

            page_source = driver.page_source
            soup = BeautifulSoup(page_source, "html.parser")
            bond_links = extract_bond_links(soup)
            all_bond_links.extend(bond_links)

        except Exception as e:
            print(f"Error at the page {page}: {e}")
            break

    # Collecter les static data pour chaque obligation
    static_bonds_data = []
    print(f"preview of bond links: {all_bond_links}")

    # getting bond data now
    # Base URL
    base_url = "https://www.boerse-frankfurt.de"

    # Iterate through each bond link
    for link in all_bond_links[:1000]:  # all_bonds_links should already be collected earlier
        full_url = base_url + link
        print(f"Processing bond URL: {full_url}")
        bond_data = extract_bond_data(driver, full_url)
        static_bonds_data.append(bond_data)

except Exception as e:
    print(f"Critical error: {e}")
    # add screenshot
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    screenshot_path = f"error_screenshot_{timestamp}.png"
    driver.save_screenshot(screenshot_path)

# Export the DataFrame to a CSV file even if errors occur
finally:
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    # Close the WebDriver
    # save data
    df = pd.DataFrame(static_bonds_data)
    processed_df = process_bond_dataframe(df)
    csv_file = f"Static_bond_data{timestamp}.csv"
    processed_df.to_csv(csv_file, index=False)
    df.to_csv("raw_static_data.csv", index=False)
    print(f"Data saved to {csv_file}")
    driver.quit()

Cookie banner not found. Skipping...
Total pages varaible: 5
Total pages shown on website: 5
preview of bond links: ['/anleihe/xs2694874533-volkswagen-leasing-gmbh-4-75-23-31', '/anleihe/xs1702729275-e-on-international-finance-b-v-1-25-17-27', '/anleihe/xs2482887879-rwe-ag-2-75-22-30', '/anleihe/de000a3lh6t7-mercedes-benz-international-finance-b-v-3-5-23-26', '/anleihe/xs2694872594-volkswagen-leasing-gmbh-4-625-23-29', '/anleihe/xs2482936247-rwe-ag-2-125-22-26', '/anleihe/xs2181689659-ungarn-republik-1-75-20-35', '/anleihe/xs2498154207-kreditanstalt-fuer-wiederaufbau-2-22-29', '/anleihe/xs2584685387-rwe-ag-4-125-23-35', '/anleihe/xs2152899584-e-on-se-1-20-25', '/anleihe/xs2673536541-e-on-se-3-75-23-29', '/anleihe/xs2288097640-international-bank-for-reconstruction-and-development-5-21-26', '/anleihe/xs2338004497-european-investment-bank-eib-1-25-21-27', '/anleihe/au3cb0258739-international-bank-for-reconstruction-and-development-2-9-18-25', '/anleihe/au3cb0295764-european-investment-ban

In [None]:
processed_df

Unnamed: 0,column_0,column_1,column_2,column_3,column_4,column_5,column_6,column_7,column_8,column_9,...,column_102,column_103,column_104,column_105,column_106,column_107,column_108,column_109,URL,column_110
0,07.02.25 15:26:59,Letzter Preis,10730,Veränderung zum Vortag,0.70%,07.02.25 17:30:00,Geld,Brief,000,000,...,Sonderkündigung,Nachrangig,Kleinste handelbare Einheit,1.000,Spezialist,WOLFGANG STEUBING AG,Handelsmodell,Fortlaufende Auktion,https://www.boerse-frankfurt.de/anleihe/xs2694...,
1,07.02.25 17:00:15,Letzter Preis,96315,Veränderung zum Vortag,-0.05%,07.02.25 17:30:00,Geld,Brief,000,000,...,Kleinste handelbare Einheit,1.000,Spezialist,WALTER LUDWIG GMBH WERTPAPIERHANDELSBANK,Handelsmodell,Fortlaufende Auktion,,,https://www.boerse-frankfurt.de/anleihe/xs1702...,
2,07.02.25 12:53:03,Letzter Preis,9961,Veränderung zum Vortag,1.09%,07.02.25 17:30:00,Geld,Brief,000,000,...,Sonderkündigung,Nachrangig,Kleinste handelbare Einheit,1.000,Spezialist,WOLFGANG STEUBING AG,Handelsmodell,Fortlaufende Auktion,https://www.boerse-frankfurt.de/anleihe/xs2482...,
3,07.02.25 12:02:23,Letzter Preis,10078,Veränderung zum Vortag,-0.14%,07.02.25 17:30:00,Geld,Brief,000,000,...,Kleinste handelbare Einheit,1.000,Spezialist,WOLFGANG STEUBING AG,Handelsmodell,Fortlaufende Auktion,,,https://www.boerse-frankfurt.de/anleihe/de000a...,
4,07.02.25 17:19:27,Letzter Preis,10544,Veränderung zum Vortag,0.00%,07.02.25 17:30:00,Geld,Brief,000,000,...,Kleinste handelbare Einheit,1.000,Spezialist,WOLFGANG STEUBING AG,Handelsmodell,Fortlaufende Auktion,,,https://www.boerse-frankfurt.de/anleihe/xs2694...,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
418,07.02.25 17:00:17,Letzter Preis,10324,Veränderung zum Vortag,0.01%,07.02.25 17:30:00,Geld,Brief,000,000,...,,,,,,,,,https://www.boerse-frankfurt.de/anleihe/xs2552...,
419,07.02.25 09:21:51,Letzter Preis,10721,Veränderung zum Vortag,0.15%,07.02.25 17:30:00,Geld,Brief,000,000,...,Nein,Kleinste handelbare Einheit,100.000,Spezialist,WOLFGANG STEUBING AG,Handelsmodell,Fortlaufende Auktion,,https://www.boerse-frankfurt.de/anleihe/xs2592...,
420,07.02.25 09:21:51,Letzter Preis,10088,Veränderung zum Vortag,-0.02%,07.02.25 17:30:00,Geld,Brief,000,000,...,Kleinste handelbare Einheit,100.000,Spezialist,WOLFGANG STEUBING AG,Handelsmodell,Fortlaufende Auktion,,,https://www.boerse-frankfurt.de/anleihe/xs2604...,
421,07.02.25 17:00:08,Letzter Preis,10190,Veränderung zum Vortag,-0.19%,07.02.25 17:30:00,Geld,Brief,000,000,...,Sonderkündigung,Nachrangig,Kleinste handelbare Einheit,100.000,Spezialist,BAADER BANK AG,Handelsmodell,Fortlaufende Auktion,https://www.boerse-frankfurt.de/anleihe/xs2613...,


# Extracting only bond links from all pages

In [None]:
# Check if Chromedriver is available on system path
shutil.which("chromedriver")

# Configuration de Selenium WebDriver
options = webdriver.ChromeOptions()
options.add_argument("--headless")
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")

# Initialisation de driver
driver = webdriver.Chrome(options=options)
url = "https://www.boerse-frankfurt.de/anleihen/most-traded"
# url = "https://www.boerse-frankfurt.de/anleihen/green-bonds"

# Script principal
try:
    # start driver
    driver.get(url)
    wait = WebDriverWait(driver, 10)

    try:
        cookie_button = WebDriverWait(driver, 5).until(
            EC.element_to_be_clickable((By.XPATH, "//button[contains(text(), 'Decline')]"))
        )
        cookie_button.click()
        print("Cookie banner handled successfully (Declined).")
    except TimeoutException:
        print("Cookie banner not found. Skipping...")

    # 100 button
    hundred_button = wait.until(
        EC.element_to_be_clickable((By.XPATH, "//button[contains(@class, 'page-bar-type-button btn btn-lg ng-star-inserted') and text()='100']"))
    )
    hundred_button.click()
    time.sleep(5)

    # Variable to store bond links
    all_bond_links = []
    # Get total number of pages
    page_buttons = driver.find_elements(
        By.XPATH,
        "//button[contains(@class, 'page-bar-type-button page-bar-type-button-width-auto btn btn-lg ng-star-inserted') and not(@disabled)]",
    )
    page_limit = 356 # Limit to 1000 bonds for now
    total_pages = min(
        int(page_buttons[-1].text.strip()), page_limit or float("inf")
    )
    print(f"Total pages varaible: {total_pages}")
    print(f"Total pages shown on website: {page_buttons[-1].text.strip()}")

    for page in range(1, total_pages + 1):
        try:
            if page != 1:
                page_button = wait.until(
                    EC.element_to_be_clickable((By.XPATH, f"//button[contains(@class, 'page-bar-type-button page-bar-type-button-width-auto btn btn-lg ng-star-inserted') and text()='{page}']"))
                )
                page_button.click()
                time.sleep(10)
            # Wait for page to full load
            wait.until(EC.presence_of_element_located((By.XPATH, "//div[@_ngcontent-boerse-frankfurt-c151]")))

            page_source = driver.page_source
            soup = BeautifulSoup(page_source, "html.parser")
            bond_links = extract_bond_links(soup)
            all_bond_links.extend(bond_links)
            print(f"Page {page} done")

        except Exception as e:
            print(f"Error at the page {page}: {e}")
            break

    # Collecter les static data pour chaque obligation
    static_bonds_data = []
    print(f"preview of bond links: {all_bond_links}")

    # getting bond data now
    # Base URL
    base_url = "https://www.boerse-frankfurt.de"
    full_bond_links = []

    for link in all_bond_links:  # all_bonds_links should already be collected earlier
        full_url = base_url + link
        full_bond_links.append(full_url)

    # Iterate through each bond link
    # for link in all_bond_links[:1000]:  # all_bonds_links should already be collected earlier
    #     full_url = base_url + link
    #     print(f"Processing bond URL: {full_url}")
    #     bond_data = extract_bond_data(driver, full_url)
    #     static_bonds_data.append(bond_data)

except Exception as e:
    print(f"Critical error: {e}")
    # add screenshot
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    screenshot_path = f"error_screenshot_{timestamp}.png"
    driver.save_screenshot(screenshot_path)

# Export the DataFrame to a CSV file even if errors occur
finally:
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    # Close the WebDriver
    # save data
    df = pd.DataFrame(full_bond_links)
    df.to_csv("raw_all_bond_links.csv", index=False)
    driver.quit()

Cookie banner not found. Skipping...
Total pages varaible: 1
Total pages shown on website: 356
Page 1 done
preview of bond links: ['/anleihe/de0001030716-deutschland-bundesrepublik-0-000-20-25', '/anleihe/no0010757925-norwegen-koenigreich-1-5-16-26', '/anleihe/de0001102382-deutschland-bundesrepublik-1-15-25', '/anleihe/de000bu22023-deutschland-bundesrepublik-3-1-23-25', '/anleihe/us91282ckb62-united-states-of-america-4-625-24-26', '/anleihe/no0012530965-mutares-se-co-kgaa-11-183-23-27', '/anleihe/de0001102408-deutschland-bundesrepublik-0-000-16-26', '/anleihe/xs0222524372-suedzucker-international-finance-b-v-5-783', '/anleihe/de000a4dfds9-pcc-se-5-75-25-30', '/anleihe/xs2152061904-volkswagen-financial-services-ag-3-375-20-28', '/anleihe/de000bu0e246-deutschland-bundesrepublik-0-000-25-26', '/anleihe/de0001141828-deutschland-bundesrepublik-0-000-20-25', '/anleihe/xs2187646901-virgin-media-vendor-financing-notes-iii-dac-4-875-20-28', '/anleihe/de000bu22080-deutschland-bundesrepublik-2-2-

In [5]:
print(df.head())
print(df.shape)
print(df.info())

                                                   0
0  https://www.boerse-frankfurt.de/anleihe/de0001...
1  https://www.boerse-frankfurt.de/anleihe/no0010...
2  https://www.boerse-frankfurt.de/anleihe/de0001...
3  https://www.boerse-frankfurt.de/anleihe/de000b...
4  https://www.boerse-frankfurt.de/anleihe/us9128...
(100, 1)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   0       100 non-null    object
dtypes: object(1)
memory usage: 932.0+ bytes
None
