<a href="https://colab.research.google.com/github/Fijiman001/EGR-Empirical-Project/blob/main/code/static_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%%capture
!pip install --upgrade selenium
!apt update
!apt install chromium-chromedriver
!cp /usr/lib/chromium-browser/chromedriver /usr/bin

"""# Packages & Function to Get Data"""

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException  # Ensure this is imported
from bs4 import BeautifulSoup
from datetime import datetime
import csv
import time
import shutil
import os
from multiprocessing import Pool
import re
import pandas as pd
from selenium.common.exceptions import TimeoutException
import requests

In [2]:
def extract_bond_links(soup):
    bond_table = soup.find("div", {"class": "table-responsive"})
    links = []
    if bond_table:
        rows = bond_table.find_all("tr")
        for row in rows:
            cols = row.find_all("td")
            if len(cols) >= 1:
                link_element = cols[0].find("a", href=True)
                if link_element:
                    links.append(link_element["href"])
    return links

import pandas as pd

def separate_text(df):

    text = df
    # Regular expression to split by commas, except when inside single quotes
    pattern = r"(?<!\\)',(?!')"

    # Split the text by commas, respecting quotes
    separated_list = re.split(pattern, text)

    # Clean up: Remove extra spaces and quotes
    separated_list = [item.strip().strip("'") for item in separated_list]

    return separated_list

def format_value(value):
    # Helper function to format values as needed (e.g., percentages, empty strings, etc.)
    if value is None or value == '':
        return None
    # Handle percentage formatting
    if "%" in value:
        return value.replace(",", ".")  # Convert to decimal format
    return value

def process_bond_dataframe(df):
    processed_data = []

    for index, row in df.iterrows():
        # Extract the raw data
        raw_data = row["//div[@_ngcontent-boerse-frankfurt-c174]//td[contains(@class, 'widget-table-cell')]"]

        # Ensure the data is cleaned and in list format
        if isinstance(raw_data, list):
            cleaned_data = [entry.strip() for entry in raw_data if entry.strip()]
        else:
            print(f"Row {index} has invalid data format. Skipping.")
            continue

        # Directly map the cleaned data to columns
        row_data = {f"column_{i}": format_value(value) for i, value in enumerate(cleaned_data)}

        # Add the URL for traceability
        row_data["URL"] = row["URL"]

        # Append the row data
        processed_data.append(row_data)

    # Create a DataFrame from the processed data
    processed_df = pd.DataFrame(processed_data)
    return processed_df


def extract_bond_data(driver, url):
    # Open the bond detail page
    driver.get(url)
    bond_data = {"URL": url}  # Start with URL for traceability

    try:
        # Cookie Handler
        try:
            wait = WebDriverWait(driver, 5)
            cookie_button = wait.until(
                EC.element_to_be_clickable((By.ID, "cookie-hint-btn-decline"))
            )
            cookie_button.click()
            # print("Cookie banner handled successfully (Declined).")
        except TimeoutException:
            # print("Cookie banner not found. Skipping...")
            None
        try:
            wait.until(
                EC.invisibility_of_element_located((By.CSS_SELECTOR, ".wrapper[_ngcontent-boerse-frankfurt-c97]"))
            )
            # print("Overlay disappeared before 100 button")

            wait.until(
                EC.invisibility_of_element_located((By.CSS_SELECTOR, ".wrapper[_ngcontent-boerse-frankfurt-c98]"))
            )
            # print("Overlay c98 disappeared - Loading Table element")
        except TimeoutException:
            print(" Table did not load fully")
            driver.save_screenshot(f"error_loading_initial_table.png") # test

        # Wait for the data table to load
        wait.until(
            EC.presence_of_element_located((By.XPATH, "//div[@_ngcontent-boerse-frankfurt-c174]"))
        )

        # To ensure we get static Data
        time.sleep(2)

        # Define the data points and their XPaths
        data_points = [
            "//div[@_ngcontent-boerse-frankfurt-c174]//td[contains(@class, 'widget-table-cell')]",
            "//div[@_ngcontent-boerse-frankfurt-c174]//td[contains(@class, 'widget-table-cell text-right')]"
        ]

        # Loop through each XPath in data_points and extract the text
        for xpath in data_points:
            try:
                # Use find_elements to handle multiple matches and extract all values
                elements = driver.find_elements(By.XPATH, xpath)
                if elements:
                    # Extract and clean the text from all matched elements
                    extracted_values = [element.text.strip() for element in elements]
                    bond_data[xpath] = extracted_values  # Save all extracted values under the XPath as the key
                else:
                    bond_data[xpath] = []  # Handle missing elements with an empty list

            except Exception as inner_e:
                bond_data[xpath] = []  # Handle missing data gracefully
                print(f"Error extracting data for XPath {xpath}: {inner_e}")

    except Exception as e:
        print(f"An error occurred while processing {url}: {e}")

    return bond_data

# Getting Static Data for bonds

In [None]:
# Check if Chromedriver is available on system path
shutil.which("chromedriver")

# Configuration de Selenium WebDriver
options = webdriver.ChromeOptions()
options.add_argument("--headless")
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")

# Initialisation de driver
driver = webdriver.Chrome(options=options)
url = "https://www.boerse-frankfurt.de/anleihen/most-traded"
# url = "https://www.boerse-frankfurt.de/anleihen/green-bonds"

# Script principal
try:
    # start driver
    driver.get(url)
    wait = WebDriverWait(driver, 5)

    try:
        cookie_button = WebDriverWait(driver, 5).until(
            EC.element_to_be_clickable((By.XPATH, "//button[contains(text(), 'Decline')]"))
        )
        cookie_button.click()
        print("Cookie banner handled successfully (Declined).")
    except TimeoutException:
        print("Cookie banner not found. Skipping...")

    # 100 button
    hundred_button = wait.until(
        EC.element_to_be_clickable((By.XPATH, "//button[contains(@class, 'page-bar-type-button btn btn-lg ng-star-inserted') and text()='100']"))
    )
    hundred_button.click()
    time.sleep(5)

    # Variable to store bond links
    all_bond_links = []
    # Get total number of pages
    page_buttons = driver.find_elements(
        By.XPATH,
        "//button[contains(@class, 'page-bar-type-button page-bar-type-button-width-auto btn btn-lg ng-star-inserted') and not(@disabled)]",
    )
    page_limit = 2 # Limit to 1000 bonds for now
    total_pages = min(
        int(page_buttons[-1].text.strip()), page_limit or float("inf")
    )
    print(f"Total pages varaible: {total_pages}")
    print(f"Total pages shown on website: {page_buttons[-1].text.strip()}")

    for page in range(1, total_pages + 1):
        try:
            if page != 1:
                page_button = wait.until(
                    EC.element_to_be_clickable((By.XPATH, f"//button[contains(@class, 'page-bar-type-button page-bar-type-button-width-auto btn btn-lg ng-star-inserted') and text()='{page}']"))
                )
                page_button.click()
                time.sleep(5)

            page_source = driver.page_source
            soup = BeautifulSoup(page_source, "html.parser")
            bond_links = extract_bond_links(soup)
            all_bond_links.extend(bond_links)

        except Exception as e:
            print(f"Error at the page {page}: {e}")
            break

    # Collecter les static data pour chaque obligation
    static_bonds_data = []
    print(f"preview of bond links: {all_bond_links}")

    # getting bond data now
    # Base URL
    base_url = "https://www.boerse-frankfurt.de"

    # Iterate through each bond link
    for link in all_bond_links[:1000]:  # all_bonds_links should already be collected earlier
        full_url = base_url + link
        print(f"Processing bond URL: {full_url}")
        bond_data = extract_bond_data(driver, full_url)
        static_bonds_data.append(bond_data)

except Exception as e:
    print(f"Critical error: {e}")
    # add screenshot
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    screenshot_path = f"error_screenshot_{timestamp}.png"
    driver.save_screenshot(screenshot_path)

# Export the DataFrame to a CSV file even if errors occur
finally:
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    # Close the WebDriver
    # save data
    df = pd.DataFrame(static_bonds_data)
    processed_df = process_bond_dataframe(df)
    csv_file = f"Static_bond_data{timestamp}.csv"
    processed_df.to_csv(csv_file, index=False)
    df.to_csv("raw_static_data.csv", index=False)
    print(f"Data saved to {csv_file}")
    driver.quit()

In [None]:
processed_df

# Extracting only bond links from all pages

In [None]:
# Check if Chromedriver is available on system path
shutil.which("chromedriver")

# Configuration de Selenium WebDriver
options = webdriver.ChromeOptions()
options.add_argument("--headless")
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")

# Initialisation de driver
driver = webdriver.Chrome(options=options)
url = "https://www.boerse-frankfurt.de/anleihen/most-traded"
# url = "https://www.boerse-frankfurt.de/anleihen/green-bonds"

# Script principal
try:
    # start driver
    driver.get(url)
    wait = WebDriverWait(driver, 10)

    try:
        cookie_button = WebDriverWait(driver, 5).until(
            EC.element_to_be_clickable((By.XPATH, "//button[contains(text(), 'Decline')]"))
        )
        cookie_button.click()
        print("Cookie banner handled successfully (Declined).")
    except TimeoutException:
        print("Cookie banner not found. Skipping...")

    # 100 button
    hundred_button = wait.until(
        EC.element_to_be_clickable((By.XPATH, "//button[contains(@class, 'page-bar-type-button btn btn-lg ng-star-inserted') and text()='100']"))
    )
    hundred_button.click()
    time.sleep(5)

    # Variable to store bond links
    all_bond_links = []
    # Get total number of pages
    page_buttons = driver.find_elements(
        By.XPATH,
        "//button[contains(@class, 'page-bar-type-button page-bar-type-button-width-auto btn btn-lg ng-star-inserted') and not(@disabled)]",
    )
    page_limit = 356 # Limit to 1000 bonds for now
    total_pages = min(
        int(page_buttons[-1].text.strip()), page_limit or float("inf")
    )
    print(f"Total pages varaible: {total_pages}")
    print(f"Total pages shown on website: {page_buttons[-1].text.strip()}")

    for page in range(1, total_pages + 1):
        try:
            if page != 1:
                page_button = wait.until(
                    EC.element_to_be_clickable((By.XPATH, f"//button[contains(@class, 'page-bar-type-button page-bar-type-button-width-auto btn btn-lg ng-star-inserted') and text()='{page}']"))
                )
                page_button.click()
                time.sleep(10)
            # Wait for page to full load
            wait.until(EC.presence_of_element_located((By.XPATH, "//div[@_ngcontent-boerse-frankfurt-c151]")))

            page_source = driver.page_source
            soup = BeautifulSoup(page_source, "html.parser")
            bond_links = extract_bond_links(soup)
            all_bond_links.extend(bond_links)
            print(f"Page {page} done")

        except Exception as e:
            print(f"Error at the page {page}: {e}")
            break

    # Collecter les static data pour chaque obligation
    static_bonds_data = []
    print(f"preview of bond links: {all_bond_links}")

    # getting bond data now
    # Base URL
    base_url = "https://www.boerse-frankfurt.de"
    full_bond_links = []

    for link in all_bond_links:  # all_bonds_links should already be collected earlier
        full_url = base_url + link
        full_bond_links.append(full_url)

    # Iterate through each bond link
    # for link in all_bond_links[:1000]:  # all_bonds_links should already be collected earlier
    #     full_url = base_url + link
    #     print(f"Processing bond URL: {full_url}")
    #     bond_data = extract_bond_data(driver, full_url)
    #     static_bonds_data.append(bond_data)

except Exception as e:
    print(f"Critical error: {e}")
    # add screenshot
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    screenshot_path = f"error_screenshot_{timestamp}.png"
    driver.save_screenshot(screenshot_path)

# Export the DataFrame to a CSV file even if errors occur
finally:
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    # Close the WebDriver
    # save data
    df = pd.DataFrame(full_bond_links)
    df.to_csv("raw_all_bond_links.csv", index=False)
    driver.quit()

In [None]:
print(df.head())
print(df.shape)
print(df.info())

# Getting Static Data for Bond_Dictionary

file path: C:\Users\Alex\Documents\GitHub\EGR-Empirical-Project\data\Static_data\bond_dictionary\bond_dictionary_cleaned

we re-use the above static data web scraping procedure to get the static data for only the bonds in our bond dictionary. making sure to also get the WKN to merge with our price data

In [3]:
# Define the URL of the raw CSV file on GitHub
url = "https://raw.githubusercontent.com/Fijiman001/EGR-Empirical-Project/refs/heads/main/data/Static_data/bond_dictionary/bond_dictionary_cleaned.csv"

bond_dictionary = pd.read_csv(url)
all_bond_links = bond_dictionary.iloc[:, 0].tolist()
# all_bond_links = all_bond_links[:20] # limit to 20 bonds to try
print(len(all_bond_links))
print(all_bond_links[:2])

5204
['https://www.boerse-frankfurt.de/anleihe/xs2694874533-volkswagen-leasing-gmbh-4-75-23-31', 'https://www.boerse-frankfurt.de/anleihe/xs2482887879-rwe-ag-2-75-22-30']


# Multi

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
from multiprocessing import Pool
import logging
import random
from tqdm import tqdm

logger = logging.getLogger('selenium')
logger.setLevel(logging.ERROR)

driver = None # set later

def get_driver():
    # Check if Chromedriver is available on system path
    shutil.which("chromedriver")

    # Configuration de Selenium WebDriver
    options = webdriver.ChromeOptions()
    options.add_argument("--headless")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")

    # Initialisation de driver
    driver = webdriver.Chrome(options=options)
    url = "https://www.boerse-frankfurt.de/anleihen/most-traded"
    return driver

def init_child_process():
    # set the global 'driver' variable
    # so that calls to `scrape_bond` have access to it
    # without creating a new driver for each bond
    global driver
    driver = get_driver()

# getting bond data for only this URL
def scrape_bond(url):
      # start driver
      assert driver is not None, "Child process init did not happen"
      driver.get(url)
      wait = WebDriverWait(driver, 5)
      # Cooking handler
      try:
          cookie_button = WebDriverWait(driver, 5).until(
              EC.element_to_be_clickable((By.XPATH, "//button[contains(text(), 'Decline')]"))
          )
          cookie_button.click()
          # print("Cookie banner handled successfully (Declined).")
      except TimeoutException:
          # print("Cookie banner not found. Skipping...")
          None

      if random.random() > 0.9:
          print(f"Processing bond URL: {url}")
      bond_data = extract_bond_data(driver, url)
      return bond_data

# returns (result, err)
def child_task(url, retries=3):
    try:
        result = scrape_bond(url)
    except KeyboardInterrupt:
        raise
    except Exception as e:
        if retries > 0:
            time.sleep(2**(3-retries))
            return child_task(url, retries-1)
        else:
          print(f"Critical error: {e}")
          # add screenshot
          timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
          screenshot_path = f"error_screenshot_{timestamp}.png"
          driver.save_screenshot(screenshot_path)
          return (None, e)
    else:
        return (result, None)

num_processes = 10 # os.cpu_count() == 2
with Pool(num_processes, initializer=init_child_process) as p:
    results_with_error = p.map(child_task, all_bond_links)

# ignore errors
results = []
for (result, err) in results_with_error:
    if err is None:
        results.append(result)

results[0]

Processing bond URL: https://www.boerse-frankfurt.de/anleihe/de000hlb5469-landesbank-hessen-thueringen-girozentrale-2-85-24-29
Processing bond URL: https://www.boerse-frankfurt.de/anleihe/be6343192710-belfius-bank-s-a-3-925-23-30
Processing bond URL: https://www.boerse-frankfurt.de/anleihe/us172967mm08-citigroup-inc-5-316-20-41
Processing bond URL: https://www.boerse-frankfurt.de/anleihe/de000dw6c979-dz-bank-ag-deutsche-zentral-genossenschaftsbank-frankfurt-am-main-3-25-23-27
Processing bond URL: https://www.boerse-frankfurt.de/anleihe/de000lb2blx6-landesbank-baden-wuerttemberg-0-22-22-28
Processing bond URL: https://www.boerse-frankfurt.de/anleihe/de000lb2bly4-landesbank-baden-wuerttemberg-0-3-22-29
An error occurred while processing https://www.boerse-frankfurt.de/anleihe/de000hlb54p0-landesbank-hessen-thueringen-girozentrale-2-25-24-26: Message: 
Stacktrace:
#0 0x594a63de514a <unknown>
#1 0x594a63882b80 <unknown>
#2 0x594a638d40e9 <unknown>
#3 0x594a638d4271 <unknown>
#4 0x594a63922

{'URL': 'https://www.boerse-frankfurt.de/anleihe/xs2694874533-volkswagen-leasing-gmbh-4-75-23-31',
 "//div[@_ngcontent-boerse-frankfurt-c174]//td[contains(@class, 'widget-table-cell')]": ['',
  '04.03.25 12:06:32',
  'Letzter Preis',
  '106,88',
  'Veränderung zum Vortag',
  '1,08%',
  '',
  '04.03.25 17:30:00',
  'Geld',
  'Brief',
  '0,00',
  '0,00',
  'für 0 Nominal',
  'für 0 Nominal',
  'Differenz zum Vortag',
  '1,14 / 1,08%',
  'Spread absolut / relativ',
  '0 / 0%',
  'Tagesumsatz in Euro',
  '101.551',
  'Tagesumsatz nominal',
  '95.000',
  'Preisfeststellungen',
  '2',
  'Schlusspreis des letzten Handelstages',
  '105,74',
  'Tagestief/-hoch',
  '106,88 / 106,90',
  '52-Wochentief/-hoch',
  '103,92 / 108,83',
  'Handelszeiten',
  '08:00-17:30',
  'Rendite in % zum letzten Preis',
  '3,5519',
  'Rendite in % zu Brief',
  '',
  'Rendite in % zu Geld',
  '',
  'Stückzinsberechnung',
  'k.A.',
  'Stückzinsen in %',
  '2,0918',
  'Aufgelaufene Stückzinstage',
  '161',
  'Duration 

In [6]:
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
# Saving df
df = pd.DataFrame(results)
df.to_csv(f"raw_static_data{timestamp}.csv", index=False)
# Save to Google Drive
drive_path = "/content/drive/MyDrive/Colab Notebooks/M2 EGR - Empirical Project/"  # Specify your desired folder in Google Drive
!mkdir -p "{drive_path}"
!cp "raw_static_data.csv" "{drive_path}"
!cp "raw_static_data{timestamp}.csv" "{drive_path}" #Correctly copies the file with timestamp
print(f"Files saved to Google Drive at {drive_path}")

cp: cannot stat 'raw_static_data.csv': No such file or directory
Files saved to Google Drive at /content/drive/MyDrive/Colab Notebooks/M2 EGR - Empirical Project/


# Singel driver - takes very long (~10 hours) - Multiprocessor below

In [None]:
# Check if Chromedriver is available on system path
shutil.which("chromedriver")

# Configuration de Selenium WebDriver
options = webdriver.ChromeOptions()
options.add_argument("--headless")
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")

# Initialisation de driver
driver = webdriver.Chrome(options=options)
url = "https://www.boerse-frankfurt.de/anleihen/most-traded"
# url = "https://www.boerse-frankfurt.de/anleihen/green-bonds"

# Script principal
try:
    # start driver
    driver.get(url)
    wait = WebDriverWait(driver, 5)

    try:
        cookie_button = WebDriverWait(driver, 5).until(
            EC.element_to_be_clickable((By.XPATH, "//button[contains(text(), 'Decline')]"))
        )
        cookie_button.click()
        print("Cookie banner handled successfully (Declined).")
    except TimeoutException:
        print("Cookie banner not found. Skipping...")

    # Collecter les static data pour chaque obligation
    static_bonds_data = []
    print(f"preview of bond links: {all_bond_links}")

    # Iterate through each bond link
    for link in all_bond_links:  # we loop over all our links
        print(f"Processing bond URL: {link}")
        bond_data = extract_bond_data(driver, link)
        static_bonds_data.append(bond_data)

except Exception as e:
    print(f"Critical error: {e}")
    # add screenshot
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    screenshot_path = f"error_screenshot_{timestamp}.png"
    driver.save_screenshot(screenshot_path)

# Export the DataFrame to a CSV file even if errors occur
finally:
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    # Close the WebDriver
    # save data
    df = pd.DataFrame(static_bonds_data)
    # processed_df = process_bond_dataframe(df)
    # csv_file = f"Static_bond_data{timestamp}.csv"
    # processed_df.to_csv(csv_file, index=False)
    df.to_csv(f"raw_static_data{timestamp}.csv", index=False)
    print(f"Data saved to {csv_file}")
    driver.quit()