This code is to extract the emitters and the isins of the green bonds to ultimately get a data set of all green bond emitting companies which we can use to build our conventional bond data. With the help of the ISIN we will be able to get external credit rating data if needed. The URL's of this matched data will allow us to web scrape all the relevant spread data at the end.

# Code for all bond links
The code below is for the file "raw_all_bond_links.csv"

In [2]:
import pandas as pd
import re
import os

# Loading the data
data_filepath = os.path.join("..", "data", "Static_data", "raw_all_bond_links.csv")
df = pd.read_csv(data_filepath)
print(df.head())
df.columns = ['URL'] # Set column name to URL as per requirement

# Function to extract ISIN and Company + Cupon and Maturity
def extract_isin_company(url):
    parts = url.split('/')
    bond_info = parts[-1] # Get the last part after "anleihe/"
    if '-' in bond_info:
        isin_company_parts = bond_info.split('-', 1) # Split only once at the first hyphen
        isin = isin_company_parts[0]
        company_coupon_maturity = isin_company_parts[1]
        return isin, company_coupon_maturity
    else:
        return None, bond_info # Handle cases without hyphen separation if needed

# Apply the extraction function
df[['ISIN', 'Company + Kupon and Maturity']] = df['URL'].apply(extract_isin_company).tolist()

# Define the function to split coupon and maturity (as provided by the user)
def split_coupon_maturity(text):
    match = re.search(r"(\d+(?:[.,]\d+)?(?:-\d+)+)$", text)  # Corrected regex for decimal and comma
    if match:
        coupon_maturity = match.group(1)
        company = text[:match.start()].rstrip('-')
        return company, coupon_maturity
    return text.rstrip('-'), None

# Apply the function to create 'Company' and 'Kupon_Maturity' columns
df[['Company', 'Kupon_Maturity']] = df['Company + Kupon and Maturity'].apply(split_coupon_maturity).tolist()

df['Company'] = df['Company'].str.replace(r'^-', '', regex=True)

# Define the function to split coupon, maturity start and end (as provided by the user)
def split_coupon_maturity_details(text):
    if text:
        parts = text.split('-')
        if len(parts) >= 3:
            maturity_start = parts[-2]
            maturity_end = parts[-1]
            coupon = '-'.join(parts[:-2]).replace('-', '.')
            return coupon, maturity_start, maturity_end
        elif len(parts) == 2: # Handle cases with only coupon and end year, assuming start year is the same
            maturity_end = parts[-1]
            coupon = parts[0].replace('-', '.')
            return coupon, None, maturity_end # Maturity start is none if not present
    return None, None, None

# Apply the function to create 'Coupon', 'Maturity_Start', and 'Maturity_End' columns
df[['Coupon', 'Maturity_Start', 'Maturity_End']] = df['Kupon_Maturity'].apply(split_coupon_maturity_details).tolist()

# To ensure all columns with numbers are numbers
columns_to_convert = ['Coupon', 'Maturity_Start', 'Maturity_End']
for col in columns_to_convert:
    df[col] = pd.to_numeric(df[col], errors='coerce')

# Save the updated DataFrame to a CSV file
df.to_csv("All_Bond_URLs_Processed.csv", index=False)

print("Data processed and saved to raw_all_bond_links.csv")

# Display the DataFrame with the separated columns, including the original
print(df[['URL', 'ISIN', 'Company + Kupon and Maturity', 'Company', 'Coupon', 'Maturity_Start', 'Maturity_End']].head())

# Display the DataFrame with the separated columns (optional) as requested in the prompt
print(df[['Company', 'Kupon_Maturity']].head())

                                                   0
0  https://www.boerse-frankfurt.de/anleihe/de0001...
1  https://www.boerse-frankfurt.de/anleihe/no0010...
2  https://www.boerse-frankfurt.de/anleihe/de0001...
3  https://www.boerse-frankfurt.de/anleihe/de000b...
4  https://www.boerse-frankfurt.de/anleihe/us9128...
Data processed and saved to raw_all_bond_links.csv
                                                 URL          ISIN  \
0  https://www.boerse-frankfurt.de/anleihe/de0001...  de0001030716   
1  https://www.boerse-frankfurt.de/anleihe/no0010...  no0010757925   
2  https://www.boerse-frankfurt.de/anleihe/de0001...  de0001102382   
3  https://www.boerse-frankfurt.de/anleihe/de000b...  de000bu22023   
4  https://www.boerse-frankfurt.de/anleihe/us9128...  us91282ckb62   

             Company + Kupon and Maturity                     Company  Coupon  \
0  deutschland-bundesrepublik-0-000-20-25  deutschland-bundesrepublik   0.000   
1          norwegen-koenigreich-1-5-16-26       

# Code for Green bond links
The following code is slightly different as I already pre modified the green bond link data. This manual modification just created what the code above did in a first step, extracting the ISIN and the rest of the string into seperate columns, to subsequently run the below code that does a regex to extract the name of the bond emitters 

In [None]:
import pandas as pd
import re
import os

# Loading the data
data_filepath = os.path.join("..", "data", "Static_data", "Green_Bond_URLs.xlsx")
excel_file = pd.ExcelFile(data_filepath)
df = excel_file.parse("Data")  # Parse the "Data" sheet into a DataFrame
print(df.head())

def split_coupon_maturity(text):
    match = re.search(r"(\d+(?:\.\d+)?(?:-\d+)+)$", text)  # Improved regex
    if match:
        coupon_maturity = match.group(1)
        company = text[:match.start()].rstrip('-')  # Exclude trailing hyphens
        return company, coupon_maturity
    return text.rstrip('-'), None  # Handle cases where no match is found

df[['Company', 'Kupon_Maturity']] = df['Company + Kupon and Maturity'].apply(split_coupon_maturity).tolist()

df['Company'] = df['Company'].str.replace(r'^-', '', regex=True)

# Split Kupon_Maturity into Coupon, Maturity_Start, and Maturity_End
def split_coupon_maturity_details(text):
    if text:
        parts = text.split('-')
        if len(parts) >= 3:  # Ensure there are at least 3 parts (coupon and maturities)
            maturity_start = parts[-2]
            maturity_end = parts[-1]
            coupon = '-'.join(parts[:-2]).replace('-', '.')  # Join remaining parts for coupon
            return coupon, maturity_start, maturity_end
    return None, None, None  # Handle cases where splitting fails

df[['Coupon', 'Maturity_Start', 'Maturity_End']] = df['Kupon_Maturity'].apply(split_coupon_maturity_details).tolist()

# Save the updated DataFrame back to a new CSV file (or overwrite the original)
df.to_csv("Green_Bond_URLs_Processed.csv", index=False)  # Save to a new CSV

print("Data processed and saved to Green_Bond_URLs_Processed.csv")

# Display the DataFrame with the separated columns, including the original
print(df[['Company + Kupon and Maturity', 'Company', 'Coupon', 'Maturity_Start', 'Maturity_End']].head())

# Save the updated DataFrame back to a new CSV file (or overwrite the original)
df.to_csv("Green_Bond_URLs_Processed.csv", index=False)  # Save to a new CSV

print("Data processed and saved to Green_Bond_URLs_Processed.csv")

# Display the DataFrame with the separated columns (optional)
print(df[['Company', 'Kupon_Maturity']].head())

Data processed and saved to Green_Bond_URLs_Processed.csv
             Company + Kupon and Maturity                     Company Coupon  \
0  deutschland-bundesrepublik-0-000-20-25  deutschland-bundesrepublik  0.000   
1          norwegen-koenigreich-1-5-16-26        norwegen-koenigreich    1.5   
2      deutschland-bundesrepublik-1-15-25  deutschland-bundesrepublik      1   
3    deutschland-bundesrepublik-3-1-23-25  deutschland-bundesrepublik    3.1   
4    united-states-of-america-4-625-24-26    united-states-of-america  4.625   

  Maturity_Start Maturity_End  
0             20           25  
1             16           26  
2             15           25  
3             23           25  
4             24           26  
Data processed and saved to Green_Bond_URLs_Processed.csv
                      Company Kupon_Maturity
0  deutschland-bundesrepublik    0-000-20-25
1        norwegen-koenigreich      1-5-16-26
2  deutschland-bundesrepublik        1-15-25
3  deutschland-bundesrepublik  