In [18]:
import pandas as pd

gpd = pd.read_csv('data/gdp.csv', index_col='Year')
inflation = pd.read_csv('data/inflation.csv', index_col='date')
unemployment = pd.read_csv('data/unemployment.csv', index_col='date')
yield_data = pd.read_csv('data/yield.csv') #61

### Yeild curves

In [19]:
import pandas as pd
import numpy as np

# Convert to datetime
yield_data['Date'] = pd.to_datetime(yield_data['Date'], errors='coerce')

# Replace invalid values with NaN
yield_data.replace(-999.99, pd.NA, inplace=True)

# Drop columns with more than 20% missing values
threshold = 0.2 * len(yield_data)
columns_to_drop = yield_data.columns[yield_data.isnull().sum() > threshold]
yield_data_cleaned = yield_data.drop(columns=columns_to_drop)

# Rename columns 
column_rename_mapping = {
    'BETA0': 'LongTermRate',
    'BETA1': 'Slope',
    'BETA2': 'Curvature1',
    'BETA3': 'Curvature2',
}

for i in range(1, 31):
    column_rename_mapping[f'SVENPY{i:02d}'] = f'SpotRate{i}Y'


for i in range(1, 31):
    column_rename_mapping[f'SVEN1F{i:02d}'] = f'ForwardRate{i}Y'

yield_data_cleaned.rename(columns=column_rename_mapping, inplace=True)

# Drop rows where 'Date' or key features are missing
key_features = ['Date', 'LongTermRate', 'ForwardRate1Y']
yield_data_cleaned = yield_data_cleaned.dropna(subset=key_features)

# Set Date as the index for resampling
yield_data_cleaned.set_index('Date', inplace=True)

# Resample to monthly frequency
monthly_data = yield_data_cleaned.resample('M').mean()

# Interpolate missing values for continuous features
monthly_data_interpolated = monthly_data.interpolate(method='linear', limit_direction='forward', axis=0)

# Logarithmic transformation to reduce skewness for curvature columns
curvature_columns = ['Curvature1', 'Curvature2']
for column in curvature_columns:
    monthly_data_interpolated[column] = monthly_data_interpolated[column].apply(
        lambda x: np.log1p(x) if x > 0 else 0
    )

# Normalize continuous features to the range [0, 1]
continuous_features = [
    col for col in monthly_data_interpolated.columns if col.startswith('LongTermRate') or col.startswith('Slope')
    or col.startswith('Curvature') or col.startswith('SpotRate') or col.startswith('ForwardRate')
]
monthly_data_interpolated[continuous_features] = monthly_data_interpolated[continuous_features].apply(
    lambda x: (x - x.min()) / (x.max() - x.min()), axis=0
)

# Final Output: Display the first few rows of the processed dataset
print(monthly_data_interpolated.head())




            LongTermRate     Slope  Curvature1  Curvature2  SVEN1F01  \
Date                                                                   
1961-06-30      0.160110  0.463769    0.000000         0.0  0.233760   
1961-07-31      0.164034  0.446137    0.000000         0.0  0.230211   
1961-08-31      0.169935  0.452569    0.000000         0.0  0.236753   
1961-09-30      0.165211  0.453090    0.000000         0.0  0.237257   
1961-10-31      0.152620  0.455216    0.113939         0.0  0.230314   

            SVEN1F04  SVEN1F09   SVENF01   SVENF02   SVENF03  ...   SVENY02  \
Date                                                          ...             
1961-06-30  0.247301       NaN  0.221101  0.255323  0.261874  ...  0.211179   
1961-07-31  0.254123       NaN  0.214934  0.255350  0.266802  ...  0.205111   
1961-08-31  0.264753       NaN  0.220417  0.263449  0.276710  ...  0.213567   
1961-09-30  0.256468       NaN  0.224142  0.260710  0.270093  ...  0.215156   
1961-10-31  0.247019 

  monthly_data = yield_data_cleaned.resample('M').mean()


# Rename collumn names

In [11]:

column_rename_mapping = {
    'BETA0': 'LongTermRate',
    'BETA1': 'Slope',
    'BETA2': 'Curvature1',
    'BETA3': 'Curvature2',
}

# Add mappings for SVENPY columns 
for i in range(1, 31):
    column_rename_mapping[f'SVENPY{i:02d}'] = f'SpotRate{i}Y'

monthly_data_renamed = monthly_data.rename(columns=column_rename_mapping)
monthly_data_renamed.head()


Unnamed: 0_level_0,LongTermRate,Slope,Curvature1,Curvature2,SVEN1F01,SVEN1F04,SVEN1F09,SVENF01,SVENF02,SVENF03,...,SVENY02,SVENY03,SVENY04,SVENY05,SVENY06,SVENY07,SVENY08,SVENY09,SVENY10,TAU1
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1961-06-30,0.16011,0.280936,0.97003,0.029967,0.227681,0.243012,,0.217252,0.248275,0.257198,...,0.207565,0.223522,0.231609,0.235785,0.237472,0.237196,,,,0.386493
1961-07-31,0.164034,0.275829,0.970033,0.029967,0.224266,0.249479,,0.21123,0.248302,0.261927,...,0.201633,0.22035,0.230545,0.236178,0.238899,0.23938,,,,0.565912
1961-08-31,0.169935,0.277692,0.970032,0.029967,0.230561,0.259554,,0.216584,0.256057,0.271435,...,0.209899,0.228819,0.239363,0.245298,0.24825,0.248892,,,,0.565608
1961-09-30,0.165211,0.277843,0.970032,0.029967,0.231046,0.251702,,0.220221,0.253434,0.265085,...,0.211453,0.228349,0.237296,0.242092,0.244227,0.24427,,,,0.492659
1961-10-31,0.15262,0.278459,0.97004,0.029967,0.224365,0.242745,,0.214511,0.246545,0.258798,...,0.205503,0.222201,0.230968,0.23521,0.236494,0.235555,,,,1.505942
