In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


In [7]:
gpd_unclean = pd.read_csv('data\gdp.csv')
inflation_unclean = pd.read_csv('data/inflation.csv')
unemployment_unclean = pd.read_csv('data/unemployment.csv')
yield_data = pd.read_csv('data/yield.csv')

In [8]:
from sklearn.preprocessing import MinMaxScaler

# Convert to datetime and set as index
yield_data['Date'] = pd.to_datetime(yield_data['Date'], errors='coerce')
yield_data.set_index('Date', inplace=True)

# Replace invalid values with NaN
yield_data.replace(-999.99, pd.NA, inplace=True)

# Drop columns with more than 20% missing values
threshold = 0.2 * len(yield_data)
columns_to_drop = yield_data.columns[yield_data.isnull().sum() > threshold]
yield_data = yield_data.drop(columns=columns_to_drop)

# Rename columns
column_rename_mapping = {
    'BETA0': 'LongTermRate',    # Reflects investor confidence about long-term economic growth and inflation
    'BETA1': 'Slope',           # A negative slope (yield curve inversion) signals a possible recession
}
for i in range(1, 31):
    column_rename_mapping[f'SVENPY{i:02d}'] = f'SpotRate{i}Y'
    column_rename_mapping[f'SVEN1F{i:02d}'] = f'ForwardRate{i}Y'
yield_data.rename(columns=column_rename_mapping, inplace=True)

# Resample to monthly frequency (convert from 'B' to 'M')
yield_data = yield_data.resample('M').mean()

# Interpolate missing values for continuous features, including key features
yield_data.interpolate(method='linear', limit_direction='forward', axis=0, inplace=True)

  yield_data = yield_data.resample('M').mean()


In [3]:
inst = pd.read_csv('processed_yield_data_filled_inst.csv')
zero = pd.read_csv('processed_yield_data_filled_supervised_flipped.csv')

In [32]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

merged = pd.read_csv('clean_data/merged_data_old.csv')
yield_data = pd.read_csv('updated_yield_data.csv')
merged = merged.drop(columns=['LongTermRate', 'Slope'])
yield_data = yield_data.drop(columns=['Date'])

final_merged_data = merged.join(yield_data)
final_merged_data = final_merged_data.drop(columns=["Unnamed: 0"])
final_merged_data = final_merged_data.set_index("DATE")

final_merged_data.to_csv("merged_data.csv")

In [41]:
merged_df = pd.read_csv('models/merged_data.csv')
recession_df = pd.read_csv('data/recession.csv')

In [42]:
# Convert DATE columns to datetime format
recession_df['DATE'] = pd.to_datetime(recession_df['DATE'])
merged_df['DATE'] = pd.to_datetime(merged_df['DATE'])

# Set DATE as index for forward-filling in recession data
recession_df = recession_df.set_index('DATE').resample('M').ffill().reset_index()

# Merge the datasets by replacing the Recession column in merged_df
merged_df = merged_df.drop(columns=['Recession'], errors='ignore')
merged_updated_df = pd.merge(merged_df, recession_df, on='DATE', how='left')

# Rename the JHDUSRGDPBR column to Recession for consistency
merged_updated_df = merged_updated_df.rename(columns={'JHDUSRGDPBR': 'Recession'})



  recession_df = recession_df.set_index('DATE').resample('M').ffill().reset_index()


In [44]:
merged_updated_df.set_index("DATE")
merged_updated_df.to_csv("merged_data.csv")

In [None]:
final_merged_data