In [3]:
import pandas as pd
import numpy as np

In [4]:
# 1️⃣ Load CSV
df = pd.read_csv("../data/feature-engineered/recession_probability.csv")
df['date'] = pd.to_datetime(df['date'])

# 2️⃣ Priority 1: Keep core economic indicators
priority1_cols = [
    'unemployment_rate', 'CPI', 'INDPRO', 'gdp_per_capita',
    '1_year_rate', '3_months_rate', '6_months_rate', '10_year_rate', 'share_price', 'unemployment_rate', 'PPI' , 'OECD_CLI_index', 'CSI_index', 'Quarter_Number' , 'Month_Number', 'gdp_per_capita', 
    # Anomaly indicators
    'anomaly_INDPRO', 'anomaly_CPI', 'anomaly_unemployment_rate', 'anomaly_PPI',
    'anomaly_share_price', 'anomaly_1_year_rate', 'anomaly_3_months_rate',
    'anomaly_6_months_rate', 'anomaly_10_year_rate'
]

# Add first differences and seasonal adjustments if present
first_diff_cols = [c for c in df.columns if "_diff1" in c or "_diff2" in c]
seasonal_cols = [c for c in df.columns if "seasonally_adjusted" in c]

priority1_keep = priority1_cols + first_diff_cols + seasonal_cols

# 3️⃣ Priority 2: Selective time-series features
# Only keep acf1 and sumsq_acf for key variables
key_vars = ['1_year_rate','3_months_rate','6_months_rate','CPI','INDPRO','10_year_rate']
acf_cols = []
for var in key_vars:
    acf_cols += [c for c in df.columns if c.startswith(var+"_acf1") or c.startswith(var+"_sumsq_acf")]
    # Only include seasonal ACF if strong seasonality
    seasonal_acf = [c for c in df.columns if c.startswith(var+"_acf_seasonal")]
    if seasonal_acf:
        acf_cols += seasonal_acf

priority2_keep = acf_cols

# 4️⃣ Combine Priority 1 & 2, drop everything else
keep_cols = ['date'] + priority1_keep + priority2_keep
df_clean = df[keep_cols].copy()

# # 5️⃣ Optional: Fill missing values
# df_clean = df_clean.replace([np.inf, -np.inf], np.nan)
# df_clean = df_clean.ffill().bfill()

# 6️⃣ Inspect final columns
print("Columns kept for modeling:")
print(df_clean.columns.tolist())

# 7️⃣ Save cleaned DataFrame
df_clean.to_csv("../data/feature-engineered/dim_reduced_data.csv", index=False)


Columns kept for modeling:
['date', 'unemployment_rate', 'CPI', 'INDPRO', 'gdp_per_capita', '1_year_rate', '3_months_rate', '6_months_rate', '10_year_rate', 'share_price', 'unemployment_rate', 'PPI', 'OECD_CLI_index', 'CSI_index', 'Quarter_Number', 'Month_Number', 'gdp_per_capita', 'anomaly_INDPRO', 'anomaly_CPI', 'anomaly_unemployment_rate', 'anomaly_PPI', 'anomaly_share_price', 'anomaly_1_year_rate', 'anomaly_3_months_rate', 'anomaly_6_months_rate', 'anomaly_10_year_rate', '1_year_rate_acf1_diff1', '1_year_rate_sumsq_acf_diff1', '1_year_rate_acf1_diff2', '1_year_rate_sumsq_acf_diff2', '3_months_rate_acf1_diff1', '3_months_rate_sumsq_acf_diff1', '3_months_rate_acf1_diff2', '3_months_rate_sumsq_acf_diff2', '6_months_rate_acf1_diff1', '6_months_rate_sumsq_acf_diff1', '6_months_rate_acf1_diff2', '6_months_rate_sumsq_acf_diff2', 'CPI_acf1_diff1', 'CPI_sumsq_acf_diff1', 'CPI_acf1_diff2', 'CPI_sumsq_acf_diff2', 'INDPRO_acf1_diff1', 'INDPRO_sumsq_acf_diff1', 'INDPRO_acf1_diff2', 'INDPRO_sums

In [5]:
df_2 = pd.read_csv("../data/feature-engineered/recession_probability.csv")

In [6]:
df_2.shape

(700, 121)

In [7]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import RFECV
from sklearn.model_selection import KFold

# 1️⃣ Load dataframe
df = df_2
df['date'] = pd.to_datetime(df['date'])

# 2️⃣ Config
TARGET_COLS = [
    '1_year_rate','3_months_rate','6_months_rate','CPI','INDPRO',
    '10_year_rate','share_price','unemployment_rate','PPI',
    'OECD_CLI_index','CSI_index','gdp_per_capita'
]

RECESSION_COLS = [
    'recession_probability','1_month_recession_probability',
    '3_month_recession_probability','6_month_recession_probability'
]

# Drop date + targets for features
X = df.drop(columns=['date'] + TARGET_COLS)
feature_importance_dict = {}

# 3️⃣ RFECV parameters
cv = KFold(n_splits=5, shuffle=True, random_state=42)
model = RandomForestRegressor(n_estimators=200, random_state=42, n_jobs=-1)

# 4️⃣ Run RFECV for each target to get feature importance
for target in TARGET_COLS:
    y = df[target]
    # Drop rows where target is NaN
    mask = ~y.isna()
    X_clean = X.loc[mask]
    y_clean = y.loc[mask]

    rfecv = RFECV(estimator=model, step=1, cv=cv, scoring='r2', min_features_to_select=5)
    rfecv.fit(X_clean, y_clean)

    # Accumulate feature importance
    for f, imp in zip(X.columns, model.fit(X_clean, y_clean).feature_importances_):
        feature_importance_dict[f] = feature_importance_dict.get(f, 0) + imp

# 5️⃣ Select top 35 features based on accumulated importance
sorted_features = sorted(feature_importance_dict.items(), key=lambda x: x[1], reverse=True)
top_35_features = [f for f, _ in sorted_features[:35]]

# 6️⃣ Combine with recession columns
final_features = top_35_features + RECESSION_COLS

# 7️⃣ Create reduced dataframe
df_reduced = df[['date'] + TARGET_COLS + final_features].copy()

# 8️⃣ Optional: save reduced dataframe
df_reduced.to_csv("df_2_reduced.csv", index=False)

print(f"Final features ({len(top_35_features)} + {len(RECESSION_COLS)} recession):")
print(final_features)


KeyboardInterrupt: 

In [6]:
df_reduced.columns

Index(['date', '1_year_rate', '3_months_rate', '6_months_rate', 'CPI',
       'INDPRO', '10_year_rate', 'share_price', 'unemployment_rate', 'PPI',
       'OECD_CLI_index', 'CSI_index', 'gdp_per_capita',
       'seasonally_adjusted_unemployment_rate', 'CPI_trend',
       'CPI_sumsq_acf_diff1', 'seasonally_adjusted_CPI', 'INDPRO_trend',
       'seasonally_adjusted_INDPRO', 'INDPRO_acf1_original',
       'CPI_sumsq_acf_original', 'INDPRO_sumsq_acf_original',
       'CPI_acf1_original', 'recession_probability', 'PPI_trend',
       '1_year_rate_acf1_diff1', '3_months_rate_acf1_original',
       'gdp_per_capita_acf1_original', 'share_price_acf1_original',
       'seasonally_adjusted_PPI', '6_months_rate_sumsq_acf_diff1',
       'gdp_per_capita_sumsq_acf_original', 'Year',
       'share_price_sumsq_acf_original', 'CPI_acf1_diff2',
       '3_months_rate_sumsq_acf_original', 'CPI_acf1_diff1',
       '6_months_rate_sumsq_acf_original', '6_months_rate_acf1_original',
       '10_year_rate_acf1_ori

In [13]:
df_reduced.to_csv("df_2_reduced.csv", index=False)


In [8]:
df_3 = pd.read_csv('df_2_reduced.csv')

In [9]:
df_3.shape

(700, 52)

In [10]:
df_3.dtypes

date                                      object
1_year_rate                              float64
3_months_rate                            float64
6_months_rate                            float64
CPI                                      float64
INDPRO                                   float64
10_year_rate                             float64
share_price                              float64
unemployment_rate                        float64
PPI                                      float64
OECD_CLI_index                           float64
CSI_index                                float64
gdp_per_capita                           float64
seasonally_adjusted_unemployment_rate    float64
CPI_trend                                float64
CPI_sumsq_acf_diff1                      float64
seasonally_adjusted_CPI                  float64
INDPRO_trend                             float64
seasonally_adjusted_INDPRO               float64
INDPRO_acf1_original                     float64
CPI_sumsq_acf_origin