In [1]:
# Step 1: Set up the environment to use the FRED API

# Import necessary libraries
import pandas as pd
import numpy as np
from fredapi import Fred
import matplotlib.pyplot as plt
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')

# Set up the FRED API with the provided key
fred = Fred(api_key='bdb4f5e5fc848858bd5578696d6b810e')

print("FRED API setup complete.")

# Define the list of FRED indicators
FRED_INDICATORS = ['GDP', 'GDPC1', 'GDPPOT', 'NYGDPMKTPCDWLD', 'CPIAUCSL', 'CPILFESL', 'GDPDEF',
                   'M1SL', 'WM1NS', 'WM2NS', 'M1V', 'M2V', 'WALCL', 'DFF', 'DTB3', 'DGS5', 'DGS10',
                   'DGS30', 'T5YIE', 'T10YIE', 'T5YIFR', 'TEDRATE', 'DPRIME', 'UNRATE', 'NROU',
                   'CIVPART', 'EMRATIO', 'UNEMPLOY', 'PAYEMS', 'MANEMP', 'ICSA', 'IC4WSA', 'CDSP',
                   'MDSP', 'FODSP', 'DSPIC96', 'PCE', 'PCEDG', 'PSAVERT', 'DSPI', 'RSXFS', 'INDPRO',
                   'TCU', 'HOUST', 'GPDI', 'CP', 'STLFSI2', 'DCOILWTICO', 'DTWEXAFEGS', 'DTWEXBGS',
                   'GFDEBTN', 'GFDEGDQ188S', 'DEXUSEU', 'GVZCLS', 'VIXCLS', 'DIVIDEND',
                   'MORTGAGE30US', 'SPCS20RSA']

print(f"Number of FRED indicators: {len(FRED_INDICATORS)}")
print("Environment setup complete. Ready to fetch data.")

FRED API setup complete.
Number of FRED indicators: 58
Environment setup complete. Ready to fetch data.


In [2]:
# Function to fetch data for a given indicator
def fetch_fred_data(indicator):
    data = fred.get_series(indicator)
    return pd.DataFrame(data, columns=[indicator])

# Fetch data for all indicators
all_data = {}
for indicator in FRED_INDICATORS:
    try:
        data = fetch_fred_data(indicator)
        all_data[indicator] = data
        print(f"Successfully fetched data for {indicator}")
    except Exception as e:
        print(f"Failed to fetch data for {indicator}: {str(e)}")

# Combine all data into a single DataFrame
combined_data = pd.concat(all_data.values(), axis=1)

# Fill NaN values with forward fill method
combined_data = combined_data.fillna(method='ffill')

print("\
Data fetching complete.")
print(f"Shape of combined data: {combined_data.shape}")
print("\
First few rows of the combined data:")
print(combined_data.head())

# Save the combined data to a CSV file
combined_data.to_csv('economic_indicators.csv')
print("\
Data saved to 'economic_indicators.csv'")

# Basic statistics of the data
print("\
Basic statistics of the data:")
print(combined_data.describe())

Successfully fetched data for GDP
Successfully fetched data for GDPC1
Successfully fetched data for GDPPOT
Successfully fetched data for NYGDPMKTPCDWLD
Successfully fetched data for CPIAUCSL
Successfully fetched data for CPILFESL
Successfully fetched data for GDPDEF
Successfully fetched data for M1SL
Successfully fetched data for WM1NS
Successfully fetched data for WM2NS
Successfully fetched data for M1V
Successfully fetched data for M2V
Successfully fetched data for WALCL
Successfully fetched data for DFF
Successfully fetched data for DTB3
Successfully fetched data for DGS5
Successfully fetched data for DGS10
Successfully fetched data for DGS30
Successfully fetched data for T5YIE
Successfully fetched data for T10YIE
Successfully fetched data for T5YIFR
Successfully fetched data for TEDRATE
Successfully fetched data for DPRIME
Successfully fetched data for UNRATE
Successfully fetched data for NROU
Successfully fetched data for CIVPART
Successfully fetched data for EMRATIO
Successfully 

In [4]:
import pandas as pd
import numpy as np

# Load the data
df = pd.read_csv('economic_indicators.csv', index_col=0, parse_dates=True)

# Check for missing values
missing_values = df.isnull().sum()
print("Missing values before preprocessing:")
print(missing_values[missing_values > 0])

# Fill missing values
# For economic data, it's often better to use 'ffill' (forward fill) method
df_filled = df.fillna(method='ffill')

# Check if there are any remaining missing values
remaining_missing = df_filled.isnull().sum()
print("\
Remaining missing values after forward fill:")
print(remaining_missing[remaining_missing > 0])

# If there are still missing values at the beginning of the series, use backfill
df_filled = df_filled.fillna(method='bfill')

# Final check for missing values
final_missing = df_filled.isnull().sum()
print("\
Final missing values check:")
print(final_missing[final_missing > 0])

# Save the preprocessed data
df_filled.to_csv('economic_indicators_preprocessed.csv')
print("\
Preprocessed data saved to 'economic_indicators_preprocessed.csv'")

# Display basic info about the preprocessed dataset
print("\
Dataset Info:")
print(df_filled.info())

print("\
First few rows of the preprocessed data:")
print(df_filled.head())

Missing values before preprocessing:
GDP                 336
GDPC1               336
GDPPOT              360
NYGDPMKTPCDWLD     2560
CPIAUCSL            336
CPILFESL           1465
GDPDEF              336
M1SL               2195
WM1NS              8044
WM2NS             10172
M1V                2195
M2V                2195
WALCL             18252
DFF                 550
DTB3                421
DGS5               3292
DGS10              3292
DGS30              8815
T5YIE             18267
T10YIE            18267
T5YIFR            18267
TEDRATE           12058
DPRIME              949
UNRATE              348
NROU                360
CIVPART             348
EMRATIO             348
UNEMPLOY            348
PAYEMS              240
MANEMP              240
ICSA               5123
IC4WSA             5144
CDSP               9865
MDSP               9865
FODSP              9865
DSPIC96            2195
PCE                2195
PCEDG              2195
PSAVERT            2195
DSPI               2195
RSX

In [5]:
import pandas as pd
import numpy as np

# Load the preprocessed data
df = pd.read_csv('economic_indicators_preprocessed.csv', index_col=0, parse_dates=True)

# Function to convert to monthly frequency
def to_monthly(series):
    if series.index.freq == 'M':
        return series
    elif series.index.freq in ['D', 'B']:
        return series.resample('M').last()
    elif series.index.freq == 'W':
        return series.resample('M').last()
    elif series.index.freq == 'Q':
        return series.resample('M').ffill()
    elif series.index.freq == 'A':
        return series.resample('M').ffill()
    else:
        return series.resample('M').last()

# Apply the conversion to each column
df_monthly = df.apply(to_monthly)

# Check the frequency of each column
freq_check = df_monthly.apply(lambda x: x.index.freq)
print("Frequency check after conversion:")
print(freq_check)

# Calculate the percentage of non-null values for each indicator
non_null_percentage = df_monthly.count() / len(df_monthly) * 100

print("\
Percentage of non-null values for each indicator:")
print(non_null_percentage)

# Save the monthly data
df_monthly.to_csv('economic_indicators_monthly.csv')
print("\
Monthly data saved to 'economic_indicators_monthly.csv'")

print("\
Shape of the monthly dataset:", df_monthly.shape)

print("\
First few rows of the monthly data:")
print(df_monthly.head())

# Calculate year-over-year percentage changes
df_yoy = df_monthly.pct_change(periods=12) * 100

print("\
First few rows of year-over-year percentage changes:")
print(df_yoy.head())

Frequency check after conversion:
GDP               <MonthEnd>
GDPC1             <MonthEnd>
GDPPOT            <MonthEnd>
NYGDPMKTPCDWLD    <MonthEnd>
CPIAUCSL          <MonthEnd>
CPILFESL          <MonthEnd>
GDPDEF            <MonthEnd>
M1SL              <MonthEnd>
WM1NS             <MonthEnd>
WM2NS             <MonthEnd>
M1V               <MonthEnd>
M2V               <MonthEnd>
WALCL             <MonthEnd>
DFF               <MonthEnd>
DTB3              <MonthEnd>
DGS5              <MonthEnd>
DGS10             <MonthEnd>
DGS30             <MonthEnd>
T5YIE             <MonthEnd>
T10YIE            <MonthEnd>
T5YIFR            <MonthEnd>
TEDRATE           <MonthEnd>
DPRIME            <MonthEnd>
UNRATE            <MonthEnd>
NROU              <MonthEnd>
CIVPART           <MonthEnd>
EMRATIO           <MonthEnd>
UNEMPLOY          <MonthEnd>
PAYEMS            <MonthEnd>
MANEMP            <MonthEnd>
ICSA              <MonthEnd>
IC4WSA            <MonthEnd>
CDSP              <MonthEnd>
MDSP     

In [6]:
import pandas as pd
import numpy as np

# Load the monthly data
df = pd.read_csv('economic_indicators_monthly.csv', index_col=0, parse_dates=True)

# Print available columns
print("Available columns:")
print(df.columns.tolist())

# Adjust the list of leading indicators based on available columns
leading_indicators = ['UNRATE', 'ICSA', 'HOUST', 'M2V', 'T10YIE', 'STLFSI2', 'VIXCLS']

print("\
Adjusted leading indicators:")
print(leading_indicators)

# Create lagged variables (3-month and 6-month lags)
for indicator in leading_indicators:
    df[f'{indicator}_lag3'] = df[indicator].shift(3)
    df[f'{indicator}_lag6'] = df[indicator].shift(6)

print("\
Lagged variables created. New shape of the dataset:", df.shape)

# Calculate moving averages (3-month, 6-month, 12-month)
for indicator in leading_indicators:
    df[f'{indicator}_MA3'] = df[indicator].rolling(window=3).mean()
    df[f'{indicator}_MA6'] = df[indicator].rolling(window=6).mean()
    df[f'{indicator}_MA12'] = df[indicator].rolling(window=12).mean()

print("Moving averages calculated. New shape of the dataset:", df.shape)

# Create interaction terms
df['UNRATE_ICSA'] = df['UNRATE'] * df['ICSA']
df['T10YIE_STLFSI2'] = df['T10YIE'] * df['STLFSI2']

print("Interaction terms created. Final shape of the dataset:", df.shape)

# Display the first few rows of the new features
print("\
First few rows of the dataset with new features:")
print(df[leading_indicators + 
         [f'{indicator}_lag3' for indicator in leading_indicators] + 
         [f'{indicator}_lag6' for indicator in leading_indicators] +
         [f'{indicator}_MA3' for indicator in leading_indicators] +
         [f'{indicator}_MA6' for indicator in leading_indicators] +
         [f'{indicator}_MA12' for indicator in leading_indicators] +
         ['UNRATE_ICSA', 'T10YIE_STLFSI2']].head())

Available columns:
['GDP', 'GDPC1', 'GDPPOT', 'NYGDPMKTPCDWLD', 'CPIAUCSL', 'CPILFESL', 'GDPDEF', 'M1SL', 'WM1NS', 'WM2NS', 'M1V', 'M2V', 'WALCL', 'DFF', 'DTB3', 'DGS5', 'DGS10', 'DGS30', 'T5YIE', 'T10YIE', 'T5YIFR', 'TEDRATE', 'DPRIME', 'UNRATE', 'NROU', 'CIVPART', 'EMRATIO', 'UNEMPLOY', 'PAYEMS', 'MANEMP', 'ICSA', 'IC4WSA', 'CDSP', 'MDSP', 'FODSP', 'DSPIC96', 'PCE', 'PCEDG', 'PSAVERT', 'DSPI', 'RSXFS', 'INDPRO', 'TCU', 'HOUST', 'GPDI', 'CP', 'STLFSI2', 'DCOILWTICO', 'DTWEXAFEGS', 'DTWEXBGS', 'GFDEBTN', 'GFDEGDQ188S', 'DEXUSEU', 'GVZCLS', 'VIXCLS', 'DIVIDEND', 'MORTGAGE30US', 'SPCS20RSA']
Adjusted leading indicators:
['UNRATE', 'ICSA', 'HOUST', 'M2V', 'T10YIE', 'STLFSI2', 'VIXCLS']
Lagged variables created. New shape of the dataset: (1390, 72)
Moving averages calculated. New shape of the dataset: (1390, 93)
Interaction terms created. Final shape of the dataset: (1390, 95)
First few rows of the dataset with new features:
            UNRATE      ICSA   HOUST    M2V  T10YIE  STLFSI2  VIX

In [7]:
import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer

# Load the data
df = pd.read_csv('economic_indicators_monthly.csv', index_col=0, parse_dates=True)

# Function to impute missing values
def impute_missing_values(df):
    # Forward fill for time series data
    df_ffill = df.ffill()
    
    # Use KNN imputation for remaining missing values
    imputer = KNNImputer(n_neighbors=5)
    df_imputed = pd.DataFrame(imputer.fit_transform(df_ffill), columns=df.columns, index=df.index)
    
    return df_imputed

# Impute missing values
df_imputed = impute_missing_values(df)

# Check for any remaining missing values
remaining_missing = df_imputed.isnull().sum()
print("Remaining missing values after imputation:")
print(remaining_missing[remaining_missing > 0])

# Calculate the percentage change for each variable
df_pct_change = df_imputed.pct_change()

# Create lagged variables (3-month and 6-month lags)
for col in df_imputed.columns:
    df_imputed[f'{col}_lag3'] = df_imputed[col].shift(3)
    df_imputed[f'{col}_lag6'] = df_imputed[col].shift(6)

# Calculate moving averages (3-month, 6-month, 12-month)
for col in df_imputed.columns:
    df_imputed[f'{col}_MA3'] = df_imputed[col].rolling(window=3).mean()
    df_imputed[f'{col}_MA6'] = df_imputed[col].rolling(window=6).mean()
    df_imputed[f'{col}_MA12'] = df_imputed[col].rolling(window=12).mean()

# Handle missing values in newly created features
df_final = impute_missing_values(df_imputed)

# Check for any remaining missing values in the final dataset
final_missing = df_final.isnull().sum()
print("\
Remaining missing values in the final dataset:")
print(final_missing[final_missing > 0])

# Save the final preprocessed dataset
df_final.to_csv('economic_indicators_preprocessed.csv')
print("\
Final preprocessed dataset saved as 'economic_indicators_preprocessed.csv'")

# Display basic information about the final dataset
print("\
Final dataset info:")
print(df_final.info())

print("\
First few rows of the final dataset:")
print(df_final.head())

Remaining missing values after imputation:
Series([], dtype: int64)
Remaining missing values in the final dataset:
Series([], dtype: int64)
Final preprocessed dataset saved as 'economic_indicators_preprocessed.csv'
Final dataset info:
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 1390 entries, 1919-01-31 to 2034-10-31
Columns: 696 entries, GDP to SPCS20RSA_lag6_MA12
dtypes: float64(696)
memory usage: 7.4 MB
None
First few rows of the final dataset:
                GDP     GDPC1       GDPPOT  NYGDPMKTPCDWLD  CPIAUCSL  \
1919-01-31  243.164  2182.681  2256.084579    1.364504e+12     21.48   
1919-02-28  243.164  2182.681  2256.084579    1.364504e+12     21.48   
1919-03-31  243.164  2182.681  2256.084579    1.364504e+12     21.48   
1919-04-30  243.164  2182.681  2256.084579    1.364504e+12     21.48   
1919-05-31  243.164  2182.681  2256.084579    1.364504e+12     21.48   

            CPILFESL  GDPDEF   M1SL  WM1NS   WM2NS  ...  \
1919-01-31      28.5  11.141  138.9  286.5  1595

In [8]:
import pandas as pd
import numpy as np

# Load the preprocessed data
df = pd.read_csv('economic_indicators_preprocessed.csv', index_col=0, parse_dates=True)

# Define variable groups
variable_groups = {
    'Growth': ['GDP', 'GDPC1', 'GDPPOT', 'INDPRO', 'PAYEMS', 'MANEMP', 'HOUST', 'GPDI'],
    'Inflation': ['CPIAUCSL', 'CPILFESL', 'GDPDEF', 'T5YIE', 'T10YIE'],
    'Liquidity': ['M1SL', 'WM1NS', 'WM2NS', 'M1V', 'M2V', 'WALCL'],
    'Interest_Rates': ['DFF', 'DTB3', 'DGS5', 'DGS10', 'DGS30', 'MORTGAGE30US'],
    'Labor_Market': ['UNRATE', 'CIVPART', 'EMRATIO', 'UNEMPLOY', 'ICSA', 'IC4WSA'],
    'Consumer': ['PCE', 'PCEDG', 'PSAVERT', 'DSPI', 'RSXFS'],
    'Financial_Markets': ['STLFSI2', 'VIXCLS', 'GVZCLS', 'DIVIDEND'],
    'International': ['DTWEXAFEGS', 'DTWEXBGS', 'DEXUSEU'],
    'Other': ['NROU', 'TCU', 'CP', 'DCOILWTICO', 'GFDEBTN', 'GFDEGDQ188S', 'SPCS20RSA']
}

# Function to get all related features (including lags and moving averages)
def get_related_features(df, base_features):
    all_features = []
    for feature in base_features:
        related = [col for col in df.columns if col.startswith(feature)]
        all_features.extend(related)
    return all_features

# Group all features (including derived ones)
grouped_features = {group: get_related_features(df, features) for group, features in variable_groups.items()}

# Print the number of features in each group
for group, features in grouped_features.items():
    print(f"{group}: {len(features)} features")

# Save the grouped features for later use
import pickle
with open('grouped_features.pkl', 'wb') as f:
    pickle.dump(grouped_features, f)

print("\
Grouped features saved to 'grouped_features.pkl'")

Growth: 132 features
Inflation: 60 features
Liquidity: 72 features
Interest_Rates: 72 features
Labor_Market: 72 features
Consumer: 84 features
Financial_Markets: 48 features
International: 36 features
Other: 108 features
Grouped features saved to 'grouped_features.pkl'
