In [None]:
# === Environment Setup ===
# Purpose: Import necessary libraries for data manipulation, visualization, and machine learning.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# Scikit-learn provides the tools for preprocessing (StandardScaler), dimensionality reduction (PCA),
# modeling (LinearRegression), and robust evaluation (TimeSeriesSplit, mean_squared_error).
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error
# pandas_datareader is used to fetch data directly from online sources like the FRED database.
import pandas_datareader.data as web
from IPython.display import display, Markdown

# --- Configuration ---
# Purpose: Standardize plotting styles and numerical output for consistency and readability.
plt.style.use('seaborn-v0_8-whitegrid')
plt.rcParams.update({'font.size': 14, 'figure.figsize': (12, 8), 'figure.dpi': 150})
np.set_printoptions(suppress=True, linewidth=120, precision=4)

# --- Utility Functions ---
# Purpose: Create helper functions for formatted output in the notebook.
def note(msg): display(Markdown(f"<div class='alert alert-info'>📝 {msg}</div>"))
def sec(title): print(f'\n{80*"="}\n| {title.upper()} |\n{80*"="}')

note("Environment initialized for Macroeconomic Forecasting with ML.")

# Part 7: Machine Learning for Economists
## Chapter 7.18: Applied ML - Macroeconomic Forecasting with Principal Components

### 1. The Challenge of High-Dimensional Macro Data
**Intellectual Provenance:** The use of factor models in macroeconomics has a long history, but the application of principal components to large macroeconomic datasets was popularized by the work of James Stock and Mark Watson in the early 2000s. Their papers demonstrated that a few principal components extracted from a large panel of macroeconomic series could effectively summarize the state of the economy and produce superior forecasts compared to traditional, smaller-scale models. This 'diffusion index' or 'factor-augmented' forecasting approach has since become a standard tool in empirical macroeconomics.

Modern macroeconomic forecasting often faces a high-dimensionality problem. Central banks and researchers have access to hundreds, if not thousands, of potentially relevant time series (e.g., industrial production for different sectors, various interest rates, employment figures, surveys, etc.). Using all of these predictors in a standard regression model is infeasible due to the curse of dimensionality and severe multicollinearity.

A powerful solution is to use **Principal Component Analysis (PCA)** to distill the information from this large set of predictors into a small number of **estimated macroeconomic factors**. These factors, which are the principal components of the data, can be interpreted as capturing the primary drivers of economic variation (e.g., a 'real activity' factor, an 'inflation' factor, a 'financial conditions' factor).

This notebook provides a practical, step-by-step guide to building a **Principal Component Regression (PCR)** model to forecast GDP growth, a common workflow in modern macro-econometrics.

### 2. Data Acquisition and Preparation
We will use the widely-cited **FRED-MD dataset**, a large panel of monthly US macroeconomic variables maintained by the St. Louis Fed. Our goal will be to forecast quarterly GDP growth using the information contained in the monthly series.

The process involves:
1.  **Loading the data:** We'll fetch the FRED-MD dataset and the target variable (Real GDP).
2.  **Data Cleaning:** Handling missing values is crucial. We will use a simple forward-fill and back-fill strategy.
3.  **Transformations:** Many macro series are non-stationary. We will apply the transformations (e.g., taking logs, differencing) suggested by the creators of the dataset to induce stationarity.
4.  **Aggregation:** We will aggregate the monthly predictor data to a quarterly frequency to match the frequency of our GDP target variable.

In [None]:
sec("Data Acquisition and Preparation")

try:
    # Step 1: Load Data from FRED.
    # We fetch the FRED-MD monthly dataset, which is a large panel of U.S. macroeconomic variables.
    # This dataset is commonly used for testing forecasting models in a data-rich environment.
    fred_md_url = 'https://files.stlouisfed.org/files/htdocs/fred-md/monthly/current.csv'
    df_raw = pd.read_csv(fred_md_url)
    # The date column must be parsed into a datetime object to serve as the DataFrame index.
    df_raw['sasdate'] = pd.to_datetime(df_raw['sasdate'], format='%m/%d/%Y')
    df_raw = df_raw.set_index('sasdate')
    
    # The first row of the raw CSV contains transformation codes, not data.
    # We separate these codes into their own DataFrame for later use.
    tcode_df = df_raw.iloc[:1, 1:]
    df = df_raw.iloc[1:, 1:]
    
    # Step 2: Clean the data.
    # Missing values are a common issue. A simple and robust method for large macro panels is to
    # forward-fill existing values and then back-fill any remaining NaNs at the beginning of the series.
    df = df.fillna(method='ffill').fillna(method='bfill')
    
    # Step 3: Apply transformations to induce stationarity.
    # Most macroeconomic time series are non-stationary. The data providers suggest specific transformations
    # (e.g., differencing, log-differencing) to make them stationary, which is a prerequisite for many models, including PCA.
    def transform(series, tcode):
        if tcode == 2: return series.diff() # First difference for levels
        if tcode == 3: return series.diff().diff() # Second difference for accelerating series
        if tcode == 4: return np.log(series) # Log level
        if tcode == 5: return np.log(series).diff() # Log difference (approx. growth rate)
        if tcode == 6: return np.log(series).diff().diff() # Change in growth rate
        return series # No transformation needed

    df_transformed = pd.DataFrame()
    for col in df.columns:
        tcode = tcode_df[col].iloc[0]
        df_transformed[col] = transform(df[col].astype(float), tcode)
    # Transformations like differencing introduce NaNs at the start; these must be dropped.
    df_transformed = df_transformed.dropna()
    
    # Step 4: Load and prepare the target variable (quarterly GDP growth).
    gdp = web.DataReader('GDPC1', 'fred', start='1960-01-01', end='2023-12-31')
    # We calculate the quarterly growth rate as the log difference, multiplied by 100 for percentage terms.
    gdp_growth = np.log(gdp).diff().dropna() * 100
    
    # Step 5: Align the data frequencies.
    # Our predictors are monthly, but the target (GDP) is quarterly. We must align them.
    # A common method is to aggregate the monthly data to quarterly by taking the mean value within each quarter.
    X = df_transformed.resample('Q').mean()
    # We then join the predictors and target, keeping only the time periods where both exist ('inner' join).
    data_full = X.join(gdp_growth, how='inner')
    data_full = data_full.rename(columns={'GDPC1': 'GDP_Growth'})
    
    # Step 6: Create lagged predictors for forecasting.
    # The core of forecasting: to predict GDP growth for quarter t, we can only use information
    # available up to the end of quarter t-1. We achieve this by shifting the predictor matrix forward by one period.
    y = data_full['GDP_Growth']
    X_full = data_full.drop('GDP_Growth', axis=1)
    X_lagged = X_full.shift(1).dropna()
    # After lagging, we must re-align y to ensure the dates match the new, shorter X matrix.
    y = y.loc[X_lagged.index]
    
    note(f"Data preparation complete. We have {X_lagged.shape[1]} potential predictors to forecast GDP growth.")
    DATA_LOADED = True
except Exception as e:
    note(f"Could not load data. Error: {e}. Skipping notebook execution.")
    DATA_LOADED = False

### 3. Principal Component Regression
Now we apply PCA to our large set of predictors ($X_{lagged}$) to extract a smaller number of principal components. We will then use these components as regressors in a standard linear regression model to predict GDP growth.

In [None]:
sec("Principal Component Analysis of Macro Data")

if DATA_LOADED:
    # Step 1: Standardize the data.
    # PCA is sensitive to the scale of the variables. A variable with a large variance could dominate
    # the first principal component. To prevent this, we standardize each series to have a mean of 0 and a standard deviation of 1.
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X_lagged)
    
    # Step 2: Fit PCA.
    # We apply PCA to the scaled predictor matrix. This process finds the orthogonal linear combinations
    # of the original variables that capture the maximum amount of variance.
    pca = PCA()
    X_pca = pca.fit_transform(X_scaled)
    
    # Step 3: Analyze the explained variance.
    # The scree plot is a critical diagnostic tool. It shows the cumulative variance explained by the components.
    # This helps us decide how many components (factors) are needed to summarize the information in the original dataset.
    plt.figure(figsize=(12, 7))
    plt.plot(np.cumsum(pca.explained_variance_ratio_))
    plt.xlabel('Number of Components')
    plt.ylabel('Cumulative Explained Variance')
    plt.title('Scree Plot: Variance Explained by Principal Components')
    # It's common to choose a threshold, like 80% of variance explained, to select the number of components.
    n_components_80 = np.where(np.cumsum(pca.explained_variance_ratio_) >= 0.8)[0][0] + 1
    plt.axhline(0.8, color='r', ls='--', label=f'{n_components_80} components explain 80% of variance')
    plt.legend()
    plt.grid(True, which='both', linestyle='--', linewidth=0.5)
    plt.show()
    
    note(f"The scree plot shows that the first few principal components capture a large fraction of the total variation in the macroeconomic dataset. We can explain 80% of the variance with just {n_components_80} factors, achieving significant dimensionality reduction.")

In [None]:
sec("Forecasting GDP Growth with Principal Components")

if DATA_LOADED:
    # Step 1: Select the number of principal components (factors) to use as predictors.
    # Based on the scree plot or other criteria, we choose a small number of components for our model.
    n_pcs = 5
    X_factors = X_pca[:, :n_pcs]
    
    # Step 2: Set up a time-series cross-validation.
    # Standard k-fold cross-validation is invalid for time series because it can lead to training on future data
    # and testing on past data. TimeSeriesSplit creates an expanding window, which respects the temporal order of the data.
    tscv = TimeSeriesSplit(n_splits=10)
    model = LinearRegression()
    
    # We will store the out-of-sample predictions and actual values here.
    predictions = []
    actuals = []
    
    # Step 3: Loop through the time series splits, training and predicting at each step.
    # This mimics how a forecaster would operate in real-time, re-estimating their model as new data arrives.
    for train_index, test_index in tscv.split(X_factors):
        X_train, X_test = X_factors[train_index], X_factors[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        # Train the model only on the 'past' data.
        model.fit(X_train, y_train)
        # Predict on the 'future' data.
        preds = model.predict(X_test)
        predictions.extend(preds)
        actuals.extend(y_test)
        
    # Step 4: Evaluate the out-of-sample performance.
    # We calculate the out-of-sample R-squared. A positive value indicates that the model's forecasts
    # are better than a simple forecast based on the historical mean.
    r_squared = 1 - np.sum((np.array(actuals) - np.array(predictions))**2) / np.sum((np.array(actuals) - np.mean(np.array(actuals)))**2)
    note(f"Out-of-Sample R-squared from the Principal Component Regression: {r_squared:.3f}")
    
    # Step 5: Visualize the forecasts against the actual data.
    # This plot provides a qualitative assessment of the model's performance over time.
    fig, ax = plt.subplots(figsize=(14, 7))
    # Create a DataFrame for easy plotting with correct dates.
    plot_df = pd.DataFrame({'Actual GDP Growth': actuals, 'Predicted GDP Growth': predictions}, index=y.index[len(y)-len(actuals):])
    plot_df.plot(ax=ax, style=['-', '--'])
    ax.set_title('GDP Growth Forecast: Actual vs. Predicted (Out-of-Sample)')
    ax.set_ylabel('Quarterly GDP Growth (%)')
    ax.legend()
    plt.show()

### 4. Conclusion and Extensions
This notebook demonstrated a powerful and practical technique for macroeconomic forecasting in a data-rich environment. By combining PCA for dimensionality reduction with a simple linear regression, we were able to build a model that effectively forecasts GDP growth out-of-sample.

This approach can be extended in several ways:
- **Dynamic Factor Models (DFM):** A more sophisticated version that explicitly models the time-series dynamics of the factors and idiosyncratic components.
- **Alternative Non-linear Models:** Instead of a linear regression on the factors, one could use more flexible models like Random Forests or Gradient Boosting Machines.
- **Interpreting the Factors:** A key challenge is to assign economic interpretations to the estimated principal components. This can be done by examining the 'factor loadings' (the correlations of the original series with the components) to see which variables are most important for each factor.