In [None]:
# === Environment Setup ===\nimport numpy as np\nimport pandas as pd\nimport matplotlib.pyplot as plt\nimport seaborn as sns\nfrom sklearn.preprocessing import StandardScaler\nfrom sklearn.decomposition import PCA\nfrom sklearn.linear_model import LinearRegression\nfrom sklearn.model_selection import TimeSeriesSplit\nfrom sklearn.metrics import mean_squared_error\nimport pandas_datareader.data as web\nfrom IPython.display import display, Markdown\n\n# --- Configuration ---\nplt.style.use('seaborn-v0_8-whitegrid')\nplt.rcParams.update({'font.size': 14, 'figure.figsize': (12, 8), 'figure.dpi': 150})\nnp.set_printoptions(suppress=True, linewidth=120, precision=4)\n\n# --- Utility Functions ---\ndef note(msg): display(Markdown(f\"<div class='alert alert-info'>📝 {msg}</div>\"))\ndef sec(title): print(f'\n{80*"="}\n| {title.upper()} |\n{80*"="}')\n\nnote(\"Environment initialized for Macroeconomic Forecasting with ML.\")

# Part 7: Machine Learning for Economists\n## Chapter 7.18: Applied ML - Macroeconomic Forecasting with Principal Components

### 1. The Challenge of High-Dimensional Macro Data\nModern macroeconomic forecasting often faces a high-dimensionality problem. Central banks and researchers have access to hundreds, if not thousands, of potentially relevant time series (e.g., industrial production for different sectors, various interest rates, employment figures, surveys, etc.). Using all of these predictors in a standard regression model is infeasible due to the curse of dimensionality and severe multicollinearity.\n\nA powerful solution is to use **Principal Component Analysis (PCA)** to distill the information from this large set of predictors into a small number of **estimated macroeconomic factors**. These factors, which are the principal components of the data, can be interpreted as capturing the primary drivers of economic variation (e.g., a 'real activity' factor, an 'inflation' factor, a 'financial conditions' factor).\n\nThis notebook provides a practical, step-by-step guide to building a **Principal Component Regression (PCR)** model to forecast GDP growth, a common workflow in modern macro-econometrics.

### 2. Data Acquisition and Preparation\nWe will use the widely-cited **FRED-MD dataset**, a large panel of monthly US macroeconomic variables maintained by the St. Louis Fed. Our goal will be to forecast quarterly GDP growth using the information contained in the monthly series.\n\nThe process involves:\n1.  **Loading the data:** We'll fetch the FRED-MD dataset and the target variable (Real GDP).\n2.  **Data Cleaning:** Handling missing values is crucial. We will use a simple forward-fill and back-fill strategy.\n3.  **Transformations:** Many macro series are non-stationary. We will apply the transformations (e.g., taking logs, differencing) suggested by the creators of the dataset to induce stationarity.\n4.  **Aggregation:** We will aggregate the monthly predictor data to a quarterly frequency to match the frequency of our GDP target variable.

In [None]:
sec(\"Data Acquisition and Preparation\")\n\ntry:\n    # 1. Load Data\n    fred_md_url = 'https://files.stlouisfed.org/files/htdocs/fred-md/monthly/current.csv'\n    df_raw = pd.read_csv(fred_md_url)\n    df_raw['sasdate'] = pd.to_datetime(df_raw['sasdate'], format='%m/%d/%Y')\n    df_raw = df_raw.set_index('sasdate')\n    \n    # Load transformation codes\n    tcode_df = df_raw.iloc[:1, 1:]\n    df = df_raw.iloc[1:, 1:]\n    \n    # 2. Clean Data (handle NaNs)\n    df = df.fillna(method='ffill').fillna(method='bfill')\n    \n    # 3. Transform data to be stationary\n    def transform(series, tcode):\n        if tcode == 2: return series.diff()\n        if tcode == 3: return series.diff().diff()\n        if tcode == 4: return np.log(series)\n        if tcode == 5: return np.log(series).diff()\n        if tcode == 6: return np.log(series).diff().diff()\n        return series\n\n    df_transformed = pd.DataFrame()\n    for col in df.columns:\n        tcode = tcode_df[col].iloc[0]\n        df_transformed[col] = transform(df[col].astype(float), tcode)\n    df_transformed = df_transformed.dropna()\n    \n    # 4. Load and prepare target variable (GDP growth)\n    gdp = web.DataReader('GDPC1', 'fred', start='1960-01-01', end='2023-12-31')\n    gdp_growth = np.log(gdp).diff().dropna() * 100 # Quarterly growth rate in %\n    \n    # 5. Align data: aggregate monthly predictors to quarterly and join\n    X = df_transformed.resample('Q').mean()\n    data_full = X.join(gdp_growth, how='inner')\n    data_full = data_full.rename(columns={'GDPC1': 'GDP_Growth'})\n    \n    # Create lagged predictors\n    y = data_full['GDP_Growth']\n    X_full = data_full.drop('GDP_Growth', axis=1)\n    X_lagged = X_full.shift(1).dropna() # Use past information to predict current growth\n    y = y.loc[X_lagged.index]\n    \n    note(f\"Data preparation complete. We have {X_lagged.shape[1]} potential predictors to forecast GDP growth.\")\n    DATA_LOADED = True\nexcept Exception as e:\n    note(f\"Could not load data. Error: {e}. Skipping notebook execution.\")\n    DATA_LOADED = False

### 3. Principal Component Regression\nNow we apply PCA to our large set of predictors ($X_{lagged}$) to extract a smaller number of principal components. We will then use these components as regressors in a standard linear regression model to predict GDP growth.

In [None]:
sec(\"Principal Component Analysis of Macro Data\")\n\nif DATA_LOADED:\n    # Standardize the data before PCA\n    scaler = StandardScaler()\n    X_scaled = scaler.fit_transform(X_lagged)\n    \n    # Fit PCA\n    pca = PCA()\n    X_pca = pca.fit_transform(X_scaled)\n    \n    # Plot the cumulative explained variance (Scree Plot)\n    plt.figure(figsize=(12, 7))\n    plt.plot(np.cumsum(pca.explained_variance_ratio_))\n    plt.xlabel('Number of Components')\n    plt.ylabel('Cumulative Explained Variance')\n    plt.title('Variance Explained by Principal Components')\n    n_components_80 = np.where(np.cumsum(pca.explained_variance_ratio_) >= 0.8)[0][0] + 1\n    plt.axhline(0.8, color='r', ls='--', label=f'{n_components_80} components explain 80% of variance')\n    plt.legend()\n    plt.show()\n    \n    note(f\"The scree plot shows that the first few principal components capture a large fraction of the total variation in the macroeconomic dataset. We can explain 80% of the variance with just {n_components_80} factors.\")

In [None]:
sec(\"Forecasting GDP Growth with Principal Components\")\n\nif DATA_LOADED:\n    # Choose number of components to use\n    n_pcs = 5\n    X_factors = X_pca[:, :n_pcs]\n    \n    # Use a time series split for cross-validation to respect the temporal order of the data\n    tscv = TimeSeriesSplit(n_splits=10)\n    model = LinearRegression()\n    \n    predictions = []\n    actuals = []\n    \n    for train_index, test_index in tscv.split(X_factors):\n        X_train, X_test = X_factors[train_index], X_factors[test_index]\n        y_train, y_test = y.iloc[train_index], y.iloc[test_index]\n        \n        model.fit(X_train, y_train)\n        preds = model.predict(X_test)\n        predictions.extend(preds)\n        actuals.extend(y_test)\n        \n    # Calculate overall out-of-sample R-squared\n    r_squared = 1 - np.sum((np.array(actuals) - np.array(predictions))**2) / np.sum((np.array(actuals) - np.mean(np.array(actuals)))**2)\n    note(f\"Out-of-Sample R-squared from the Principal Component Regression: {r_squared:.3f}\")\n    \n    # Plot actual vs. predicted\n    fig, ax = plt.subplots(figsize=(14, 7))\n    plot_df = pd.DataFrame({'Actual GDP Growth': actuals, 'Predicted GDP Growth': predictions}, index=y.index[len(y)-len(actuals):])\n    plot_df.plot(ax=ax)\n    ax.set_title('GDP Growth Forecast: Actual vs. Predicted (Out-of-Sample)')\n    ax.set_ylabel('Quarterly GDP Growth (%)')\n    plt.show()

### 4. Conclusion and Extensions\nThis notebook demonstrated a powerful and practical technique for macroeconomic forecasting in a data-rich environment. By combining PCA for dimensionality reduction with a simple linear regression, we were able to build a model that effectively forecasts GDP growth out-of-sample.\n\nThis approach can be extended in several ways:\n- **Dynamic Factor Models (DFM):** A more sophisticated version that explicitly models the time-series dynamics of the factors and idiosyncratic components.\n- **Alternative Non-linear Models:** Instead of a linear regression on the factors, one could use more flexible models like Random Forests or Gradient Boosting Machines.\n- **Interpreting the Factors:** A key challenge is to assign economic interpretations to the estimated principal components. This can be done by examining the 'factor loadings' (the correlations of the original series with the components) to see which variables are most important for each factor.