In [None]:
# === Environment Setup ===
import os, sys, math, time, random, json, textwrap, warnings
import numpy as np, pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api as sm
from scipy.optimize import minimize
try:
    import graphviz
    GRAPHVIZ_AVAILABLE = True
except ImportError:
    GRAPHVIZ_AVAILABLE = False
from IPython.display import display, Markdown

# --- Configuration ---
plt.style.use('seaborn-v0_8-whitegrid')
plt.rcParams.update({'font.size': 14, 'figure.figsize': (12, 8), 'figure.dpi': 150})
np.set_printoptions(suppress=True, linewidth=120, precision=4)
warnings.filterwarnings('ignore', category=FutureWarning)

# --- Utility Functions ---
def note(msg): display(Markdown(f"<div class='alert alert-block alert-info'>📝 {textwrap.fill(msg, width=100)}</div>"))
def sec(title): print(f"\n{80*'='}\n| {title.upper()} |\n{80*'='}")

note("Environment initialized.")

# Part 3: Dynamic Models
## Chapter 3.6: Structural Estimation: Modeling the Engine of Choice

### Table of Contents
1.  [The Structural Approach and the Lucas Critique](#1.-The-Structural-Approach-and-the-Lucas-Critique)
2.  [Dynamic Discrete Choice (DDC) Models](#2.-Dynamic-Discrete-Choice-(DDC)-Models)
    *   [2.1 The Value Function and Choice Probabilities](#2.1-The-Value-Function-and-Choice-Probabilities)
    *   [2.2 Identification](#2.2-Identification)
3.  [Estimation Algorithms](#3.-Estimation-Algorithms)
    *   [3.1 The Nested Fixed Point (NFXP) Algorithm](#3.1-The-Nested-Fixed-Point-(NFXP)-Algorithm)
    *   [3.2 Two-Step Estimators (Hotz-Miller / CCP)](#3.2-Two-Step-Estimators-(Hotz-Miller-/-CCP))
    *   [3.3 MPEC: Constrained Optimization](#3.3-MPEC:-Constrained-Optimization)
4.  [Code Lab: Estimating a Bus Engine Replacement Model](#4.-Code-Lab:-Estimating-a-Bus-Engine-Replacement-Model)
    *   [4.1 The NFXP Implementation](#4.1-The-NFXP-Implementation)
    *   [4.2 Counterfactuals: The Power of Structural Models](#4.2-Counterfactuals:-The-Power-of-Structural-Models)
5.  [Chapter Summary](#5.-Chapter-Summary)
6.  [Exercises](#6.-Exercises)

### 1. The Structural Approach and the Lucas Critique
This chapter introduces **structural estimation**, an approach that deeply integrates economic theory with empirical data. It stands in contrast to **reduced-form** methods (like OLS or IV), which estimate statistical relationships without fully modeling the underlying behavior that generated them.

The foundational motivation is the **Lucas Critique**. Robert Lucas Jr. argued that reduced-form relationships are fundamentally unreliable for evaluating future policy changes, because rational, forward-looking agents will change their behavior and expectations in response to a new policy, rendering the old statistical relationships obsolete. The classic example is the Phillips Curve, which appeared to offer a stable trade-off between inflation and unemployment, but broke down when central banks tried to exploit it.

The solution, according to Lucas, is to estimate the **"deep" structural parameters** that govern behavior but are invariant to policy changes—parameters of preferences (like risk aversion), technology (like production costs), and constraints. A structural model, by estimating this underlying economic engine, allows us to simulate how optimizing agents *would* behave under entirely new policy regimes. It allows us to ask not just "what happened?" but also "what if?"

### 2. Dynamic Discrete Choice (DDC) Models
A workhorse of modern structural analysis is the **Dynamic Discrete Choice (DDC)** model. These models analyze situations where forward-looking agents make a sequence of choices from a discrete set over time, and where today's choices affect tomorrow's state. The pioneering work is John Rust's (1987) analysis of Harold Zurcher, the superintendent of maintenance for the Madison, Wisconsin bus company, who had to decide each month whether to replace a bus engine.

#### 2.1 The Value Function and Choice Probabilities
A DDC model is formally defined by the components of a Bellman equation. To allow for behavior that is not perfectly deterministic from the perspective of the econometrician, we assume that the agent's utility for each choice $i$ is subject to a random taste shock, $\epsilon_i$, that is unobserved by us. The choice-specific value function is:
$$ v(s_t, i, \theta) = u(s_t, i, \theta) + \beta E[V(s_{t+1}) | s_t, i] $$ 
where $u(s_t, i, \theta)$ is the flow utility and $E[V(s_{t+1})]$ is the expected continuation value. The agent's total utility for choice $i$ is $v(s_t, i, \theta) + \epsilon_i$. The agent chooses the action with the highest total utility.

If we assume the shocks $\epsilon_i$ follow a **Type-I Extreme Value (Gumbel)** distribution, we get two convenient results:
1.  The choice probabilities have the familiar **logit formula**:
$$ P(i | s_t, \theta) = \frac{\exp(v(s_t, i, \theta))}{\sum_{j \in A} \exp(v(s_t, j, \theta))} $$
2.  The *ex-ante* value function $V(s_t, \theta) = E_\epsilon[\max_i \{v(s_t, i, \theta) + \epsilon_i\}]$ has a closed-form solution known as the **log-sum-exp** formula:
$$ V(s_t, \theta) = \ln \left( \sum_{j \in A} \exp(v(s_t, j, \theta)) \right) + C $$
where C is Euler's constant. This gives us a tractable Bellman equation to solve for the value function.

#### 2.2 Identification
**Identification** is the question of whether it is theoretically possible to uniquely recover the structural parameters from the observed data. In DDC models, identification comes from how the model predicts that choices will change as the state variables change. For example, we identify cost parameters by observing how the probability of an action (e.g., replacement) changes as a cost-relevant state variable (e.g., mileage) increases. We identify the discount factor $\beta$ by observing how much a change in the *expected future value* of a choice affects an agent's *current* choice.

### 3. Estimation Algorithms
The goal is to find the parameter vector $\theta$ that maximizes the log-likelihood of the observed data:
$$ \mathcal{L}(\theta) = \sum_{n=1}^N \sum_{t=1}^{T_n} \ln P(a_{nt} | s_{nt}, \theta) $$

#### 3.1 The Nested Fixed Point (NFXP) Algorithm
The challenge is that to calculate the likelihood, we need the choice probabilities $P$, which depend on the value function $V$, which itself depends on the parameters $\theta$ we are trying to estimate. The **Nested Fixed Point (NFXP)** algorithm (Rust, 1987) solves this by nesting two loops:
1.  **Outer Loop:** A numerical optimizer searches over the parameter space $\Theta$ to maximize the log-likelihood.
2.  **Inner Loop:** For each guess of $\theta$ from the outer loop, this loop solves the agent's DP problem by iterating the Bellman equation to a fixed point, $V(s; \theta)$.

#### 3.2 Two-Step Estimators (Hotz-Miller / CCP)
NFXP can be computationally slow because the inner loop must be solved to convergence for every single step of the outer loop. **Two-step estimators** avoid this. The key insight of **Hotz and Miller (1993)** is that one can *invert* the logit choice probability formula to back out the value function from the choice probabilities, avoiding the need for iterative solution.

**Algorithm:**
1.  **Step 1 (Reduced Form):** Flexibly estimate the choice probabilities $P(a|s)$ from the data, for example using a non-parametric method or a simple logit regression. This step does *not* impose the full structural model.
2.  **Step 2 (Structural):** Use the estimated probabilities $\hat{P}(a|s)$ and the logit inversion formula to construct an estimate of the value function. Then, use this estimated value function in a GMM or regression framework to estimate the structural parameters $\theta$.

This approach trades statistical efficiency (if the model is perfectly specified, NFXP is more efficient) for computational speed and robustness to certain types of misspecification.

#### 3.3 MPEC: Constrained Optimization
A third approach, **Mathematical Programming with Equilibrium Constraints (MPEC)**, reframes the entire problem as a single, large, constrained optimization. The goal is to find the parameters $\theta$ and the value function $V$ that maximize the log-likelihood, *subject to the constraint that $V$ must satisfy the Bellman equation*.
$$ \max_{\theta, V} \mathcal{L}(\theta, V | \text{Data}) \quad \text{s.t.} \quad V = T(V|\theta) $$
This avoids the nested loop structure entirely, which can be a significant advantage. However, it results in a very high-dimensional optimization problem that requires specialized solvers.

### 4. Code Lab: Estimating a Bus Engine Replacement Model
We implement a simplified version of Rust's model using the NFXP algorithm. The state `s` is the machine's age. The agent chooses to replace ($a=1$) or maintain ($a=0$). The parameters to estimate are the replacement cost $\theta_R$ and the maintenance cost parameter $\theta_M$.

In [None]:
sec("NFXP Implementation for Bus Replacement Model")

class RustNFXPSolver:
    def __init__(self, n_states, beta, transitions):
        self.n_states = n_states
        self.beta = beta
        self.states = np.arange(n_states)
        self.transitions = transitions # A dict mapping action to transition matrix

    def _get_flow_utilities(self, params):
        theta_R, theta_M = params
        u_maintain = -theta_M * self.states
        u_replace = -theta_R * np.ones(self.n_states)
        return u_maintain, u_replace

    def solve_dp_problem(self, params):
        u_maintain, u_replace = self._get_flow_utilities(params)
        EV = np.zeros(self.n_states)
        for i in range(250):
            EV_maintain_next = self.transitions['maintain'] @ EV
            EV_replace_next = self.transitions['replace'] @ EV
            V_new = np.log(np.exp(u_maintain + self.beta * EV_maintain_next) + 
                           np.exp(u_replace + self.beta * EV_replace_next))
            if np.max(np.abs(EV - V_new)) < 1e-8: break
            EV = V_new
        return EV

    def _log_likelihood(self, params, data_choices, data_states):
        if params[0] < 0 or params[1] < 0: return 1e12
        EV = self.solve_dp_problem(params)
        u_maintain, u_replace = self._get_flow_utilities(params)
        v_maintain = u_maintain + self.beta * (self.transitions['maintain'] @ EV)
        v_replace = u_replace + self.beta * (self.transitions['replace'] @ EV)
        prob_maintain = np.exp(v_maintain) / (np.exp(v_maintain) + np.exp(v_replace))
        p_chosen = np.where(data_choices == 0, prob_maintain[data_states], 1 - prob_maintain[data_states])
        return -np.sum(np.log(np.maximum(p_chosen, 1e-12)))

    def estimate(self, data_choices, data_states, initial_guess):
        print(f"Starting NFXP estimation from guess: {initial_guess}")
        result = minimize(self._log_likelihood, initial_guess, 
                          args=(data_choices, data_states), method='Nelder-Mead',
                          options={'xatol': 1e-6, 'fatol': 1e-6, 'disp': True})
        return result

# === Main Execution ===
N_STATES = 50; DISCOUNT_FACTOR = 0.95
# Transitions: if maintain, age increases by 1. if replace, age resets to 0.
T_MAINTAIN = np.roll(np.eye(N_STATES), 1, axis=1); T_MAINTAIN[-1, -1] = 1; T_MAINTAIN[-1, -2] = 0
T_REPLACE = np.zeros((N_STATES, N_STATES)); T_REPLACE[:, 0] = 1
transitions = {'maintain': T_MAINTAIN, 'replace': T_REPLACE}

solver = RustNFXPSolver(N_STATES, DISCOUNT_FACTOR, transitions)
TRUE_PARAMS = [5.0, 0.2]; rng = np.random.default_rng(123)
EV_true = solver.solve_dp_problem(TRUE_PARAMS)
u_m_true, u_r_true = solver._get_flow_utilities(TRUE_PARAMS)
v_m_true = u_m_true + DISCOUNT_FACTOR * (T_MAINTAIN @ EV_true)
v_r_true = u_r_true + DISCOUNT_FACTOR * (T_REPLACE @ EV_true)
prob_maintain_true = np.exp(v_m_true) / (np.exp(v_m_true) + np.exp(v_r_true))

sim_states = rng.integers(0, N_STATES, 500)
sim_choices = (rng.random(500) > prob_maintain_true[sim_states]).astype(int)

mle_result = solver.estimate(sim_choices, sim_states, initial_guess=[4.0, 0.1])
theta_R_hat, theta_M_hat = mle_result.x
note(f"True Parameters: {TRUE_PARAMS} | Estimated Parameters: {[round(p, 4) for p in mle_result.x]}")

#### 4.2 Counterfactuals: The Power of Structural Models
The real power of the model comes from using the estimated parameters, $\hat{\theta}$, to analyze behavior and simulate policy experiments. Suppose the government offers a 20% subsidy on replacement parts. We can resolve the model with a new, counterfactual cost, $\theta_R' = \hat{\theta}_R \cdot (1 - 0.20)$, and trace out the new optimal policy.

In [None]:
sec("Counterfactual Policy Simulation")
EV_hat = solver.solve_dp_problem(mle_result.x)
u_m_hat, u_r_hat = solver._get_flow_utilities(mle_result.x)
v_m_hat = u_m_hat + solver.beta * (solver.transitions['maintain'] @ EV_hat)
v_r_hat = u_r_hat + solver.beta * (solver.transitions['replace'] @ EV_hat)
prob_replace_hat = 1 - (np.exp(v_m_hat) / (np.exp(v_m_hat) + np.exp(v_r_hat)))

subsidy = 0.20
params_cf = [theta_R_hat * (1 - subsidy), theta_M_hat]
EV_cf = solver.solve_dp_problem(params_cf)
u_m_cf, u_r_cf = solver._get_flow_utilities(params_cf)
v_m_cf = u_m_cf + solver.beta * (solver.transitions['maintain'] @ EV_cf)
v_r_cf = u_r_cf + solver.beta * (solver.transitions['replace'] @ EV_cf)
prob_replace_cf = 1 - (np.exp(v_m_cf) / (np.exp(v_m_cf) + np.exp(v_r_cf)))

fig, ax = plt.subplots()
ax.plot(solver.states, prob_replace_hat, '-o', ms=5, label='Original Estimated Policy')
ax.plot(solver.states, prob_replace_cf, '-s', ms=5, label=f'Policy with {subsidy:.0%} Subsidy')
ax.set(xlabel="State (Engine Mileage)", ylabel="Probability of Replacement", title="Counterfactual Policy Simulation")
ax.legend(); ax.grid(True, which='both', linestyle='--')

### 5. Chapter Summary
- **Structural vs. Reduced-Form:** Structural models estimate deep parameters of preferences and technology, allowing for credible counterfactual analysis, which is the key defense against the **Lucas Critique**.
- **Dynamic Discrete Choice Models** are a workhorse for this, modeling forward-looking agents making discrete choices over time.
- **Estimation Algorithms:** The **NFXP** algorithm solves the model by nesting an inner DP loop inside an outer likelihood maximization loop. Alternatives like **two-step estimators** (Hotz-Miller, CCP) and **MPEC** offer computational advantages by avoiding the nested loop structure.
- **Identification** is a fundamental prerequisite for estimation, ensuring that the model's parameters can be uniquely recovered from the data.

### 6. Exercises

1.  **Explaining the Lucas Critique:** In your own words, use the results of the counterfactual simulation to explain the Lucas Critique to a non-economist. Why would a simple logit regression of `choice` on `state` fail to predict the effect of the subsidy?

2.  **MPEC Formulation:** Write down the objective function and the full set of constraints for solving the bus engine replacement model using the MPEC framework. The variables to be optimized over are $\theta_R, \theta_M$, and the entire value function vector $V$. 

3.  **Two-Step Estimation vs. NFXP:** Compare the workflow of the NFXP algorithm to the Hotz-Miller two-step estimator. What is the key trade-off between the two methods in terms of statistical assumptions, computational burden, and statistical efficiency?

4.  **Coding: The Role of the Discount Factor.** How would you expect the agent's replacement behavior to change if they were less patient (had a lower $\beta$)? Modify the code to solve for and plot the policy function for `beta = 0.8` using the estimated parameters. Explain the economic intuition for the change.

5.  **Coding: A More Complex Cost Function.** Modify the `RustNFXPSolver` class to allow for a non-linear, quadratic maintenance cost function: $u(s, 0, \theta) = -\theta_{M1} s - \theta_{M2} s^2$. Re-run the simulation and estimation. How does this change the shape of the estimated policy function?

### 6. Bayesian Structural Estimation

#### 6.1 The Bayesian Approach to Structural Models

This section introduces the modern workhorse for empirical macroeconomics: **Bayesian estimation of Dynamic Stochastic General Equilibrium (DSGE) models**. Pioneered by researchers like Schorfheide, Del Negro, and Canova, this approach has become the standard at central banks and research institutions for taking complex theoretical models to the data.

Classical methods for estimating structural models, such as Maximum Likelihood, often struggle. The likelihood surface can be flat and multi-modal, and many parameters are only **weakly identified** by aggregate macro data alone. The Bayesian approach provides a powerful solution:

1.  **Incorporate Prior Information:** Use findings from previous studies or microeconomic evidence to form **priors** about the plausible range of structural parameters. This helps to discipline the estimation.
2.  **Characterize Full Uncertainty:** The output is not a single point estimate, but the entire joint **posterior distribution** for all parameters, revealing a complete picture of parameter uncertainty and correlations.
3.  **Systematically Compare Models:** The Bayesian framework provides a coherent way to compare different, non-nested models using their posterior model probabilities.

The price for this flexibility is computational. Since the posterior distribution is not available in closed form, we must use simulation methods like **Markov Chain Monte Carlo (MCMC)** to generate samples from it.

#### 6.2 Bayesian Estimation of DSGE Models

##### The State-Space Representation and the Kalman Filter

The solution to a linearized DSGE model can be written in a **linear state-space form**:
$$ s_t = T(\theta) s_{t-1} + R(\theta) \epsilon_t \quad (\text{Transition Equation}) $$
$$ y_t = Z s_t + d + u_t \quad (\text{Measurement Equation}) $$
Here, $s_t$ is a vector of unobserved state variables (like the capital stock or productivity), $y_t$ is a vector of observed data (like GDP growth and inflation), and $\epsilon_t$ and $u_t$ are shocks. The matrices $T, R, Z$ and the vector $d$ are functions of the deep structural parameters $\theta$.

Given this state-space form, the **Kalman filter** is a recursive algorithm that provides the optimal estimate of the unobserved state $s_t$ given the history of observed data. Crucially for estimation, it also produces the value of the **log-likelihood function** of the data as a byproduct. This allows us to evaluate the likelihood $\mathcal{L}(y_{1:T} | \theta)$ for any given parameter vector $\theta$.

##### MCMC for DSGE Models

With the ability to evaluate the likelihood (via the Kalman filter) and specified priors for the parameters, we can use MCMC to sample from the posterior distribution $p(\theta | y) \propto \mathcal{L}(y | \theta) p(\theta)$.

However, DSGE models can have dozens of parameters, leading to a high-dimensional and complex posterior surface. Standard MCMC samplers can be inefficient. A common strategy is to use a **Metropolis-within-Gibbs** sampler, which groups parameters into blocks and uses different, more efficient proposal distributions for different blocks. The gold standard in modern software like `PyMC` and `Stan` is the **No-U-Turn Sampler (NUTS)**, a highly efficient variant of Hamiltonian Monte Carlo that automatically adapts to the geometry of the posterior surface.

#### 6.3 Code Lab: Bayesian NFXP

In [None]:
sec("Step 1: The Structural Model Solver (from previous chapter)")
@njit
def solve_dp_problem_numba(theta_R, theta_M, n_states, beta):
    states = np.arange(n_states)
    u_maintain = -theta_M * states
    u_replace = -theta_R * np.ones(n_states)
    EV = np.zeros(n_states)
    for i in range(250):
        EV_maintain_next = np.roll(EV, -1); EV_maintain_next[-1] = EV[-1]
        EV_replace_next = EV[0]
        V_new = np.log(np.exp(u_maintain + beta * EV_maintain_next) + np.exp(u_replace + beta * EV_replace_next))
        if np.max(np.abs(EV - V_new)) < 1e-8: break
        EV = V_new
    v_maintain_total = u_maintain + beta * np.roll(EV, -1)
    v_replace_total = u_replace + beta * EV[0]
    return np.exp(v_maintain_total) / (np.exp(v_maintain_total) + np.exp(v_replace_total))
note("Solver function compiled with Numba.")

def loglike(theta, data_choices, data_states, n_states, beta):
    prob_maintain = solve_dp_problem_numba(theta[0], theta[1], n_states, beta)
    p_chosen = pt.switch(pt.eq(data_choices, 0), prob_maintain[data_states], 1 - prob_maintain[data_states])
    return pt.sum(pt.log(pt.maximum(p_chosen, 1e-12)))

In [None]:
sec("Step 2: Defining and Sampling the PyMC Model")
if PYMC_AVAILABLE:
    N_STATES, DISCOUNT_FACTOR = 50, 0.95
    TRUE_PARAMS = [5.0, 0.2]
    prob_maintain_true = solve_dp_problem_numba(TRUE_PARAMS[0], TRUE_PARAMS[1], N_STATES, DISCOUNT_FACTOR)
    rng = np.random.default_rng(123)
    sim_states = rng.integers(0, N_STATES, 500)
    sim_choices = (rng.random(500) > prob_maintain_true[sim_states]).astype(int)

    with pm.Model() as rust_model:
        theta_R = pm.Gamma('theta_R', alpha=2.0, beta=0.5)
        theta_M = pm.Gamma('theta_M', alpha=2.0, beta=10.0)
        thetas = pt.as_tensor_variable([theta_R, theta_M])
        pm.Potential("likelihood", loglike(thetas, sim_choices, sim_states, N_STATES, DISCOUNT_FACTOR))
        
        note("Sampling from the posterior... This will be slow.")
        idata = pm.sample(2000, tune=1000, chains=2, cores=1)
    note("MCMC sampling complete.")
else:
    note("PyMC not available. Skipping model estimation.")