# Causal Imitation Learning Examples - Inverse Reinforcement Learning

In [1]:
import numpy as np

N = 100000
rng = np.random.default_rng(0)

# utility
def logistic(size=None):
    return rng.logistic(loc=0.0, scale=1.0, size=size)

### 9.24: Inverse RL in MAB

In [2]:
# expert rollouts (NUC holds, expert copies U)
p = 0.9
theta = 2.0
N_expert = N
U_expert = rng.binomial(1, p, size=N_expert)
X_expert = U_expert.copy()
Y_expert = X_expert.copy()
R_expert = theta * Y_expert
p_hat = Y_expert.mean()

In [3]:
# BC baseline
N_eval = N
q_bc = p_hat
X_bc = rng.binomial(1, q_bc, size=N_eval)
Y_bc = X_bc.copy()
R_bc = theta * Y_bc

In [4]:
# IRL minimax (NUC holds)
q_star = 1.0

X_irl = np.ones(N_eval, dtype=int)
Y_irl = X_irl.copy()
R_irl = theta * Y_irl

# show worst-case gap
def worst_case_gap(q, p_expert):
    return max(0.0, p_expert - q)

grid_q = np.linspace(0, 1, 6)
gaps = [worst_case_gap(q, p_hat) for q in grid_q]

In [5]:
print("IRL minimax in 1-step MAB (NUC holds):")
print("  U ~ Bern(p), X := U, Y := X, Rθ(Y) = θ * Y (θ > 0)")
print(f"  Demo p = P(U=1) = E_demo[Y] = {p:.3f} (empirical {p_hat:.5f})")
print()
print("Behavioral Cloning (baseline):")
print(f"  q_bc := π_BC(X=1) = P_demo(X=1) = {q_bc:.5f}")
print(f"  Value_BC = θ * q_bc = {theta*q_bc:.5f} (empirical {R_bc.mean():.5f})")
print()
print("IRL (minimax) solution:")
print("  Choose q* := π*(X=1) = 1.0  (any q ≥ p makes the worst-case gap zero; choosing 1 is canonical)")
print(f"  Value_IRL = θ * 1 = {theta*1.0:.5f} (empirical {R_irl.mean():.5f})")
print()
print("Comparison:")
print(f"  Expert value = θ * p = {theta*p:.5f}")
print(f"  BC value     = θ * p = {theta*q_bc:.5f}")
print(f"  IRL value    = θ * 1 = {theta*1.0:.5f}")
print(f"  IRL improves over expert/BC by: {theta*(1.0 - p):.5f}")
print()
print("Worst-case gap max_{θ>0} θ*(p - q) (up to positive scaling):")
for q, g in zip(grid_q, gaps):
    print(f"  q={q:.1f} -> gap ~ {g:.3f}")

IRL minimax in 1-step MAB (NUC holds):
  U ~ Bern(p), X := U, Y := X, Rθ(Y) = θ * Y (θ > 0)
  Demo p = P(U=1) = E_demo[Y] = 0.900 (empirical 0.90095)

Behavioral Cloning (baseline):
  q_bc := π_BC(X=1) = P_demo(X=1) = 0.90095
  Value_BC = θ * q_bc = 1.80190 (empirical 1.80478)

IRL (minimax) solution:
  Choose q* := π*(X=1) = 1.0  (any q ≥ p makes the worst-case gap zero; choosing 1 is canonical)
  Value_IRL = θ * 1 = 2.00000 (empirical 2.00000)

Comparison:
  Expert value = θ * p = 1.80000
  BC value     = θ * p = 1.80190
  IRL value    = θ * 1 = 2.00000
  IRL improves over expert/BC by: 0.20000

Worst-case gap max_{θ>0} θ*(p - q) (up to positive scaling):
  q=0.0 -> gap ~ 0.901
  q=0.2 -> gap ~ 0.701
  q=0.4 -> gap ~ 0.501
  q=0.6 -> gap ~ 0.301
  q=0.8 -> gap ~ 0.101
  q=1.0 -> gap ~ 0.000


### 9.25: Inverse RL fails without NUC

In [6]:
theta = 2.0
N_expert = N
N_eval = N

def xor(a, b):
    return (a ^ b).astype(int)

In [7]:
U_expert = rng.binomial(1, 0.5, size=N_expert)
X_expert = 1 - U_expert
Y_expert = xor(X_expert, U_expert)

Ey_x0 = Y_expert[X_expert == 0].mean()
Ey_x1 = Y_expert[X_expert == 1].mean()

In [8]:
# Off-Policy Evaluation
def naive_ope_value(q, Ey_x0, Ey_x1):
    return (1 - q) * Ey_x0 + q * Ey_x1

In [9]:
q_star = 1.0
V_hat_irl = theta * naive_ope_value(q_star, Ey_x0, Ey_x1)

# baseline
q_bc = X_expert.mean()
V_hat_bc = theta * naive_ope_value(q_bc, Ey_x0, Ey_x1)

In [10]:
# irl rollout
U_eval = rng.binomial(1, 0.5, size=N_eval)
X_irl = np.ones(N_eval, dtype=int)
Y_irl = xor(X_irl, U_eval)
R_irl = theta * Y_irl
V_true_irl = R_irl.mean()

# expert rollout
X_expert = 1 - U_eval
Y_expert = xor(X_expert, U_eval)
V_true_expert = (theta * Y_expert).mean()

# bc rollout
X_bc = rng.binomial(1, q_bc, size=N_eval)
Y_bc = xor(X_bc, U_eval)
V_true_bc = (theta * Y_bc).mean()

In [11]:
print("SCM (confounded XOR MAB):")
print("  U ~ Bern(0.5)")
print("  Expert: X := NOT U")
print("  Outcome: Y := XOR(X, U)")
print("  Reward: R = θ * Y  (θ > 0)\n")

print("From expert (observational) data:")
print(f"  E[Y|X=0] ≈ {Ey_x0:.5f},  E[Y|X=1] ≈ {Ey_x1:.5f}  (both ~1 ⇒ naïve off-policy eval believes any policy gets θ)")
print(f"  E_demo[Y] ≈ {Y_expert.mean():.5f} (should be ~1)\n")

print("Naive Off-policy Evaluation (WRONG, assumes NUC):")
print(f"  Predicted IRL value for q* = 1.0:  V_hat(π*) = {V_hat_irl:.5f}")
print(f"  Predicted BC value  for q_bc={q_bc:.2f}:  V_hat(BC) = {V_hat_bc:.5f}\n")

print("TRUE deployment on the SCM:")
print(f"  Expert true value         = {V_true_expert:.5f}   (θ * 1)")
print(f"  IRL(π*: X≡1) true value   = {V_true_irl:.5f}   (≈ θ * 0.5)")
print(f"  BC  true value            = {V_true_bc:.5f}   (≈ θ * 0.5)\n")

SCM (confounded XOR MAB):
  U ~ Bern(0.5)
  Expert: X := NOT U
  Outcome: Y := XOR(X, U)
  Reward: R = θ * Y  (θ > 0)

From expert (observational) data:
  E[Y|X=0] ≈ 1.00000,  E[Y|X=1] ≈ 1.00000  (both ~1 ⇒ naïve off-policy eval believes any policy gets θ)
  E_demo[Y] ≈ 1.00000 (should be ~1)

Naive Off-policy Evaluation (WRONG, assumes NUC):
  Predicted IRL value for q* = 1.0:  V_hat(π*) = 2.00000
  Predicted BC value  for q_bc=0.50:  V_hat(BC) = 2.00000

TRUE deployment on the SCM:
  Expert true value         = 2.00000   (θ * 1)
  IRL(π*: X≡1) true value   = 0.99728   (≈ θ * 0.5)
  BC  true value            = 1.00256   (≈ θ * 0.5)

