In [1]:
# Core
import os, json, numpy as np, pandas as pd, joblib, warnings
from pathlib import Path
warnings.filterwarnings("ignore")

# Viz & metrics
import matplotlib.pyplot as plt

# Our project modules
import sys
PROJECT_ROOT = Path.cwd().parent if Path.cwd().name == "notebooks" else Path.cwd()
if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))

from src.features import NUM_COLS, CAT_COLS, make_preprocessor   # for column lists
from src.data_utils import load_and_clean, time_split, get_blocks
from src.profit_threshold import break_even_p, decisions_from_probs
from src.rl_cql import build_mdp_dataset, train_cql, policy_actions
# FQE lives under d3rlpy.ope; some versions expose DiscreteFQE separately
try:
    from d3rlpy.ope import DiscreteFQE as FQE, FQEConfig
except Exception:
    # fallback for versions that only expose FQE (works for both spaces, but we use it as discrete)
    from d3rlpy.ope import FQE, FQEConfig


# Paths
DATA_PATH   = Path("/Users/macaryan/Coding/Shodh_AI/lendingclub-policy-optimization/data/accepted_2007_to_2018Q4.csv")  # adjust if different
PREPROC_PKL = Path("/Users/macaryan/Coding/Shodh_AI/lendingclub-policy-optimization/notebooks/preprocessor.joblib")
ARTIFACTS   = Path(".")
ARTIFACTS.mkdir(exist_ok=True)

plt.rcParams["figure.figsize"] = (6,4)
pd.set_option("display.max_columns", 200)


Gym has been unmaintained since 2022 and does not support NumPy 2.0 amongst other critical functionality.
Please upgrade to Gymnasium, the maintained drop-in replacement of Gym, or contact the authors of your software and request that they upgrade.
Users of this version of Gym should be able to simply replace 'import gym' with 'import gymnasium as gym' in the vast majority of cases.
See the migration guide at https://gymnasium.farama.org/introduction/migration_guide/ for additional information.


In [2]:
META_COLS = ["issue_d","loan_amnt","int_rate"]

df = load_and_clean(DATA_PATH, NUM_COLS, CAT_COLS, META_COLS)

train, valid, test = time_split(df, train_end="2016-12-31", val_year="2017")

Xtr_raw, ytr, tr_meta = get_blocks(train, NUM_COLS, CAT_COLS)
Xva_raw, yva, va_meta = get_blocks(valid, NUM_COLS, CAT_COLS)
Xte_raw, yte, te_meta = get_blocks(test,  NUM_COLS, CAT_COLS)

print("Shapes raw:", Xtr_raw.shape, Xva_raw.shape, Xte_raw.shape)
print("Default rate (train/val/test):", round(ytr.mean(),4), round(yva.mean(),4), round(yte.mean(),4))


Shapes raw: (1129956, 20) (178817, 20) (65142, 20)
Default rate (train/val/test): 0.2025 0.2721 0.2717


In [4]:
# Save once
np.save("Xtr.npy", Xtr_raw); np.save("Xva.npy", Xva_raw); np.save("Xte.npy", Xte_raw)
np.save("ytr.npy", ytr); np.save("yva.npy", yva); np.save("yte.npy", yte)
tr_meta.to_parquet("tr_meta.parquet"); va_meta.to_parquet("va_meta.parquet"); te_meta.to_parquet("te_meta.parquet")


In [None]:
# Reload instantly (no CSV parse)
Xtr = np.load("Xtr.npy"); Xva = np.load("Xva.npy"); Xte = np.load("Xte.npy")
ytr = np.load("ytr.npy"); yva = np.load("yva.npy"); yte = np.load("yte.npy")
import pandas as pd
tr_meta = pd.read_parquet("tr_meta.parquet")
va_meta = pd.read_parquet("va_meta.parquet")
te_meta = pd.read_parquet("te_meta.parquet")


In [5]:
preprocessor = joblib.load(PREPROC_PKL)

def transform_df(pre, X_df):
    X = pre.transform(X_df)
    if hasattr(X, "toarray"):
        X = X.toarray()
    return X

Xtr = transform_df(preprocessor, Xtr_raw)
Xva = transform_df(preprocessor, Xva_raw)
Xte = transform_df(preprocessor, Xte_raw)

Xtr.shape, Xva.shape, Xte.shape

((1129956, 63), (178817, 63), (65142, 63))

In [6]:
pte_path = ARTIFACTS / "test_predictions.npy"
if pte_path.exists():
    p_te = np.load(pte_path)                 # p(default | x)
    print("Loaded DL test probabilities:", p_te.shape)
else:
    p_te = None
    print("No test_predictions.npy found; skipping DL policy comparison.")

# Profit-aware DL policy (approve if p < p*)
if p_te is not None:
    thr_te = break_even_p(te_meta["int_rate"].values)
    dl_actions_te = (p_te < thr_te).astype(int)  # 1=approve, 0=deny


Loaded DL test probabilities: (65142,)


In [7]:
def realized_profit(actions, y_true, loan_amnt, int_rate):
    actions = np.asarray(actions).astype(int)
    y_true  = np.asarray(y_true).astype(int)
    loan_amnt = np.asarray(loan_amnt).astype(float)
    int_rate  = np.asarray(int_rate).astype(float)
    # reward when approve:
    approve_profit = np.where(y_true==0, loan_amnt*int_rate, -loan_amnt)
    # reward when deny:
    deny_profit = np.zeros_like(approve_profit)
    return np.where(actions==1, approve_profit, deny_profit)

def policy_summary(name, actions, y_true, loan_amnt, int_rate):
    r = realized_profit(actions, y_true, loan_amnt, int_rate)
    return {
        "policy": name,
        "n": int(len(r)),
        "approve_rate": float(np.mean(actions)),
        "avg_profit": float(np.mean(r)),
        "median_profit": float(np.median(r)),
        "p95_profit": float(np.percentile(r,95)),
        "p5_profit": float(np.percentile(r,5)),
        "total_profit": float(np.sum(r))
    }


In [8]:
from src.rl_cql import build_mdp_dataset, train_cql, policy_actions, fqe_estimate

# Build datasets
mdp_train = build_mdp_dataset(Xtr, tr_meta["loan_amnt"].values, tr_meta["int_rate"].values, ytr)
mdp_valid = build_mdp_dataset(Xva, va_meta["loan_amnt"].values, va_meta["int_rate"].values, yva)
mdp_test  = build_mdp_dataset(Xte, te_meta["loan_amnt"].values, te_meta["int_rate"].values, yte)

# Train (CPU)
algo = train_cql(mdp_train, n_steps=200_000)

# # Actions on test
# rl_actions_te = policy_actions(algo, Xte)

# # FQE (optional, robust helper)
# est_val = fqe_estimate(algo, mdp_train, mdp_valid, n_steps=100_000)
# est_te  = fqe_estimate(algo, mdp_train, mdp_test,  n_steps=100_000)
# print("FQE Estimated Policy Value — Val:", est_val, " Test:", est_te)


[2m2025-10-29 23:30.47[0m [[32m[1minfo     [0m] [1mSignatures have been automatically determined.[0m [36maction_signature[0m=[35mSignature(dtype=[dtype('int64')], shape=[(1,)])[0m [36mobservation_signature[0m=[35mSignature(dtype=[dtype('float32')], shape=[(63,)])[0m [36mreward_signature[0m=[35mSignature(dtype=[dtype('float32')], shape=[(1,)])[0m
[2m2025-10-29 23:30.47[0m [[32m[1minfo     [0m] [1mAction-space has been automatically determined.[0m [36maction_space[0m=[35m<ActionSpace.DISCRETE: 2>[0m
[2m2025-10-29 23:30.50[0m [[32m[1minfo     [0m] [1mAction size has been automatically determined.[0m [36maction_size[0m=[35m2[0m
[2m2025-10-29 23:30.52[0m [[32m[1minfo     [0m] [1mSignatures have been automatically determined.[0m [36maction_signature[0m=[35mSignature(dtype=[dtype('int64')], shape=[(1,)])[0m [36mobservation_signature[0m=[35mSignature(dtype=[dtype('float32')], shape=[(63,)])[0m [36mreward_signature[0m=[35mSignature(dtyp

Epoch 1/20: 100%|██████████| 10000/10000 [03:30<00:00, 47.44it/s, loss=3.7e+3, td_loss=3.7e+3, conservative_loss=0.000979]

[2m2025-10-29 23:34.24[0m [[32m[1minfo     [0m] [1mDiscreteCQL_20251029233053: epoch=1 step=10000[0m [36mepoch[0m=[35m1[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.013707966494560242, 'time_algorithm_update': 0.0071500230312347415, 'loss': 3698.7780274414063, 'td_loss': 3698.777049658203, 'conservative_loss': 0.000977902539097704, 'time_step': 0.020981254863739012}[0m [36mstep[0m=[35m10000[0m
[2m2025-10-29 23:34.24[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/DiscreteCQL_20251029233053/model_10000.d3[0m



Epoch 2/20: 100%|██████████| 10000/10000 [02:43<00:00, 61.06it/s, loss=3.64e+3, td_loss=3.64e+3, conservative_loss=0]

[2m2025-10-29 23:37.08[0m [[32m[1minfo     [0m] [1mDiscreteCQL_20251029233053: epoch=2 step=20000[0m [36mepoch[0m=[35m2[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.00993980803489685, 'time_algorithm_update': 0.006267837572097778, 'loss': 3644.524125854492, 'td_loss': 3644.524125854492, 'conservative_loss': 0.0, 'time_step': 0.016311634373664854}[0m [36mstep[0m=[35m20000[0m





[2m2025-10-29 23:37.08[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/DiscreteCQL_20251029233053/model_20000.d3[0m


Epoch 3/20: 100%|██████████| 10000/10000 [02:38<00:00, 63.25it/s, loss=3.64e+3, td_loss=3.64e+3, conservative_loss=1.2e-8]

[2m2025-10-29 23:39.46[0m [[32m[1minfo     [0m] [1mDiscreteCQL_20251029233053: epoch=3 step=30000[0m [36mepoch[0m=[35m3[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.009476372647285462, 'time_algorithm_update': 0.006169161891937256, 'loss': 3640.9628619140626, 'td_loss': 3640.9628607421873, 'conservative_loss': 1.1954277753829956e-06, 'time_step': 0.015747848391532898}[0m [36mstep[0m=[35m30000[0m
[2m2025-10-29 23:39.46[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/DiscreteCQL_20251029233053/model_30000.d3[0m



Epoch 4/20: 100%|██████████| 10000/10000 [02:37<00:00, 63.31it/s, loss=3.64e+3, td_loss=3.64e+3, conservative_loss=0.00148]

[2m2025-10-29 23:42.24[0m [[32m[1minfo     [0m] [1mDiscreteCQL_20251029233053: epoch=4 step=40000[0m [36mepoch[0m=[35m4[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.00944580602645874, 'time_algorithm_update': 0.006183750557899475, 'loss': 3641.2178373046877, 'td_loss': 3641.2163557861327, 'conservative_loss': 0.0014814888834953307, 'time_step': 0.01573201675415039}[0m [36mstep[0m=[35m40000[0m
[2m2025-10-29 23:42.24[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/DiscreteCQL_20251029233053/model_40000.d3[0m



Epoch 5/20: 100%|██████████| 10000/10000 [02:37<00:00, 63.68it/s, loss=3.62e+3, td_loss=3.62e+3, conservative_loss=0.041] 

[2m2025-10-29 23:45.01[0m [[32m[1minfo     [0m] [1mDiscreteCQL_20251029233053: epoch=5 step=50000[0m [36mepoch[0m=[35m5[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.009415064072608947, 'time_algorithm_update': 0.006122259497642517, 'loss': 3619.127306640625, 'td_loss': 3619.08631015625, 'conservative_loss': 0.04099639351367951, 'time_step': 0.015638995027542114}[0m [36mstep[0m=[35m50000[0m
[2m2025-10-29 23:45.01[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/DiscreteCQL_20251029233053/model_50000.d3[0m



Epoch 6/20: 100%|██████████| 10000/10000 [02:40<00:00, 62.17it/s, loss=3.6e+3, td_loss=3.6e+3, conservative_loss=0.0302] 

[2m2025-10-29 23:47.42[0m [[32m[1minfo     [0m] [1mDiscreteCQL_20251029233053: epoch=6 step=60000[0m [36mepoch[0m=[35m6[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.009753345704078674, 'time_algorithm_update': 0.006166534185409546, 'loss': 3603.3037314208987, 'td_loss': 3603.2735423828126, 'conservative_loss': 0.030189023208618163, 'time_step': 0.0160210990190506}[0m [36mstep[0m=[35m60000[0m
[2m2025-10-29 23:47.42[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/DiscreteCQL_20251029233053/model_60000.d3[0m



Epoch 7/20: 100%|██████████| 10000/10000 [02:35<00:00, 64.20it/s, loss=3.59e+3, td_loss=3.59e+3, conservative_loss=0.00811]

[2m2025-10-29 23:50.18[0m [[32m[1minfo     [0m] [1mDiscreteCQL_20251029233053: epoch=7 step=70000[0m [36mepoch[0m=[35m7[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.009282199597358704, 'time_algorithm_update': 0.006129122614860535, 'loss': 3594.0172120117186, 'td_loss': 3594.009112915039, 'conservative_loss': 0.00809900312423706, 'time_step': 0.015512884020805359}[0m [36mstep[0m=[35m70000[0m
[2m2025-10-29 23:50.18[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/DiscreteCQL_20251029233053/model_70000.d3[0m



Epoch 8/20: 100%|██████████| 10000/10000 [02:40<00:00, 62.30it/s, loss=3.59e+3, td_loss=3.59e+3, conservative_loss=0.00236]

[2m2025-10-29 23:52.58[0m [[32m[1minfo     [0m] [1mDiscreteCQL_20251029233053: epoch=8 step=80000[0m [36mepoch[0m=[35m8[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.009695485854148865, 'time_algorithm_update': 0.006189799427986145, 'loss': 3593.489909716797, 'td_loss': 3593.4875483154296, 'conservative_loss': 0.0023613900184631348, 'time_step': 0.0159892014503479}[0m [36mstep[0m=[35m80000[0m
[2m2025-10-29 23:52.58[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/DiscreteCQL_20251029233053/model_80000.d3[0m



Epoch 9/20: 100%|██████████| 10000/10000 [02:38<00:00, 62.97it/s, loss=3.59e+3, td_loss=3.59e+3, conservative_loss=0.00333]

[2m2025-10-29 23:55.37[0m [[32m[1minfo     [0m] [1mDiscreteCQL_20251029233053: epoch=9 step=90000[0m [36mepoch[0m=[35m9[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.00953582272529602, 'time_algorithm_update': 0.006176239633560181, 'loss': 3591.6557236816407, 'td_loss': 3591.6523993896485, 'conservative_loss': 0.0033242856979370117, 'time_step': 0.015815213227272033}[0m [36mstep[0m=[35m90000[0m
[2m2025-10-29 23:55.37[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/DiscreteCQL_20251029233053/model_90000.d3[0m



Epoch 10/20: 100%|██████████| 10000/10000 [02:30<00:00, 66.59it/s, loss=3.59e+3, td_loss=3.59e+3, conservative_loss=0.00079]

[2m2025-10-29 23:58.07[0m [[32m[1minfo     [0m] [1mDiscreteCQL_20251029233053: epoch=10 step=100000[0m [36mepoch[0m=[35m10[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.008753699135780335, 'time_algorithm_update': 0.006100441098213196, 'loss': 3585.3560107421877, 'td_loss': 3585.3552215332033, 'conservative_loss': 0.000789171028137207, 'time_step': 0.014957831740379334}[0m [36mstep[0m=[35m100000[0m
[2m2025-10-29 23:58.07[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/DiscreteCQL_20251029233053/model_100000.d3[0m



Epoch 11/20: 100%|██████████| 10000/10000 [02:31<00:00, 65.93it/s, loss=3.59e+3, td_loss=3.59e+3, conservative_loss=0]

[2m2025-10-30 00:00.39[0m [[32m[1minfo     [0m] [1mDiscreteCQL_20251029233053: epoch=11 step=110000[0m [36mepoch[0m=[35m11[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.008875428175926209, 'time_algorithm_update': 0.006132446122169495, 'loss': 3586.2466073242185, 'td_loss': 3586.2466073242185, 'conservative_loss': 0.0, 'time_step': 0.015108173704147338}[0m [36mstep[0m=[35m110000[0m





[2m2025-10-30 00:00.39[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/DiscreteCQL_20251029233053/model_110000.d3[0m


Epoch 12/20: 100%|██████████| 10000/10000 [02:32<00:00, 65.64it/s, loss=3.58e+3, td_loss=3.58e+3, conservative_loss=0]

[2m2025-10-30 00:03.11[0m [[32m[1minfo     [0m] [1mDiscreteCQL_20251029233053: epoch=12 step=120000[0m [36mepoch[0m=[35m12[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.009035431957244874, 'time_algorithm_update': 0.006036876058578491, 'loss': 3583.1606893310545, 'td_loss': 3583.1606893310545, 'conservative_loss': 0.0, 'time_step': 0.015175068521499633}[0m [36mstep[0m=[35m120000[0m
[2m2025-10-30 00:03.11[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/DiscreteCQL_20251029233053/model_120000.d3[0m



Epoch 13/20: 100%|██████████| 10000/10000 [02:27<00:00, 67.61it/s, loss=3.58e+3, td_loss=3.58e+3, conservative_loss=0]

[2m2025-10-30 00:05.39[0m [[32m[1minfo     [0m] [1mDiscreteCQL_20251029233053: epoch=13 step=130000[0m [36mepoch[0m=[35m13[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.008853242325782777, 'time_algorithm_update': 0.005779248046875, 'loss': 3577.5796701660156, 'td_loss': 3577.5796701660156, 'conservative_loss': 0.0, 'time_step': 0.014732733058929443}[0m [36mstep[0m=[35m130000[0m
[2m2025-10-30 00:05.39[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/DiscreteCQL_20251029233053/model_130000.d3[0m



Epoch 14/20: 100%|██████████| 10000/10000 [02:28<00:00, 67.28it/s, loss=3.57e+3, td_loss=3.57e+3, conservative_loss=0]

[2m2025-10-30 00:08.08[0m [[32m[1minfo     [0m] [1mDiscreteCQL_20251029233053: epoch=14 step=140000[0m [36mepoch[0m=[35m14[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.00890665259361267, 'time_algorithm_update': 0.005797763276100158, 'loss': 3573.070932836914, 'td_loss': 3573.070932836914, 'conservative_loss': 0.0, 'time_step': 0.014805224514007568}[0m [36mstep[0m=[35m140000[0m
[2m2025-10-30 00:08.08[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/DiscreteCQL_20251029233053/model_140000.d3[0m



Epoch 15/20: 100%|██████████| 10000/10000 [02:29<00:00, 66.70it/s, loss=3.57e+3, td_loss=3.57e+3, conservative_loss=0]

[2m2025-10-30 00:10.38[0m [[32m[1minfo     [0m] [1mDiscreteCQL_20251029233053: epoch=15 step=150000[0m [36mepoch[0m=[35m15[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.009019245433807372, 'time_algorithm_update': 0.005813837099075317, 'loss': 3571.851830517578, 'td_loss': 3571.851830517578, 'conservative_loss': 0.0, 'time_step': 0.014933407068252564}[0m [36mstep[0m=[35m150000[0m
[2m2025-10-30 00:10.38[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/DiscreteCQL_20251029233053/model_150000.d3[0m



Epoch 16/20: 100%|██████████| 10000/10000 [02:30<00:00, 66.29it/s, loss=3.57e+3, td_loss=3.57e+3, conservative_loss=0]

[2m2025-10-30 00:13.09[0m [[32m[1minfo     [0m] [1mDiscreteCQL_20251029233053: epoch=16 step=160000[0m [36mepoch[0m=[35m16[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.009110367202758788, 'time_algorithm_update': 0.005813847017288208, 'loss': 3568.883147753906, 'td_loss': 3568.883147753906, 'conservative_loss': 0.0, 'time_step': 0.015025371217727661}[0m [36mstep[0m=[35m160000[0m
[2m2025-10-30 00:13.09[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/DiscreteCQL_20251029233053/model_160000.d3[0m



Epoch 17/20: 100%|██████████| 10000/10000 [02:29<00:00, 67.06it/s, loss=3.57e+3, td_loss=3.57e+3, conservative_loss=0]

[2m2025-10-30 00:15.38[0m [[32m[1minfo     [0m] [1mDiscreteCQL_20251029233053: epoch=17 step=170000[0m [36mepoch[0m=[35m17[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.008985117888450623, 'time_algorithm_update': 0.005770624113082885, 'loss': 3568.686716381836, 'td_loss': 3568.686716381836, 'conservative_loss': 0.0, 'time_step': 0.014854708909988404}[0m [36mstep[0m=[35m170000[0m
[2m2025-10-30 00:15.38[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/DiscreteCQL_20251029233053/model_170000.d3[0m



Epoch 18/20: 100%|██████████| 10000/10000 [02:29<00:00, 66.83it/s, loss=3.56e+3, td_loss=3.56e+3, conservative_loss=0]


[2m2025-10-30 00:18.08[0m [[32m[1minfo     [0m] [1mDiscreteCQL_20251029233053: epoch=18 step=180000[0m [36mepoch[0m=[35m18[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.009027961468696595, 'time_algorithm_update': 0.005775844144821167, 'loss': 3561.191135498047, 'td_loss': 3561.191135498047, 'conservative_loss': 0.0, 'time_step': 0.014903050661087037}[0m [36mstep[0m=[35m180000[0m
[2m2025-10-30 00:18.08[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/DiscreteCQL_20251029233053/model_180000.d3[0m


Epoch 19/20: 100%|██████████| 10000/10000 [02:29<00:00, 66.86it/s, loss=3.56e+3, td_loss=3.56e+3, conservative_loss=1.91e-10]

[2m2025-10-30 00:20.37[0m [[32m[1minfo     [0m] [1mDiscreteCQL_20251029233053: epoch=19 step=190000[0m [36mepoch[0m=[35m19[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.00902596538066864, 'time_algorithm_update': 0.0057707313299179075, 'loss': 3559.7494353515626, 'td_loss': 3559.7494353515626, 'conservative_loss': 1.9073486328125e-10, 'time_step': 0.014898727560043335}[0m [36mstep[0m=[35m190000[0m
[2m2025-10-30 00:20.37[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/DiscreteCQL_20251029233053/model_190000.d3[0m



Epoch 20/20: 100%|██████████| 10000/10000 [02:33<00:00, 65.25it/s, loss=3.56e+3, td_loss=3.56e+3, conservative_loss=1.77e-5]

[2m2025-10-30 00:23.10[0m [[32m[1minfo     [0m] [1mDiscreteCQL_20251029233053: epoch=20 step=200000[0m [36mepoch[0m=[35m20[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.009408315253257752, 'time_algorithm_update': 0.005753175973892212, 'loss': 3560.088860961914, 'td_loss': 3560.088843310547, 'conservative_loss': 1.7656707763671873e-05, 'time_step': 0.015263005924224854}[0m [36mstep[0m=[35m200000[0m
[2m2025-10-30 00:23.10[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/DiscreteCQL_20251029233053/model_200000.d3[0m





In [11]:
import inspect
from src.rl_cql import _make_cql  # we already wrote this helper earlier

def train_cql_nb(mdp_train, n_steps=200_000):
    """
    Notebook override of train_cql so we ALWAYS return the model,
    even if .fit(...) returns a list.
    """
    algo = _make_cql()

    # try to build
    try:
        algo.build_with_dataset(mdp_train)
    except Exception:
        pass

    fit_sig  = inspect.signature(algo.fit)
    fit_kwds = set(fit_sig.parameters.keys())

    kwargs = {}
    if "dataset" in fit_kwds:
        kwargs["dataset"] = mdp_train
    if "n_steps" in fit_kwds:
        kwargs["n_steps"] = n_steps
    if "n_steps_per_epoch" in fit_kwds:
        kwargs["n_steps_per_epoch"] = 10_000

    # call fit
    try:
        _ = algo.fit(**kwargs)
    except TypeError:
        ds = kwargs.pop("dataset", mdp_train)
        _ = algo.fit(ds, **kwargs)

    # 🔴 IMPORTANT: always return the model, ignore what fit returned
    return algo


In [17]:
algo = train_cql_nb(mdp_train, n_steps=200_000)

[2m2025-10-30 07:56.14[0m [[32m[1minfo     [0m] [1mdataset info                  [0m [36mdataset_info[0m=[35mDatasetInfo(observation_signature=Signature(dtype=[dtype('float32')], shape=[(63,)]), action_signature=Signature(dtype=[dtype('int64')], shape=[(1,)]), reward_signature=Signature(dtype=[dtype('float32')], shape=[(1,)]), action_space=<ActionSpace.DISCRETE: 2>, action_size=2)[0m
[2m2025-10-30 07:56.14[0m [[32m[1minfo     [0m] [1mDirectory is created at d3rlpy_logs/DiscreteCQL_20251030075614[0m
[2m2025-10-30 07:56.14[0m [[32m[1minfo     [0m] [1mParameters                    [0m [36mparams[0m=[35m{'observation_shape': [63], 'action_size': 2, 'config': {'type': 'discrete_cql', 'params': {'batch_size': 1024, 'gamma': 0.99, 'observation_scaler': {'type': 'none', 'params': {}}, 'action_scaler': {'type': 'none', 'params': {}}, 'reward_scaler': {'type': 'none', 'params': {}}, 'compile_graph': False, 'learning_rate': 0.0003, 'optim_factory': {'type': 'adam', 'pa

Epoch 1/20: 100%|██████████| 10000/10000 [04:22<00:00, 38.12it/s, loss=3.7e+3, td_loss=3.7e+3, conservative_loss=0.000884] 

[2m2025-10-30 08:00.37[0m [[32m[1minfo     [0m] [1mDiscreteCQL_20251030075614: epoch=1 step=10000[0m [36mepoch[0m=[35m1[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.01772546148300171, 'time_algorithm_update': 0.008224328088760375, 'loss': 3703.496393994141, 'td_loss': 3703.495511083984, 'conservative_loss': 0.0008830552247120067, 'time_step': 0.026101881909370423}[0m [36mstep[0m=[35m10000[0m
[2m2025-10-30 08:00.37[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/DiscreteCQL_20251030075614/model_10000.d3[0m



Epoch 2/20: 100%|██████████| 10000/10000 [04:12<00:00, 39.59it/s, loss=3.64e+3, td_loss=3.64e+3, conservative_loss=0]

[2m2025-10-30 08:04.49[0m [[32m[1minfo     [0m] [1mDiscreteCQL_20251030075614: epoch=2 step=20000[0m [36mepoch[0m=[35m2[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.016973043203353883, 'time_algorithm_update': 0.008015899634361267, 'loss': 3642.636858544922, 'td_loss': 3642.636858544922, 'conservative_loss': 0.0, 'time_step': 0.025129512453079223}[0m [36mstep[0m=[35m20000[0m





[2m2025-10-30 08:04.49[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/DiscreteCQL_20251030075614/model_20000.d3[0m


Epoch 3/20: 100%|██████████| 10000/10000 [04:34<00:00, 36.46it/s, loss=3.64e+3, td_loss=3.64e+3, conservative_loss=0]

[2m2025-10-30 08:09.24[0m [[32m[1minfo     [0m] [1mDiscreteCQL_20251030075614: epoch=3 step=30000[0m [36mepoch[0m=[35m3[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.018579974150657652, 'time_algorithm_update': 0.008536725425720215, 'loss': 3642.715951098633, 'td_loss': 3642.715951098633, 'conservative_loss': 0.0, 'time_step': 0.027277433228492735}[0m [36mstep[0m=[35m30000[0m





[2m2025-10-30 08:09.24[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/DiscreteCQL_20251030075614/model_30000.d3[0m


Epoch 4/20: 100%|██████████| 10000/10000 [03:53<00:00, 42.83it/s, loss=3.64e+3, td_loss=3.64e+3, conservative_loss=0.000618]

[2m2025-10-30 08:13.17[0m [[32m[1minfo     [0m] [1mDiscreteCQL_20251030075614: epoch=4 step=40000[0m [36mepoch[0m=[35m4[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.015387221908569336, 'time_algorithm_update': 0.007714990639686585, 'loss': 3636.611420727539, 'td_loss': 3636.6108035888674, 'conservative_loss': 0.0006171760350465775, 'time_step': 0.023237613368034364}[0m [36mstep[0m=[35m40000[0m
[2m2025-10-30 08:13.17[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/DiscreteCQL_20251030075614/model_40000.d3[0m



Epoch 5/20: 100%|██████████| 10000/10000 [04:52<00:00, 34.18it/s, loss=3.62e+3, td_loss=3.62e+3, conservative_loss=0.043] 


[2m2025-10-30 08:18.10[0m [[32m[1minfo     [0m] [1mDiscreteCQL_20251030075614: epoch=5 step=50000[0m [36mepoch[0m=[35m5[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.020135783195495607, 'time_algorithm_update': 0.008764088678359986, 'loss': 3619.4839703125, 'td_loss': 3619.441000341797, 'conservative_loss': 0.04296980444192886, 'time_step': 0.029105269050598143}[0m [36mstep[0m=[35m50000[0m
[2m2025-10-30 08:18.10[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/DiscreteCQL_20251030075614/model_50000.d3[0m


Epoch 6/20: 100%|██████████| 10000/10000 [03:57<00:00, 42.07it/s, loss=3.61e+3, td_loss=3.61e+3, conservative_loss=0.0404]

[2m2025-10-30 08:22.07[0m [[32m[1minfo     [0m] [1mDiscreteCQL_20251030075614: epoch=6 step=60000[0m [36mepoch[0m=[35m6[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.01605104558467865, 'time_algorithm_update': 0.007472174024581909, 'loss': 3606.408186254883, 'td_loss': 3606.3678606689455, 'conservative_loss': 0.04032558546066284, 'time_step': 0.023661257457733155}[0m [36mstep[0m=[35m60000[0m
[2m2025-10-30 08:22.08[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/DiscreteCQL_20251030075614/model_60000.d3[0m



Epoch 7/20: 100%|██████████| 10000/10000 [04:25<00:00, 37.67it/s, loss=3.6e+3, td_loss=3.6e+3, conservative_loss=0.0123]  

[2m2025-10-30 08:26.33[0m [[32m[1minfo     [0m] [1mDiscreteCQL_20251030075614: epoch=7 step=70000[0m [36mepoch[0m=[35m7[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.018119954442977905, 'time_algorithm_update': 0.008134889960289002, 'loss': 3597.300731982422, 'td_loss': 3597.288395776367, 'conservative_loss': 0.012336229419708251, 'time_step': 0.02640734374523163}[0m [36mstep[0m=[35m70000[0m
[2m2025-10-30 08:26.33[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/DiscreteCQL_20251030075614/model_70000.d3[0m



Epoch 8/20: 100%|██████████| 10000/10000 [04:18<00:00, 38.68it/s, loss=3.59e+3, td_loss=3.59e+3, conservative_loss=0.00648]

[2m2025-10-30 08:30.52[0m [[32m[1minfo     [0m] [1mDiscreteCQL_20251030075614: epoch=8 step=80000[0m [36mepoch[0m=[35m8[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.017558500814437866, 'time_algorithm_update': 0.008012994599342346, 'loss': 3589.768605786133, 'td_loss': 3589.7621342773436, 'conservative_loss': 0.0064714768409729, 'time_step': 0.025719834446907043}[0m [36mstep[0m=[35m80000[0m





[2m2025-10-30 08:30.52[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/DiscreteCQL_20251030075614/model_80000.d3[0m


Epoch 9/20: 100%|██████████| 10000/10000 [07:30<00:00, 22.22it/s, loss=3.6e+3, td_loss=3.6e+3, conservative_loss=0.00422]  

[2m2025-10-30 08:38.22[0m [[32m[1minfo     [0m] [1mDiscreteCQL_20251030075614: epoch=9 step=90000[0m [36mepoch[0m=[35m9[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.03661282286643982, 'time_algorithm_update': 0.008104105544090271, 'loss': 3595.778217163086, 'td_loss': 3595.7739975341797, 'conservative_loss': 0.004219574737548828, 'time_step': 0.04486341814994812}[0m [36mstep[0m=[35m90000[0m





[2m2025-10-30 08:38.22[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/DiscreteCQL_20251030075614/model_90000.d3[0m


Epoch 10/20: 100%|██████████| 10000/10000 [03:35<00:00, 46.50it/s, loss=3.59e+3, td_loss=3.59e+3, conservative_loss=0.00157]

[2m2025-10-30 08:41.57[0m [[32m[1minfo     [0m] [1mDiscreteCQL_20251030075614: epoch=10 step=100000[0m [36mepoch[0m=[35m10[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.013856401419639588, 'time_algorithm_update': 0.00741534423828125, 'loss': 3594.768078515625, 'td_loss': 3594.7665136230466, 'conservative_loss': 0.0015649120330810547, 'time_step': 0.021406792998313905}[0m [36mstep[0m=[35m100000[0m
[2m2025-10-30 08:41.57[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/DiscreteCQL_20251030075614/model_100000.d3[0m



Epoch 11/20: 100%|██████████| 10000/10000 [03:41<00:00, 45.16it/s, loss=3.59e+3, td_loss=3.59e+3, conservative_loss=7.83e-6]

[2m2025-10-30 08:45.38[0m [[32m[1minfo     [0m] [1mDiscreteCQL_20251030075614: epoch=11 step=110000[0m [36mepoch[0m=[35m11[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.014370469546318054, 'time_algorithm_update': 0.007525546455383301, 'loss': 3585.5598541259765, 'td_loss': 3585.5598463134766, 'conservative_loss': 7.820320129394532e-06, 'time_step': 0.022032292675971985}[0m [36mstep[0m=[35m110000[0m
[2m2025-10-30 08:45.38[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/DiscreteCQL_20251030075614/model_110000.d3[0m



Epoch 12/20: 100%|██████████| 10000/10000 [04:58<00:00, 33.50it/s, loss=3.58e+3, td_loss=3.58e+3, conservative_loss=0]

[2m2025-10-30 08:50.37[0m [[32m[1minfo     [0m] [1mDiscreteCQL_20251030075614: epoch=12 step=120000[0m [36mepoch[0m=[35m12[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.020632814931869508, 'time_algorithm_update': 0.008892390179634094, 'loss': 3584.876978491211, 'td_loss': 3584.876978491211, 'conservative_loss': 0.0, 'time_step': 0.02968693175315857}[0m [36mstep[0m=[35m120000[0m





[2m2025-10-30 08:50.37[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/DiscreteCQL_20251030075614/model_120000.d3[0m


Epoch 13/20: 100%|██████████| 10000/10000 [04:50<00:00, 34.38it/s, loss=3.58e+3, td_loss=3.58e+3, conservative_loss=0]

[2m2025-10-30 08:55.28[0m [[32m[1minfo     [0m] [1mDiscreteCQL_20251030075614: epoch=13 step=130000[0m [36mepoch[0m=[35m13[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.019927389740943907, 'time_algorithm_update': 0.008856986975669862, 'loss': 3583.13093425293, 'td_loss': 3583.13093425293, 'conservative_loss': 0.0, 'time_step': 0.028938938641548156}[0m [36mstep[0m=[35m130000[0m





[2m2025-10-30 08:55.28[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/DiscreteCQL_20251030075614/model_130000.d3[0m


Epoch 14/20: 100%|██████████| 10000/10000 [04:21<00:00, 38.26it/s, loss=3.58e+3, td_loss=3.58e+3, conservative_loss=0]

[2m2025-10-30 08:59.49[0m [[32m[1minfo     [0m] [1mDiscreteCQL_20251030075614: epoch=14 step=140000[0m [36mepoch[0m=[35m14[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.017860235953330993, 'time_algorithm_update': 0.007996753907203674, 'loss': 3576.9740880615236, 'td_loss': 3576.9740880615236, 'conservative_loss': 0.0, 'time_step': 0.02600061273574829}[0m [36mstep[0m=[35m140000[0m
[2m2025-10-30 08:59.49[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/DiscreteCQL_20251030075614/model_140000.d3[0m



Epoch 15/20: 100%|██████████| 10000/10000 [04:21<00:00, 38.19it/s, loss=3.58e+3, td_loss=3.58e+3, conservative_loss=0]

[2m2025-10-30 09:04.11[0m [[32m[1minfo     [0m] [1mDiscreteCQL_20251030075614: epoch=15 step=150000[0m [36mepoch[0m=[35m15[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.017867127966880798, 'time_algorithm_update': 0.00803241422176361, 'loss': 3579.9559470947265, 'td_loss': 3579.9559470947265, 'conservative_loss': 0.0, 'time_step': 0.026047499561309813}[0m [36mstep[0m=[35m150000[0m





[2m2025-10-30 09:04.11[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/DiscreteCQL_20251030075614/model_150000.d3[0m


Epoch 16/20: 100%|██████████| 10000/10000 [04:31<00:00, 36.85it/s, loss=3.57e+3, td_loss=3.57e+3, conservative_loss=0]

[2m2025-10-30 09:08.42[0m [[32m[1minfo     [0m] [1mDiscreteCQL_20251030075614: epoch=16 step=160000[0m [36mepoch[0m=[35m16[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.018404480361938477, 'time_algorithm_update': 0.008406436729431152, 'loss': 3573.175107446289, 'td_loss': 3573.175107446289, 'conservative_loss': 0.0, 'time_step': 0.026990618658065797}[0m [36mstep[0m=[35m160000[0m
[2m2025-10-30 09:08.42[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/DiscreteCQL_20251030075614/model_160000.d3[0m



Epoch 17/20: 100%|██████████| 10000/10000 [04:05<00:00, 40.75it/s, loss=3.57e+3, td_loss=3.57e+3, conservative_loss=0]

[2m2025-10-30 09:12.48[0m [[32m[1minfo     [0m] [1mDiscreteCQL_20251030075614: epoch=17 step=170000[0m [36mepoch[0m=[35m17[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.01667151975631714, 'time_algorithm_update': 0.007608928656578064, 'loss': 3569.9510028320315, 'td_loss': 3569.9510028320315, 'conservative_loss': 0.0, 'time_step': 0.024419804072380067}[0m [36mstep[0m=[35m170000[0m
[2m2025-10-30 09:12.48[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/DiscreteCQL_20251030075614/model_170000.d3[0m



Epoch 18/20: 100%|██████████| 10000/10000 [04:55<00:00, 33.85it/s, loss=3.57e+3, td_loss=3.57e+3, conservative_loss=0]


[2m2025-10-30 09:17.43[0m [[32m[1minfo     [0m] [1mDiscreteCQL_20251030075614: epoch=18 step=180000[0m [36mepoch[0m=[35m18[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.020451636028289796, 'time_algorithm_update': 0.008757887268066407, 'loss': 3569.9036631103518, 'td_loss': 3569.9036631103518, 'conservative_loss': 0.0, 'time_step': 0.029380690479278566}[0m [36mstep[0m=[35m180000[0m
[2m2025-10-30 09:17.43[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/DiscreteCQL_20251030075614/model_180000.d3[0m


Epoch 19/20: 100%|██████████| 10000/10000 [04:47<00:00, 34.82it/s, loss=3.57e+3, td_loss=3.57e+3, conservative_loss=0]

[2m2025-10-30 09:22.30[0m [[32m[1minfo     [0m] [1mDiscreteCQL_20251030075614: epoch=19 step=190000[0m [36mepoch[0m=[35m19[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.01874050612449646, 'time_algorithm_update': 0.009679274559020996, 'loss': 3565.9149115234377, 'td_loss': 3565.9149115234377, 'conservative_loss': 0.0, 'time_step': 0.028572401356697082}[0m [36mstep[0m=[35m190000[0m
[2m2025-10-30 09:22.30[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/DiscreteCQL_20251030075614/model_190000.d3[0m



Epoch 20/20: 100%|██████████| 10000/10000 [04:40<00:00, 35.60it/s, loss=3.56e+3, td_loss=3.56e+3, conservative_loss=0]

[2m2025-10-30 09:27.11[0m [[32m[1minfo     [0m] [1mDiscreteCQL_20251030075614: epoch=20 step=200000[0m [36mepoch[0m=[35m20[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.018260151767730715, 'time_algorithm_update': 0.009531409692764283, 'loss': 3561.279971557617, 'td_loss': 3561.279971557617, 'conservative_loss': 0.0, 'time_step': 0.027950141501426697}[0m [36mstep[0m=[35m200000[0m
[2m2025-10-30 09:27.11[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/DiscreteCQL_20251030075614/model_200000.d3[0m





In [18]:
import numpy as np

def realized_reward(actions, y_true, loan_amnt, int_rate):
    actions   = np.asarray(actions).astype(int)
    y_true    = np.asarray(y_true).astype(int)
    loan_amnt = np.asarray(loan_amnt).astype(float)
    int_rate  = np.asarray(int_rate).astype(float)

    approve_reward = np.where(y_true == 0, loan_amnt * int_rate, -loan_amnt)
    deny_reward    = np.zeros_like(approve_reward)

    return np.where(actions == 1, approve_reward, deny_reward)

def policy_value(actions, y_true, loan_amnt, int_rate):
    r = realized_reward(actions, y_true, loan_amnt, int_rate)
    return {
        "approve_rate": float((np.asarray(actions)==1).mean()),
        "mean_reward": float(r.mean()),
        "total_reward": float(r.sum())
    }

# 1) RL policy actions
rl_actions_te = algo.predict(Xte.astype(np.float32))
rl_val = policy_value(
    rl_actions_te,
    yte,
    te_meta["loan_amnt"].values,
    te_meta["int_rate"].values,
)
print("RL policy:", rl_val)

# 2) Always-approve baseline
always_approve = np.ones_like(yte)
aa_val = policy_value(
    always_approve,
    yte,
    te_meta["loan_amnt"].values,
    te_meta["int_rate"].values,
)
print("Always-approve:", aa_val)

# 3) Always-deny baseline
always_deny = np.zeros_like(yte)
ad_val = policy_value(
    always_deny,
    yte,
    te_meta["loan_amnt"].values,
    te_meta["int_rate"].values,
)
print("Always-deny:", ad_val)


RL policy: {'approve_rate': 1.0, 'mean_reward': -3316.01138286359, 'total_reward': -216011613.50249997}
Always-approve: {'approve_rate': 1.0, 'mean_reward': -3316.01138286359, 'total_reward': -216011613.50249997}
Always-deny: {'approve_rate': 0.0, 'mean_reward': 0.0, 'total_reward': 0.0}


In [15]:
from src.rl_cql import fqe_estimate, build_mdp_dataset

# we already have: mdp_train, mdp_valid, mdp_test
est_val = fqe_estimate(algo, mdp_train, mdp_valid, n_steps=10_000)
est_te  = fqe_estimate(algo, mdp_train, mdp_test,  n_steps=10_000)

print("Estimated Policy Value (FQE) — Val:", est_val)
print("Estimated Policy Value (FQE) — Test:", est_te)


[2m2025-10-30 07:29.15[0m [[32m[1minfo     [0m] [1mdataset info                  [0m [36mdataset_info[0m=[35mDatasetInfo(observation_signature=Signature(dtype=[dtype('float32')], shape=[(63,)]), action_signature=Signature(dtype=[dtype('int64')], shape=[(1,)]), reward_signature=Signature(dtype=[dtype('float32')], shape=[(1,)]), action_space=<ActionSpace.DISCRETE: 2>, action_size=2)[0m
[2m2025-10-30 07:29.15[0m [[32m[1mdebug    [0m] [1mBuilding models...            [0m
[2m2025-10-30 07:29.15[0m [[32m[1mdebug    [0m] [1mModels have been built.       [0m
[2m2025-10-30 07:29.15[0m [[32m[1minfo     [0m] [1mDirectory is created at d3rlpy_logs/DiscreteFQE_20251030072915[0m
[2m2025-10-30 07:29.15[0m [[32m[1minfo     [0m] [1mParameters                    [0m [36mparams[0m=[35m{'observation_shape': [63], 'action_size': 2, 'config': {'type': 'fqe', 'params': {'batch_size': 100, 'gamma': 0.99, 'observation_scaler': {'type': 'none', 'params': {}}, 'action_sc

Epoch 1/1: 100%|██████████| 10000/10000 [00:45<00:00, 217.63it/s, loss=3.86e+3]

[2m2025-10-30 07:30.01[0m [[32m[1minfo     [0m] [1mDiscreteFQE_20251030072915: epoch=1 step=10000[0m [36mepoch[0m=[35m1[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.00188371741771698, 'time_algorithm_update': 0.0026010443449020386, 'loss': 3860.603049987793, 'time_step': 0.00456871645450592}[0m [36mstep[0m=[35m10000[0m





[2m2025-10-30 07:30.01[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/DiscreteFQE_20251030072915/model_10000.d3[0m


AttributeError: 'DiscreteFQE' object has no attribute 'estimate_value'