In [1]:
!pip install d3rlpy pyarrow

Collecting d3rlpy
  Downloading d3rlpy-2.8.1-py3-none-any.whl.metadata (11 kB)
Collecting gym>=0.26.0 (from d3rlpy)
  Downloading gym-0.26.2.tar.gz (721 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m721.7/721.7 kB[0m [31m19.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting structlog (from d3rlpy)
  Downloading structlog-25.4.0-py3-none-any.whl.metadata (7.6 kB)
Collecting colorama (from d3rlpy)
  Downloading colorama-0.4.6-py2.py3-none-any.whl.metadata (17 kB)
Collecting dataclasses-json (from d3rlpy)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting gymnasium==1.0.0 (from d3rlpy)
  Downloading gymnasium-1.0.0-py3-none-any.whl.metadata (9.5 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json->d3rlpy)
  Downloading marshmallow-3.26.1-py3-none-any.whl.met

In [3]:
import d3rlpy

Gym has been unmaintained since 2022 and does not support NumPy 2.0 amongst other critical functionality.
Please upgrade to Gymnasium, the maintained drop-in replacement of Gym, or contact the authors of your software and request that they upgrade.
Users of this version of Gym should be able to simply replace 'import gym' with 'import gymnasium as gym' in the vast majority of cases.
See the migration guide at https://gymnasium.farama.org/introduction/migration_guide/ for additional information.


In [4]:
d3rlpy.__version__

'2.8.1'

In [15]:
from d3rlpy.metrics import EnvironmentEvaluator

In [None]:
import pandas as pd
import numpy as np
import torch
import warnings
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import (
    StandardScaler, OneHotEncoder, OrdinalEncoder, FunctionTransformer
)
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

import d3rlpy
from d3rlpy.dataset import MDPDataset
from d3rlpy.algos import DiscreteCQLConfig
from d3rlpy.ope import FQEConfig, DiscreteFQE

warnings.filterwarnings('ignore')
print(f"Using d3rlpy version: {d3rlpy.__version__}")

# =============================
# STEP 1 — Preprocessing
# =============================
def build_preprocessor(X_data):
    log_transform_features = [
        'annual_inc', 'tot_coll_amt', 'delinq_amnt', 'tax_liens',
        'pub_rec', 'revol_bal', 'tot_cur_bal'
    ]
    numeric_features = [
        'loan_amnt', 'funded_amnt', 'installment', 'fico_range_low',
        'num_tl_op_past_12m', 'acc_open_past_24mths', 'inq_last_6mths', 'dti',
        'revol_util', 'open_acc', 'mort_acc', 'bc_util', 'emp_length_int',
        'credit_history_length_mths', 'avg_cur_bal', 'bc_open_to_buy',
        'chargeoff_within_12_mths', 'delinq_2yrs', 'mo_sin_old_il_acct',
        'mo_sin_old_rev_tl_op', 'mo_sin_rcnt_rev_tl_op', 'mo_sin_rcnt_tl',
        'mths_since_recent_bc', 'num_accts_ever_120_pd', 'num_actv_bc_tl',
        'num_actv_rev_tl', 'num_bc_sats', 'num_bc_tl', 'num_il_tl',
        'num_op_rev_tl', 'num_rev_accts', 'num_rev_tl_bal_gt_0', 'num_sats',
        'num_tl_120dpd_2m', 'num_tl_30dpd', 'num_tl_90g_dpd_24m',
        'pct_tl_nvr_dlq', 'percent_bc_gt_75', 'pub_rec_bankruptcies',
        'tot_hi_cred_lim', 'total_bal_ex_mort', 'total_bc_limit',
        'total_il_high_credit_limit', 'total_acc', 'total_rev_hi_lim'
    ]
    categorical_features = [
        'verification_status', 'home_ownership', 'purpose',
        'initial_list_status', 'application_type'
    ]
    ordinal_features = ['grade', 'sub_grade', 'term']

    existing_cols = X_data.columns
    log_transform_features = [c for c in log_transform_features if c in existing_cols]
    numeric_features = [c for c in numeric_features if c in existing_cols]
    categorical_features = [c for c in categorical_features if c in existing_cols]
    ordinal_features = [c for c in ordinal_features if c in existing_cols]

    grade_cats = sorted(X_data['grade'].dropna().unique()) if 'grade' in X_data else []
    sub_grade_cats = sorted(X_data['sub_grade'].dropna().unique()) if 'sub_grade' in X_data else []
    term_cats = sorted(X_data['term'].dropna().unique()) if 'term' in X_data else []
    ordinal_categories = [grade_cats, sub_grade_cats, term_cats]

    log_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy='median')),
        ('log', FunctionTransformer(np.log1p, validate=False)),
        ('scale', StandardScaler())
    ])
    numeric_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy='median')),
        ('scale', StandardScaler())
    ])
    categorical_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
    ])
    ordinal_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('encode', OrdinalEncoder(categories=ordinal_categories, handle_unknown='use_encoded_value', unknown_value=-1))
    ])

    preprocessor = ColumnTransformer([
        ('log', log_pipeline, log_transform_features),
        ('num', numeric_pipeline, numeric_features),
        ('cat', categorical_pipeline, categorical_features),
        ('ord', ordinal_pipeline, ordinal_features)
    ], remainder='drop')

    all_features = log_transform_features + numeric_features + categorical_features + ordinal_features
    return preprocessor, all_features


# =============================
# STEP 2 — Load Data
# =============================
print("Loading data...")
df_rl = pd.read_parquet('processed_loan_data.parquet')

df_rl['reward'] = np.where(df_rl['is_default'] == 1, -df_rl['loan_amnt'],
                           df_rl['loan_amnt'] * (df_rl['int_rate'] / 100.0))
df_rl['terminal'] = 1

# two possible actions: 0 = deny, 1 = approve
df_rl['action'] = np.random.choice([0, 1], size=len(df_rl))

# =============================
# STEP 3 — Transform Data
# =============================
preprocessor, used_features = build_preprocessor(df_rl)
X = preprocessor.fit_transform(df_rl[used_features]).astype(np.float32)
X = np.nan_to_num(X, nan=0.0)

actions = df_rl['action'].values.astype(np.int32)
rewards = df_rl['reward'].values.astype(np.float32)
terminals = df_rl['terminal'].values.astype(np.float32)

# =============================
# STEP 4 — Split into train/test
# =============================
train_idx, test_idx = train_test_split(np.arange(len(X)), test_size=0.2, random_state=42)
train_data = MDPDataset(X[train_idx], actions[train_idx], rewards[train_idx], terminals[train_idx])
test_data = MDPDataset(X[test_idx], actions[test_idx], rewards[test_idx], terminals[test_idx])

print(f"Train: {len(train_idx)}, Test: {len(test_idx)}")

# =============================
# STEP 5 — Offline RL: Discrete CQL
# =============================
use_gpu = torch.cuda.is_available()
device = "cuda:0" if use_gpu else "cpu"
print(f"Using device: {device}")

cql = DiscreteCQLConfig().create(device=device)
print("Training Discrete CQL (offline)...")

cql.fit(train_data, n_steps=50_000)
cql.save_model("discrete_cql_model.pt")

# =============================
# STEP 6 — Offline Evaluation: FQE
# =============================
print("\nRunning Fitted Q Evaluation (FQE)...")

fqe_config = FQEConfig()
fqe = DiscreteFQE(algo=cql, config=fqe_config, device=device)

fqe.fit(dataset=train_data, n_steps=25_000)
estimated_value = fqe.evaluate(dataset=test_data)

print("\n===================================")
print("  Offline Policy Evaluation Result  ")
print("===================================")
print(f"Estimated Policy Value (FQE): {estimated_value:.2f}")
print("===================================")
