In [9]:
import joblib
import numpy as np
import pandas as pd
import torch

# Import d3rlpy
import d3rlpy
from d3rlpy.dataset import MDPDataset
from d3rlpy.algos import DiscreteCQLConfig  # FIXED: Use DiscreteCQL for discrete actions
from d3rlpy.ope import DiscreteFQE, FQEConfig  # FIXED: Use DiscreteFQE for discrete actions

# Check for GPU
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {device}")

Using device: cpu


In [10]:
# --- Task 3.1: Load Preprocessed Data ---

print("Loading preprocessed data from 'artifacts/preprocessed_data.joblib'...")
try:
    data = joblib.load('artifacts/preprocessed_data.joblib')
    
    # These are the processed states for the model
    X_train = data['X_train']
    X_test = data['X_test']
    
    # These are the raw dataframes needed to calculate rewards
    train_data_rl = data['train_data_rl']
    test_data_rl = data['test_data_rl']
    
    print("Data loaded successfully.")
    print(f"X_train (states) shape: {X_train.shape}")
    print(f"train_data_rl (for rewards) shape: {train_data_rl.shape}")
except FileNotFoundError:
    print("Error: 'artifacts/preprocessed_data.joblib' not found.")
    print("Please run the 01-eda-preprocessing.ipynb notebook to generate the data.")


Loading preprocessed data from 'artifacts/preprocessed_data.joblib'...
Data loaded successfully.
X_train (states) shape: (71541, 37)
train_data_rl (for rewards) shape: (71541, 14)


In [11]:
# --- Task 3.1: Load Preprocessed Data ---

print("Loading preprocessed data from 'artifacts/preprocessed_data.joblib'...")
try:
    data = joblib.load('artifacts/preprocessed_data.joblib')
    
    # These are the processed states for the model
    X_train = data['X_train']
    X_test = data['X_test']
    
    # These are the raw dataframes needed to calculate rewards
    train_data_rl = data['train_data_rl']
    test_data_rl = data['test_data_rl']
    
    print("Data loaded successfully.")
    print(f"X_train (states) shape: {X_train.shape}")
    print(f"train_data_rl (for rewards) shape: {train_data_rl.shape}")
except FileNotFoundError:
    print("Error: 'artifacts/preprocessed_data.joblib' not found.")
    print("Please run the 01-eda-preprocessing.ipynb notebook to generate the data.")

Loading preprocessed data from 'artifacts/preprocessed_data.joblib'...
Data loaded successfully.
X_train (states) shape: (71541, 37)
train_data_rl (for rewards) shape: (71541, 14)


In [12]:
# --- Task 3.3: Train an Offline RL Agent ---

# FIXED: Use DiscreteCQL for discrete action spaces
# DiscreteCQL is specifically designed for discrete actions (0: Deny, 1: Approve)
cql = DiscreteCQLConfig().create(device=device)

# Train the Agent
print("Starting RL agent training (DiscreteCQL)...")
cql.fit(
    train_dataset,
    n_steps=10000,  # This is 'training steps', not epochs. 10k is fast.
    n_steps_per_epoch=1000
)

print("RL agent training complete.")

# Save the model
cql.save_model('artifacts/discrete_cql_model.d3')
print("DiscreteCQL model saved to 'artifacts/discrete_cql_model.d3'")

Starting RL agent training (DiscreteCQL)...
2025-10-29 21:46.20 [info     ] dataset info                   dataset_info=DatasetInfo(observation_signature=Signature(dtype=[dtype('float32')], shape=[(37,)]), action_signature=Signature(dtype=[dtype('int32')], shape=[(1,)]), reward_signature=Signature(dtype=[dtype('float32')], shape=[(1,)]), action_space=<ActionSpace.DISCRETE: 2>, action_size=2)
2025-10-29 21:46.20 [debug    ] Building models...            
2025-10-29 21:46.22 [debug    ] Models have been built.       
2025-10-29 21:46.22 [info     ] Directory is created at d3rlpy_logs\DiscreteCQL_20251029214622
2025-10-29 21:46.22 [info     ] Parameters                     params={'observation_shape': [37], 'action_size': 2, 'config': {'type': 'discrete_cql', 'params': {'batch_size': 32, 'gamma': 0.99, 'observation_scaler': {'type': 'none', 'params': {}}, 'action_scaler': {'type': 'none', 'params': {}}, 'reward_scaler': {'type': 'none', 'params': {}}, 'compile_graph': False, 'learning_rat

Epoch 1/10: 100%|██████████| 1000/1000 [00:08<00:00, 115.74it/s, loss=4.47e+3, td_loss=4.47e+3, conservative_loss=0.0556]


2025-10-29 21:46.30 [info     ] DiscreteCQL_20251029214622: epoch=1 step=1000 epoch=1 metrics={'time_sample_batch': 0.0007089624404907226, 'time_algorithm_update': 0.007626190662384034, 'loss': 4470.60221496582, 'td_loss': 4470.547122924805, 'conservative_loss': 0.0550950580611825, 'time_step': 0.008517817497253418} step=1000
2025-10-29 21:46.30 [info     ] Model parameters are saved to d3rlpy_logs\DiscreteCQL_20251029214622\model_1000.d3


Epoch 2/10: 100%|██████████| 1000/1000 [00:06<00:00, 146.45it/s, loss=4.21e+3, td_loss=4.21e+3, conservative_loss=0]


2025-10-29 21:46.37 [info     ] DiscreteCQL_20251029214622: epoch=2 step=2000 epoch=2 metrics={'time_sample_batch': 0.0005082817077636719, 'time_algorithm_update': 0.006107254981994629, 'loss': 4209.876989074707, 'td_loss': 4209.876989074707, 'conservative_loss': 0.0, 'time_step': 0.006747395038604736} step=2000
2025-10-29 21:46.37 [info     ] Model parameters are saved to d3rlpy_logs\DiscreteCQL_20251029214622\model_2000.d3


Epoch 3/10: 100%|██████████| 1000/1000 [00:06<00:00, 148.23it/s, loss=3.98e+3, td_loss=3.98e+3, conservative_loss=0]


2025-10-29 21:46.44 [info     ] DiscreteCQL_20251029214622: epoch=3 step=3000 epoch=3 metrics={'time_sample_batch': 0.0005014455318450928, 'time_algorithm_update': 0.006043472528457642, 'loss': 3985.0275782470703, 'td_loss': 3985.0275782470703, 'conservative_loss': 0.0, 'time_step': 0.006673449754714966} step=3000
2025-10-29 21:46.44 [info     ] Model parameters are saved to d3rlpy_logs\DiscreteCQL_20251029214622\model_3000.d3


Epoch 4/10: 100%|██████████| 1000/1000 [00:06<00:00, 160.31it/s, loss=3.9e+3, td_loss=3.9e+3, conservative_loss=0] 


2025-10-29 21:46.50 [info     ] DiscreteCQL_20251029214622: epoch=4 step=4000 epoch=4 metrics={'time_sample_batch': 0.000455376148223877, 'time_algorithm_update': 0.0055951507091522214, 'loss': 3901.589845458984, 'td_loss': 3901.589845458984, 'conservative_loss': 0.0, 'time_step': 0.0061730678081512455} step=4000
2025-10-29 21:46.50 [info     ] Model parameters are saved to d3rlpy_logs\DiscreteCQL_20251029214622\model_4000.d3


Epoch 5/10: 100%|██████████| 1000/1000 [00:05<00:00, 168.18it/s, loss=3.84e+3, td_loss=3.84e+3, conservative_loss=0]


2025-10-29 21:46.56 [info     ] DiscreteCQL_20251029214622: epoch=5 step=5000 epoch=5 metrics={'time_sample_batch': 0.00045026493072509766, 'time_algorithm_update': 0.005317702770233155, 'loss': 3851.515514953613, 'td_loss': 3851.515514953613, 'conservative_loss': 0.0, 'time_step': 0.0058844823837280276} step=5000
2025-10-29 21:46.56 [info     ] Model parameters are saved to d3rlpy_logs\DiscreteCQL_20251029214622\model_5000.d3


Epoch 6/10: 100%|██████████| 1000/1000 [00:06<00:00, 164.47it/s, loss=3.74e+3, td_loss=3.74e+3, conservative_loss=0]


2025-10-29 21:47.02 [info     ] DiscreteCQL_20251029214622: epoch=6 step=6000 epoch=6 metrics={'time_sample_batch': 0.0004787309169769287, 'time_algorithm_update': 0.005416768789291382, 'loss': 3731.7278041381837, 'td_loss': 3731.7278041381837, 'conservative_loss': 0.0, 'time_step': 0.006017614603042603} step=6000
2025-10-29 21:47.02 [info     ] Model parameters are saved to d3rlpy_logs\DiscreteCQL_20251029214622\model_6000.d3


Epoch 7/10: 100%|██████████| 1000/1000 [00:06<00:00, 161.35it/s, loss=3.73e+3, td_loss=3.73e+3, conservative_loss=0]


2025-10-29 21:47.08 [info     ] DiscreteCQL_20251029214622: epoch=7 step=7000 epoch=7 metrics={'time_sample_batch': 0.0005619540214538574, 'time_algorithm_update': 0.0054262735843658445, 'loss': 3728.787400085449, 'td_loss': 3728.787400085449, 'conservative_loss': 0.0, 'time_step': 0.006126837730407715} step=7000
2025-10-29 21:47.08 [info     ] Model parameters are saved to d3rlpy_logs\DiscreteCQL_20251029214622\model_7000.d3


Epoch 8/10: 100%|██████████| 1000/1000 [00:06<00:00, 161.70it/s, loss=3.73e+3, td_loss=3.73e+3, conservative_loss=0]


2025-10-29 21:47.15 [info     ] DiscreteCQL_20251029214622: epoch=8 step=8000 epoch=8 metrics={'time_sample_batch': 0.000549363374710083, 'time_algorithm_update': 0.005420393228530883, 'loss': 3730.991396911621, 'td_loss': 3730.991396911621, 'conservative_loss': 0.0, 'time_step': 0.006110477447509766} step=8000
2025-10-29 21:47.15 [info     ] Model parameters are saved to d3rlpy_logs\DiscreteCQL_20251029214622\model_8000.d3


Epoch 9/10: 100%|██████████| 1000/1000 [00:06<00:00, 161.01it/s, loss=3.66e+3, td_loss=3.66e+3, conservative_loss=0]


2025-10-29 21:47.21 [info     ] DiscreteCQL_20251029214622: epoch=9 step=9000 epoch=9 metrics={'time_sample_batch': 0.0005421669483184814, 'time_algorithm_update': 0.005449480056762696, 'loss': 3658.7352639312744, 'td_loss': 3658.7352639312744, 'conservative_loss': 0.0, 'time_step': 0.006136719226837158} step=9000
2025-10-29 21:47.21 [info     ] Model parameters are saved to d3rlpy_logs\DiscreteCQL_20251029214622\model_9000.d3


Epoch 10/10: 100%|██████████| 1000/1000 [00:05<00:00, 172.46it/s, loss=3.64e+3, td_loss=3.64e+3, conservative_loss=0]


2025-10-29 21:47.27 [info     ] DiscreteCQL_20251029214622: epoch=10 step=10000 epoch=10 metrics={'time_sample_batch': 0.0004952807426452637, 'time_algorithm_update': 0.005097994565963745, 'loss': 3635.4052867126466, 'td_loss': 3635.4052867126466, 'conservative_loss': 0.0, 'time_step': 0.00573124885559082} step=10000
2025-10-29 21:47.27 [info     ] Model parameters are saved to d3rlpy_logs\DiscreteCQL_20251029214622\model_10000.d3
RL agent training complete.
DiscreteCQL model saved to 'artifacts/discrete_cql_model.d3'


In [13]:
# --- Task 3.4: Evaluate the RL Policy (Offline Policy Evaluation) ---

print("Starting Offline Policy Evaluation (DiscreteFQE)...")

# Create a DiscreteFQE object using the FQEConfig and the trained cql algo
# Note: FQEConfig is the config dataclass used by both FQE and DiscreteFQE.
fqe = DiscreteFQE(algo=cql, config=FQEConfig(), device=device)

# Fit the FQE on the training dataset
fqe.fit(
    train_dataset,
    n_steps=10000,
    n_steps_per_epoch=1000
)

# Evaluate the policy on the TEST dataset
test_values = []
for episode in test_dataset.episodes:
    obs = episode.observations[0]
    # Predict action using the trained CQL policy
    action = cql.predict(obs.reshape(1, -1))[0]
    # Predict Q-value for the (state, action)
    value = fqe.predict_value(obs.reshape(1, -1), np.array([action]))
    test_values.append(value[0])

estimated_policy_value = np.mean(test_values)

print("\n--- Model 2 (Offline RL) Results ---")
print(f"Estimated Policy Value: {estimated_policy_value:.4f}")
print(f"Number of test episodes evaluated: {len(test_values)}")
print("------------------------------------")

# Save results
rl_results = {
    'estimated_policy_value': estimated_policy_value,
    'test_values': test_values,
    'mean_value': estimated_policy_value,
    'std_value': np.std(test_values)
}
joblib.dump(rl_results, 'artifacts/rl_results.joblib')
print("\nResults saved to 'artifacts/rl_results.joblib'")

Starting Offline Policy Evaluation (DiscreteFQE)...
2025-10-29 21:48.26 [info     ] dataset info                   dataset_info=DatasetInfo(observation_signature=Signature(dtype=[dtype('float32')], shape=[(37,)]), action_signature=Signature(dtype=[dtype('int32')], shape=[(1,)]), reward_signature=Signature(dtype=[dtype('float32')], shape=[(1,)]), action_space=<ActionSpace.DISCRETE: 2>, action_size=2)
2025-10-29 21:48.26 [debug    ] Building models...            
2025-10-29 21:48.26 [debug    ] Models have been built.       
2025-10-29 21:48.26 [info     ] Directory is created at d3rlpy_logs\DiscreteFQE_20251029214826
2025-10-29 21:48.26 [info     ] Parameters                     params={'observation_shape': [37], 'action_size': 2, 'config': {'type': 'fqe', 'params': {'batch_size': 100, 'gamma': 0.99, 'observation_scaler': {'type': 'none', 'params': {}}, 'action_scaler': {'type': 'none', 'params': {}}, 'reward_scaler': {'type': 'none', 'params': {}}, 'compile_graph': False, 'learning_rat

Epoch 1/10: 100%|██████████| 1000/1000 [00:05<00:00, 188.29it/s, loss=4.39e+3]


2025-10-29 21:48.31 [info     ] DiscreteFQE_20251029214826: epoch=1 step=1000 epoch=1 metrics={'time_sample_batch': 0.0011523680686950683, 'time_algorithm_update': 0.003988391399383545, 'loss': 4387.003826416016, 'time_step': 0.005254304170608521} step=1000
2025-10-29 21:48.31 [info     ] Model parameters are saved to d3rlpy_logs\DiscreteFQE_20251029214826\model_1000.d3


Epoch 2/10: 100%|██████████| 1000/1000 [00:05<00:00, 169.77it/s, loss=3.96e+3]


2025-10-29 21:48.37 [info     ] DiscreteFQE_20251029214826: epoch=2 step=2000 epoch=2 metrics={'time_sample_batch': 0.0011481614112854005, 'time_algorithm_update': 0.004565505743026733, 'loss': 3953.143928222656, 'time_step': 0.0058294498920440675} step=2000
2025-10-29 21:48.37 [info     ] Model parameters are saved to d3rlpy_logs\DiscreteFQE_20251029214826\model_2000.d3


Epoch 3/10: 100%|██████████| 1000/1000 [00:05<00:00, 174.83it/s, loss=3.88e+3]


2025-10-29 21:48.43 [info     ] DiscreteFQE_20251029214826: epoch=3 step=3000 epoch=3 metrics={'time_sample_batch': 0.0011268253326416015, 'time_algorithm_update': 0.004420431852340698, 'loss': 3878.672094482422, 'time_step': 0.005661777496337891} step=3000
2025-10-29 21:48.43 [info     ] Model parameters are saved to d3rlpy_logs\DiscreteFQE_20251029214826\model_3000.d3


Epoch 4/10: 100%|██████████| 1000/1000 [00:05<00:00, 179.32it/s, loss=3.73e+3]


2025-10-29 21:48.48 [info     ] DiscreteFQE_20251029214826: epoch=4 step=4000 epoch=4 metrics={'time_sample_batch': 0.0011364557743072509, 'time_algorithm_update': 0.004270117044448853, 'loss': 3734.0758876953123, 'time_step': 0.005520683765411377} step=4000
2025-10-29 21:48.48 [info     ] Model parameters are saved to d3rlpy_logs\DiscreteFQE_20251029214826\model_4000.d3


Epoch 5/10: 100%|██████████| 1000/1000 [00:05<00:00, 182.22it/s, loss=3.73e+3]


2025-10-29 21:48.54 [info     ] DiscreteFQE_20251029214826: epoch=5 step=5000 epoch=5 metrics={'time_sample_batch': 0.0011284747123718262, 'time_algorithm_update': 0.004190766334533692, 'loss': 3731.352971069336, 'time_step': 0.005433071613311768} step=5000
2025-10-29 21:48.54 [info     ] Model parameters are saved to d3rlpy_logs\DiscreteFQE_20251029214826\model_5000.d3


Epoch 6/10: 100%|██████████| 1000/1000 [00:05<00:00, 182.93it/s, loss=3.66e+3]


2025-10-29 21:48.59 [info     ] DiscreteFQE_20251029214826: epoch=6 step=6000 epoch=6 metrics={'time_sample_batch': 0.0011251637935638429, 'time_algorithm_update': 0.004174154281616211, 'loss': 3663.191140991211, 'time_step': 0.005411334991455078} step=6000
2025-10-29 21:48.59 [info     ] Model parameters are saved to d3rlpy_logs\DiscreteFQE_20251029214826\model_6000.d3


Epoch 7/10: 100%|██████████| 1000/1000 [00:05<00:00, 182.12it/s, loss=3.61e+3]


2025-10-29 21:49.05 [info     ] DiscreteFQE_20251029214826: epoch=7 step=7000 epoch=7 metrics={'time_sample_batch': 0.001154026985168457, 'time_algorithm_update': 0.004166457653045654, 'loss': 3613.525942993164, 'time_step': 0.005434349536895752} step=7000
2025-10-29 21:49.05 [info     ] Model parameters are saved to d3rlpy_logs\DiscreteFQE_20251029214826\model_7000.d3


Epoch 8/10: 100%|██████████| 1000/1000 [00:05<00:00, 174.70it/s, loss=3.65e+3]


2025-10-29 21:49.10 [info     ] DiscreteFQE_20251029214826: epoch=8 step=8000 epoch=8 metrics={'time_sample_batch': 0.0012460391521453857, 'time_algorithm_update': 0.004294162750244141, 'loss': 3638.9418165893553, 'time_step': 0.0056616849899291995} step=8000
2025-10-29 21:49.10 [info     ] Model parameters are saved to d3rlpy_logs\DiscreteFQE_20251029214826\model_8000.d3


Epoch 9/10: 100%|██████████| 1000/1000 [00:06<00:00, 162.51it/s, loss=3.64e+3]


2025-10-29 21:49.17 [info     ] DiscreteFQE_20251029214826: epoch=9 step=9000 epoch=9 metrics={'time_sample_batch': 0.0014486286640167235, 'time_algorithm_update': 0.004489414930343628, 'loss': 3648.560751586914, 'time_step': 0.006077306509017944} step=9000
2025-10-29 21:49.17 [info     ] Model parameters are saved to d3rlpy_logs\DiscreteFQE_20251029214826\model_9000.d3


Epoch 10/10: 100%|██████████| 1000/1000 [00:05<00:00, 168.80it/s, loss=3.59e+3]


2025-10-29 21:49.23 [info     ] DiscreteFQE_20251029214826: epoch=10 step=10000 epoch=10 metrics={'time_sample_batch': 0.0013811919689178467, 'time_algorithm_update': 0.004331860780715942, 'loss': 3585.993712524414, 'time_step': 0.005852826356887817} step=10000
2025-10-29 21:49.23 [info     ] Model parameters are saved to d3rlpy_logs\DiscreteFQE_20251029214826\model_10000.d3

--- Model 2 (Offline RL) Results ---
Estimated Policy Value: 1924.2563
Number of test episodes evaluated: 17886
------------------------------------

Results saved to 'artifacts/rl_results.joblib'
