In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
import warnings
warnings.filterwarnings('ignore')


In [7]:
# d3rlpy for offline RL
import d3rlpy
from d3rlpy.dataset import MDPDataset
from d3rlpy.algos import DiscreteCQLConfig
from d3rlpy.metrics import TDErrorEvaluator, DiscountedSumOfAdvantageEvaluator

import os
os.makedirs('models', exist_ok=True)
os.makedirs('results', exist_ok=True)

print(f"d3rlpy version: {d3rlpy.__version__}")
print("Setup complete! ✓")


d3rlpy version: 2.6.2
Setup complete! ✓


In [8]:

print("\n" + "="*70)
print("LOADING DATA")
print("="*70)

# Load the accepted loans data (memory-efficient approach)
data_path = "D:/policy/accepted_2007_to_2018Q4.csv"
print(f"Loading data from: {data_path}")
print("This may take a minute...")

# First, peek at columns to select only what we need
print("Peeking at data structure...")
df_peek = pd.read_csv(data_path, nrows=100)
print(f"Total columns available: {len(df_peek.columns)}")

# Define columns we actually need (reduces memory significantly)
needed_cols = [
    'loan_amnt', 'term', 'int_rate', 'installment', 'grade', 'sub_grade',
    'emp_length', 'home_ownership', 'annual_inc', 'verification_status',
    'loan_status', 'purpose', 'dti', 'delinq_2yrs', 'inq_last_6mths',
    'open_acc', 'pub_rec', 'revol_bal', 'revol_util', 'total_acc'
]

# Check which columns exist
available_cols = [col for col in needed_cols if col in df_peek.columns]
print(f"Loading {len(available_cols)} selected columns...")

# Load with chunking to avoid memory issues
chunk_size = 100000
chunks = []
total_rows = 0

for i, chunk in enumerate(pd.read_csv(data_path, usecols=available_cols, 
                                       chunksize=chunk_size, low_memory=False)):
    chunks.append(chunk)
    total_rows += len(chunk)
    if (i + 1) % 5 == 0:
        print(f"  Loaded {total_rows:,} rows...")
    
    # Optional: limit total rows for testing (remove this for full dataset)
    if total_rows >= 500000:  # Load first 500k for testing
        print(f"  Stopping at {total_rows:,} rows for testing (remove limit for full dataset)")
        break

print(f"\nConcatenating chunks...")
df = pd.concat(chunks, ignore_index=True)
del chunks  # Free memory

print(f"\nDataset loaded successfully!")
print(f"Shape: {df.shape}")
print(f"Columns: {df.shape[1]}")
print(f"Rows: {df.shape[0]:,}")

# Display first few rows
print("\nFirst few rows:")
print(df.head())


LOADING DATA
Loading data from: D:/policy/accepted_2007_to_2018Q4.csv
This may take a minute...
Peeking at data structure...
Total columns available: 151
Loading 20 selected columns...
  Loaded 500,000 rows...
  Stopping at 500,000 rows for testing (remove limit for full dataset)

Concatenating chunks...

Dataset loaded successfully!
Shape: (500000, 20)
Columns: 20
Rows: 500,000

First few rows:
   loan_amnt        term  int_rate  installment grade sub_grade emp_length  \
0     3600.0   36 months     13.99       123.03     C        C4  10+ years   
1    24700.0   36 months     11.99       820.28     C        C1  10+ years   
2    20000.0   60 months     10.78       432.66     B        B4  10+ years   
3    35000.0   60 months     14.85       829.90     C        C5  10+ years   
4    10400.0   60 months     22.45       289.91     F        F1    3 years   

  home_ownership  annual_inc verification_status loan_status  \
0       MORTGAGE     55000.0        Not Verified  Fully Paid   
1  

In [9]:

print("\n" + "="*70)
print("EXPLORATORY DATA ANALYSIS")
print("="*70)

# Check loan status distribution
print("\nLoan Status Distribution:")
print(df['loan_status'].value_counts())

# Key columns to explore
print("\nKey Column Info:")
key_cols = ['loan_amnt', 'int_rate', 'annual_inc', 'dti', 'loan_status', 
            'grade', 'emp_length', 'home_ownership', 'purpose']
for col in key_cols:
    if col in df.columns:
        missing_pct = (df[col].isna().sum() / len(df)) * 100
        print(f"{col:20s} - Missing: {missing_pct:.2f}%")



EXPLORATORY DATA ANALYSIS

Loan Status Distribution:
loan_status
Fully Paid            312340
Current               104240
Charged Off            78824
Late (31-120 days)      2977
In Grace Period         1046
Late (16-30 days)        567
Default                    4
Name: count, dtype: int64

Key Column Info:
loan_amnt            - Missing: 0.00%
int_rate             - Missing: 0.00%
annual_inc           - Missing: 0.00%
dti                  - Missing: 0.04%
loan_status          - Missing: 0.00%
grade                - Missing: 0.00%
emp_length           - Missing: 6.18%
home_ownership       - Missing: 0.00%
purpose              - Missing: 0.00%


In [10]:

print("\n" + "="*70)
print("DATA PREPROCESSING")
print("="*70)

# Create binary target: 1 = Default, 0 = Paid
# Consider "Charged Off", "Default", "Does not meet the credit policy. Status:Charged Off" as default
default_statuses = ['Charged Off', 'Default', 'Does not meet the credit policy. Status:Charged Off']
paid_statuses = ['Fully Paid', 'Does not meet the credit policy. Status:Fully Paid']

df_clean = df[df['loan_status'].isin(default_statuses + paid_statuses)].copy()
df_clean['defaulted'] = df_clean['loan_status'].isin(default_statuses).astype(int)

print(f"Filtered to completed loans: {len(df_clean):,} rows")
print(f"Default rate: {df_clean['defaulted'].mean():.2%}")

# Select important features for the RL state
# These are features that would be available at the time of loan application
selected_features = [
    # Loan characteristics
    'loan_amnt',
    'term',
    'int_rate',
    'installment',
    'grade',
    'sub_grade',
    
    # Borrower characteristics
    'emp_length',
    'home_ownership',
    'annual_inc',
    'verification_status',
    'purpose',
    'dti',
    'delinq_2yrs',
    'inq_last_6mths',
    'open_acc',
    'pub_rec',
    'revol_bal',
    'revol_util',
    'total_acc',
    
    # Target
    'defaulted',
    'loan_status'
]

# Keep only rows with selected features
df_model = df_clean[selected_features].copy()

print(f"\nSelected {len(selected_features)-2} features for modeling")

# Handle missing values
print("\nHandling missing values...")
# For numeric columns, fill with median
numeric_cols = df_model.select_dtypes(include=[np.number]).columns
for col in numeric_cols:
    if df_model[col].isna().any():
        df_model[col].fillna(df_model[col].median(), inplace=True)

# For categorical columns, fill with mode or 'Unknown'
categorical_cols = df_model.select_dtypes(include=['object']).columns
categorical_cols = [c for c in categorical_cols if c not in ['loan_status', 'defaulted']]
for col in categorical_cols:
    if df_model[col].isna().any():
        df_model[col].fillna(df_model[col].mode()[0] if len(df_model[col].mode()) > 0 else 'Unknown', inplace=True)

print("Missing values handled ✓")



DATA PREPROCESSING
Filtered to completed loans: 391,168 rows
Default rate: 20.15%

Selected 19 features for modeling

Handling missing values...
Missing values handled ✓


In [11]:
# ============================================================================
# PART 5: ENCODE CATEGORICAL VARIABLES
# ============================================================================

print("\n" + "="*70)
print("ENCODING CATEGORICAL VARIABLES")
print("="*70)

# Label encode categorical variables
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    df_model[col] = le.fit_transform(df_model[col].astype(str))
    label_encoders[col] = le
    print(f"Encoded {col}: {len(le.classes_)} unique values")



ENCODING CATEGORICAL VARIABLES
Encoded term: 2 unique values
Encoded grade: 7 unique values
Encoded sub_grade: 35 unique values
Encoded emp_length: 11 unique values
Encoded home_ownership: 4 unique values
Encoded verification_status: 3 unique values
Encoded purpose: 14 unique values


In [12]:
"""
Reward Structure:
- State: Loan applicant features
- Action: 0 = Deny, 1 = Approve
- Reward:
  * If Deny: reward = 0 (no risk, no gain)
  * If Approve & Paid: reward = loan_amnt * int_rate (profit from interest)
  * If Approve & Default: reward = -loan_amnt (loss of principal)
  
IMPORTANT: We need to normalize rewards for better learning!
"""

# Calculate potential reward for each loan
df_model['int_rate_decimal'] = df_model['int_rate'] / 100  # Convert to decimal

# Reward if loan is approved (action=1)
df_model['reward_if_approve'] = np.where(
    df_model['defaulted'] == 0,
    df_model['loan_amnt'] * df_model['int_rate_decimal'],  # Paid: profit
    -df_model['loan_amnt']  # Default: loss
)

print("Raw reward statistics if approved:")
print(df_model['reward_if_approve'].describe())
print(f"\nMean reward (approved loans): ${df_model['reward_if_approve'].mean():,.2f}")
print(f"Default rate: {df_model['defaulted'].mean():.2%}")

# CRITICAL FIX: Normalize rewards to improve learning
# This helps the RL agent learn better by keeping rewards in a reasonable range
reward_mean = df_model['reward_if_approve'].mean()
reward_std = df_model['reward_if_approve'].std()
df_model['reward_normalized'] = (df_model['reward_if_approve'] - reward_mean) / reward_std

print(f"\nNormalized reward stats:")
print(f"  Mean: {df_model['reward_normalized'].mean():.4f}")
print(f"  Std: {df_model['reward_normalized'].std():.4f}")

# Historical action (all loans were approved in this dataset)
df_model['action'] = 1

# Store normalization params for later denormalization
reward_norm_params = {'mean': reward_mean, 'std': reward_std}

Raw reward statistics if approved:
count    391168.000000
mean      -1806.302111
std        8089.876912
min      -40000.000000
25%         384.650000
50%        1015.200000
75%        1958.600000
max       12068.000000
Name: reward_if_approve, dtype: float64

Mean reward (approved loans): $-1,806.30
Default rate: 20.15%

Normalized reward stats:
  Mean: 0.0000
  Std: 1.0000


In [13]:

# ============================================================================
# PART 7: CREATE RL DATASET (OFFLINE)
# ============================================================================

print("\n" + "="*70)
print("CREATING OFFLINE RL DATASET")
print("="*70)

# Prepare features for RL
feature_cols = [col for col in df_model.columns 
                if col not in ['defaulted', 'loan_status', 'reward_if_approve', 
                               'action', 'int_rate_decimal']]

print(f"Using {len(feature_cols)} features as state representation")

# Sample data for faster iteration (optional - remove for full training)
# For initial testing, use a sample. For final model, use full data.
SAMPLE_SIZE = 50000  # Set to None to use all data
if SAMPLE_SIZE and len(df_model) > SAMPLE_SIZE:
    print(f"\nSampling {SAMPLE_SIZE:,} rows for faster iteration...")
    df_rl = df_model.sample(n=SAMPLE_SIZE, random_state=42).reset_index(drop=True)
else:
    df_rl = df_model.copy()

print(f"RL dataset size: {len(df_rl):,} rows")

# Split into train/test
train_df, test_df = train_test_split(df_rl, test_size=0.2, random_state=42, stratify=df_rl['defaulted'])

print(f"Train size: {len(train_df):,}")
print(f"Test size: {len(test_df):,}")

# Normalize features
scaler = StandardScaler()
X_train = scaler.fit_transform(train_df[feature_cols])
X_test = scaler.transform(test_df[feature_cols])

# Extract actions and rewards
# Extract actions and rewards (USE NORMALIZED REWARDS)
actions_train = train_df['action'].values
rewards_train = train_df['reward_normalized'].values  # Use normalized!

actions_test = test_df['action'].values
rewards_test = test_df['reward_if_approve'].values  # Keep original for evaluation
rewards_test_normalized = test_df['reward_normalized'].values

# Create terminal flags (all episodes are single-step decisions)
terminals_train = np.ones(len(train_df), dtype=np.float32)
terminals_test = np.ones(len(test_df), dtype=np.float32)

print("\nDataset prepared for d3rlpy ✓")


CREATING OFFLINE RL DATASET
Using 20 features as state representation

Sampling 50,000 rows for faster iteration...
RL dataset size: 50,000 rows
Train size: 40,000
Test size: 10,000

Dataset prepared for d3rlpy ✓


In [14]:

# ============================================================================
# PART 8: CREATE D3RLPY MDP DATASET
# ============================================================================

print("\n" + "="*70)
print("CREATING MDP DATASET FOR D3RLPY")
print("="*70)

# Create MDPDataset for d3rlpy
# For discrete actions, we need to convert to int
actions_train_discrete = actions_train.astype(np.int32)
actions_test_discrete = actions_test.astype(np.int32)

# Create the dataset
mdp_dataset = MDPDataset(
    observations=X_train.astype(np.float32),
    actions=actions_train_discrete,
    rewards=rewards_train.astype(np.float32),
    terminals=terminals_train
)

print(f"MDP Dataset created:")
print(f"  - Episodes: {len(mdp_dataset.episodes)}")
if len(mdp_dataset.episodes) > 0:
    first_ep = mdp_dataset.episodes[0]
    print(f"  - Observation shape: {first_ep.observations.shape}")
    print(f"  - Actions shape: {first_ep.actions.shape}")
    print(f"  - Rewards shape: {first_ep.rewards.shape}")
print("✓ Dataset ready for training")



CREATING MDP DATASET FOR D3RLPY
2025-10-26 14:51.07 [info     ] Signatures have been automatically determined. action_signature=Signature(dtype=[dtype('int32')], shape=[(1,)]) observation_signature=Signature(dtype=[dtype('float32')], shape=[(20,)]) reward_signature=Signature(dtype=[dtype('float32')], shape=[(1,)])
2025-10-26 14:51.07 [info     ] Action-space has been automatically determined. action_space=<ActionSpace.DISCRETE: 2>
2025-10-26 14:51.07 [info     ] Action size has been automatically determined. action_size=2
MDP Dataset created:
  - Episodes: 40000
  - Observation shape: (1, 20)
  - Actions shape: (1, 1)
  - Rewards shape: (1, 1)
✓ Dataset ready for training


In [20]:

# ============================================================================
# PART 9: TRAIN OFFLINE RL AGENT (CQL)
# ============================================================================

print("\n" + "="*70)
print("TRAINING OFFLINE RL AGENT")
print("="*70)

"""
We'll use Conservative Q-Learning (CQL) - a popular offline RL algorithm
CQL is designed to handle offline datasets and avoid overestimation bias
"""

# Create CQL algorithm configuration
import torch

# Detect device
device = 'cuda:0' if torch.cuda.is_available() else 'cpu:0'
print(f"Using device: {device}")

# CRITICAL FIX: Increase alpha for more conservative learning
# Higher alpha = more conservative = less likely to approve risky loans
cql = DiscreteCQLConfig(
    batch_size=256,
    learning_rate=1e-5,  # Even lower!
    alpha=100.0,  # MUCH higher! (was 10)
    gamma=0.99,
).create(device=device)


print("Training CQL agent with CONSERVATIVE settings...")
print(f"  - Alpha (conservativeness): {10.0}")
print(f"  - This encourages the agent to be more selective")
print("This may take 10-20 minutes depending on your hardware...")
# Train the agent
results = cql.fit(
    mdp_dataset,
    n_steps=50000,  # Number of training steps
    n_steps_per_epoch=1000,
    evaluators={
        'td_error': TDErrorEvaluator(episodes=mdp_dataset.episodes[:100]),
    },
    show_progress=True,
    save_interval=5
)

print("\nTraining complete! ✓")

# Save the trained model
model_path = "models/cql_loan_agent.d3"
cql.save(model_path)
print(f"Model saved to: {model_path}")


TRAINING OFFLINE RL AGENT
Using device: cpu:0
Training CQL agent with CONSERVATIVE settings...
  - Alpha (conservativeness): 10.0
  - This encourages the agent to be more selective
This may take 10-20 minutes depending on your hardware...
2025-10-26 15:06.12 [info     ] dataset info                   dataset_info=DatasetInfo(observation_signature=Signature(dtype=[dtype('float32')], shape=[(20,)]), action_signature=Signature(dtype=[dtype('int32')], shape=[(1,)]), reward_signature=Signature(dtype=[dtype('float32')], shape=[(1,)]), action_space=<ActionSpace.DISCRETE: 2>, action_size=2)
2025-10-26 15:06.12 [info     ] Directory is created at d3rlpy_logs\DiscreteCQL_20251026150612
2025-10-26 15:06.12 [debug    ] Building models...            
2025-10-26 15:06.12 [debug    ] Models have been built.       
2025-10-26 15:06.12 [info     ] Parameters                     params={'observation_shape': [20], 'action_size': 2, 'config': {'type': 'discrete_cql', 'params': {'batch_size': 256, 'gamma'

Epoch 1/50: 100%|█████████████████| 1000/1000 [00:16<00:00, 60.52it/s, loss=24, td_loss=0.634, conservative_loss=0.233]


2025-10-26 15:06.29 [info     ] DiscreteCQL_20251026150612: epoch=1 step=1000 epoch=1 metrics={'time_sample_batch': 0.008375758647918702, 'time_algorithm_update': 0.007577334642410278, 'loss': 23.795445760250093, 'td_loss': 0.6392312445193529, 'conservative_loss': 0.23156214529275895, 'time_step': 0.01624019241333008, 'td_error': 4.173821310847997} step=1000


Epoch 2/50: 100%|████████████████| 1000/1000 [00:15<00:00, 64.20it/s, loss=3.57, td_loss=1.37, conservative_loss=0.022]


2025-10-26 15:06.45 [info     ] DiscreteCQL_20251026150612: epoch=2 step=2000 epoch=2 metrics={'time_sample_batch': 0.008044461011886597, 'time_algorithm_update': 0.006966717481613159, 'loss': 3.561951793909073, 'td_loss': 1.367792070865631, 'conservative_loss': 0.02194159723073244, 'time_step': 0.01531667709350586, 'td_error': 5.066076453030109} step=2000


Epoch 3/50: 100%|██████████████| 1000/1000 [00:16<00:00, 60.90it/s, loss=2.01, td_loss=1.15, conservative_loss=0.00862]


2025-10-26 15:07.01 [info     ] DiscreteCQL_20251026150612: epoch=3 step=3000 epoch=3 metrics={'time_sample_batch': 0.00891491460800171, 'time_algorithm_update': 0.006910024881362915, 'loss': 2.008893754482269, 'td_loss': 1.1490965706706047, 'conservative_loss': 0.008597971839364619, 'time_step': 0.01610742998123169, 'td_error': 2.8330667671281846} step=3000


Epoch 4/50: 100%|█████████████| 1000/1000 [00:19<00:00, 52.04it/s, loss=1.02, td_loss=0.536, conservative_loss=0.00486]


2025-10-26 15:07.21 [info     ] DiscreteCQL_20251026150612: epoch=4 step=4000 epoch=4 metrics={'time_sample_batch': 0.009893014430999755, 'time_algorithm_update': 0.008653005838394165, 'loss': 1.018404773235321, 'td_loss': 0.5343837744891643, 'conservative_loss': 0.0048402099821250886, 'time_step': 0.018808122873306275, 'td_error': 1.25584376771576} step=4000


Epoch 5/50: 100%|████████████| 1000/1000 [00:17<00:00, 57.22it/s, loss=0.507, td_loss=0.292, conservative_loss=0.00215]


2025-10-26 15:07.39 [info     ] DiscreteCQL_20251026150612: epoch=5 step=5000 epoch=5 metrics={'time_sample_batch': 0.008852696180343627, 'time_algorithm_update': 0.007959452152252198, 'loss': 0.5057920744121075, 'td_loss': 0.2918235276341438, 'conservative_loss': 0.002139685471076518, 'time_step': 0.017099493026733397, 'td_error': 0.7646665642549828} step=5000
2025-10-26 15:07.39 [info     ] Model parameters are saved to d3rlpy_logs\DiscreteCQL_20251026150612\model_5000.d3


Epoch 6/50: 100%|████████████| 1000/1000 [00:15<00:00, 64.15it/s, loss=0.283, td_loss=0.19, conservative_loss=0.000934]


2025-10-26 15:07.54 [info     ] DiscreteCQL_20251026150612: epoch=6 step=6000 epoch=6 metrics={'time_sample_batch': 0.007904782056808472, 'time_algorithm_update': 0.0071508769989013675, 'loss': 0.2826288196146488, 'td_loss': 0.18946297826617955, 'conservative_loss': 0.0009316584141924977, 'time_step': 0.015340634822845459, 'td_error': 0.3306100400431751} step=6000


Epoch 7/50: 100%|██████████| 1000/1000 [00:20<00:00, 48.48it/s, loss=0.115, td_loss=0.0688, conservative_loss=0.000465]


2025-10-26 15:08.15 [info     ] DiscreteCQL_20251026150612: epoch=7 step=7000 epoch=7 metrics={'time_sample_batch': 0.01084462857246399, 'time_algorithm_update': 0.008992270231246949, 'loss': 0.11474971234425903, 'td_loss': 0.06835508312471211, 'conservative_loss': 0.0004639462917111814, 'time_step': 0.020094813108444215, 'td_error': 0.05781770830565975} step=7000


Epoch 8/50: 100%|█████████| 1000/1000 [00:16<00:00, 61.21it/s, loss=0.0373, td_loss=0.0137, conservative_loss=0.000236]


2025-10-26 15:08.32 [info     ] DiscreteCQL_20251026150612: epoch=8 step=8000 epoch=8 metrics={'time_sample_batch': 0.00858044195175171, 'time_algorithm_update': 0.007223143339157105, 'loss': 0.037187652451917526, 'td_loss': 0.013657403782010078, 'conservative_loss': 0.00023530248695169577, 'time_step': 0.015987998485565185, 'td_error': 0.02242523547059286} step=8000


Epoch 9/50: 100%|█████████| 1000/1000 [00:15<00:00, 65.16it/s, loss=0.0188, td_loss=0.00687, conservative_loss=0.00012]


2025-10-26 15:08.47 [info     ] DiscreteCQL_20251026150612: epoch=9 step=9000 epoch=9 metrics={'time_sample_batch': 0.008406785011291504, 'time_algorithm_update': 0.00642278790473938, 'loss': 0.018781250115484, 'td_loss': 0.006861588779371232, 'conservative_loss': 0.00011919661279534922, 'time_step': 0.0150113365650177, 'td_error': 0.013755030518767057} step=9000


Epoch 10/50: 100%|████████| 1000/1000 [00:17<00:00, 56.50it/s, loss=0.0106, td_loss=0.00423, conservative_loss=6.34e-5]


2025-10-26 15:09.05 [info     ] DiscreteCQL_20251026150612: epoch=10 step=10000 epoch=10 metrics={'time_sample_batch': 0.008979621410369874, 'time_algorithm_update': 0.008142537117004394, 'loss': 0.010548305210191757, 'td_loss': 0.004224998004036024, 'conservative_loss': 6.323307192724314e-05, 'time_step': 0.01735379409790039, 'td_error': 0.008068257258449875} step=10000
2025-10-26 15:09.05 [info     ] Model parameters are saved to d3rlpy_logs\DiscreteCQL_20251026150612\model_10000.d3


Epoch 11/50: 100%|███████| 1000/1000 [00:19<00:00, 51.68it/s, loss=0.00628, td_loss=0.00273, conservative_loss=3.55e-5]


2025-10-26 15:09.25 [info     ] DiscreteCQL_20251026150612: epoch=11 step=11000 epoch=11 metrics={'time_sample_batch': 0.009883401870727539, 'time_algorithm_update': 0.008600157976150513, 'loss': 0.0062688160801772025, 'td_loss': 0.0027236034703673794, 'conservative_loss': 3.545212616154458e-05, 'time_step': 0.018848884344100952, 'td_error': 0.004999434066041886} step=11000


Epoch 12/50: 100%|███████| 1000/1000 [00:18<00:00, 53.80it/s, loss=0.00394, td_loss=0.00193, conservative_loss=2.01e-5]


2025-10-26 15:09.43 [info     ] DiscreteCQL_20251026150612: epoch=12 step=12000 epoch=12 metrics={'time_sample_batch': 0.009973551273345947, 'time_algorithm_update': 0.007905846357345581, 'loss': 0.003936404167441651, 'td_loss': 0.0019303331073606386, 'conservative_loss': 2.0060710550751535e-05, 'time_step': 0.01819117093086243, 'td_error': 0.0032515901774638678} step=12000


Epoch 13/50: 100%|███████| 1000/1000 [00:21<00:00, 45.75it/s, loss=0.00265, td_loss=0.00148, conservative_loss=1.16e-5]


2025-10-26 15:10.05 [info     ] DiscreteCQL_20251026150612: epoch=13 step=13000 epoch=13 metrics={'time_sample_batch': 0.011243335485458373, 'time_algorithm_update': 0.009927904844284057, 'loss': 0.0026455274585168807, 'td_loss': 0.0014831287130364216, 'conservative_loss': 1.1623987462371588e-05, 'time_step': 0.021518816709518433, 'td_error': 0.0024092508662275237} step=13000


Epoch 14/50: 100%|████████| 1000/1000 [00:18<00:00, 54.35it/s, loss=0.0019, td_loss=0.00121, conservative_loss=6.93e-6]


2025-10-26 15:10.24 [info     ] DiscreteCQL_20251026150612: epoch=14 step=14000 epoch=14 metrics={'time_sample_batch': 0.00967641282081604, 'time_algorithm_update': 0.008121387720108033, 'loss': 0.0018965933021390811, 'td_loss': 0.0012046508479397744, 'conservative_loss': 6.919424515217543e-06, 'time_step': 0.018105736017227172, 'td_error': 0.0018705851388335758} step=14000


Epoch 15/50: 100%|███████| 1000/1000 [00:16<00:00, 61.97it/s, loss=0.00142, td_loss=0.00101, conservative_loss=4.12e-6]


2025-10-26 15:10.40 [info     ] DiscreteCQL_20251026150612: epoch=15 step=15000 epoch=15 metrics={'time_sample_batch': 0.008541188955307007, 'time_algorithm_update': 0.007074373483657837, 'loss': 0.0014185368744656444, 'td_loss': 0.001007555854099337, 'conservative_loss': 4.109810222871601e-06, 'time_step': 0.015889602184295654, 'td_error': 0.0015510196970237545} step=15000
2025-10-26 15:10.40 [info     ] Model parameters are saved to d3rlpy_logs\DiscreteCQL_20251026150612\model_15000.d3


Epoch 16/50: 100%|██████| 1000/1000 [00:15<00:00, 63.93it/s, loss=0.00112, td_loss=0.000864, conservative_loss=2.52e-6]


2025-10-26 15:10.56 [info     ] DiscreteCQL_20251026150612: epoch=16 step=16000 epoch=16 metrics={'time_sample_batch': 0.008238198041915893, 'time_algorithm_update': 0.006834311008453369, 'loss': 0.0011140980832860805, 'td_loss': 0.0008632251941016875, 'conservative_loss': 2.508728881366551e-06, 'time_step': 0.01541198182106018, 'td_error': 0.0013812364766968698} step=16000


Epoch 17/50: 100%|█████| 1000/1000 [00:19<00:00, 50.26it/s, loss=0.000903, td_loss=0.000749, conservative_loss=1.54e-6]


2025-10-26 15:11.16 [info     ] DiscreteCQL_20251026150612: epoch=17 step=17000 epoch=17 metrics={'time_sample_batch': 0.010718430995941162, 'time_algorithm_update': 0.008542296409606933, 'loss': 0.0009026784370071254, 'td_loss': 0.0007489148324821144, 'conservative_loss': 1.5376360388472675e-06, 'time_step': 0.019554494619369506, 'td_error': 0.0011824871145296712} step=17000


Epoch 18/50: 100%|█████| 1000/1000 [00:15<00:00, 62.68it/s, loss=0.000759, td_loss=0.000662, conservative_loss=9.78e-7]


2025-10-26 15:11.32 [info     ] DiscreteCQL_20251026150612: epoch=18 step=18000 epoch=18 metrics={'time_sample_batch': 0.008364786624908447, 'time_algorithm_update': 0.0071315023899078366, 'loss': 0.0007588553951936774, 'td_loss': 0.0006611912288353778, 'conservative_loss': 9.766416624188424e-07, 'time_step': 0.01573989391326904, 'td_error': 0.001010371419092948} step=18000


Epoch 19/50: 100%|██████| 1000/1000 [00:16<00:00, 61.27it/s, loss=0.00065, td_loss=0.000586, conservative_loss=6.36e-7]


2025-10-26 15:11.49 [info     ] DiscreteCQL_20251026150612: epoch=19 step=19000 epoch=19 metrics={'time_sample_batch': 0.008570176601409912, 'time_algorithm_update': 0.007188028573989868, 'loss': 0.0006497500430850778, 'td_loss': 0.0005862241111462935, 'conservative_loss': 6.352593190968037e-07, 'time_step': 0.016033046960830688, 'td_error': 0.000886736999908102} step=19000


Epoch 20/50: 100%|██████| 1000/1000 [00:16<00:00, 62.08it/s, loss=0.000561, td_loss=0.00052, conservative_loss=4.11e-7]


2025-10-26 15:12.05 [info     ] DiscreteCQL_20251026150612: epoch=20 step=20000 epoch=20 metrics={'time_sample_batch': 0.008467483043670654, 'time_algorithm_update': 0.007091650485992432, 'loss': 0.0005607256806397344, 'td_loss': 0.0005196638543857262, 'conservative_loss': 4.1061826050281527e-07, 'time_step': 0.015866975545883177, 'td_error': 0.0008222692729134451} step=20000
2025-10-26 15:12.05 [info     ] Model parameters are saved to d3rlpy_logs\DiscreteCQL_20251026150612\model_20000.d3


Epoch 21/50: 100%|█████| 1000/1000 [00:13<00:00, 74.09it/s, loss=0.000492, td_loss=0.000466, conservative_loss=2.69e-7]


2025-10-26 15:12.19 [info     ] DiscreteCQL_20251026150612: epoch=21 step=21000 epoch=21 metrics={'time_sample_batch': 0.007417468547821045, 'time_algorithm_update': 0.005716494560241699, 'loss': 0.0004919828455313108, 'td_loss': 0.00046511796157574283, 'conservative_loss': 2.686488442122936e-07, 'time_step': 0.013301262378692627, 'td_error': 0.000755415781826514} step=21000


Epoch 22/50: 100%|██████| 1000/1000 [00:14<00:00, 68.79it/s, loss=0.000437, td_loss=0.00042, conservative_loss=1.74e-7]


2025-10-26 15:12.33 [info     ] DiscreteCQL_20251026150612: epoch=22 step=22000 epoch=22 metrics={'time_sample_batch': 0.007564565181732178, 'time_algorithm_update': 0.006437102556228638, 'loss': 0.0004370036422333214, 'td_loss': 0.000419651612493908, 'conservative_loss': 1.735202968120575e-07, 'time_step': 0.014297362565994263, 'td_error': 0.0006848509955609927} step=22000


Epoch 23/50: 100%|██████| 1000/1000 [00:13<00:00, 73.90it/s, loss=0.000392, td_loss=0.00038, conservative_loss=1.12e-7]


2025-10-26 15:12.47 [info     ] DiscreteCQL_20251026150612: epoch=23 step=23000 epoch=23 metrics={'time_sample_batch': 0.007491914987564087, 'time_algorithm_update': 0.005633583545684814, 'loss': 0.00039138735859887676, 'td_loss': 0.0003801394964393694, 'conservative_loss': 1.1247862130403518e-07, 'time_step': 0.013327855110168457, 'td_error': 0.000628104111224701} step=23000


Epoch 24/50: 100%|█████| 1000/1000 [00:13<00:00, 75.97it/s, loss=0.000351, td_loss=0.000344, conservative_loss=7.42e-8]


2025-10-26 15:13.00 [info     ] DiscreteCQL_20251026150612: epoch=24 step=24000 epoch=24 metrics={'time_sample_batch': 0.007297961711883545, 'time_algorithm_update': 0.0054562842845916745, 'loss': 0.0003512543194519822, 'td_loss': 0.00034383400683873336, 'conservative_loss': 7.420312613248825e-08, 'time_step': 0.012945385456085205, 'td_error': 0.0006112366975503391} step=24000


Epoch 25/50: 100%|██████| 1000/1000 [00:15<00:00, 65.02it/s, loss=0.00032, td_loss=0.000316, conservative_loss=4.75e-8]


2025-10-26 15:13.16 [info     ] DiscreteCQL_20251026150612: epoch=25 step=25000 epoch=25 metrics={'time_sample_batch': 0.007991410732269288, 'time_algorithm_update': 0.0066914610862731936, 'loss': 0.0003202701659029117, 'td_loss': 0.0003155214452272048, 'conservative_loss': 4.748720675706863e-08, 'time_step': 0.015005312204360962, 'td_error': 0.0005473999931427897} step=25000
2025-10-26 15:13.16 [info     ] Model parameters are saved to d3rlpy_logs\DiscreteCQL_20251026150612\model_25000.d3


Epoch 26/50: 100%|█████| 1000/1000 [00:12<00:00, 79.51it/s, loss=0.000294, td_loss=0.000291, conservative_loss=3.07e-8]


2025-10-26 15:13.28 [info     ] DiscreteCQL_20251026150612: epoch=26 step=26000 epoch=26 metrics={'time_sample_batch': 0.006251884698867798, 'time_algorithm_update': 0.006019286394119263, 'loss': 0.00029372646956471725, 'td_loss': 0.00029066516571037935, 'conservative_loss': 3.0613038688898086e-08, 'time_step': 0.012469389915466308, 'td_error': 0.0005206293275633911} step=26000


Epoch 27/50: 100%|██████| 1000/1000 [00:12<00:00, 77.69it/s, loss=0.00027, td_loss=0.000268, conservative_loss=1.95e-8]


2025-10-26 15:13.42 [info     ] DiscreteCQL_20251026150612: epoch=27 step=27000 epoch=27 metrics={'time_sample_batch': 0.006654207468032837, 'time_algorithm_update': 0.00587020468711853, 'loss': 0.00026992135887849144, 'td_loss': 0.00026796921355708034, 'conservative_loss': 1.9521452486515044e-08, 'time_step': 0.012743255138397218, 'td_error': 0.0005117335722852445} step=27000


Epoch 28/50: 100%|█████| 1000/1000 [00:13<00:00, 73.12it/s, loss=0.000249, td_loss=0.000248, conservative_loss=1.15e-8]


2025-10-26 15:13.55 [info     ] DiscreteCQL_20251026150612: epoch=28 step=28000 epoch=28 metrics={'time_sample_batch': 0.007228280782699585, 'time_algorithm_update': 0.006036463499069214, 'loss': 0.00024906899442430583, 'td_loss': 0.00024792202410753815, 'conservative_loss': 1.1469703167676926e-08, 'time_step': 0.013440795183181763, 'td_error': 0.0004456292547157403} step=28000


Epoch 29/50: 100%|██████| 1000/1000 [00:14<00:00, 70.38it/s, loss=0.000232, td_loss=0.000232, conservative_loss=7.1e-9]


2025-10-26 15:14.10 [info     ] DiscreteCQL_20251026150612: epoch=29 step=29000 epoch=29 metrics={'time_sample_batch': 0.007966842412948608, 'time_algorithm_update': 0.005834408283233642, 'loss': 0.00023217920283786952, 'td_loss': 0.00023146986099891365, 'conservative_loss': 7.093418389558792e-09, 'time_step': 0.01403377890586853, 'td_error': 0.0004237716856347262} step=29000


Epoch 30/50: 100%|█████| 1000/1000 [00:13<00:00, 74.66it/s, loss=0.000216, td_loss=0.000215, conservative_loss=4.04e-9]


2025-10-26 15:14.23 [info     ] DiscreteCQL_20251026150612: epoch=30 step=30000 epoch=30 metrics={'time_sample_batch': 0.0072796642780303955, 'time_algorithm_update': 0.005610124111175537, 'loss': 0.00021558429606375283, 'td_loss': 0.00021518126621958799, 'conservative_loss': 4.030298441648483e-09, 'time_step': 0.013089730501174927, 'td_error': 0.00039429911103468386} step=30000
2025-10-26 15:14.23 [info     ] Model parameters are saved to d3rlpy_logs\DiscreteCQL_20251026150612\model_30000.d3


Epoch 31/50: 100%|█████| 1000/1000 [00:14<00:00, 71.42it/s, loss=0.000202, td_loss=0.000201, conservative_loss=2.44e-9]


2025-10-26 15:14.37 [info     ] DiscreteCQL_20251026150612: epoch=31 step=31000 epoch=31 metrics={'time_sample_batch': 0.007656116962432861, 'time_algorithm_update': 0.005871856927871704, 'loss': 0.00020156814804067835, 'td_loss': 0.0002013258179067634, 'conservative_loss': 2.423301339149475e-09, 'time_step': 0.013730573415756226, 'td_error': 0.0003829542842375133} step=31000


Epoch 32/50: 100%|██████| 1000/1000 [00:13<00:00, 75.30it/s, loss=0.000188, td_loss=0.000188, conservative_loss=1.3e-9]


2025-10-26 15:14.51 [info     ] DiscreteCQL_20251026150612: epoch=32 step=32000 epoch=32 metrics={'time_sample_batch': 0.007404232263565063, 'time_algorithm_update': 0.005478645324707031, 'loss': 0.00018799198514898307, 'td_loss': 0.00018786239161272533, 'conservative_loss': 1.2959353625774383e-09, 'time_step': 0.013120784759521484, 'td_error': 0.0003441825886373451} step=32000


Epoch 33/50: 100%|█████| 1000/1000 [00:14<00:00, 70.14it/s, loss=0.000176, td_loss=0.000176, conservative_loss=7.3e-10]


2025-10-26 15:15.05 [info     ] DiscreteCQL_20251026150612: epoch=33 step=33000 epoch=33 metrics={'time_sample_batch': 0.007955808401107788, 'time_algorithm_update': 0.005787583112716675, 'loss': 0.00017571652228070888, 'td_loss': 0.0001756434600247303, 'conservative_loss': 7.306225597858429e-10, 'time_step': 0.013961008310317994, 'td_error': 0.0003144668767471792} step=33000


Epoch 34/50: 100%|████| 1000/1000 [00:14<00:00, 70.66it/s, loss=0.000165, td_loss=0.000165, conservative_loss=4.48e-10]


2025-10-26 15:15.20 [info     ] DiscreteCQL_20251026150612: epoch=34 step=34000 epoch=34 metrics={'time_sample_batch': 0.007766737937927246, 'time_algorithm_update': 0.005841442584991455, 'loss': 0.0001654223748773802, 'td_loss': 0.0001653777179599274, 'conservative_loss': 4.4656917452812193e-10, 'time_step': 0.01387801742553711, 'td_error': 0.0003385834152396061} step=34000


Epoch 35/50: 100%|████| 1000/1000 [00:13<00:00, 74.87it/s, loss=0.000156, td_loss=0.000156, conservative_loss=2.29e-10]


2025-10-26 15:15.33 [info     ] DiscreteCQL_20251026150612: epoch=35 step=35000 epoch=35 metrics={'time_sample_batch': 0.007318354845046997, 'time_algorithm_update': 0.005564684867858887, 'loss': 0.00015591156112350292, 'td_loss': 0.00015588860402203863, 'conservative_loss': 2.2957101464271545e-10, 'time_step': 0.013106711149215699, 'td_error': 0.0003013198305335152} step=35000
2025-10-26 15:15.33 [info     ] Model parameters are saved to d3rlpy_logs\DiscreteCQL_20251026150612\model_35000.d3


Epoch 36/50: 100%|████| 1000/1000 [00:13<00:00, 74.30it/s, loss=0.000147, td_loss=0.000147, conservative_loss=1.13e-10]


2025-10-26 15:15.47 [info     ] DiscreteCQL_20251026150612: epoch=36 step=36000 epoch=36 metrics={'time_sample_batch': 0.007438535928726196, 'time_algorithm_update': 0.0055991849899291995, 'loss': 0.00014741848786798073, 'td_loss': 0.00014740721886482788, 'conservative_loss': 1.126900315284729e-10, 'time_step': 0.013208956718444824, 'td_error': 0.00027872954690842276} step=36000


Epoch 37/50: 100%|████| 1000/1000 [00:13<00:00, 74.32it/s, loss=0.000139, td_loss=0.000139, conservative_loss=5.78e-11]


2025-10-26 15:16.00 [info     ] DiscreteCQL_20251026150612: epoch=37 step=37000 epoch=37 metrics={'time_sample_batch': 0.007462238073348999, 'time_algorithm_update': 0.00545283555984497, 'loss': 0.0001391529761822312, 'td_loss': 0.00013914724854839733, 'conservative_loss': 5.727633833885193e-11, 'time_step': 0.01316091251373291, 'td_error': 0.0002638896643520638} step=37000


Epoch 38/50: 100%|████| 1000/1000 [00:13<00:00, 71.86it/s, loss=0.000132, td_loss=0.000132, conservative_loss=2.73e-11]


2025-10-26 15:16.14 [info     ] DiscreteCQL_20251026150612: epoch=38 step=38000 epoch=38 metrics={'time_sample_batch': 0.007643033027648926, 'time_algorithm_update': 0.005703411102294922, 'loss': 0.00013233994391339366, 'td_loss': 0.00013233724307792726, 'conservative_loss': 2.7008354663848878e-11, 'time_step': 0.013613165855407715, 'td_error': 0.00024653081816208735} step=38000


Epoch 39/50: 100%|████| 1000/1000 [00:13<00:00, 75.90it/s, loss=0.000126, td_loss=0.000126, conservative_loss=1.27e-11]


2025-10-26 15:16.28 [info     ] DiscreteCQL_20251026150612: epoch=39 step=39000 epoch=39 metrics={'time_sample_batch': 0.007091171503067016, 'time_algorithm_update': 0.00564107084274292, 'loss': 0.00012585783057875234, 'td_loss': 0.0001258565732932766, 'conservative_loss': 1.257285475730896e-11, 'time_step': 0.012953288555145263, 'td_error': 0.00023670228061511978} step=39000


Epoch 40/50: 100%|████| 1000/1000 [00:13<00:00, 74.32it/s, loss=0.000119, td_loss=0.000119, conservative_loss=1.46e-11]


2025-10-26 15:16.41 [info     ] DiscreteCQL_20251026150612: epoch=40 step=40000 epoch=40 metrics={'time_sample_batch': 0.0075231845378875735, 'time_algorithm_update': 0.005433382034301758, 'loss': 0.00011874437760707224, 'td_loss': 0.00011874293405708158, 'conservative_loss': 1.4435499906539917e-11, 'time_step': 0.013162339925765992, 'td_error': 0.00021874811386951798} step=40000
2025-10-26 15:16.41 [info     ] Model parameters are saved to d3rlpy_logs\DiscreteCQL_20251026150612\model_40000.d3


Epoch 41/50: 100%|████| 1000/1000 [00:13<00:00, 75.55it/s, loss=0.000114, td_loss=0.000114, conservative_loss=5.17e-12]


2025-10-26 15:16.55 [info     ] DiscreteCQL_20251026150612: epoch=41 step=41000 epoch=41 metrics={'time_sample_batch': 0.007265565395355225, 'time_algorithm_update': 0.005447373628616333, 'loss': 0.00011370600467489567, 'td_loss': 0.00011370549244747963, 'conservative_loss': 5.1222741603851315e-12, 'time_step': 0.012964205980300904, 'td_error': 0.00023309518075060432} step=41000


Epoch 42/50: 100%|█████| 1000/1000 [00:14<00:00, 68.16it/s, loss=0.000108, td_loss=0.000108, conservative_loss=4.7e-13]


2025-10-26 15:17.10 [info     ] DiscreteCQL_20251026150612: epoch=42 step=42000 epoch=42 metrics={'time_sample_batch': 0.008114847183227539, 'time_algorithm_update': 0.006106138467788696, 'loss': 0.00010789889620355098, 'td_loss': 0.00010789884963742224, 'conservative_loss': 4.656612873077393e-13, 'time_step': 0.014412575721740723, 'td_error': 0.00020024155548867384} step=42000


Epoch 43/50: 100%|███████████| 1000/1000 [00:13<00:00, 74.86it/s, loss=0.000103, td_loss=0.000103, conservative_loss=0]


2025-10-26 15:17.23 [info     ] DiscreteCQL_20251026150612: epoch=43 step=43000 epoch=43 metrics={'time_sample_batch': 0.007647024869918823, 'time_algorithm_update': 0.005317799806594849, 'loss': 0.00010280119951494271, 'td_loss': 0.00010280119951494271, 'conservative_loss': 0.0, 'time_step': 0.013188709974288941, 'td_error': 0.00019546855573171484} step=43000


Epoch 44/50: 100%|█████████████| 1000/1000 [00:13<00:00, 75.55it/s, loss=9.93e-5, td_loss=9.93e-5, conservative_loss=0]


2025-10-26 15:17.37 [info     ] DiscreteCQL_20251026150612: epoch=44 step=44000 epoch=44 metrics={'time_sample_batch': 0.006927061796188355, 'time_algorithm_update': 0.0059272701740264895, 'loss': 9.934260779118632e-05, 'td_loss': 9.934260779118632e-05, 'conservative_loss': 0.0, 'time_step': 0.013103506803512573, 'td_error': 0.00017084458582907302} step=44000


Epoch 45/50: 100%|█████████████| 1000/1000 [00:13<00:00, 75.75it/s, loss=9.43e-5, td_loss=9.43e-5, conservative_loss=0]


2025-10-26 15:17.50 [info     ] DiscreteCQL_20251026150612: epoch=45 step=45000 epoch=45 metrics={'time_sample_batch': 0.007287786245346069, 'time_algorithm_update': 0.005544368505477905, 'loss': 9.43169008969562e-05, 'td_loss': 9.43169008969562e-05, 'conservative_loss': 0.0, 'time_step': 0.013019045352935792, 'td_error': 0.0001607446280223712} step=45000
2025-10-26 15:17.50 [info     ] Model parameters are saved to d3rlpy_logs\DiscreteCQL_20251026150612\model_45000.d3


Epoch 46/50: 100%|█████████████| 1000/1000 [00:13<00:00, 73.02it/s, loss=9.04e-5, td_loss=9.04e-5, conservative_loss=0]


2025-10-26 15:18.04 [info     ] DiscreteCQL_20251026150612: epoch=46 step=46000 epoch=46 metrics={'time_sample_batch': 0.007681147575378418, 'time_algorithm_update': 0.00559993314743042, 'loss': 9.035296644287883e-05, 'td_loss': 9.035296644287883e-05, 'conservative_loss': 0.0, 'time_step': 0.01352216100692749, 'td_error': 0.0001820505364217695} step=46000


Epoch 47/50: 100%|█████████████| 1000/1000 [00:13<00:00, 75.49it/s, loss=8.76e-5, td_loss=8.76e-5, conservative_loss=0]


2025-10-26 15:18.17 [info     ] DiscreteCQL_20251026150612: epoch=47 step=47000 epoch=47 metrics={'time_sample_batch': 0.007391171932220459, 'time_algorithm_update': 0.005395124197006226, 'loss': 8.758623108951724e-05, 'td_loss': 8.758623108951724e-05, 'conservative_loss': 0.0, 'time_step': 0.012988981485366822, 'td_error': 0.0001693865545997042} step=47000


Epoch 48/50: 100%|█████████████| 1000/1000 [00:12<00:00, 77.86it/s, loss=8.31e-5, td_loss=8.31e-5, conservative_loss=0]


2025-10-26 15:18.30 [info     ] DiscreteCQL_20251026150612: epoch=48 step=48000 epoch=48 metrics={'time_sample_batch': 0.007048283815383911, 'time_algorithm_update': 0.00545750093460083, 'loss': 8.311769763167831e-05, 'td_loss': 8.311769763167831e-05, 'conservative_loss': 0.0, 'time_step': 0.01265396547317505, 'td_error': 0.000155014893778187} step=48000


Epoch 49/50: 100%|█████████████| 1000/1000 [00:13<00:00, 73.67it/s, loss=8.08e-5, td_loss=8.08e-5, conservative_loss=0]


2025-10-26 15:18.44 [info     ] DiscreteCQL_20251026150612: epoch=49 step=49000 epoch=49 metrics={'time_sample_batch': 0.007392178297042847, 'time_algorithm_update': 0.00575860333442688, 'loss': 8.08268277105526e-05, 'td_loss': 8.08268277105526e-05, 'conservative_loss': 0.0, 'time_step': 0.013424071311950683, 'td_error': 0.00016165180158878912} step=49000


Epoch 50/50: 100%|█████████████| 1000/1000 [00:13<00:00, 73.56it/s, loss=7.77e-5, td_loss=7.77e-5, conservative_loss=0]


2025-10-26 15:18.58 [info     ] DiscreteCQL_20251026150612: epoch=50 step=50000 epoch=50 metrics={'time_sample_batch': 0.007485133409500122, 'time_algorithm_update': 0.005736316680908203, 'loss': 7.768553005007561e-05, 'td_loss': 7.768553005007561e-05, 'conservative_loss': 0.0, 'time_step': 0.01344212532043457, 'td_error': 0.00013826473256501438} step=50000
2025-10-26 15:18.58 [info     ] Model parameters are saved to d3rlpy_logs\DiscreteCQL_20251026150612\model_50000.d3

Training complete! ✓
Model saved to: models/cql_loan_agent.d3


In [21]:

# ============================================================================
# PART 10: EVALUATE THE POLICY
# ============================================================================

print("\n" + "="*70)
print("EVALUATING THE LEARNED POLICY")
print("="*70)

# Predict actions on test set
predicted_actions = cql.predict(X_test)

print(f"Test set predictions:")
print(f"  - Total applications: {len(predicted_actions)}")
print(f"  - Approved by policy: {predicted_actions.sum()}")
print(f"  - Denied by policy: {len(predicted_actions) - predicted_actions.sum()}")
print(f"  - Approval rate: {predicted_actions.mean():.2%}")

# Calculate expected return under the learned policy
# Reward if deny = 0
# Reward if approve = actual reward
policy_rewards = np.where(
    predicted_actions == 1,
    rewards_test,  # If approved, get actual reward
    0  # If denied, no reward
)

print(f"\nPolicy Performance on Test Set:")
print(f"  - Total expected return: ${policy_rewards.sum():,.2f}")
print(f"  - Mean reward per decision: ${policy_rewards.mean():,.2f}")

# Compare to baseline strategies
# Baseline 1: Approve all (current strategy in historical data)
approve_all_return = rewards_test.sum()
approve_all_mean = rewards_test.mean()

print(f"\nBaseline (Approve All):")
print(f"  - Total return: ${approve_all_return:,.2f}")
print(f"  - Mean reward: ${approve_all_mean:,.2f}")

# Baseline 2: Deny all (safest but no profit)
deny_all_return = 0
deny_all_mean = 0

print(f"\nBaseline (Deny All):")
print(f"  - Total return: ${deny_all_return:,.2f}")
print(f"  - Mean reward: ${deny_all_mean:,.2f}")

# Calculate improvement
improvement_vs_approve_all = ((policy_rewards.sum() - approve_all_return) / abs(approve_all_return)) * 100

print(f"\n{'='*70}")
print(f"POLICY IMPROVEMENT: {improvement_vs_approve_all:+.2f}% vs Approve All")
print(f"{'='*70}")



EVALUATING THE LEARNED POLICY
Test set predictions:
  - Total applications: 10000
  - Approved by policy: 10000
  - Denied by policy: 0
  - Approval rate: 100.00%

Policy Performance on Test Set:
  - Total expected return: $-17,998,407.52
  - Mean reward per decision: $-1,799.84

Baseline (Approve All):
  - Total return: $-17,998,407.52
  - Mean reward: $-1,799.84

Baseline (Deny All):
  - Total return: $0.00
  - Mean reward: $0.00

POLICY IMPROVEMENT: +0.00% vs Approve All


In [22]:
print("\n" + "="*70)
print("ANALYZING POLICY DECISIONS")
print("="*70)

# Add predictions to test dataframe
test_df_analysis = test_df.copy()
test_df_analysis['predicted_action'] = predicted_actions
test_df_analysis['actual_reward'] = rewards_test

# Analyze by default status
print("\nPolicy decisions by actual outcome:")
print("-" * 50)
for default_status in [0, 1]:
    mask = test_df_analysis['defaulted'] == default_status
    status_name = "Fully Paid" if default_status == 0 else "Defaulted"
    approval_rate = test_df_analysis[mask]['predicted_action'].mean()
    count = mask.sum()
    avg_reward = test_df_analysis[mask]['actual_reward'].mean()
    print(f"{status_name:12s} ({count:5d} loans):")
    print(f"  - Agent approved: {approval_rate:6.1%}")
    print(f"  - Avg reward: ${avg_reward:8,.2f}")

# Calculate what the OPTIMAL policy would be
print("\n" + "="*70)
print("WHAT IF WE ONLY APPROVED PROFITABLE LOANS?")
print("="*70)
optimal_actions = (rewards_test > 0).astype(int)
optimal_return = (optimal_actions * rewards_test).sum()
optimal_approval_rate = optimal_actions.mean()

print(f"Optimal (Oracle) Strategy:")
print(f"  - Would approve: {optimal_approval_rate:.1%} of loans")
print(f"  - Total return: ${optimal_return:,.2f}")
print(f"  - Mean reward: ${optimal_return / len(rewards_test):,.2f}")

# Compare policies
print("\n" + "="*70)
print("POLICY COMPARISON")
print("="*70)
print(f"{'Strategy':<20s} {'Approval Rate':>15s} {'Total Return':>20s} {'Mean Return':>15s}")
print("-" * 70)
print(f"{'Deny All':<20s} {0:>14.1%} ${0:>18,.2f} ${0:>13,.2f}")
print(f"{'Approve All':<20s} {1.0:>14.1%} ${approve_all_return:>18,.2f} ${approve_all_mean:>13,.2f}")
print(f"{'RL Policy':<20s} {predicted_actions.mean():>14.1%} ${policy_rewards.sum():>18,.2f} ${policy_rewards.mean():>13,.2f}")
print(f"{'Optimal (Oracle)':<20s} {optimal_approval_rate:>14.1%} ${optimal_return:>18,.2f} ${optimal_return/len(rewards_test):>13,.2f}")

# Find interesting cases
print("\n" + "="*70)
print("INTERESTING CASES")
print("="*70)

# High-risk approvals (defaults that policy still approved)
high_risk_approved = test_df_analysis[
    (test_df_analysis['defaulted'] == 1) & 
    (test_df_analysis['predicted_action'] == 1)
]
print(f"High-risk approvals (approved loans that defaulted): {len(high_risk_approved)}")
if len(high_risk_approved) > 0:
    print(f"  - Average loss: ${high_risk_approved['actual_reward'].mean():,.2f}")

# Safe denials (would-be paid loans that were denied)
safe_denials = test_df_analysis[
    (test_df_analysis['defaulted'] == 0) & 
    (test_df_analysis['predicted_action'] == 0)
]
print(f"Opportunity cost (denied loans that would have paid): {len(safe_denials)}")
if len(safe_denials) > 0:
    print(f"  - Missed profit: ${safe_denials['actual_reward'].sum():,.2f}")

# Good denials (correctly denied defaulting loans)
good_denials = test_df_analysis[
    (test_df_analysis['defaulted'] == 1) & 
    (test_df_analysis['predicted_action'] == 0)
]
print(f"Good denials (correctly denied risky loans): {len(good_denials)}")
if len(good_denials) > 0:
    print(f"  - Losses avoided: ${abs(good_denials['actual_reward'].sum()):,.2f}")


ANALYZING POLICY DECISIONS

Policy decisions by actual outcome:
--------------------------------------------------
Fully Paid   ( 7962 loans):
  - Agent approved: 100.0%
  - Avg reward: $1,725.83
Defaulted    ( 2038 loans):
  - Agent approved: 100.0%
  - Avg reward: $-15,573.83

WHAT IF WE ONLY APPROVED PROFITABLE LOANS?
Optimal (Oracle) Strategy:
  - Would approve: 79.6% of loans
  - Total return: $13,741,067.48
  - Mean reward: $1,374.11

POLICY COMPARISON
Strategy               Approval Rate         Total Return     Mean Return
----------------------------------------------------------------------
Deny All                       0.0% $              0.00 $         0.00
Approve All                  100.0% $    -17,998,407.52 $    -1,799.84
RL Policy                    100.0% $    -17,998,407.52 $    -1,799.84
Optimal (Oracle)              79.6% $     13,741,067.48 $     1,374.11

INTERESTING CASES
High-risk approvals (approved loans that defaulted): 2038
  - Average loss: $-15,573.83


In [27]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display

sns.set(style="whitegrid", rc={"figure.dpi":120})
pd.set_option("display.max_columns", 200)
pd.set_option("display.width", 200)

PATH = "accepted_2007_to_2018Q4.csv"
df = pd.read_csv(PATH, low_memory=False)

print("Dataset loaded:", PATH)
print("Shape:", df.shape)
print("\nColumn types (first 80 cols shown):")
display(df.dtypes.head(80))

print("\nFirst 6 rows:")
display(df.head(6))

if "loan_status" not in df.columns:
    raise KeyError("Required column 'loan_status' not found in dataset. Please provide it.")
else:
    print("\nUnique values in loan_status (top 50):")
    display(df["loan_status"].value_counts(dropna=False).head(50))


def map_loan_status_to_binary(s):
    s = s.astype(str).str.strip().str.lower()
    positive_keywords = ("charged", "default", "late", "collection", "bankruptcy", "miss", "delinquent")
    negative_keywords = ("fully paid", "paid", "current", "ongoing")
    out = pd.Series(index=s.index, dtype="Int64")
    for i, val in s.items():
        if any(k in val for k in positive_keywords):
            out.at[i] = 1
        elif any(k in val for k in negative_keywords):
            out.at[i] = 0
        else:
            out.at[i] = pd.NA
    return out

df["loan_default_bin"] = map_loan_status_to_binary(df["loan_status"])

print("Mapped binary target summary:")
display(df["loan_default_bin"].value_counts(dropna=False))

print("\nRows with ambiguous loan_status (need manual check):")
display(df[df["loan_default_bin"].isna()][["loan_status"]].drop_duplicates().head(50))


missing = df.isnull().sum().sort_values(ascending=False)
missing_pct = (missing / len(df) * 100).round(3)
missing_df = pd.DataFrame({"missing_count": missing, "missing_pct": missing_pct})
print("Top 60 columns by missingness:")
display(missing_df[missing_df["missing_count"] > 0].head(60))

top_missing = missing_df[missing_df["missing_count"]>0].head(30).index.tolist()
if top_missing:
    plt.figure(figsize=(10, min(6, 0.2*len(top_missing)+1)))
    sns.heatmap(df[top_missing].isnull().T, cbar=False)
    plt.title("Missingness heatmap (top 30 columns with most missing values)")
    plt.xlabel("rows")
    plt.ylabel("columns")
    plt.show()

plt.figure(figsize=(10,4))
missing_df.head(30)["missing_pct"].plot(kind="bar")
plt.title("Missing % (top 30 columns)")
plt.ylabel("% missing")
plt.tight_layout()
plt.show()

cat_cols = [c for c in df.columns if not pd.api.types.is_numeric_dtype(df[c]) and df[c].nunique(dropna=False) < 200]
print("Categorical columns considered (cardinality <200):", cat_cols)

def plot_cat_counts_and_default_rate(col, top_k=10):
    vc = df[col].fillna("MISSING").value_counts(dropna=False)
    top = vc.head(top_k).index.tolist()
    sub = df[df[col].isin(top) | df[col].isna()].copy()
    sub[col] = sub[col].fillna("MISSING")
    counts = sub[col].value_counts().loc[top]
    default_rates = sub.groupby(col)["loan_default_bin"].apply(lambda s: s.dropna().mean()).reindex(top)
    fig, ax1 = plt.subplots(figsize=(8,3))
    sns.barplot(x=counts.index, y=counts.values, ax=ax1)
    ax1.set_ylabel("count")
    ax1.set_xticklabels(ax1.get_xticklabels(), rotation=45, ha="right")
    ax2 = ax1.twinx()
    sns.pointplot(x=default_rates.index, y=default_rates.values, ax=ax2, color="red")
    ax2.set_ylabel("default rate")
    plt.title(f"{col}: counts and default rate (top {top_k})")
    plt.tight_layout()
    plt.show()

informative_cat = [c for c in ["term","grade","sub_grade","emp_length","home_ownership","purpose","verification_status"] if c in df.columns]
for c in informative_cat:
    print("\nColumn:", c)
    display(df[c].value_counts(dropna=False).head(20))
    plot_cat_counts_and_default_rate(c, top_k=8)



from scipy import stats
numeric_all = df.select_dtypes(include=[np.number]).columns.tolist()
numeric_for_corr = [c for c in numeric_all if df[c].notna().mean() > 0.6 and df[c].nunique() > 5]
print("Numeric columns used for correlation:", numeric_for_corr)

if numeric_for_corr:
    corr = df[numeric_for_corr].corr()
    plt.figure(figsize=(10,8))
    sns.heatmap(corr, cmap="RdBu_r", center=0, annot=False, fmt=".2f", linewidths=0.2)
    plt.title("Pearson correlation (numeric features)")
    plt.tight_layout()
    plt.show()

    from statsmodels.stats.outliers_influence import variance_inflation_factor
    X_vif = df[numeric_for_corr].fillna(df[numeric_for_corr].median())
    sample = X_vif.sample(min(len(X_vif), 5000), random_state=0)
    vif_df = pd.DataFrame({
        "feature": sample.columns,
        "vif": [variance_inflation_factor(sample.values, i) for i in range(sample.shape[1])]
    }).sort_values("vif", ascending=False)
    print("VIF (top 30):")
    display(vif_df.head(30))
else:
    print("No numeric columns met completeness/variance criteria for correlation/VIF.")


report = []
report.append(("target_positive_rate", df["loan_default_bin"].mean(skipna=True)))
top_missing_cols = missing_df.head(10).index.tolist()
report.append(("top_missing_cols", top_missing_cols))
skew_info = df.select_dtypes(include=[np.number]).apply(lambda s: s.dropna().skew()).sort_values(ascending=False).head(10).to_dict()
report.append(("top_skewed_numeric", skew_info))
high_card_cat = [c for c in df.select_dtypes(exclude=[np.number]).columns if df[c].nunique(dropna=False) > 200][:10]
report.append(("high_cardinality_categoricals", high_card_cat))

report_df = pd.DataFrame(report, columns=["aspect","value"])
display(report_df)

report_df.to_csv("EDA_short_report.csv", index=False)
print("Saved short EDA report to EDA_short_report.csv")




ParserError: Error tokenizing data. C error: out of memory