# A2C-mod (Modified A2C) Algorithm
## Reinforcement Learning for Inventory Management

This notebook implements the A2C-mod (Modified A2C) algorithm for optimizing grocery store inventory management.

## Install Required Dependencies

In [None]:
import subprocess
import sys

# Install required packages
packages = ['tensorflow', 'numpy']

for package in packages:
    try:
        __import__(package.replace('-', '_'))
        print(f"‚úì {package} is already installed")
    except ImportError:
        print(f"Installing {package}...")
        subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", package])
        print(f"‚úì {package} installed successfully")

print("\n‚úì All dependencies installed!")

In [None]:
import os
import sys
import argparse

os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"

import tensorflow as tf
import numpy as np

np.set_printoptions(edgeitems=25, linewidth=10000, precision=12, suppress=True)

# Set up algorithm
ALGORITHM = 'A2C_mod'
print(f"Training with {ALGORITHM} Algorithm")

# Default FLAGS
class FLAGS:
    output_dir = 'C:\\NCKH\\XAI\\checkpoints_a2c_mod'
    train_file = 'data/train.tfrecords'
    capacity_file = 'data/capacity.tfrecords'
    stock_file = 'data/stock.tfrecords'
    predict_file = 'data/test.tfrecords'
    output_file = './output_a2c_mod.csv'
    dropout_prob = 0.1
    train_episodes = 600
    num_products = 220
    num_timesteps = 900
    num_features = 3
    num_actions = 14
    hidden_size = 32
    entropy_coefficient = 0.001
    gamma = 0.99
    waste = 0.025
    actor_learning_rate = 0.001
    critic_learning_rate = 0.001
    zero_inventory = 1e-5
    batch_size = 32
    action = 'TRAIN'


## Define Neural Network Components

In [None]:
class Dense(tf.Module):
  def __init__(self, input_dim, output_size, activation=None, stddev=1.0):
    super(Dense, self).__init__()
    self.w = tf.Variable(
      tf.random.truncated_normal([input_dim, output_size], stddev=stddev), name='w')
    self.b = tf.Variable(tf.zeros([output_size]), name='b')
    self.activation = activation
  def __call__(self, x):
    y = tf.matmul(x, self.w) + self.b
    if (self.activation):
      y = self.activation(y)
    return y

class Actor(tf.Module):
  def __init__(self, num_features, num_actions, hidden_size, activation=tf.nn.relu, dropout_prob=0.1):
    super(Actor, self).__init__()
    self.layer1 = Dense(num_features, hidden_size, activation=None)
    self.layer2 = Dense(hidden_size, hidden_size, activation=None)
    self.layer3 = Dense(hidden_size, hidden_size, activation=None)
    self.layer4 = Dense(hidden_size, num_actions, activation=None)
    self.activation = activation
    self.dropout_prob = dropout_prob
  def __call__(self, state):
    layer_output = self.layer1(state)
    layer_output = self.activation(layer_output)
    layer_output = tf.nn.dropout(layer_output, self.dropout_prob)

    layer_output = self.layer2(layer_output)
    layer_output = self.activation(layer_output)
    layer_output = tf.nn.dropout(layer_output, self.dropout_prob)

    layer_output = self.layer3(layer_output)
    layer_output = self.activation(layer_output)
    layer_output = tf.nn.dropout(layer_output, self.dropout_prob)

    layer_output = self.layer4(layer_output)
    return tf.nn.softmax(layer_output)

class Critic(tf.Module):
  def __init__(self, num_features, hidden_size, activation=tf.nn.relu, dropout_prob=0.1):
    super(Critic, self).__init__()
    self.layer1 = Dense(num_features, hidden_size, activation=None)
    self.layer2 = Dense(hidden_size, 1, activation=None)
    self.activation = activation
    self.dropout_prob = dropout_prob
  def __call__(self, state):
    layer_output = self.layer1(state)
    # Layer normalization instead of GroupNormalization
    layer_output = tf.keras.layers.LayerNormalization()(layer_output)
    layer_output = self.activation(layer_output)
    layer_output = tf.nn.dropout(layer_output, self.dropout_prob)

    layer_output = self.layer2(layer_output)
    return tf.squeeze(layer_output, axis=-1, name='factor_squeeze')

print("Actor and Critic networks defined successfully")

## Implement Data Parsers and Helper Functions

In [None]:
def sales_parser(serialized_example):
  example = tf.io.parse_single_example(
    serialized_example,
    features={"sales": tf.io.FixedLenFeature([FLAGS.num_products], tf.float32)})
  for name in list(example.keys()):
    t = example[name]
    if t.dtype == tf.int64:
      t = tf.cast(t, tf.float32)
      example[name] = t
  return example

def capacity_parser(serialized_example):
  example = tf.io.parse_single_example(
    serialized_example,
    features={"capacity": tf.io.FixedLenFeature([FLAGS.num_products], tf.float32)})
  for name in list(example.keys()):
    t = example[name]
    if t.dtype == tf.int64:
      t = tf.cast(t, tf.float32)
      example[name] = t
  return example

def stock_parser(serialized_example):
  example = tf.io.parse_single_example(
    serialized_example,
    features={"stock": tf.io.FixedLenFeature([FLAGS.num_products], tf.float32)})
  for name in list(example.keys()):
    t = example[name]
    if t.dtype == tf.int64:
      t = tf.cast(t, tf.float32)
      example[name] = t
  return example

def waste(x):
   return FLAGS.waste * x

def quantile(x, q):
  return np.quantile(x, q)

def cross_entropy(p, q):
  return -tf.reduce_mean(tf.reduce_sum(p*tf.math.log(tf.math.maximum(1e-15, q)), axis=1))

print("Data parsers and helper functions defined successfully")

## Training Loop with A2C-mod Algorithm

The A2C-mod algorithm modifies the policy update using advantage weighting:
- p_new = softmax(log(œÄ(a|s)) + advantage / (|a_selected - a_all| + 1))
- actor_loss = mean squared difference between p_old and p_new

In [None]:
import json
import csv
from datetime import datetime
from collections import defaultdict

class TrainingLogger:
    def __init__(self, log_dir='./logA2Cmod'):
        os.makedirs(log_dir, exist_ok=True)
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        self.log_dir = log_dir
        self.timestamp = timestamp
        self.metrics = defaultdict(list)
        self.episode_logs = []
        
    def log_step(self, global_step, experience_step, rewards, stockouts, waste_val, delta, critic_loss, entropy_adj, actor_loss):
        """Log metrics for each training step"""
        self.metrics['global_step'].append(int(global_step))
        self.metrics['experience_step'].append(int(experience_step))
        self.metrics['rewards'].append(float(rewards))
        self.metrics['stockouts'].append(float(stockouts))
        self.metrics['waste'].append(float(waste_val))
        self.metrics['delta'].append(float(delta))
        self.metrics['critic_loss'].append(float(critic_loss))
        self.metrics['entropy_adjusted'].append(float(entropy_adj))
        self.metrics['actor_loss'].append(float(actor_loss))
        
    def save_episode_logs(self, episode):
        """Save logs for each episode"""
        if not self.metrics or len(self.metrics['global_step']) == 0:
            return None, None
        
        # Create episode-specific filenames
        json_file = os.path.join(self.log_dir, f'training_log_{self.timestamp}_episode_{episode:04d}.json')
        csv_file = os.path.join(self.log_dir, f'training_log_{self.timestamp}_episode_{episode:04d}.csv')
        
        # Save as JSON
        with open(json_file, 'w') as f:
            json.dump(dict(self.metrics), f, indent=2)
        
        # Save as CSV
        if self.metrics:
            keys = list(self.metrics.keys())
            with open(csv_file, 'w', newline='') as f:
                writer = csv.writer(f)
                writer.writerow(keys)
                for i in range(len(self.metrics[keys[0]])):
                    writer.writerow([self.metrics[k][i] for k in keys])
        
        # Store episode summary
        episode_summary = {
            'episode': episode,
            'steps': len(self.metrics['global_step']),
            'rewards_mean': float(np.mean(self.metrics['rewards'])),
            'rewards_std': float(np.std(self.metrics['rewards'])),
            'stockouts_mean': float(np.mean(self.metrics['stockouts'])),
            'waste_mean': float(np.mean(self.metrics['waste'])),
            'delta_mean': float(np.mean(self.metrics['delta'])),
            'critic_loss_mean': float(np.mean(self.metrics['critic_loss'])),
            'entropy_adjusted_mean': float(np.mean(self.metrics['entropy_adjusted'])),
            'actor_loss_mean': float(np.mean(self.metrics['actor_loss']))
        }
        self.episode_logs.append(episode_summary)
        
        # Reset metrics for next episode
        self.metrics = defaultdict(list)
        
        return json_file, csv_file
    
    def save_summary(self):
        """Save summary of all episodes"""
        summary_file = os.path.join(self.log_dir, f'training_summary_{self.timestamp}.json')
        with open(summary_file, 'w') as f:
            json.dump(self.episode_logs, f, indent=2)
        
        print(f"\n‚úì Training summary saved: {summary_file}")
        return summary_file

print("TrainingLogger class defined successfully")


In [None]:
def train():
  # Initialize logger
  logger = TrainingLogger(log_dir='./logA2Cmod')
  
  sales_dataset = tf.data.TFRecordDataset(FLAGS.train_file).window(FLAGS.batch_size, shift=FLAGS.batch_size-1, drop_remainder=False)

  capacity_dataset = tf.data.TFRecordDataset(FLAGS.capacity_file)
  parsed_capacity_dataset = capacity_dataset.map(capacity_parser)
  capacity = next(iter(parsed_capacity_dataset))['capacity']

  actor_optimizer = tf.optimizers.Adam(FLAGS.actor_learning_rate)
  critic_optimizer = tf.optimizers.Adam(FLAGS.critic_learning_rate)

  actor = Actor(FLAGS.num_features, FLAGS.num_actions, FLAGS.hidden_size, activation=tf.nn.relu, dropout_prob=FLAGS.dropout_prob)
  critic = Critic(FLAGS.num_features, FLAGS.hidden_size, activation=tf.nn.relu, dropout_prob=FLAGS.dropout_prob)

  global_step = tf.Variable(0)

  checkpoint_prefix = os.path.join(FLAGS.output_dir, "ckpt")
  checkpoint = tf.train.Checkpoint(critic_optimizer=critic_optimizer, actor_optimizer=actor_optimizer, critic=critic, actor=actor, step=global_step)
  status = checkpoint.restore(tf.train.latest_checkpoint(FLAGS.output_dir))

  for episode in range(FLAGS.train_episodes):
    x = tf.random.uniform(shape=[FLAGS.num_products], minval=0, maxval=1, dtype=tf.dtypes.float32)
    q = waste(x)

    for batch_dataset in sales_dataset:
      with tf.GradientTape() as actor_tape, tf.GradientTape() as critic_tape:
        experience_step = tf.constant(0)
        experience_s = tf.TensorArray(size=FLAGS.batch_size, dtype=tf.float32, element_shape=tf.TensorShape([FLAGS.num_products, FLAGS.num_features]), name="experience_s")
        experience_u = tf.TensorArray(size=FLAGS.batch_size, dtype=tf.float32, element_shape=tf.TensorShape([FLAGS.num_products]), name="experience_u")
        experience_p = tf.TensorArray(size=FLAGS.batch_size, dtype=tf.float32, element_shape=tf.TensorShape([FLAGS.num_products, FLAGS.num_actions]), name="experience_p")
        experience_i = tf.TensorArray(size=FLAGS.batch_size, dtype=tf.int64, element_shape=tf.TensorShape([FLAGS.num_products]), name="experience_i")
        experience_overstock = tf.TensorArray(size=FLAGS.batch_size, dtype=tf.float32, element_shape=tf.TensorShape([FLAGS.num_products]), name="experience_overstock")
        experience_s_prime = tf.TensorArray(size=FLAGS.batch_size, dtype=tf.float32, element_shape=tf.TensorShape([FLAGS.num_products, FLAGS.num_features]), name="experience_s_prime")
        experience_r = tf.TensorArray(size=FLAGS.batch_size, dtype=tf.float32, element_shape=tf.TensorShape([FLAGS.num_products]), name="experience_r")
        experience_z = tf.TensorArray(size=FLAGS.batch_size, dtype=tf.float32, element_shape=tf.TensorShape([FLAGS.num_products]), name="experience_z")
        experience_q = tf.TensorArray(size=FLAGS.batch_size, dtype=tf.float32, element_shape=tf.TensorShape([FLAGS.num_products]), name="experience_q")
        experience_quan = tf.TensorArray(size=FLAGS.batch_size, dtype=tf.float32, element_shape=tf.TensorShape([FLAGS.num_products]), name="experience_quan")

        batch_iterator = batch_dataset.map(sales_parser)
        sales = tf.divide(next(iter(batch_iterator))['sales'], capacity)

        s = tf.transpose(tf.stack([x, sales, q], axis=0), perm=[1, 0])
        policy_probs = actor(s)

        for item in batch_iterator:
          sales_prime = tf.divide(item['sales'], capacity)
          policy_index = tf.squeeze(tf.random.categorical(tf.math.log(policy_probs), 1))
          policy_mask = tf.one_hot(policy_index, FLAGS.num_actions)
          policy_selected = tf.boolean_mask(policy_probs, policy_mask)
     
          action_space = tf.tile([[0, 0.005, 0.01, 0.0125, 0.015, 0.0175, 0.02, 0.03, 0.04, 0.08, 0.12, 0.2, 0.5, 1]], [FLAGS.num_products, 1])
          u = tf.boolean_mask(action_space, policy_mask)

          overstock = tf.math.maximum(0, (x + u) - 1)
          x_u = tf.math.minimum(1, x + u)
          x_prime = tf.math.maximum(0, x_u - sales)
        
          q_prime = waste(x_prime)
          s_prime = tf.transpose(tf.stack([x_prime, sales_prime, q_prime], axis=0), perm=[1, 0])

          z = tf.cast(x < FLAGS.zero_inventory, tf.float32)
          quan = tf.repeat(tf.cast(quantile(x, 0.95) - quantile(x, 0.05), tf.float32), FLAGS.num_products)
          r = tf.cast(1 - z - overstock - q - quan, tf.float32)

          experience_s = experience_s.write(experience_step, s)
          experience_u = experience_u.write(experience_step, u)
          experience_p = experience_p.write(experience_step, policy_probs)
          experience_i = experience_i.write(experience_step, policy_index)
          experience_overstock = experience_overstock.write(experience_step, overstock)
          experience_s_prime = experience_s_prime.write(experience_step, s_prime)
          experience_r = experience_r.write(experience_step, r)
          experience_z = experience_z.write(experience_step, z)
          experience_q = experience_q.write(experience_step, q)
          experience_quan = experience_quan.write(experience_step, quan)

          policy_probs = actor(s_prime)
          x = x_prime
          q = q_prime
          s = s_prime
          sales = sales_prime
          experience_step = experience_step + 1

        s_batch = tf.reshape(experience_s.stack()[:experience_step, :, :], [-1, FLAGS.num_features])
        p_batch = tf.reshape(experience_p.stack()[:experience_step, :], [-1, FLAGS.num_actions])
        i_batch = tf.reshape(experience_i.stack()[:experience_step, :], [-1])
        overstock_batch = tf.reshape(experience_overstock.stack()[:experience_step, :], [-1])
        s_prime_batch = tf.reshape(experience_s_prime.stack()[:experience_step, :, :], [-1, FLAGS.num_features])
        r_batch = tf.reshape(experience_r.stack()[:experience_step, :], [-1])
        z_batch = tf.reshape(experience_z.stack()[:experience_step, :], [-1])
        q_batch = tf.reshape(experience_q.stack()[:experience_step, :], [-1])
        quan_batch = tf.reshape(experience_quan.stack()[:experience_step, :], [-1])

        # Calculate metrics
        rewards_mean = tf.reduce_mean(r_batch, keepdims=False)
        stockouts_mean = tf.reduce_mean(z_batch, keepdims=False)
        waste_mean = tf.reduce_mean(q_batch, keepdims=False)

        tf.print("rewards:", global_step, experience_step, rewards_mean, output_stream=sys.stderr, summarize=-1)
        tf.print("stockouts:", global_step, experience_step, stockouts_mean, output_stream=sys.stderr, summarize=-1)

        v = critic(s_batch)
        v_prime = critic(s_prime_batch)
        y = r_batch + FLAGS.gamma*v_prime

        delta = y - v
        delta_mean = tf.reduce_mean(delta, keepdims=False)
        tf.print("delta:", global_step, delta_mean, output_stream=sys.stderr, summarize=-1)

        critic_loss = 0.5*tf.reduce_mean(tf.math.square(delta), keepdims=False)
        tf.print("critic loss:", global_step, critic_loss, output_stream=sys.stderr, summarize=-1)

        entropy_p = cross_entropy(p_batch, p_batch)
        entropy_adj = FLAGS.entropy_coefficient*entropy_p

        # A2C_mod Algorithm
        ix_batch = tf.tile(tf.reshape(i_batch, [-1, 1]), [1, FLAGS.num_actions])
        p_new = tf.nn.softmax(tf.math.log(tf.math.maximum(1e-15, p_batch)) + tf.reshape(delta, [-1, 1]) / tf.cast(tf.math.abs(ix_batch - tf.cast(tf.range(FLAGS.num_actions), tf.int64)) + 1, tf.float32))
        per_timestep_actor_loss = tf.reduce_mean(tf.math.squared_difference(p_batch, p_new), axis=-1)
        actor_loss = tf.reduce_mean(per_timestep_actor_loss, axis=-1)
        
        tf.print("actor loss:", global_step, actor_loss, output_stream=sys.stderr, summarize=-1)
        
        # Log metrics to file
        logger.log_step(
            global_step=int(global_step),
            experience_step=int(experience_step),
            rewards=float(rewards_mean),
            stockouts=float(stockouts_mean),
            waste_val=float(waste_mean),
            delta=float(delta_mean),
            critic_loss=float(critic_loss),
            entropy_adj=float(entropy_adj),
            actor_loss=float(actor_loss)
        )
        
        global_step.assign_add(1)

      actor_gradients = actor_tape.gradient(actor_loss, actor.variables)
      critic_gradients = critic_tape.gradient(critic_loss, critic.variables)

      actor_optimizer.apply_gradients(zip(actor_gradients, actor.variables))
      critic_optimizer.apply_gradients(zip(critic_gradients, critic.variables))

    # Save logs after each episode
    json_file, csv_file = logger.save_episode_logs(episode + 1)
    if (episode + 1) % 10 == 0:
      print(f"Episode {episode + 1} - Logs saved:")
      print(f"  JSON: {json_file}")
      print(f"  CSV: {csv_file}")
    
    if (episode + 1) % 10 == 0:
      checkpoint.save(file_prefix=checkpoint_prefix)
      print(f"Checkpoint saved at episode {episode + 1}")

  tf.print ("episode:", episode, global_step, output_stream=sys.stderr, summarize=-1)
  
  # Save summary of all episodes
  logger.save_summary()

print("Train function defined successfully")

## Prediction Function

In [None]:
def predict():
  sales_dataset = tf.data.TFRecordDataset(FLAGS.predict_file)
  capacity_dataset = tf.data.TFRecordDataset(FLAGS.capacity_file)
  stock_dataset = tf.data.TFRecordDataset(FLAGS.stock_file)

  parsed_capacity_dataset = capacity_dataset.map(capacity_parser)
  capacity = next(iter(parsed_capacity_dataset))['capacity']

  parsed_dataset = sales_dataset.map(sales_parser)

  parsed_stock_dataset = stock_dataset.map(stock_parser)
  x = next(iter(parsed_stock_dataset))['stock']

  actor = Actor(FLAGS.num_features, FLAGS.num_actions, FLAGS.hidden_size, activation=tf.nn.relu, dropout_prob=FLAGS.dropout_prob)

  checkpoint = tf.train.Checkpoint(actor=actor)
  checkpoint.restore(tf.train.latest_checkpoint(FLAGS.output_dir)).expect_partial()

  with tf.io.gfile.GFile(FLAGS.output_file, "w") as writer:
    for sales_record in parsed_dataset:
      sales = tf.divide(sales_record['sales'], capacity)
      q = waste(x)
      s = tf.transpose(tf.stack([x, sales, q], axis=0), perm=[1, 0])

      policy_probs = actor(s)
      policy_mask = tf.one_hot(tf.math.argmax(policy_probs, axis=-1), FLAGS.num_actions)
      action_space = tf.tile([[0, 0.005, 0.01, 0.0125, 0.015, 0.0175, 0.02, 0.03, 0.04, 0.08, 0.12, 0.2, 0.5, 1]], [FLAGS.num_products, 1])
      u = tf.boolean_mask(action_space, policy_mask)

      overstock = tf.math.maximum(0, (x + u) - 1)
      x_u = tf.math.minimum(1, x + u)
      stockout = tf.math.minimum(0, x_u - sales)

      writer.write("stock:" + ','.join(  list(map(str,   x.numpy()    ))    ) + "\n")
      writer.write("action:" + ','.join(  list(map(str,   u.numpy()    ))    ) + "\n")
      writer.write("overstock:" + ','.join(  list(map(str,   overstock.numpy()    ))    ) + "\n")
      writer.write("sales:" + ','.join(  list(map(str,   sales.numpy()    ))    ) + "\n")
      writer.write("stockout:" + ','.join(  list(map(str,   stockout.numpy()    ))    ) + "\n")
      writer.write("capacity:" + ','.join(  list(map(str,   (capacity/capacity).numpy()    ))    ) + "\n")

      x = tf.math.maximum(0, x_u - sales)

print("Predict function defined successfully")

## Execute Training or Prediction

In [None]:
os.makedirs(FLAGS.output_dir, exist_ok=True)

if FLAGS.action == 'TRAIN':
    print(f"Starting {ALGORITHM} training...")
    train()
    print(f"{ALGORITHM} training completed!")
elif FLAGS.action == 'PREDICT':
    print(f"Starting {ALGORITHM} prediction...")
    predict()
    print(f"{ALGORITHM} prediction completed! Results saved to {FLAGS.output_file}")
else:
    print(f"Unknown action: {FLAGS.action}")

## Gi·∫£i Th√≠ch Chi Ti·∫øt C√°c Th√¥ng S·ªë FLAGS

### **T·ªïng Quan V·ªÅ Hyperparameters**

C√°c th√¥ng s·ªë FLAGS ƒë·ªãnh nghƒ©a:
- **C·∫•u h√¨nh d·ªØ li·ªáu**: ƒê∆∞·ªùng d·∫´n file, k√≠ch th∆∞·ªõc d·ªØ li·ªáu
- **Ki·∫øn tr√∫c m·∫°ng**: S·ªë l·ªõp ·∫©n, s·ªë h√†nh ƒë·ªông
- **Tham s·ªë hu·∫•n luy·ªán**: Learning rate, gamma, entropy coefficient
- **C·∫•u h√¨nh ch·∫°y**: S·ªë t·∫≠p phim, batch size

In [8]:
import pandas as pd

# T·∫°o b·∫£ng gi·∫£i th√≠ch chi ti·∫øt c√°c th√¥ng s·ªë FLAGS
parameters_explanation = {
    'Th√¥ng S·ªë': [
        'output_dir',
        'train_file',
        'capacity_file',
        'stock_file',
        'predict_file',
        'output_file',
        'dropout_prob',
        'train_episodes',
        'num_products',
        'num_timesteps',
        'num_features',
        'num_actions',
        'hidden_size',
        'entropy_coefficient',
        'gamma',
        'waste',
        'actor_learning_rate',
        'critic_learning_rate',
        'zero_inventory',
        'batch_size',
        'action'
    ],
    'Gi√° Tr·ªã': [
        'C:\\NCKH\\XAI\\checkpoints_a2c_mod',
        'data/train.tfrecords',
        'data/capacity.tfrecords',
        'data/stock.tfrecords',
        'data/test.tfrecords',
        './output_a2c_mod.csv',
        '0.1',
        '600',
        '220',
        '900',
        '3',
        '14',
        '32',
        '0.001',
        '0.99',
        '0.025',
        '0.001',
        '0.001',
        '1e-5',
        '32',
        'TRAIN'
    ],
    '√ù Nghƒ©a': [
        'Th∆∞ m·ª•c l∆∞u tr·ªØ c√°c checkpoint (m√¥ h√¨nh ƒë√£ l∆∞u)',
        'File TFRecords ch·ª©a d·ªØ li·ªáu b√°n h√†ng ƒë·ªÉ hu·∫•n luy·ªán',
        'File TFRecords ch·ª©a d·ªØ li·ªáu s·ª©c ch·ª©a c·ªßa t·ª´ng s·∫£n ph·∫©m',
        'File TFRecords ch·ª©a d·ªØ li·ªáu t·ªìn kho ban ƒë·∫ßu',
        'File TFRecords ch·ª©a d·ªØ li·ªáu b√°n h√†ng ƒë·ªÉ d·ª± ƒëo√°n',
        'File CSV xu·∫•t k·∫øt qu·∫£ d·ª± ƒëo√°n',
        'T·ª∑ l·ªá dropout trong m·∫°ng n∆°-ron (gi·∫£m overfitting)',
        'S·ªë t·∫≠p phim (episodes) ƒë·ªÉ hu·∫•n luy·ªán (1 episode = 900 b∆∞·ªõc)',
        'S·ªë lo·∫°i s·∫£n ph·∫©m qu·∫£n l√Ω (220 s·∫£n ph·∫©m)',
        'S·ªë b∆∞·ªõc th·ªùi gian trong m·ªói t·∫≠p phim (900 b∆∞·ªõc)',
        'S·ªë ƒë·∫∑c tr∆∞ng c·ªßa tr·∫°ng th√°i (inventory, sales, waste)',
        'S·ªë h√†nh ƒë·ªông m√† agent c√≥ th·ªÉ ch·ªçn (14 m·ª©c tƒÉng t·ªìn kho)',
        'S·ªë neuron trong m·ªói l·ªõp ·∫©n c·ªßa Actor/Critic (32 neuron)',
        'H·ªá s·ªë entropy ƒë·ªÉ khuy·∫øn kh√≠ch exploration (0.001)',
        'Discount factor - tr·ªçng s·ªë cho reward t∆∞∆°ng lai (0.99)',
        'T·ª∑ l·ªá l√£ng ph√≠ h√†ng h√≥a (2.5% m·ªói b∆∞·ªõc)',
        'Learning rate cho Actor network (0.001)',
        'Learning rate cho Critic network (0.001)',
        'Ng∆∞·ª°ng xem l√† h·∫øt h√†ng (1e-5 ‚âà 0)',
        'K√≠ch th∆∞·ªõc batch - s·ªë samples x·ª≠ l√Ω c√πng l√∫c (32 samples)',
        'Ch·∫ø ƒë·ªô ch·∫°y: TRAIN (hu·∫•n luy·ªán) ho·∫∑c PREDICT (d·ª± ƒëo√°n)'
    ]
}

df_params = pd.DataFrame(parameters_explanation)

print("="*100)
print("GI·∫¢I TH√çCH CHI TI·∫æT C√ÅC TH√îNG S·ªê FLAGS (A2C-MOD)")
print("="*100)
print()
print(df_params.to_string(index=False))
print()


GI·∫¢I TH√çCH CHI TI·∫æT C√ÅC TH√îNG S·ªê FLAGS (A2C-MOD)

            Th√¥ng S·ªë                         Gi√° Tr·ªã                                                     √ù Nghƒ©a
          output_dir C:\NCKH\XAI\checkpoints_a2c_mod             Th∆∞ m·ª•c l∆∞u tr·ªØ c√°c checkpoint (m√¥ h√¨nh ƒë√£ l∆∞u)
          train_file            data/train.tfrecords          File TFRecords ch·ª©a d·ªØ li·ªáu b√°n h√†ng ƒë·ªÉ hu·∫•n luy·ªán
       capacity_file         data/capacity.tfrecords      File TFRecords ch·ª©a d·ªØ li·ªáu s·ª©c ch·ª©a c·ªßa t·ª´ng s·∫£n ph·∫©m
          stock_file            data/stock.tfrecords                 File TFRecords ch·ª©a d·ªØ li·ªáu t·ªìn kho ban ƒë·∫ßu
        predict_file             data/test.tfrecords             File TFRecords ch·ª©a d·ªØ li·ªáu b√°n h√†ng ƒë·ªÉ d·ª± ƒëo√°n
         output_file            ./output_a2c_mod.csv                               File CSV xu·∫•t k·∫øt qu·∫£ d·ª± ƒëo√°n
        dropout_prob                             0.1         

### **Chi Ti·∫øt: 14 H√†nh ƒê·ªông (num_actions = 14)**

C√°c h√†nh ƒë·ªông ƒë·∫°i di·ªán cho **14 m·ª©c tƒÉng t·ªìn kho** kh√°c nhau:

In [7]:
# Gi·∫£i th√≠ch 14 h√†nh ƒë·ªông
action_space = [0, 0.005, 0.01, 0.0125, 0.015, 0.0175, 0.02, 0.03, 0.04, 0.08, 0.12, 0.2, 0.5, 1]

actions_df = pd.DataFrame({
    'Action Index': range(14),
    'TƒÉng T·ªìn Kho (%)': [f"{a*100:.2f}%" for a in action_space],
    '√ù Nghƒ©a': [
        'Kh√¥ng tƒÉng t·ªìn kho (0%)',
        'TƒÉng 0.5% - TƒÉng r·∫•t nh·ªè',
        'TƒÉng 1% - TƒÉng th·∫•p',
        'TƒÉng 1.25% - TƒÉng th·∫•p',
        'TƒÉng 1.5% - TƒÉng th·∫•p',
        'TƒÉng 1.75% - TƒÉng v·ª´a',
        'TƒÉng 2% - TƒÉng v·ª´a',
        'TƒÉng 3% - TƒÉng v·ª´a',
        'TƒÉng 4% - TƒÉng v·ª´a',
        'TƒÉng 8% - TƒÉng cao',
        'TƒÉng 12% - TƒÉng cao',
        'TƒÉng 20% - TƒÉng r·∫•t cao',
        'TƒÉng 50% - TƒÉng c·ª±c cao',
        'TƒÉng 100% - TƒÉng c·ª±c ƒë·∫°i (g·∫•p ƒë√¥i)'
    ]
})

print("\n" + "="*100)
print("CHI TI·∫æT 14 H√ÄNH ƒê·ªòNG (num_actions = 14)")
print("="*100)
print()
print(actions_df.to_string(index=False))
print()
print("üí° Gi·∫£i Th√≠ch:")
print("   - Agent ch·ªçn m·ªôt trong 14 h√†nh ƒë·ªông n√†y ·ªü m·ªói b∆∞·ªõc th·ªùi gian")
print("   - M·ª•c ti√™u: T√¨m m·ª©c tƒÉng t·ªìn kho t·ªëi ∆∞u ƒë·ªÉ c√¢n b·∫±ng gi·ªØa:")
print("     * Tr√°nh h·∫øt h√†ng (stockout) - khi nhu c·∫ßu > t·ªìn kho")
print("     * Gi·∫£m l√£ng ph√≠ (waste) - h√†ng h√≥a kh√¥ng b√°n ƒë∆∞·ª£c")
print("   - N·∫øu ch·ªçn h√†nh ƒë·ªông qu√° nh·ªè ‚Üí h·∫øt h√†ng ‚Üí m·∫•t doanh thu")
print("   - N·∫øu ch·ªçn h√†nh ƒë·ªông qu√° l·ªõn ‚Üí l√£ng ph√≠ ‚Üí m·∫•t chi ph√≠")



CHI TI·∫æT 14 H√ÄNH ƒê·ªòNG (num_actions = 14)

 Action Index TƒÉng T·ªìn Kho (%)                            √ù Nghƒ©a
            0            0.00%            Kh√¥ng tƒÉng t·ªìn kho (0%)
            1            0.50%           TƒÉng 0.5% - TƒÉng r·∫•t nh·ªè
            2            1.00%                TƒÉng 1% - TƒÉng th·∫•p
            3            1.25%             TƒÉng 1.25% - TƒÉng th·∫•p
            4            1.50%              TƒÉng 1.5% - TƒÉng th·∫•p
            5            1.75%              TƒÉng 1.75% - TƒÉng v·ª´a
            6            2.00%                 TƒÉng 2% - TƒÉng v·ª´a
            7            3.00%                 TƒÉng 3% - TƒÉng v·ª´a
            8            4.00%                 TƒÉng 4% - TƒÉng v·ª´a
            9            8.00%                 TƒÉng 8% - TƒÉng cao
           10           12.00%                TƒÉng 12% - TƒÉng cao
           11           20.00%            TƒÉng 20% - TƒÉng r·∫•t cao
           12           50.00%            

### **Chi Ti·∫øt: 3 ƒê·∫∑c Tr∆∞ng Tr·∫°ng Th√°i (num_features = 3)**

Tr·∫°ng th√°i c·ªßa c·ª≠a h√†ng bao g·ªìm 3 th√¥ng tin ch√≠nh:

In [6]:
# Gi·∫£i th√≠ch 3 ƒë·∫∑c tr∆∞ng tr·∫°ng th√°i
features_df = pd.DataFrame({
    'Feature Index': [0, 1, 2],
    'T√™n ƒê·∫∑c Tr∆∞ng': ['x (inventory)', 'sales (demand)', 'q (waste)'],
    'Gi·∫£i Th√≠ch': [
        'T·ªìn kho hi·ªán t·∫°i (t·ª∑ l·ªá t·ª´ 0 ƒë·∫øn 1) - Bao nhi√™u h√†ng h√≥a ƒëang c√≥ trong kho?',
        'Nhu c·∫ßu b√°n h√†ng (t·ª∑ l·ªá t·ª´ 0 ƒë·∫øn 1) - Kh√°ch h√†ng mu·ªën mua bao nhi√™u?',
        'T·ª∑ l·ªá l√£ng ph√≠ (t·ª∑ l·ªá t·ª´ 0 ƒë·∫øn 1) - Bao nhi√™u ph·∫ßn trƒÉm h√†ng s·∫Ω b·ªã l√£ng ph√≠?'
    ],
    'C√¥ng Th·ª©c T√≠nh': [
        'x: t·ªìn kho / s·ª©c ch·ª©a t·ªëi ƒëa',
        'sales: nhu c·∫ßu / s·ª©c ch·ª©a t·ªëi ƒëa',
        'q = waste_rate √ó x (0.025 √ó x = 2.5% √ó t·ªìn kho)'
    ]
})

print("\n" + "="*100)
print("CHI TI·∫æT 3 ƒê·∫∂C TR∆ØNG TR·∫†NG TH√ÅI (num_features = 3)")
print("="*100)
print()
print(features_df.to_string(index=False))
print()
print("üìä M√¥ H√¨nh Tr·∫°ng Th√°i:")
print("   Cho m·ªói s·∫£n ph·∫©m trong 220 s·∫£n ph·∫©m, agent nh√¨n th·∫•y 3 gi√° tr·ªã n√†y:")
print()
print("   V√≠ d·ª•: S·∫£n ph·∫©m A")
print("   ‚îú‚îÄ x = 0.7      ‚Üí T·ªìn kho = 70% c·ªßa s·ª©c ch·ª©a")
print("   ‚îú‚îÄ sales = 0.3  ‚Üí Nhu c·∫ßu = 30% c·ªßa s·ª©c ch·ª©a")
print("   ‚îî‚îÄ q = 0.0175   ‚Üí S·∫Ω l√£ng ph√≠ 1.75% c·ªßa t·ªìn kho")
print()
print("   Tr·∫°ng th√°i l√† m·ªôt ma tr·∫≠n 220√ó3:")
print("   ‚îå‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îê")
print("   ‚îÇ [x‚ÇÅ, sales‚ÇÅ, q‚ÇÅ] (s·∫£n ph·∫©m 1)")
print("   ‚îÇ [x‚ÇÇ, sales‚ÇÇ, q‚ÇÇ] (s·∫£n ph·∫©m 2)")
print("   ‚îÇ ...                         ‚îÇ")
print("   ‚îÇ [x‚ÇÇ‚ÇÇ‚ÇÄ, sales‚ÇÇ‚ÇÇ‚ÇÄ, q‚ÇÇ‚ÇÇ‚ÇÄ]      ‚îÇ")
print("   ‚îî‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îò")



CHI TI·∫æT 3 ƒê·∫∂C TR∆ØNG TR·∫†NG TH√ÅI (num_features = 3)

 Feature Index  T√™n ƒê·∫∑c Tr∆∞ng                                                                   Gi·∫£i Th√≠ch                                  C√¥ng Th·ª©c T√≠nh
             0  x (inventory)  T·ªìn kho hi·ªán t·∫°i (t·ª∑ l·ªá t·ª´ 0 ƒë·∫øn 1) - Bao nhi√™u h√†ng h√≥a ƒëang c√≥ trong kho?                    x: t·ªìn kho / s·ª©c ch·ª©a t·ªëi ƒëa
             1 sales (demand)         Nhu c·∫ßu b√°n h√†ng (t·ª∑ l·ªá t·ª´ 0 ƒë·∫øn 1) - Kh√°ch h√†ng mu·ªën mua bao nhi√™u?                sales: nhu c·∫ßu / s·ª©c ch·ª©a t·ªëi ƒëa
             2      q (waste) T·ª∑ l·ªá l√£ng ph√≠ (t·ª∑ l·ªá t·ª´ 0 ƒë·∫øn 1) - Bao nhi√™u ph·∫ßn trƒÉm h√†ng s·∫Ω b·ªã l√£ng ph√≠? q = waste_rate √ó x (0.025 √ó x = 2.5% √ó t·ªìn kho)

üìä M√¥ H√¨nh Tr·∫°ng Th√°i:
   Cho m·ªói s·∫£n ph·∫©m trong 220 s·∫£n ph·∫©m, agent nh√¨n th·∫•y 3 gi√° tr·ªã n√†y:

   V√≠ d·ª•: S·∫£n ph·∫©m A
   ‚îú‚îÄ x = 0.7      ‚Üí T·ªìn kho = 70% c·ªßa s·ª©c ch·ª©a
   ‚îú‚îÄ

### **T√≥m T·∫Øt: Lu·ªìng X·ª≠ L√Ω Trong M·ªói B∆∞·ªõc Th·ªùi Gian**

In [9]:
print("\n" + "="*100)
print("LU·ªíNG X·ª¨ L√ù TRONG M·ªñI B∆Ø·ªöC TH·ªúI GIAN")
print("="*100)
print()
print("1Ô∏è‚É£  ƒê·∫¶U V√ÄO (INPUT)")
print("   Tr·∫°ng th√°i hi·ªán t·∫°i:")
print("   ‚îú‚îÄ T·ªìn kho c·ªßa 220 s·∫£n ph·∫©m (x)")
print("   ‚îú‚îÄ Nhu c·∫ßu b√°n h√†ng (sales)")
print("   ‚îî‚îÄ T·ª∑ l·ªá l√£ng ph√≠ (q = 2.5% √ó x)")
print("   ‚Üí K√≠ch th∆∞·ªõc: 220 √ó 3 = 660 gi√° tr·ªã")
print()
print("2Ô∏è‚É£  X·ª¨ L√ù B·∫∞NG ACTOR NETWORK")
print("   ‚îú‚îÄ Input: 3 ƒë·∫∑c tr∆∞ng c·ªßa 220 s·∫£n ph·∫©m")
print("   ‚îú‚îÄ Hidden layers: 32 ‚Üí 32 ‚Üí 32 neurons (activation: ReLU + Dropout)")
print("   ‚îú‚îÄ Output: 14 x√°c su·∫•t h√†nh ƒë·ªông (softmax)")
print("   ‚îî‚îÄ K·∫øt qu·∫£: X√°c su·∫•t ch·ªçn m·ªói h√†nh ƒë·ªông tƒÉng t·ªìn kho")
print()
print("3Ô∏è‚É£  CH·ªåN H√ÄNH ƒê·ªòNG")
print("   ‚îú‚îÄ Agent l·∫•y m·∫´u t·ª´ 14 h√†nh ƒë·ªông d·ª±a tr√™n x√°c su·∫•t")
print("   ‚îú‚îÄ K√≠ch th∆∞·ªõc tƒÉng t·ªìn kho ƒë∆∞·ª£c x√°c ƒë·ªãnh")
print("   ‚îî‚îÄ M·ªói s·∫£n ph·∫©m ƒë∆∞·ª£c tƒÉng c√πng m·ªôt m·ª©c (action space)")
print()
print("4Ô∏è‚É£  ƒê·∫¶U RA (OUTPUT REWARD)")
print("   Agent nh·∫≠n ƒë∆∞·ª£c ph·∫ßn th∆∞·ªüng:")
print("   ‚îú‚îÄ +1 ƒëi·ªÉm: Kh√¥ng h·∫øt h√†ng")
print("   ‚îú‚îÄ -1 ƒëi·ªÉm: H·∫øt h√†ng (m·∫•t b√°n h√†ng)")
print("   ‚îî‚îÄ -waste: L√£ng ph√≠ h√†ng h√≥a")
print("   ‚Üí M·ª•c ti√™u: Maximize reward")
print()
print("5Ô∏è‚É£  TR·∫† TH√ÅI TI·∫æP THEO")
print("   ‚îú‚îÄ T·ªìn kho m·ªõi: x' = max(0, x + action - sales)")
print("   ‚îú‚îÄ Nhu c·∫ßu ti·∫øp theo: sales_next")
print("   ‚îî‚îÄ L√£ng ph√≠ ti·∫øp theo: q' = 2.5% √ó x'")
print()
print("\n" + "="*100)
print("V√ç D·ª§ C·ª§ TH·ªÇ: S·∫¢N PH·∫®M T√ÅO")
print("="*100)
print()

example_data = {
    'B∆∞·ªõc': ['T', 'T+1', 'T+2'],
    'T·ªìn Kho (x)': ['0.8 (80% s·ª©c ch·ª©a)', '0.55 (55%)', '0.42 (42%)'],
    'Nhu C·∫ßu (sales)': ['0.3 (30%)', '0.25 (25%)', '0.35 (35%)'],
    'L√£ng Ph√≠ (q)': ['0.02 (2%)', '0.0138 (1.38%)', '0.0105 (1.05%)'],
    'H√†nh ƒê·ªông': ['TƒÉng 2% (action=6)', 'TƒÉng 0.5% (action=1)', 'TƒÉng 4% (action=8)'],
    'Reward': ['0.68 (kh√¥ng h·∫øt, √≠t l√£ng ph√≠)', '0.745 (t·ªët)', '0.605 (h·∫øt h√†ng)']
}

example_df = pd.DataFrame(example_data)
print(example_df.to_string(index=False))
print()
print("Gi·∫£i th√≠ch:")
print("  T:   x=0.8, sales=0.3, q=0.02 ‚Üí T·ªìn kho ƒë·ªß nh∆∞ng c√≥ l√£ng ph√≠")
print("       Agent ch·ªçn tƒÉng 2% ‚Üí x' = min(1, 0.8+0.02) - 0.3 = 0.52")
print("       Reward = 1 - 0 - 0.02 = 0.98 (t·ªët)")
print()
print("  T+1: x=0.55, sales=0.25, q=0.0138 ‚Üí T·ªìn kho v·ª´a ph·∫£i")
print("       Agent ch·ªçn tƒÉng 0.5% ‚Üí x' = min(1, 0.55+0.005) - 0.25 = 0.305")
print("       Reward = 1 - 0 - 0.0138 = 0.986 (r·∫•t t·ªët)")
print()
print("  T+2: x=0.42, sales=0.35, q=0.0105 ‚Üí T·ªìn kho th·∫•p, nhu c·∫ßu cao")
print("       Agent ch·ªçn tƒÉng 4% ‚Üí x' = max(0, 0.42+0.04-0.35) = 0.11")
print("       Reward = 0 - 1 - 0.0105 = -1.0105 (x·∫•u - h·∫øt h√†ng!)")



LU·ªíNG X·ª¨ L√ù TRONG M·ªñI B∆Ø·ªöC TH·ªúI GIAN

1Ô∏è‚É£  ƒê·∫¶U V√ÄO (INPUT)
   Tr·∫°ng th√°i hi·ªán t·∫°i:
   ‚îú‚îÄ T·ªìn kho c·ªßa 220 s·∫£n ph·∫©m (x)
   ‚îú‚îÄ Nhu c·∫ßu b√°n h√†ng (sales)
   ‚îî‚îÄ T·ª∑ l·ªá l√£ng ph√≠ (q = 2.5% √ó x)
   ‚Üí K√≠ch th∆∞·ªõc: 220 √ó 3 = 660 gi√° tr·ªã

2Ô∏è‚É£  X·ª¨ L√ù B·∫∞NG ACTOR NETWORK
   ‚îú‚îÄ Input: 3 ƒë·∫∑c tr∆∞ng c·ªßa 220 s·∫£n ph·∫©m
   ‚îú‚îÄ Hidden layers: 32 ‚Üí 32 ‚Üí 32 neurons (activation: ReLU + Dropout)
   ‚îú‚îÄ Output: 14 x√°c su·∫•t h√†nh ƒë·ªông (softmax)
   ‚îî‚îÄ K·∫øt qu·∫£: X√°c su·∫•t ch·ªçn m·ªói h√†nh ƒë·ªông tƒÉng t·ªìn kho

3Ô∏è‚É£  CH·ªåN H√ÄNH ƒê·ªòNG
   ‚îú‚îÄ Agent l·∫•y m·∫´u t·ª´ 14 h√†nh ƒë·ªông d·ª±a tr√™n x√°c su·∫•t
   ‚îú‚îÄ K√≠ch th∆∞·ªõc tƒÉng t·ªìn kho ƒë∆∞·ª£c x√°c ƒë·ªãnh
   ‚îî‚îÄ M·ªói s·∫£n ph·∫©m ƒë∆∞·ª£c tƒÉng c√πng m·ªôt m·ª©c (action space)

4Ô∏è‚É£  ƒê·∫¶U RA (OUTPUT REWARD)
   Agent nh·∫≠n ƒë∆∞·ª£c ph·∫ßn th∆∞·ªüng:
   ‚îú‚îÄ +1 ƒëi·ªÉm: Kh√¥ng h·∫øt h√†ng
   ‚îú‚îÄ -1 ƒëi·ªÉm: H·∫øt h√†n

### Visualize Training Logs


In [None]:
import json
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path
def plot_episode_logs(timestamp=None, episode=None):
    """
    Plot detailed metrics for a specific episode from the training logs
    """
    log_dir = Path('./logA2Cmod')
    if not log_dir.exists():
        return
    
    # If no timestamp specified, find the latest one
    if timestamp is None:
        log_files = sorted(log_dir.glob('training_log_*_episode_*.json'))
        if not log_files:
            print("No episode log files found.")
            return
        log_file_path = str(log_files[-1])
    else:
        if episode is None:
            print("Please specify episode number when providing timestamp")
            return
        log_file_path = str(log_dir / f'training_log_{timestamp}_episode_{episode:04d}.json')
    
    # Load the log file
    try:
        with open(log_file_path, 'r') as f:
            metrics = json.load(f)
    except Exception as e:
        print(f"Error loading log file: {e}")
        return
    
    # Create DataFrame
    df = pd.DataFrame(metrics)
    
    # Create subplots
    fig, axes = plt.subplots(3, 3, figsize=(15, 12))
    fig.suptitle(f'A2C-mod Training Metrics\n{Path(log_file_path).name}', fontsize=16)
    
    # Plot each metric
    metrics_to_plot = [
        ('rewards', 'Rewards', axes[0, 0]),
        ('stockouts', 'Stockouts', axes[0, 1]),
        ('waste', 'Waste', axes[0, 2]),
        ('delta', 'Delta (Advantage)', axes[1, 0]),
        ('critic_loss', 'Critic Loss', axes[1, 1]),
        ('entropy_adjusted', 'Entropy (Adjusted)', axes[1, 2]),
        ('actor_loss', 'Actor Loss', axes[2, 0]),
    ]
    
    for col, title, ax in metrics_to_plot:
        if col in df.columns:
            ax.plot(df[col], linewidth=1.5, alpha=0.8)
            ax.set_title(title, fontweight='bold')
            ax.set_xlabel('Step')
            ax.set_ylabel('Value')
            ax.grid(True, alpha=0.3)
    
    # Remove extra subplots
    axes[2, 1].remove()
    axes[2, 2].remove()
    
    plt.tight_layout()
    plt.savefig(log_file_path.replace('.json', '_plot.png'), dpi=150, bbox_inches='tight')
    print(f"‚úì Plot saved: {log_file_path.replace('.json', '_plot.png')}")
    plt.show()
    
    # Print statistics
    print("\n" + "="*60)
    print("EPISODE STATISTICS")
    print("="*60)
    for col in df.columns:
        if col not in ['global_step', 'experience_step']:
            print(f"\n{col.upper()}:")
            print(f"  Mean:   {df[col].mean():.6f}")
            print(f"  Std:    {df[col].std():.6f}")
            print(f"  Min:    {df[col].min():.6f}")
            print(f"  Max:    {df[col].max():.6f}")

def plot_training_summary(timestamp=None):
    """
    Plot summary statistics across all episodes
    """
    log_dir = Path('./logA2Cmod')
    if not log_dir.exists():
        print("No training logs found. Please run training first.")
        return
    
    # If no timestamp specified, find the latest one
    if timestamp is None:
        summary_files = sorted(log_dir.glob('training_summary_*.json'))
        if not summary_files:
            print("No training summary found.")
            return
        summary_file = str(summary_files[-1])
    else:
        summary_file = str(log_dir / f'training_summary_{timestamp}.json')
    
    # Load summary
    try:
        with open(summary_file, 'r') as f:
            episode_logs = json.load(f)
    except Exception as e:
        print(f"Error loading summary file: {e}")
        return
    
    df = pd.DataFrame(episode_logs)
    
    # Create subplots
    fig, axes = plt.subplots(2, 3, figsize=(15, 10))
    fig.suptitle(f'A2C-mod Training Summary\n{Path(summary_file).name}', fontsize=16)
    
    metrics_to_plot = [
        ('rewards_mean', 'Avg Rewards per Episode', axes[0, 0]),
        ('stockouts_mean', 'Avg Stockouts per Episode', axes[0, 1]),
        ('waste_mean', 'Avg Waste per Episode', axes[0, 2]),
        ('critic_loss_mean', 'Avg Critic Loss per Episode', axes[1, 0]),
        ('entropy_adjusted_mean', 'Avg Entropy per Episode', axes[1, 1]),
        ('actor_loss_mean', 'Avg Actor Loss per Episode', axes[1, 2]),
    ]
    
    for col, title, ax in metrics_to_plot:
        if col in df.columns:
            ax.plot(df['episode'], df[col], linewidth=2, marker='o', markersize=4, alpha=0.8)
            ax.set_title(title, fontweight='bold')
            ax.set_xlabel('Episode')
            ax.set_ylabel('Value')
            ax.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.savefig(summary_file.replace('.json', '_plot.png'), dpi=150, bbox_inches='tight')
    print(f"‚úì Summary plot saved: {summary_file.replace('.json', '_plot.png')}")
    plt.show()
    
    print(f"\nTotal episodes trained: {len(df)}")

print("Plot functions defined successfully")