In [None]:
# Cell 1: Imports
import numpy as np
import random
import math
from collections import deque
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
from tqdm import trange

# Import the environment and classes provided by the assignment
from Inventory_env_class import InventoryManagementEnv, NormalizeObservation, ReplayBuffer, DQN


In [None]:
# Cell 2: Device & environment setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# Create and normalize the environment (the assignment told us to use NormalizeObservation)
env = InventoryManagementEnv()
env = NormalizeObservation(env)

obs_dim = env.observation_space.shape[0]   # should be 6
n_actions = env.action_space.n             # should be 3

print("Observation dim:", obs_dim)
print("Action count:", n_actions)


In [None]:
# Cell 3: Hyperparameters (simple defaults you can tweak)
seed = 42
np.random.seed(seed)
random.seed(seed)
torch.manual_seed(seed)

# Training hyperparams
num_episodes = 5000         # start smaller (e.g. 2000) if using CPU
max_steps_per_episode = 500 # env has a built-in max_steps value; we keep similar
batch_size = 64
gamma = 0.99
lr = 1e-3
replay_capacity = 20000

# Epsilon-greedy
eps_start = 1.0
eps_end = 0.05
eps_decay = 3000.0   # larger -> slower decay

# Target network update frequency (in steps)
target_update_every = 1000

# For logging / plotting
print_interval = 50


In [None]:
# Cell 4: Initialize networks, optimizer, replay buffer
policy_net = DQN(obs_dim, n_actions).to(device)
target_net = DQN(obs_dim, n_actions).to(device)
target_net.load_state_dict(policy_net.state_dict())
target_net.eval()

optimizer = optim.Adam(policy_net.parameters(), lr=lr)
replay_buffer = ReplayBuffer(capacity=replay_capacity)


In [None]:
# Cell 5: Helper functions - epsilon schedule, select_action, and optimize step

def get_epsilon(it):
    # smooth exponential-ish decay
    return eps_end + (eps_start - eps_end) * math.exp(-1.0 * it / eps_decay)

def select_action(state, epsilon):
    # state: numpy array (observation)
    if random.random() < epsilon:
        return random.randrange(n_actions)
    else:
        with torch.no_grad():
            t = torch.tensor(state, dtype=torch.float32, device=device).unsqueeze(0)
            qvals = policy_net(t)
            return int(qvals.argmax().cpu().numpy())

mse_loss = nn.MSELoss()

def optimize_model():
    if len(replay_buffer) < batch_size:
        return None
    states, actions, rewards, next_states, dones = replay_buffer.sample(batch_size)
    # convert to tensors
    states_t = torch.tensor(states, dtype=torch.float32, device=device)
    actions_t = torch.tensor(actions, dtype=torch.long, device=device).unsqueeze(1)
    rewards_t = torch.tensor(rewards, dtype=torch.float32, device=device).unsqueeze(1)
    next_states_t = torch.tensor(next_states, dtype=torch.float32, device=device)
    dones_t = torch.tensor(dones, dtype=torch.float32, device=device).unsqueeze(1)

    # Q(s,a)
    q_values = policy_net(states_t).gather(1, actions_t)

    # target: r + gamma * max_a' Q_target(s', a') * (1 - done)
    with torch.no_grad():
        next_q = target_net(next_states_t).max(1)[0].unsqueeze(1)
        target_q = rewards_t + (1.0 - dones_t) * gamma * next_q

    loss = mse_loss(q_values, target_q)

    optimizer.zero_grad()
    loss.backward()
    # simple gradient clipping for stability
    nn.utils.clip_grad_norm_(policy_net.parameters(), 1.0)
    optimizer.step()

    return loss.item()


In [None]:
# Cell 6: Training loop
total_steps = 0
loss_history = []
reward_history = []
eps_history = []

# We'll keep a short moving average for printing
print("Starting training...")
for episode in trange(1, num_episodes + 1):
    state, _ = env.reset()
    episode_reward = 0.0
    episode_losses = []

    for t in range(max_steps_per_episode):
        epsilon = get_epsilon(total_steps)
        action = select_action(state, epsilon)
        next_state, reward, terminated, truncated, info = env.step(action)

        done = bool(terminated or truncated)
        replay_buffer.push(state, action, reward, next_state, float(done))

        state = next_state
        episode_reward += reward
        total_steps += 1

        loss_val = optimize_model()
        if loss_val is not None:
            episode_losses.append(loss_val)
            loss_history.append(loss_val)

        # update target network periodically
        if total_steps % target_update_every == 0:
            target_net.load_state_dict(policy_net.state_dict())

        if done:
            break

    reward_history.append(episode_reward)
    eps_history.append(epsilon)

    # average loss for this episode
    avg_loss = np.mean(episode_losses) if episode_losses else 0.0

    if episode % print_interval == 0 or episode == 1:
        recent_rewards_avg = np.mean(reward_history[-print_interval:])
        print(f"Episode {episode:4d} | Reward: {episode_reward:8.2f} | AvgLast{print_interval}: {recent_rewards_avg:8.2f} | Epsilon: {epsilon:.3f} | AvgLoss: {avg_loss:.6f}")

print("Training finished.")


In [None]:
# Cell 7: Save the trained model
torch.save(policy_net.state_dict(), "inventory_dqn_policy.pth")
print("Saved policy to inventory_dqn_policy.pth")


In [None]:
# Cell 8: Quick plots of reward and loss (useful to check convergence)
# Simple smoothing helper
def smooth(x, w=50):
    if len(x) < w:
        return x
    return np.convolve(x, np.ones(w)/w, mode='valid')

plt.figure(figsize=(12,4))
plt.subplot(1,2,1)
plt.plot(reward_history, alpha=0.4, label='episode reward')
plt.plot(range(len(smooth(reward_history))), smooth(reward_history), label='smoothed')
plt.title("Episode reward")
plt.legend()

plt.subplot(1,2,2)
if loss_history:
    plt.plot(loss_history, alpha=0.6)
    plt.title("Training MSE loss (per update)")
else:
    plt.title("No loss data")
plt.tight_layout()
plt.show()


In [None]:
# Cell 9: Test the trained policy (render the env to observe behavior)
def test_policy(policy_model, episodes=10, render=True):
    policy_model.eval()
    results = []
    for ep in range(episodes):
        state, _ = env.reset()
        ep_reward = 0.0
        for t in range(env.max_steps):
            with torch.no_grad():
                st = torch.tensor(state, dtype=torch.float32, device=device).unsqueeze(0)
                action = int(policy_model(st).argmax().cpu().numpy())
            next_state, reward, terminated, truncated, info = env.step(action)
            ep_reward += reward
            if render:
                print(f"\n=== Test Episode {ep+1} | Step {t+1} ===")
                env.render()
                print(f"Action chosen: {action}, Reward: {reward:.2f}")
            state = next_state
            if terminated or truncated:
                break
        results.append(ep_reward)
        print(f"Test Episode {ep+1} finished | Total reward: {ep_reward:.2f}")
    return results

# Load saved model (if you want to test a previously saved checkpoint)
# policy_net.load_state_dict(torch.load("inventory_dqn_policy.pth", map_location=device))

test_results = test_policy(policy_net, episodes=5, render=True)
print("Test episode rewards:", test_results)
