In [1]:
import gymnasium as gym
import torch

# Import the required classes from the Inventory_env_class.py file that is provided
from Inventory_env_class import InventoryManagementEnv, NormalizeObservation, ReplayBuffer, DQN

#### Intializing the environment

In [2]:
# Create environment instance - Notice we do not need to use gym.make here as I provided you the enviroment class
env = InventoryManagementEnv()

# Normalize the observation space for better training performance
env = NormalizeObservation(env)

"NormalizeObservation" is a Gymnasium observation wrapper that scales each element of the observation vector by pre-defined factors. This helps with stabilizing training, especially when the different components of the state have diverse scales.

#### Checking out the observation and action spaces structure

In [3]:
# Check the observation and action spaces
print(f"Observation space which is continuous:\n{env.observation_space}\n")
print(f"Action space which is discrete:\n{env.action_space}\n")

# Check the dimensions of the observation space 
print(f"Observation space dimensions: {env.observation_space.shape[0]}")
print(f"Action space dimensions: {env.action_space.n}")

Observation space which is continuous:
Box(0.0, inf, (6,), float32)

Action space which is discrete:
Discrete(3)

Observation space dimensions: 6
Action space dimensions: 3


#### Do a test run to get a sense of how this enviroment operates

In [4]:
# Reset the environment to a start state
observation, info = env.reset()

# Here we are just taking some random actions to see how the env works and what information rendering give us
# We do this over 5 episodes
for episode in range(5):
    action = env.action_space.sample()  # Replace with agent policy
    observation, reward, terminated, truncated, info = env.step(action)
    print(f"\nEpisode {episode+1} | Action taken: {action}")
    env.render()
    
    if terminated or truncated:
        observation, info = env.reset()
        
env.close()


Episode 1 | Action taken: 1
Step: 1
Raw Inventory: 1.0
Product Inventory Before Sale: 0.0, After Sale: 0.0
Raw Price: 4.76, Product Price: 20.01
Demand: 10.10, Cash: 994.94

Episode 2 | Action taken: 1
Step: 2
Raw Inventory: 2.0
Product Inventory Before Sale: 0.0, After Sale: 0.0
Raw Price: 4.96, Product Price: 19.90
Demand: 8.64, Cash: 990.18

Episode 3 | Action taken: 0
Step: 3
Raw Inventory: 2.0
Product Inventory Before Sale: 0.0, After Sale: 0.0
Raw Price: 4.89, Product Price: 20.16
Demand: 9.63, Cash: 990.18

Episode 4 | Action taken: 0
Step: 4
Raw Inventory: 2.0
Product Inventory Before Sale: 0.0, After Sale: 0.0
Raw Price: 4.77, Product Price: 20.05
Demand: 6.15, Cash: 990.18

Episode 5 | Action taken: 1
Step: 5
Raw Inventory: 3.0
Product Inventory Before Sale: 0.0, After Sale: 0.0
Raw Price: 5.26, Product Price: 19.72
Demand: 10.70, Cash: 985.41


#### <u>When developing the DQN solution you need a few things</u> ####

1. The Network architectures for Q and target net is provided in the CLass and therefore DQN is imported
    - Remember the inputs to tese networks is the "state" (observation space = 6) and outputs are actions (3).
    - So, we intialize them accordingly.
    - Also the target network has to be the same as the Q net when initialized.

Let us see how to do this.




In [5]:
# GPU or CPU selection
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Assign dimension values of state and actions
input_dim = env.observation_space.shape[0]  # 6 dimensions
output_dim = env.action_space.n             # 3 actions

# Initialize networks
Q_net = DQN(input_dim, output_dim).to(device)
target_net = DQN(input_dim, output_dim).to(device)

# Copy the weights from Q network to target network
target_net.load_state_dict(Q_net.state_dict())

# Put target network in "no Training" mode.
target_net.eval()

DQN(
  (network): Sequential(
    (0): Linear(in_features=6, out_features=64, bias=True)
    (1): ReLU()
    (2): Linear(in_features=64, out_features=64, bias=True)
    (3): ReLU()
    (4): Linear(in_features=64, out_features=64, bias=True)
    (5): ReLU()
    (6): Linear(in_features=64, out_features=3, bias=True)
  )
)

2. Replay Buffer is a simple fixed-capacity buffer used to store experience transitions (state, action, reward, next state, done). These transitions are later sampled in batches for training the DQN.
    - The "ReplayBuffer" class is already imported from the Class.
    - To initialize it, we simply have to say what the buffer value should be. Typically 10000 but it can be higher or lower depending how much diversity you want in the data.

Let us see how we can initialize this.

In [6]:
replay_buffer = ReplayBuffer(capacity=10000)    # The value is your choice

#### At this point you have all the operators required to implement the training loop and this is the task of your Mini Project Part 2