In [8]:
%load_ext autoreload
%autoreload 2

from MarkovDecisionProcess import MarkovDecisionProcess
from PolicyNetwork import PolicyNetwork
from ValueNetwork import ValueNetwork
from simulation import simulate

import numpy as np
import random
import torch

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [9]:
def transition_function(state, action):
    new_state = state + action
    return new_state

def reward_function(state, action):
    if np.array_equal(action, np.array([1,1])):
        reward = 1
    else:
        reward = -1
    
    return reward

def policy_function(state):
    random_number = random.uniform(0,1)
    
    if random_number > 0.5:
        action = np.array([1,1])
    else:
        action = np.array([-1,-1])
    
    return action

In [10]:
example = MarkovDecisionProcess(state=np.array([0,0]),
                                action=np.array([0,0]),
                                transition_func=transition_function,
                                reward_func=reward_function,
                                T=100)

In [11]:
simulate(example, policy_function)

In [12]:
print(example)

Current state: [6 6] 
Current action: [-1 -1] 
Current reward: -1 
Transition function: transition_function(state, action) 
Reward function: reward_function(state, action)


In [15]:
value_net = ValueNetwork(example)
policy_net = PolicyNetwork(example)

dummy_state = torch.randn(len(example.get_state()))

value = value_net(dummy_state)
mean, std = policy_net(dummy_state)

# Check output shapes
print("Value:", value)
print("Mean:", mean)
print("Std:", std)
print("Mean shape:", mean.shape)
print("Std shape:", std.shape)


Value: tensor([-0.5032], grad_fn=<AddBackward0>)
Mean: tensor([-0.9241,  0.5871], grad_fn=<AddBackward0>)
Std: tensor([0.7045, 1.1391], grad_fn=<ExpBackward0>)
Mean shape: torch.Size([2])
Std shape: torch.Size([2])
