In [2]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.nn.functional import normalize
from itertools import count
from torch.autograd import Variable
from environments.base_environment import OneHotEnv

In [160]:
class RiverSwimMDP_Agent(OneHotEnv):
    def __init__(self):
        self.states = []
        self.actions = ["up", "down", "left", "right"]
        self.lr = 0.2
        self.exp_rate = 0.3
        
        self.num_states = 6
        self.num_actions = 2
        
        #Env params
        self.start_state = 0
        self.current_state = 0
        self.reward_scale_factor = None
        self.P = np.zeros((self.num_states, self.num_actions, self.num_states))
        self.R = np.zeros((self.num_states, self.num_actions, self.num_states))
        # adding connections from node i to i+1
        for s in range(self.num_states):
            if s == 0:
                self.P[s, 0, s] = 1
                self.P[s, 1, s] = 0.1
                self.P[s, 1, s + 1] = 0.9
                self.R[s, 0, s] = 5. / 1000.
            elif s == self.num_states - 1:
                self.P[s, 0, s - 1] = 1
                self.P[s, 1, s - 1] = 0.05
                self.P[s, 1, s] = 0.95
                self.R[s, 1, s] = 1
            else:
                self.P[s, 0, s - 1] = 1
                self.P[s, 1, s - 1] = 0.05
                self.P[s, 1, s] = 0.05
                self.P[s, 1, s + 1] = 0.9

        self.random_seed = 2023
        self.rand_generator = np.random.RandomState(self.random_seed)
        self.start_state = self.rand_generator.choice(self.num_states)
        self.reward_obs_term = [0.0, None, False]
        
        self.P_tensor = torch.FloatTensor(self.P)
        self.R_tensor = torch.FloatTensor(self.R)
        
        possible_transitions = self.R_tensor*self.P_tensor
        self.R_tensor = torch.zeros(self.num_states,self.num_actions)
        for i, actions in enumerate(possible_transitions):
            self.R_tensor[i] = actions.sum(dim=1)
            
        self.P_tensor = self.P_tensor.reshape(self.num_actions*self.num_states, self.num_states)
        
        #Agent params
        self.policy = torch.zeros(self.num_states, self.num_actions, requires_grad=True)
        self.policy_optimizer = torch.optim.Adam([self.policy], lr=0.001)
        
        self.T = torch.zeros(self.num_states, requires_grad=True)
        self.T_optimizer = torch.optim.Adam([self.T], lr=0.001)
        
        self.potential = torch.zeros(self.num_states, requires_grad=True)
        self.potential_optimizer = torch.optim.Adam([self.potential], lr=0.001)
        
        self.sm = nn.Softmax(dim=0)
        self.psm = nn.Softmax(dim=1)

    
    def train(self, rounds=10):
        for step in range(rounds):
            self.policy.requires_grad_(True)
            self.T.requires_grad_(True)
            self.potential.requires_grad_(False)
            for g in range(10):
                
                objective = self.R_tensor.flatten()*self.sm(self.policy.flatten())
                entropy = -torch.log(self.sm(self.policy.flatten()))*self.sm(self.policy.flatten())
                t_part = self.potential*self.sm(self.T)
                p_part = torch.mm(self.P_tensor, self.potential.reshape(self.num_states,1)).reshape(self.num_states,self.num_actions).flatten()*self.sm(self.policy.flatten())
                
#                 objective = self.R_tensor.flatten()*self.psm(self.policy).flatten()
#                 entropy = -torch.log(self.psm(self.policy).flatten())*self.psm(self.policy).flatten()
#                 t_part = self.potential*self.sm(self.T)
#                 p_part = torch.mm(self.P_tensor, self.potential.reshape(self.num_states,1)).reshape(self.num_states,self.num_actions).flatten()*self.psm(self.policy).flatten()
                
                
                #print('objective   ', objective.shape)
                #print('f_constraint',f_constraint.shape)
                #print('p_constraint', p_constraint.shape)
                
                policy_loss = -objective.sum() + t_part.sum() - p_part.sum() #- 10*entropy.sum()
               # policy_loss = -(self.R_matrix.flatten()*self.psm(self.policy.flatten())).sum() \
               #               -(torch.mm(self.P.reshape(self.num_actions*self.num_states, self.num_states), self.potential.reshape(self.num_states,1))*self.sm(self.policy.flatten())).sum()
                
                #print('Reward and policy:', (self.R_matrix*self.sm(self.policy)).shape) torch.Size([9, 2])
                #print('Potential and T:', (self.potential*self.sm(self.T)).shape) torch.Size([9])
                #print('P and Potential:', (torch.mm(self.P.reshape(self.num_states*self.num_actions, self.num_states), self.potential.reshape(self.num_states,1)).shape)) torch.Size([18, 1])
                #kmd
                #policy_loss = -policy_loss
                self.policy_optimizer.zero_grad()
                self.T_optimizer.zero_grad()
                policy_loss.backward()
                self.policy_optimizer.step()
                self.T_optimizer.step()
            
            self.policy.requires_grad_(False)
            self.T.requires_grad_(False)
            self.potential.requires_grad_(True)
            for d in range(1):
                
                t_part = self.potential*self.sm(self.T)
                p_part = torch.mm(self.P_tensor, self.potential.reshape(self.num_states,1)).reshape(self.num_states,self.num_actions).flatten()*self.sm(self.policy.flatten())
                
                # t_part = self.potential*self.sm(self.T)
                # p_part = torch.mm(self.P_tensor, self.potential.reshape(self.num_states,1)).reshape(self.num_states,self.num_actions).flatten()*self.psm(self.policy).flatten()
                f_loss = t_part.sum() - p_part.sum()
                f_loss = -f_loss
                
                self.potential_optimizer.zero_grad()
                f_loss.backward() 
                self.potential_optimizer.step()
            if step %100 ==0:       
                print('Policy loss: ',policy_loss.item())
                print('Potentials loss: ', f_loss.item())
                rewards = agent.evaluate_policy()
                print('Rewards', rewards)
                print('----------------------------------')
                
    def evaluate_policy(self):
        rewards = 0
        self.env_start()
        for i in range(100):
            prob = self.sm(self.policy[self.current_state])
            action = np.random.choice([0,1], p=prob.detach().numpy())
            #action = self.policy[self.current_state].argmax()
            obs = self.env_step(action)
            rewards+=obs[0]
        return rewards

In [161]:
agent = RiverSwimMDP_Agent()

In [162]:
agent.train(10000)

Policy loss:  -0.08089937269687653
Potentials loss:  -0.0
Rewards 7.015
----------------------------------
Policy loss:  -0.5130094289779663
Potentials loss:  0.00017523346468806267
Rewards 49.04500000000001
----------------------------------
Policy loss:  -0.8290524482727051
Potentials loss:  0.0018662847578525543
Rewards 66.01499999999999
----------------------------------
Policy loss:  -0.9030272364616394
Potentials loss:  0.0013418085873126984
Rewards 53.03
----------------------------------
Policy loss:  -0.9287384748458862
Potentials loss:  0.0038876160979270935
Rewards 37.015
----------------------------------
Policy loss:  -0.9343671798706055
Potentials loss:  -0.0018609948456287384
Rewards 20.04
----------------------------------
Policy loss:  -0.954905092716217
Potentials loss:  0.012672502547502518
Rewards 13.085
----------------------------------
Policy loss:  -0.9480862021446228
Potentials loss:  0.0029198825359344482
Rewards 88.0
----------------------------------
Policy 

KeyboardInterrupt: 

In [151]:
agent.R_tensor

tensor([[0.0050, 0.0000],
        [0.0000, 0.0000],
        [0.0000, 0.0000],
        [0.0000, 0.0000],
        [0.0000, 0.0000],
        [0.0000, 0.9500]])

In [163]:
#the results without using f and P in objectife
agent.policy, agent.psm(agent.policy), #agent.sm(agent.policy.flatten())

(tensor([[-4.3411, -4.3409],
         [-4.3281, -4.3418],
         [-4.3419, -4.3425],
         [-4.3424, -4.3708],
         [-4.3426, -4.3979],
         [-4.3736,  4.3533]], requires_grad=True),
 tensor([[4.9995e-01, 5.0005e-01],
         [5.0342e-01, 4.9658e-01],
         [5.0014e-01, 4.9986e-01],
         [5.0711e-01, 4.9289e-01],
         [5.1383e-01, 4.8617e-01],
         [1.6213e-04, 9.9984e-01]], grad_fn=<SoftmaxBackward>))

In [271]:
for i in range(10):
    action = np.random.choice(agent.num_actions)
    obs = agent.env_step(action)
    print('action:',action, '  reward:',obs[0].item(), '  next_state:',np.argmax(obs[1]))

action: 1   reward: 0.0   next_state: 5
action: 1   reward: 0.0   next_state: 6
action: 1   reward: 0.0   next_state: 7
action: 0   reward: 0.0   next_state: 8
action: 0   reward: 2.0   next_state: 0
action: 1   reward: 0.0   next_state: 5
action: 0   reward: 0.0   next_state: 6
action: 0   reward: 0.0   next_state: 7
action: 1   reward: 0.0   next_state: 8
action: 0   reward: 2.0   next_state: 0


In [275]:
agent.T

tensor([ 1.5785, -1.6616, 11.7198, 11.7198, 11.7198, -1.6616, 11.7198, 11.7198,
        11.7198])

In [274]:
agent.potential

tensor([76.7428, 48.8321, 47.9522, 47.9522, 47.9522, 48.8321, 47.9522, 47.9522,
        47.9522], requires_grad=True)