In [2]:
import os


In [3]:
import torch.nn as nn
import torch.optim as optim
import torch as T
from torch.distributions.categorical import Categorical
import numpy as np



class AgentNetwork(nn.Module):
    def __init__(self,input_dims,action_dim,lr,layer1=256,layer2=256,weight_file='weightFiles/ppo_discrete'):
        super(AgentNetwork,self).__init__()
        self.checkpoint_file = os.path.join(weight_file,'ppo_actor_weight')
        #TOCHECK: *input_dims vs input_dims
        self.actor = nn.Sequential(
                nn.Linear(*input_dims,layer1),
                nn.ReLU(),
                nn.Linear(layer1,layer2),
                nn.ReLU(),
                nn.Linear(layer2,action_dim),
                nn.Softmax(dim=-1)               
        )
        
        self.optimizer = optim.Adam(self.parameters(),lr=lr)
        self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu')
        self.to(self.device)
    
    def forward(self,state):
        dist = self.actor(state)
        #TOCHECK: what does categorical do
        dist = Categorical(dist)
        return dist
    
    def save_checkpoint(self):
        T.save(self.state_dict(),self.checkpoint_file)
    
    def load_checkpoint(self):
        self.load_state_dict(T.load(self.checkpoint_file))

class CriticNetwork(nn.Module):
    def __init__(self,input_dims,lr,layer1=256,layer2=256,weight_file='weightFiles/ppo_discrete'):
        super(CriticNetwork,self).__init__()
        self.checkpoint_file = os.path.join(weight_file,'ppo_critic_weight')
        self.critic = nn.Sequential(
                nn.Linear(*input_dims,layer1),
                nn.ReLU(),
                nn.Linear(layer1,layer2),
                nn.ReLU(),
                nn.Linear(layer2,1)
        )
        self.optimizer = optim.Adam(self.parameters(),lr=lr)
        self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu')
        self.to(self.device)
        
    def forward(self,state):
        value = self.critic(state)
        return value
    
    def save_checkpoint(self):
        T.save(self.state_dict(),self.checkpoint_file)
    
    def load_checkpoint(self):
        self.load_state_dict(T.load(self.checkpoint_file))
        
    
    
    
        
        
        
        
        


In [4]:
class ReplayMemory:
    def __init__(self,batch_size):
        self.states = []
        self.probs=[]
        self.vals=[]
        self.actions=[]
        self.rewards=[]
        self.dones=[]
        self.batch_size = batch_size
    
    def store_memory(self,state,action,prob,val,reward,done):
        self.states.append(state)
        self.actions.append(action)
        self.probs.append(prob)
        self.vals.append(val)
        self.rewards.append(reward)
        self.dones.append(done)
        
    def clear_memory(self):
        self.states = []
        self.probs=[]
        self.vals=[]
        self.actions=[]
        self.rewards=[]
        self.dones=[]
        
    def generate_batches(self):
        n_states = len(self.states)
        batches=[]
        i=0
        indices = np.arange(n_states,dtype = np.int64)
        np.random.shuffle(indices)
        for i in range(n_states):
            batches.append(indices[i:i+self.batch_size])
            i+=self.batch_size
        return np.array(self.states),\
                np.array(self.actions),\
                np.array(self.probs), \
                np.array(self.vals),\
                np.array(self.rewards), \
                np.array(self.dones),\
                batches
    

In [17]:
class Agent:
    def __init__(self,n_actions,input_dims,gamma=0.99,lr=0.0003,lambda_factor=0.95,policy_clip=0.2,batch_size=64,n_epochs=10):
        self.gamma = gamma
        self.policy_clip = policy_clip
        self.n_epochs = n_epochs
        self.lambda_factor = lambda_factor
        self.actor = AgentNetwork(action_dim = n_actions,input_dims = input_dims,lr = lr)
        self.critic = CriticNetwork(input_dims,lr)
        self.memory = ReplayMemory(batch_size)
        
    def remember(self,state,action,prob,val,reward,done):
        self.memory.store_memory(state,action,prob,val,reward,done)
    
    def save_models(self):
        print("saving model file")
        self.actor.save_checkpoint()
        self.critic.save_checkpoint()
    
    def load_models(self):
        print("loading model file")
        self.actor.load_checkpoint()
        self.critic.load_checkpoint()
        
    def choose_action(self,observation):
        state = T.tensor([observation],dtype=T.float).to(self.actor.device)
        dist = self.actor(state)
        value = self.critic(state)
        print(f"value {value}")
        #TOCHECK what is the datatype action 
        action = dist.sample()
        print(f"action = {action}")
        #TOCHECK what does squeeze do
        
        probs = T.squeeze(dist.log_prob(action)).item()
        print(f"log prob {dist.log_prob(action)} squeezed = {T.squeeze(dist.log_prob(action))} Probs = {probs}")
        action = T.squeeze(action).item()
        value  = T.squeeze(value).item()
        return action,probs,value
        
    def learn(self):
        for _ in range(self.n_epochs):
            state_arr,action_arr,probs_arr,vals_arr, \
            rewards_arr,dones_arr,batches = self.memory.generate_batches()
            
            advantages=np.zeros_like(rewards_arr)
            
            for t in reversed(range(len(state_arr)-1)):
                advantages[t] = rewards_arr[t]+self.gamma*vals_arr[t+1]*(1-int(dones_arr[t]))-vals_arr[t] + self.gamma*self.lambda_factor*advantages[t+1]
                
            advantages = T.tensor(advantages).to(self.actor.device)
            values = T.tensor(vals_arr).to(self.actor.device)
            
            for batch in batches:
                states = T.tensor(state_arr[batch],dtype = T.float).to(self.actor.device)
                actions = T.tensor(action_arr[batch],dtype = T.float).to(self.actor.device)
                old_probs = T.tensor(probs_arr[batch],dtype = T.float).to(self.actor.device)
                dist = self.actor(states)
                new_probs = dist.log_prob(actions)
                #TOCHECK: what do exp() do
                
                prob_ratio = new_probs.exp()/old_probs.exp()
                
                weighted_prob = advantages[batch]*(prob_ratio)
                
                weighted_clipped_probs = T.clamp(prob_ratio,1-self.policy_clip,1+self.policy_clip)*advantages[batch]
                
                actor_loss = - T.min(weighted_clipped_probs,weighted_prob).mean()
                
                critic_values = self.critic(states)
                #TOCHECK what does squeeze do here
                critic_values = T.squeeze(critic_values)
                
                desired_state_values = advantages[batch]+values[batch]
                critic_loss = (desired_state_values-critic_values)**2
                critic_loss = critic_loss.mean()
                
                total_loss = actor_loss+critic_loss*0.5
                
                self.actor.optimizer.zero_grad()
                self.critic.optimizer.zero_grad()
                total_loss.backward()
                self.actor.optimizer.step()
                self.critic.optimizer.step()
        self.memory.clear_memory() 
    

In [18]:
import gym


env = gym.make('CartPole-v1')
print( env.observation_space.shape)
agent = Agent(n_actions = env.action_space.n,input_dims = env.observation_space.shape,batch_size=5,n_epochs=4)
N = 20
episodes = 3000
n_step=0
learn_iters=0
best_score = 0
score_history=[]

for ep in range(episodes):
    state = env.reset()
    done = False
    score = 0
    while not done:
        action,prob,val = agent.choose_action(state)
        next_state,reward,done,_ = env.step(action)
        
        score+=reward
        n_step+=1
        agent.remember(state,action,prob,val,reward,done)
        state = next_state
        if n_step %N == 0:
            agent.learn()
            learn_iters+=1
    score_history.append(score)
    avg_score = np.mean(score_history[-100:])
    
    if avg_score > best_score:
        best_score = avg_score
        agent.save_models()
    print(f"episode {ep} current score {score} avg score {avg_score} best_score {best_score}")
    
        
        
        


(4,)
value tensor([[0.0229]], device='cuda:0', grad_fn=<AddmmBackward0>)
action = tensor([1], device='cuda:0')
log prob tensor([-0.6421], device='cuda:0', grad_fn=<SqueezeBackward1>) squeezed = -0.6421285271644592 Probs = -0.6421285271644592
value tensor([[-0.0144]], device='cuda:0', grad_fn=<AddmmBackward0>)
action = tensor([0], device='cuda:0')
log prob tensor([-0.7557], device='cuda:0', grad_fn=<SqueezeBackward1>) squeezed = -0.7556757926940918 Probs = -0.7556757926940918
value tensor([[0.0235]], device='cuda:0', grad_fn=<AddmmBackward0>)
action = tensor([1], device='cuda:0')
log prob tensor([-0.6425], device='cuda:0', grad_fn=<SqueezeBackward1>) squeezed = -0.6424530148506165 Probs = -0.6424530148506165
value tensor([[-0.0142]], device='cuda:0', grad_fn=<AddmmBackward0>)
action = tensor([0], device='cuda:0')
log prob tensor([-0.7555], device='cuda:0', grad_fn=<SqueezeBackward1>) squeezed = -0.7555360198020935 Probs = -0.7555360198020935
value tensor([[0.0235]], device='cuda:0', gra

TypeError: can't convert cuda:0 device type tensor to numpy. Use Tensor.cpu() to copy the tensor to host memory first.

In [86]:
for ep in range(episodes):
    state = env.reset()
    done = False
    score = 0
    while not done:
        action,prob,val = agent.choose_action(state)
        next_state,reward,done,_ = env.step(action)   
        score+=reward
        n_step+=1 
        state = next_state  
        env.render()
    print(f"ep score = {score}")


ep score = 500.0
ep score = 500.0
ep score = 500.0
ep score = 500.0
ep score = 500.0
ep score = 500.0
ep score = 500.0
ep score = 500.0
ep score = 500.0
ep score = 500.0
ep score = 500.0
ep score = 500.0
ep score = 500.0
ep score = 500.0
ep score = 483.0
ep score = 500.0
ep score = 500.0
ep score = 500.0
ep score = 500.0
ep score = 500.0
ep score = 500.0
ep score = 500.0
ep score = 500.0
ep score = 500.0
ep score = 500.0
ep score = 500.0
ep score = 500.0
ep score = 500.0
ep score = 500.0
ep score = 500.0


KeyboardInterrupt: 