In [152]:
import gym
import torch
import torch.nn as nn
import numpy as np
import random
import matplotlib.pyplot as plt
from copy import deepcopy
from torch.distributions import Normal

# torch.manual_seed(42)

class SAC(nn.Module):
    def __init__(self, state_dim, action_dim, gamma=0.99, alpha=1e-4, tau=0.2, 
                 batch_size=128, pi_lr=1e-3, q_lr=1e-3, period=15):
        super().__init__()

        self.pi_model = nn.Sequential(nn.Linear(state_dim, 128), nn.ReLU(), 
                                      nn.Linear(128, 128), nn.ReLU(), 
                                      nn.Linear(128, 2 * action_dim), nn.Tanh())

        self.q1_model = nn.Sequential(nn.Linear(state_dim + action_dim, 128), nn.ReLU(), 
                                      nn.Linear(128, 128), nn.ReLU(), 
                                      nn.Linear(128, 1))

        self.q2_model = nn.Sequential(nn.Linear(state_dim + action_dim, 128), nn.ReLU(), 
                                      nn.Linear(128, 128), nn.ReLU(), 
                                      nn.Linear(128, 1))

        self.gamma = gamma
        self.alpha = alpha
        self.tau = tau
        self.batch_size = batch_size
        self.memory = []
        self.period = period
        self.counter = 0
        self.step = 1

        self.pi_optimizer = torch.optim.Adam(self.pi_model.parameters(), pi_lr)
        self.q1_optimizer = torch.optim.Adam(self.q1_model.parameters(), q_lr)
        self.q2_optimizer = torch.optim.Adam(self.q2_model.parameters(), q_lr)
        self.q1_target_model = deepcopy(self.q1_model)
        self.q2_target_model = deepcopy(self.q2_model)

    def get_action(self, state):
        state = torch.FloatTensor(state).unsqueeze(0)
        action, _, log_stds = self.predict_actions(state)
        if self.step % 300 == 0:
            print(f'std: {torch.exp(log_stds - 1.0).detach().data.numpy().reshape(-1)}')
        return action.squeeze(1).detach().numpy()

    def fit(self, state, action, reward, done, next_state):
        self.memory.append([state, action, reward, done, next_state])

        if len(self.memory) > self.batch_size:
            batch = random.sample(self.memory, self.batch_size)
            states, actions, rewards, dones, next_states = map(torch.FloatTensor, zip(*batch))
            rewards, dones = rewards.unsqueeze(1), dones.unsqueeze(1)

            next_actions, next_log_probs, _ = self.predict_actions(next_states)
            next_states_and_actions = torch.concatenate((next_states, next_actions), dim=1)
            next_q1_values = self.q1_target_model(next_states_and_actions)
            next_q2_values = self.q2_target_model(next_states_and_actions)
            next_min_q_values = torch.min(next_q1_values, next_q2_values)
            targets = rewards + self.gamma * (1 - dones) * (next_min_q_values - self.alpha * next_log_probs)

            states_and_actions = torch.concatenate((states, actions), dim=1)
            q1_loss = torch.mean((self.q1_model(states_and_actions) - targets.detach()) ** 2)
            q2_loss = torch.mean((self.q2_model(states_and_actions) - targets.detach()) ** 2)
            self.update_model(q1_loss, self.q1_optimizer, self.q1_model, self.q1_target_model)
            self.update_model(q2_loss, self.q2_optimizer, self.q2_model, self.q2_target_model)


            pred_actions, log_probs, _ = self.predict_actions(states)
            states_and_pred_actions = torch.concatenate((states, pred_actions), dim=1)
            q1_values = self.q1_model(states_and_pred_actions)
            q2_values = self.q2_model(states_and_pred_actions)
            min_q_values = torch.min(q1_values, q2_values)
            pi_loss = - torch.mean(min_q_values - self.alpha * log_probs)
            self.update_model(pi_loss, self.pi_optimizer)

            if self.step % 200 == 0:
                print(f'q1 loss: {q1_loss.data.numpy()}')
                print(f'q2 loss: {q2_loss.data.numpy()}')
                print(f'pi loss: {pi_loss.data.numpy()}')

            self.step += 1
            
    def update_model(self, loss, optimizer, model=None, target_model=None):
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        if model != None and target_model != None and self.counter < self.period:
            for param, terget_param in zip(model.parameters(), target_model.parameters()):
                new_terget_param = (1 - self.tau) * terget_param + self.tau * param
                terget_param.data.copy_(new_terget_param)
            self.counter = 0
        self.counter += 1

    def predict_actions(self, states):
        means, log_stds = self.pi_model(states).T
        means, log_stds = means.unsqueeze(1), log_stds.unsqueeze(1)
        dists = Normal(means, torch.exp(log_stds - 1.0))
        actions = dists.rsample()
        log_probs = dists.log_prob(actions)
        return actions, log_probs, log_stds

In [None]:
import json
totals = []
for i in range(1):
    env = gym.make('Pendulum-v1')
    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.shape[0]
    
    agent = SAC(state_dim, action_dim)
    
    episode_n = 100
    
    total_rewards = []
    for episode in range(episode_n):
    
        total_reward = 0
        state, info = env.reset()
        
        for t in range(200):
            action = agent.get_action(state)
            next_state, reward, terminated, truncated, _ = env.step(2 * action)
        
            agent.fit(state, action, reward, terminated or truncated, next_state)
        
            total_reward += reward
            state = next_state

            if terminated or truncated:
                break
    
        total_rewards.append(total_reward)
        print(f'total_reward: {total_reward}')
    
    plt.plot(total_rewards)
    plt.title('Total Rewards')
    plt.grid()
    plt.show()
    totals.append(total_rewards)
with open('SAC_Pendulum 3tr.json', 'w') as f:
    json.dump(totals, f)
        


In [198]:
import gym
import torch
import torch.nn as nn
import numpy as np
import random
import matplotlib.pyplot as plt
from copy import deepcopy
from torch.distributions import Normal

torch.autograd.set_detect_anomaly(False)

class Actor(nn.Module):
    def __init__(self, state_dim, action_dim):
        super().__init__()
        self.body = nn.Sequential(
            nn.Linear(state_dim, 128), nn.ReLU(), 
            nn.Linear(128, 128), nn.ReLU(), 
        )
        self.mean_head = nn.Sequential(nn.Linear(128, action_dim), nn.Tanh())
        self.std_head = nn.Sequential(
            nn.Linear(128, action_dim),
            nn.Sigmoid()
        ) 

    def forward(self, input):
        input = self.body(input)
        means = self.mean_head(input)
        stds = self.std_head(input)
        log_stds = torch.log(stds + 1e-10)
        
        return means, log_stds


class SAC_many_dims(nn.Module):
    def __init__(self, state_dim, action_dim, gamma=0.99, alpha=5e-1, tau=0.5, 
                 batch_size=128, pi_lr=1e-3, q_lr=2e-3, a_lr=1e-4, period=10, episode_n=500):
        super().__init__()

        self.pi_model = Actor(state_dim, action_dim)

        self.q1_model = nn.Sequential(nn.Linear(state_dim + action_dim, 128), nn.ReLU(), 
                                      nn.Linear(128, 128), nn.ReLU(), 
                                      nn.Linear(128, 1))

        self.q2_model = nn.Sequential(nn.Linear(state_dim + action_dim, 128), nn.ReLU(), 
                                      nn.Linear(128, 128), nn.ReLU(), 
                                      nn.Linear(128, 1))

        self.pi_model.name = "policy"
        self.q1_model.name = "q1"
        self.q2_model.name = "q2"

        self.gamma = gamma
        # self.alpha = alpha
        self.tau = tau
        self.batch_size = batch_size
        self.memory = []
        self.period = period
        self.counter = 0
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.epoch_n = 2
        self.step = 1
        self.episode_n = episode_n
        self.episode = 1

        self.pi_optimizer = torch.optim.Adam(self.pi_model.parameters(), pi_lr)
        self.q1_optimizer = torch.optim.Adam(self.q1_model.parameters(), q_lr)
        self.q2_optimizer = torch.optim.Adam(self.q2_model.parameters(), q_lr)
        self.pi_scheduler = torch.optim.lr_scheduler.MultiStepLR(self.pi_optimizer, milestones=[int(0.4*episode_n), int(0.8*episode_n)], gamma=0.3)
        self.q1_scheduler = torch.optim.lr_scheduler.MultiStepLR(self.q1_optimizer, milestones=[int(0.4*episode_n), int(0.8*episode_n)], gamma=0.3)
        self.q2_scheduler = torch.optim.lr_scheduler.MultiStepLR(self.q2_optimizer, milestones=[int(0.4*episode_n), int(0.8*episode_n)], gamma=0.3)
        self.q1_target_model = deepcopy(self.q1_model)
        self.q2_target_model = deepcopy(self.q2_model)

        self.target_entropy = 0.98 * -np.log(1 / self.action_dim)
        self.log_alpha = torch.tensor(np.log(alpha), requires_grad=True)
        self.alpha = self.log_alpha
        self.alpha_optimiser = torch.optim.Adam([self.log_alpha], lr=a_lr)

    def get_action(self, state):
        state = torch.FloatTensor(state).unsqueeze(0)
        action, _, log_stds = self.predict_actions(state)
        if self.step % 200 == 0:
            print(f'std: {torch.exp(log_stds).detach().data.numpy().reshape(-1)}')
            print(f'alpha: {round(self.alpha.detach().numpy().reshape(-1)[0] , 2)}')
            self.step += 1
        return action.squeeze(0).detach().numpy()

    def add_five(self, state, action, reward, done, next_state):
        if done:
            self.episode += 1
            if self.episode == 50 or self.episode == 150 or self.episode == 300:
                self.alpha *= 0.4
        self.memory.append([state, action, reward, int(done), next_state])

    def fit(self):
        mean_pi_loss = []
        if len(self.memory) > self.batch_size:
            for i in range(self.epoch_n):
                batch = random.sample(self.memory, self.batch_size)
                states, actions, rewards, dones, next_states = map(torch.FloatTensor, zip(*batch))
                rewards, dones = rewards.unsqueeze(1), dones.unsqueeze(1)
    
                next_actions, next_log_probs, _ = self.predict_actions(next_states)
                next_states_and_actions = torch.concatenate((next_states, next_actions), dim=1)
                next_q1_values = self.q1_target_model(next_states_and_actions)
                next_q2_values = self.q2_target_model(next_states_and_actions)
                next_min_q_values = torch.min(next_q1_values, next_q2_values)
                targets = rewards + self.gamma * (1 - dones) * (next_min_q_values - self.alpha * torch.sum(next_log_probs, dim=1).unsqueeze(1))
                # print(f'targets {targets}')
    
                states_and_actions = torch.concatenate((states, actions), dim=1)
                # print(f'st_and_act {states_and_actions.shape}')
                q1_loss = torch.mean((self.q1_model(states_and_actions) - targets.detach()) ** 2)
                q2_loss = torch.mean((self.q2_model(states_and_actions) - targets.detach()) ** 2)
                # print(f'q1_loss {q1_loss.shape}')
                self.update_model(q1_loss, self.q1_optimizer, self.q1_model, self.q1_target_model)
                self.update_model(q2_loss, self.q2_optimizer, self.q2_model, self.q2_target_model)

                self.q1_scheduler.step()
                self.q2_scheduler.step()

                if self.step % 100 == 0:
                    print(f'q1 loss: {q1_loss.data.numpy()}')
                    print(f'q2 loss: {q2_loss.data.numpy()}')

                if i % 2 == 0:
                    pred_actions, log_probs, log_stds = self.predict_actions(states)
                    # print(pred_actions.requires_grad)
                    # print(f'pred_act {pred_actions.shape}')
                    # print(f'log_probs {log_probs.shape}')
                    # print(f'log_stds {log_stds.shape}')
                    states_and_pred_actions = torch.concatenate((states, pred_actions), dim=1)

                    self.disable_grads(self.q1_model)
                    self.disable_grads(self.q2_model)
                    
                    q1_values = self.q1_model(states_and_pred_actions)
                    q2_values = self.q2_model(states_and_pred_actions)
                    min_q_values = torch.min(q1_values, q2_values)
                    pi_loss = - torch.mean(min_q_values + self.alpha * torch.sum(log_stds, dim=1).unsqueeze(1))
                    
                    if self.step % 100 == 0:
                        print(f'pi loss: {pi_loss.data.numpy()}')
                        self.step += 1
                    
                    mean_pi_loss.append(pi_loss.data.numpy())
                    self.update_model(pi_loss, self.pi_optimizer, self.pi_model)

                    self.pi_scheduler.step()

                    self.enable_grads(self.q1_model)
                    self.enable_grads(self.q2_model)

                    alpha_loss = -(self.log_alpha * (log_probs + self.target_entropy).detach()).mean()
                    alpha_loss.backward()
                    self.alpha_optimiser.step()
                    self.alpha = self.log_alpha.exp()
                    self.alpha_optimiser.zero_grad()
                    
                self.counter += 1
            
            self.step += 1
                
            return np.mean(mean_pi_loss)
        else:
            return 0
            
    def update_model(self, loss, optimizer, model=None, target_model=None):
        loss.backward()
        if model.name == "policy":
            nn.utils.clip_grad_norm_(model.parameters(), max_norm=0.1)
        else:
            nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        optimizer.zero_grad()
        if model != None and target_model != None and self.counter % self.period == 0:
            for param, terget_param in zip(model.parameters(), target_model.parameters()):
                new_terget_param = (1 - self.tau) * terget_param + self.tau * param
                terget_param.data.copy_(new_terget_param)

    def predict_actions(self, states):
        means, log_stds = self.pi_model(states)
        # вычитаем единицу чтобы дать возможность больше сузить ДИ при желании нейросети
        dists = Normal(means, torch.exp(log_stds))
        actions = dists.rsample()
        actions = torch.tanh(actions)
        log_probs = dists.log_prob(actions)
        return actions, log_probs, log_stds

    def disable_grads(self, net):
        for param in net.parameters():
            param.requires_grad = False

    def enable_grads(self, net):
        for param in net.parameters():
            param.requires_grad = True

In [122]:
def test_SAC(alpha, lr):
    env = gym.make('LunarLander-v2', continuous=True)
    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.shape[0]
    
    episode_n = 1000

    agent = SAC_many_dims(state_dim, action_dim, episode_n=episode_n)
    
    mean_pi_loss = []
    
    total_rewards = []
    counter = 0
    for episode in range(episode_n):
    
        total_reward = 0
        state, info = env.reset()

        pi_loss = []
        
        for i in range(700):
            action = agent.get_action(state)
    
            next_state, reward, terminated, truncated, info = env.step(action)
            
            agent.add_five(state, action, reward, terminated or truncated, next_state)
            counter += 1

            if counter % 4 == 0:
                _pi_loss = agent.fit()
                pi_loss.append(_pi_loss)
            
            total_reward += reward
            state = next_state
            if terminated or truncated:
                break

        if episode % 3 == 0:
            print(f'episode: {episode},  mean:  {np.mean(total_rewards[-3:])}')
        total_rewards.append(total_reward)
        mean_pi_loss.append(np.mean(pi_loss))
                
        
        
    
    plt.plot(total_rewards)
    plt.title(f'Total Rewards SAC alpha={alpha}, lr={lr}')
    plt.grid()
    plt.show()
    plt.plot(mean_pi_loss)
    plt.title(f'Mean policy loss SAC alpha={alpha}, lr={lr}')
    plt.grid()
    plt.show()

In [None]:
# начался рост. дошло до 110 пунктов в лунарлэндере.
# нормальные значения для лосса политики - чем меньше тем лучше. В данном случае дошло примерно до -20.
# нормальные значения для лосса критика - чем ближе к нулю тем лучше. Дошло до 0.5 что есть хорошо.
# использовалась фиксированная альфа равная 0.1. 
# gamma=0.99, alpha=1e-1, tau=0.5, 
# batch_size=64, pi_lr=1e-3, q_lr=1e-3, a_lr=1e-4, period=15
# от этих значений уже будем плясать

# первый вариант улучшения - постепенно понижать альфу + увеличить количество итераций чтобы можно было сойтись.
# Если не сойдется то можно добавить lr decay.

In [199]:
torch.manual_seed(42)
lrs = [1e-3]

for lr in lrs:
    test_SAC(alpha, lr)

episode: 0,  mean:  nan
episode: 3,  mean:  -213.54250026634932
q1 loss: 10.594131469726562
q2 loss: 10.150322914123535
pi loss: 15.074338912963867
episode: 6,  mean:  -751.3608013903363
std: [0.9965342  0.99970347]
alpha: 0.49
episode: 9,  mean:  -592.1580354654861
q1 loss: 63.68525695800781
q2 loss: 62.786319732666016
pi loss: 39.901187896728516
episode: 12,  mean:  -386.7966972718635
std: [0.8875459 0.9822687]
alpha: 0.48
episode: 15,  mean:  -325.8962036946081
q1 loss: 18.688411712646484
q2 loss: 21.683244705200195
pi loss: 30.45903205871582
episode: 18,  mean:  -481.3059039275843
episode: 21,  mean:  -576.912971081665
std: [0.9980971  0.99919873]
alpha: 0.47
episode: 24,  mean:  -461.0221037725937
episode: 27,  mean:  -501.47160655996055
q1 loss: 21.059650421142578
q2 loss: 18.780210494995117
pi loss: 54.23542785644531
episode: 30,  mean:  -215.54283383749473
std: [0.54262197 0.91355973]
alpha: 0.46
q1 loss: 54.86723327636719
q2 loss: 50.81185531616211
pi loss: 72.92225646972656
s

RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.DoubleTensor []], which is output 0 of ExpBackward0, is at version 1; expected version 0 instead. Hint: enable anomaly detection to find the operation that failed to compute its gradient, with torch.autograd.set_detect_anomaly(True).

In [19]:
torch.arange(0, 2 * 2, 2)

tensor([0, 2])

In [56]:
import gym
import torch
import torch.nn as nn
import numpy as np
import random
import matplotlib.pyplot as plt
from copy import deepcopy
from torch.distributions import Normal
from torch.distributions.categorical import Categorical

torch.manual_seed(42)

class SAC_discrete(nn.Module):
    def __init__(self, state_dim, action_dim, gamma=0.99, alpha=1e-4, tau=0.2, 
                 batch_size=128, pi_lr=1e-3, q_lr=1e-3, period=15):
        super().__init__()

        self.pi_model = nn.Sequential(nn.Linear(state_dim, 128), nn.ReLU(), 
                                      nn.Linear(128, 128), nn.ReLU(), 
                                      nn.Linear(128, action_dim), nn.Softmax())

        self.q1_model = nn.Sequential(nn.Linear(state_dim + action_dim, 128), nn.ReLU(), 
                                      nn.Linear(128, 128), nn.ReLU(), 
                                      nn.Linear(128, 1))

        self.q2_model = nn.Sequential(nn.Linear(state_dim + action_dim, 128), nn.ReLU(), 
                                      nn.Linear(128, 128), nn.ReLU(), 
                                      nn.Linear(128, 1))

        self.gamma = gamma
        self.alpha = alpha
        self.tau = tau
        self.batch_size = batch_size
        self.memory = []
        self.period = period
        self.counter = 0
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.epoch_n = 20
        

        self.pi_optimizer = torch.optim.Adam(self.pi_model.parameters(), pi_lr)
        self.q1_optimizer = torch.optim.Adam(self.q1_model.parameters(), q_lr)
        self.q2_optimizer = torch.optim.Adam(self.q2_model.parameters(), q_lr)
        self.q1_target_model = deepcopy(self.q1_model)
        self.q2_target_model = deepcopy(self.q2_model)

    def get_action(self, state):
        state = torch.FloatTensor(state).unsqueeze(0)
        action, _ = self.predict_actions(state)
        return action.squeeze(0).detach().numpy()

    def add_five(self, state, action, reward, done, next_state):
        self.memory.append([state, action, reward, done, next_state])

    def fit(self):
        if len(self.memory) > self.batch_size:
            for _ in range(self.epoch_n):
                batch = random.sample(self.memory, self.batch_size)
                states, actions, rewards, dones, next_states = map(torch.FloatTensor, zip(*batch))
                rewards, dones = rewards.unsqueeze(1), dones.unsqueeze(1)
    
                next_actions, next_log_probs = self.predict_actions(next_states)
                next_states_and_actions = torch.concatenate((next_states, next_actions), dim=1)
                next_q1_values = self.q1_target_model(next_states_and_actions)
                next_q2_values = self.q2_target_model(next_states_and_actions)
                next_min_q_values = torch.min(next_q1_values, next_q2_values)
                targets = rewards + self.gamma * (1 - dones) * (next_min_q_values - self.alpha * next_log_probs)
    
                states_and_actions = torch.concatenate((states, actions), dim=1)
                q1_loss = torch.mean((self.q1_model(states_and_actions) - targets.detach()) ** 2)
                q2_loss = torch.mean((self.q2_model(states_and_actions) - targets.detach()) ** 2)
                self.update_model(q1_loss, self.q1_optimizer, self.q1_model, self.q1_target_model)
                self.update_model(q2_loss, self.q2_optimizer, self.q2_model, self.q2_target_model)
    
                pred_actions, log_probs = self.predict_actions(states)
                states_and_pred_actions = torch.concatenate((states, pred_actions), dim=1)
                q1_values = self.q1_model(states_and_pred_actions)
                q2_values = self.q2_model(states_and_pred_actions)
                min_q_values = torch.min(q1_values, q2_values)
                pi_loss = - torch.mean(min_q_values - self.alpha * log_probs)
                self.update_model(pi_loss, self.pi_optimizer)
            
    def update_model(self, loss, optimizer, model=None, target_model=None):
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        if model != None and target_model != None and self.counter < self.period:
            for param, terget_param in zip(model.parameters(), target_model.parameters()):
                new_terget_param = (1 - self.tau) * terget_param + self.tau * param
                terget_param.data.copy_(new_terget_param)
            self.counter = 0
        self.counter += 1

    def predict_actions(self, states):
        probs = self.pi_model(states)
        print(probs)
        dists = Categorical(probs=probs)
        print(dists)
        actions = dists.sample()
        print(actions)
        log_probs = dists.log_prob(actions)
        return actions, log_probs

In [None]:
env = gym.make('Acrobot-v1')
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.n

agent = SAC_discrete(state_dim, action_dim)

episode_n = 1

total_rewards = []
for episode in range(episode_n):

    total_reward = 0
    state = env.reset()
    
    for t in range(700):
        action = agent.get_action(state)
        print(action)

        next_state, reward, done, _ = env.step(action)
        
        
        agent.add_five(state, action, reward, done, next_state)
    
        total_reward += reward
        state = next_state
        if done:
            break
            
    agent.fit()
    total_rewards.append(total_reward)

plt.plot(total_rewards)
plt.title('Total Rewards')
plt.grid()
plt.show()