In [1]:
import gym
from collections import namedtuple
import itertools
from itertools import count
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.distributions.normal import Normal
import numpy as np
import collections
import random
import sys
import scipy.stats
import scipy.io as sio
import matplotlib.pyplot as plt
from simulation_env_new import Env
from tqdm import tqdm
from FourWI_EV_new import EV

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
class PolicyNet(torch.nn.Module):
    def __init__(self, state_dim, hidden_dim, action_dim, action_bound):
        super(PolicyNet, self).__init__()
        self.fc1 = torch.nn.Linear(state_dim, hidden_dim)
        self.fc_mu = torch.nn.Linear(hidden_dim, action_dim)
        self.fc_std = torch.nn.Linear(hidden_dim, action_dim)
        self.action_bound = action_bound

    def forward(self, x):
        x = F.relu(self.fc1(x))
        mu = F.tanh(self.fc_mu(x))
        std = F.softplus(self.fc_std(x)) + 0.01
        dist = Normal(mu, std)
        normal_sample = dist.rsample()  # rsample()是重参数化采样函数    
        log_prob = dist.log_prob(normal_sample)
        action = torch.tanh(normal_sample)  # 计算tanh_normal分布的对数概率密度
        log_prob = log_prob - torch.log(1 - torch.tanh(action).pow(2) + 1e-7)
        action = action * self.action_bound
        return action, log_prob

class QValueNet(torch.nn.Module):
    def __init__(self, state_dim, hidden_dim, action_dim):
        super(QValueNet, self).__init__()
        self.fc1 = torch.nn.Linear(state_dim + action_dim, hidden_dim)
        self.fc2 = torch.nn.Linear(hidden_dim, 1)

    def forward(self, x, a):
        cat = torch.cat([x, a], dim=1)  # 拼接状态和动作
        x = F.relu(self.fc1(cat))
        return self.fc2(x)

device = torch.device("cuda") if torch.cuda.is_available() else torch.device(
    "cpu")

class SAC:
    ''' 处理连续动作的SAC算法 '''
    def __init__(self, state_dim, hidden_dim, action_dim, action_bound,
                 actor_lr, critic_lr, alpha_lr, target_entropy, tau, gamma):
        self.actor = PolicyNet(state_dim, hidden_dim, action_dim,
                               action_bound).to(device)  # 策略网络
        # 第一个Q网络
        self.critic_1 = QValueNet(state_dim, hidden_dim, action_dim).to(device)
        # 第二个Q网络
        self.critic_2 = QValueNet(state_dim, hidden_dim, action_dim).to(device)
        self.target_critic_1 = QValueNet(state_dim, hidden_dim,
                                         action_dim).to(device)  # 第一个目标Q网络
        self.target_critic_2 = QValueNet(state_dim, hidden_dim,
                                         action_dim).to(device)  # 第二个目标Q网络
        # 令目标Q网络的初始参数和Q网络一样
        self.target_critic_1.load_state_dict(self.critic_1.state_dict())
        self.target_critic_2.load_state_dict(self.critic_2.state_dict())
        self.actor_optimizer = torch.optim.Adam(self.actor.parameters(),
                                                lr=actor_lr)
        self.critic_1_optimizer = torch.optim.Adam(self.critic_1.parameters(),
                                                   lr=critic_lr)
        self.critic_2_optimizer = torch.optim.Adam(self.critic_2.parameters(),
                                                   lr=critic_lr)
        # 使用alpha的log值,可以使训练结果比较稳定
        self.log_alpha = torch.tensor(np.log(0.01), dtype=torch.float)
        self.log_alpha.requires_grad = True  # 可以对alpha求梯度
        self.log_alpha_optimizer = torch.optim.Adam([self.log_alpha],
                                                    lr=alpha_lr)
        self.target_entropy = target_entropy  # 目标熵的大小
        self.gamma = gamma
        self.tau = tau

    def take_action(self, state):
        state = torch.tensor([state], dtype=torch.float).to(device)
        action = self.actor(state)[0]
        return [action.item()]

    def calc_target(self, rewards, next_states, dones):  # 计算目标Q值
        next_actions, log_prob = self.actor(next_states)
        entropy = -log_prob
        q1_value = self.target_critic_1(next_states, next_actions)
        q2_value = self.target_critic_2(next_states, next_actions)
        next_value = torch.min(q1_value,
                               q2_value) + self.log_alpha.exp() * entropy
        td_target = rewards + self.gamma * next_value * (1 - dones)
        return td_target

    def soft_update(self, net, target_net):
        for param_target, param in zip(target_net.parameters(),
                                       net.parameters()):
            param_target.data.copy_(param_target.data * (1.0 - self.tau) +
                                    param.data * self.tau)

    def update(self, transition_dict):
        states = torch.tensor(transition_dict['states'],
                              dtype=torch.float).to(device)
        actions = torch.tensor(transition_dict['actions'],
                               dtype=torch.float).view(-1, 1).to(device)
        rewards = torch.tensor(transition_dict['rewards'],
                               dtype=torch.float).view(-1, 1).to(device)
        next_states = torch.tensor(transition_dict['next_states'],
                                   dtype=torch.float).to(device)
        dones = torch.tensor(transition_dict['dones'],
                             dtype=torch.float).view(-1, 1).to(device)
        rewards = (rewards + 1) / 1  # 对倒立摆环境的奖励进行重塑

        # 更新两个Q网络
        td_target = self.calc_target(rewards, next_states, dones)
        critic_1_loss = torch.mean(
            F.mse_loss(self.critic_1(states, actions), td_target.detach()))
        critic_2_loss = torch.mean(
            F.mse_loss(self.critic_2(states, actions), td_target.detach()))
        self.critic_1_optimizer.zero_grad()
        critic_1_loss.backward()
        self.critic_1_optimizer.step()
        self.critic_2_optimizer.zero_grad()
        critic_2_loss.backward()
        self.critic_2_optimizer.step()

        # 更新策略网络
        new_actions, log_prob = self.actor(states)
        entropy = -log_prob
        q1_value = self.critic_1(states, new_actions)
        q2_value = self.critic_2(states, new_actions)
        actor_loss = torch.mean(-self.log_alpha.exp() * entropy -
                                torch.min(q1_value, q2_value))
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # 更新alpha值
        alpha_loss = torch.mean(
            (entropy - self.target_entropy).detach() * self.log_alpha.exp())
        self.log_alpha_optimizer.zero_grad()
        alpha_loss.backward()
        self.log_alpha_optimizer.step()

        self.soft_update(self.critic_1, self.target_critic_1)
        self.soft_update(self.critic_2, self.target_critic_2)
        
    def save(self,episode):
        torch.save(self.actor.state_dict(), "./model/MBPO/mbpo_s3_actor{}.pth".format(episode))

    def load(self,episode):
        self.actor.load_state_dict(torch.load("./model/sac_actor{}.pth".format(episode)))

In [3]:
class Swish(nn.Module):
    ''' Swish激活函数 '''
    def __init__(self):
        super(Swish, self).__init__()

    def forward(self, x):
        return x * torch.sigmoid(x)


def init_weights(m):
    ''' 初始化模型权重 '''
    def truncated_normal_init(t, mean=0.0, std=0.01):
        torch.nn.init.normal_(t, mean=mean, std=std)
        while True:
            cond = (t < mean - 2 * std) | (t > mean + 2 * std)
            if not torch.sum(cond):
                break
            t = torch.where(
                cond,
                torch.nn.init.normal_(torch.ones(t.shape, device=device),
                                      mean=mean,
                                      std=std), t)
        return t

    if type(m) == nn.Linear or isinstance(m, FCLayer):
        truncated_normal_init(m.weight, std=1 / (2 * np.sqrt(m._input_dim)))
        m.bias.data.fill_(0.0)


class FCLayer(nn.Module):
    ''' 集成之后的全连接层 '''
    def __init__(self, input_dim, output_dim, ensemble_size, activation):
        super(FCLayer, self).__init__()
        self._input_dim, self._output_dim = input_dim, output_dim
        self.weight = nn.Parameter(
            torch.Tensor(ensemble_size, input_dim, output_dim).to(device))
        self._activation = activation
        self.bias = nn.Parameter(
            torch.Tensor(ensemble_size, output_dim).to(device))

    def forward(self, x):
        return self._activation(
            torch.add(torch.bmm(x, self.weight), self.bias[:, None, :]))

In [4]:
class EnsembleModel(nn.Module):
    ''' 环境模型集成 '''
    def __init__(self,
                 state_dim,
                 action_dim,
                 model_alpha,
                 ensemble_size=5,
                 learning_rate=1e-4):
        super(EnsembleModel, self).__init__()
        # 输出包括均值和方差,因此是状态与奖励维度之和的两倍
        self._output_dim = (state_dim + 1) * 2
        self._model_alpha = model_alpha  # 模型损失函数中加权时的权重
        self._max_logvar = nn.Parameter((torch.ones(
            (1, self._output_dim // 2)).float() / 2).to(device),
                                        requires_grad=False)
        self._min_logvar = nn.Parameter((-torch.ones(
            (1, self._output_dim // 2)).float() * 10).to(device),
                                        requires_grad=False)

        self.layer1 = FCLayer(state_dim + action_dim, 200, ensemble_size,
                              Swish())
        self.layer2 = FCLayer(200, 200, ensemble_size, Swish())
        self.layer3 = FCLayer(200, 200, ensemble_size, Swish())
        self.layer4 = FCLayer(200, 200, ensemble_size, Swish())
        self.layer5 = FCLayer(200, self._output_dim, ensemble_size,
                              nn.Identity())
        self.apply(init_weights)  # 初始化环境模型中的参数
        self.optimizer = torch.optim.Adam(self.parameters(), lr=learning_rate)

    def forward(self, x, return_log_var=False):
        ret = self.layer5(self.layer4(self.layer3(self.layer2(
            self.layer1(x)))))
        mean = ret[:, :, :self._output_dim // 2]
        # 在PETS算法中,将方差控制在最小值和最大值之间
        logvar = self._max_logvar - F.softplus(
            self._max_logvar - ret[:, :, self._output_dim // 2:])
        logvar = self._min_logvar + F.softplus(logvar - self._min_logvar)
        return mean, logvar if return_log_var else torch.exp(logvar)

    def loss(self, mean, logvar, labels, use_var_loss=True):
        inverse_var = torch.exp(-logvar)
        if use_var_loss:
            mse_loss = torch.mean(torch.mean(torch.pow(mean - labels, 2) *
                                             inverse_var,
                                             dim=-1),
                                  dim=-1)
            var_loss = torch.mean(torch.mean(logvar, dim=-1), dim=-1)
            total_loss = torch.sum(mse_loss) + torch.sum(var_loss)
        else:
            mse_loss = torch.mean(torch.pow(mean - labels, 2), dim=(1, 2))
            total_loss = torch.sum(mse_loss)
        return total_loss, mse_loss

    def train(self, loss):
        self.optimizer.zero_grad()
        loss += self._model_alpha * torch.sum(
            self._max_logvar) - self._model_alpha * torch.sum(self._min_logvar)
        loss.backward()
        self.optimizer.step()


class EnsembleDynamicsModel:
    ''' 环境模型集成,加入精细化的训练 '''
    def __init__(self, state_dim, action_dim, model_alpha=0.01, num_network=5):
        self._num_network = num_network
        self._state_dim, self._action_dim = state_dim, action_dim
        self.model = EnsembleModel(state_dim,
                                   action_dim,
                                   model_alpha,
                                   ensemble_size=num_network)
        self._epoch_since_last_update = 0

    def train(self,
              inputs,
              labels,
              batch_size=256,
              holdout_ratio=0.1,
              max_iter=20):
        # 设置训练集与验证集
        permutation = np.random.permutation(inputs.shape[0])
        inputs, labels = inputs[permutation], labels[permutation]
        num_holdout = int(inputs.shape[0] * holdout_ratio)
        train_inputs, train_labels = inputs[num_holdout:], labels[num_holdout:]
        holdout_inputs, holdout_labels = inputs[:
                                                num_holdout], labels[:
                                                                     num_holdout]
        holdout_inputs = torch.from_numpy(holdout_inputs).float().to(device)
        holdout_labels = torch.from_numpy(holdout_labels).float().to(device)
        holdout_inputs = holdout_inputs[None, :, :].repeat(
            [self._num_network, 1, 1])
        holdout_labels = holdout_labels[None, :, :].repeat(
            [self._num_network, 1, 1])

        # 保留最好的结果
        self._snapshots = {i: (None, 1e10) for i in range(self._num_network)}

        for epoch in itertools.count():
            # 定义每一个网络的训练数据
            train_index = np.vstack([
                np.random.permutation(train_inputs.shape[0])
                for _ in range(self._num_network)
            ])
            # 所有真实数据都用来训练
            for batch_start_pos in range(0, train_inputs.shape[0], batch_size):
                batch_index = train_index[:, batch_start_pos:batch_start_pos +
                                          batch_size]
                train_input = torch.from_numpy(
                    train_inputs[batch_index]).float().to(device)
                train_label = torch.from_numpy(
                    train_labels[batch_index]).float().to(device)

                mean, logvar = self.model(train_input, return_log_var=True)
                loss, _ = self.model.loss(mean, logvar, train_label)
                self.model.train(loss)

            with torch.no_grad():
                mean, logvar = self.model(holdout_inputs, return_log_var=True)
                _, holdout_losses = self.model.loss(mean,
                                                    logvar,
                                                    holdout_labels,
                                                    use_var_loss=False)
                holdout_losses = holdout_losses.cpu()
                break_condition = self._save_best(epoch, holdout_losses)
                if break_condition or epoch > max_iter:  # 结束训练
                    break
        return loss

    def _save_best(self, epoch, losses, threshold=0.1):
        updated = False
        for i in range(len(losses)):
            current = losses[i]
            _, best = self._snapshots[i]
            improvement = (best - current) / best
            if improvement > threshold:
                self._snapshots[i] = (epoch, current)
                updated = True
        self._epoch_since_last_update = 0 if updated else self._epoch_since_last_update + 1
        return self._epoch_since_last_update > 5

    def predict(self, inputs, batch_size=64):
        inputs = np.tile(inputs, (self._num_network, 1, 1))
        inputs = torch.tensor(inputs, dtype=torch.float).to(device)
        mean, var = self.model(inputs, return_log_var=False)
        return mean.detach().cpu().numpy(), var.detach().cpu().numpy()


class FakeEnv:
    def __init__(self, model):
        self.model = model

    def step(self, obs, act):
        inputs = np.concatenate((obs, act), axis=-1)
        ensemble_model_means, ensemble_model_vars = self.model.predict(inputs)
        ensemble_model_means[:, :, 1:] += obs
        ensemble_model_stds = np.sqrt(ensemble_model_vars)
        ensemble_samples = ensemble_model_means + np.random.normal(
            size=ensemble_model_means.shape) * ensemble_model_stds

        num_models, batch_size, _ = ensemble_model_means.shape
        models_to_use = np.random.choice(
            [i for i in range(self.model._num_network)], size=batch_size)
        batch_inds = np.arange(0, batch_size)
        samples = ensemble_samples[models_to_use, batch_inds]
        rewards, next_obs = samples[:, :1][0][0], samples[:, 1:][0]
        return rewards, next_obs

In [5]:
class variableK:
    def __init__(self,env_pool, model_pool):
        self.env_pool = env_pool
        self.model_pool = model_pool

    def KLdivergence(self, batch_size=100, epsilon = 1e-6):
        env_obs, env_action, env_reward, env_next_obs, env_done = self.env_pool.sample(batch_size) 
        model_obs, model_action, model_reward, model_next_obs, model_done = self.model_pool.sample(batch_size)
        KL_obs = scipy.stats.entropy(abs(env_next_obs) + epsilon, abs(model_next_obs) + epsilon)
        KL_reward = scipy.stats.entropy((np.array(env_reward)+3)/3 + epsilon, (np.array(model_reward)+3)/3 + epsilon)
        return KL_obs, KL_reward 

In [6]:
class MBPO:
    def __init__(self, env, agent, fake_env, env_pool, model_pool,rollout_K, rollout_batch_size, real_ratio, num_episode):

        self.env = env
        self.agent = agent
        self.fake_env = fake_env
        self.env_pool = env_pool
        self.model_pool = model_pool
        self.rollout_batch_size = rollout_batch_size
        self.real_ratio = real_ratio
        self.num_episode = num_episode
        self.rollout_K = rollout_K

    def rollout_model(self):
        observations, _, _, _, _ = self.env_pool.sample(
            self.rollout_batch_size)
        if self.model_pool.size() > 100:
            KL_obs, KL_reward = self.rollout_K.KLdivergence()
            K = np.clip(round(2.5/ max(KL_obs[0],KL_obs[1],KL_obs[2],KL_reward) ), 0, 5)
        else:
            K = 1
  
        for obs in observations:
            for i in range(K):
                action = self.agent.take_action(obs)
                reward, next_obs = self.fake_env.step(obs, action)
                self.model_pool.add(obs, action, reward, next_obs, False)
                obs = next_obs
        return K

    def update_agent(self, policy_train_batch_size=64):
        env_batch_size = int(policy_train_batch_size * self.real_ratio)
        model_batch_size = policy_train_batch_size - env_batch_size
        for epoch in range(10):
            env_obs, env_action, env_reward, env_next_obs, env_done = self.env_pool.sample(
                env_batch_size)
            if self.model_pool.size() > 0:
                model_obs, model_action, model_reward, model_next_obs, model_done = self.model_pool.sample(
                    model_batch_size)
                obs = np.concatenate((env_obs, model_obs), axis=0)
                env_action = env_action[:,np.newaxis]
                action = np.concatenate((env_action, model_action), axis=0)
                next_obs = np.concatenate((env_next_obs, model_next_obs),
                                          axis=0)
                reward = np.concatenate((env_reward, model_reward), axis=0)
                done = np.concatenate((env_done, model_done), axis=0)
            else:
                obs, action, next_obs, reward, done = env_obs, env_action, env_next_obs, env_reward, env_done
            transition_dict = {
                'states': obs,
                'actions': action,
                'next_states': next_obs,
                'rewards': reward,
                'dones': done
            }
            self.agent.update(transition_dict)
            

    def train_model(self):
        obs, action, reward, next_obs, done = self.env_pool.return_all_samples()
        action = action[:,np.newaxis]
        inputs = np.concatenate((obs, action), axis=-1)
        reward = np.array(reward)
        labels = np.concatenate(
            (np.reshape(reward, (reward.shape[0], -1)), next_obs - obs),
            axis=-1)
        loss = self.fake_env.model.train(inputs, labels)
        return loss

class ReplayBuffer:
    def __init__(self, capacity):
        self.buffer = collections.deque(maxlen=capacity)

    def add(self, state, action, reward, next_state, done):
        self.buffer.append((state, action, reward, next_state, done))

    def size(self):
        return len(self.buffer)

    def sample(self, batch_size):
        if batch_size > len(self.buffer):
            return self.return_all_samples()
        else:
            transitions = random.sample(self.buffer, batch_size)
            state, action, reward, next_state, done = zip(*transitions)
            return np.array(state), np.array(action), reward, np.array(next_state), done

    def return_all_samples(self):
        all_transitions = list(self.buffer)
        state, action, reward, next_state, done = zip(*all_transitions)
        return np.array(state), np.array(action), reward, np.array(next_state), done

In [7]:
CA = False
total_episode = 200
TTC_threshold = 3.001
base_name = f'MBPO_VarK_Energy_s3' 

real_ratio = 0.8
actor_lr = 0.001
critic_lr = 0.001
alpha_lr = 3e-4
hidden_dim = 32
gamma = 0.9
tau = 0.005  # 软更新参数
MEMORY_CAPACITY = 20000
batch_size = 1024
target_entropy = -1
model_alpha = 0.01  # 模型损失函数中的加权权重

In [8]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

env = Env(TTC_threshold)

# load training data
# train = sio.loadmat('trainSet.mat')['calibrationData']
# test = sio.loadmat('testSet.mat')['validationData']
train = sio.loadmat('calibrationData_new.mat')['calibrationData_new']
test = sio.loadmat('validationData_new.mat')['validationData_new']
trainNum = train.shape[0]
testNum = test.shape[0]
print('Number of training samples:', trainNum)
print('Number of validate samples:', testNum)

random.seed(3)
np.random.seed(3)
torch.manual_seed(3)

target_entropy = -env.n_actions
s_dim = env.n_features
a_dim = env.n_actions
a_bound = env.action_Bound

rollout_batch_size = 100
rollout_length = 5  # 推演长度k,推荐更多尝试
model_pool_size = rollout_batch_size * rollout_length

n_run = 3
rolling_window = 10  # 100 car following events, average score
result = []
Ev = EV()

Number of training samples: 1073
Number of validate samples: 268


In [None]:
for run in [base_name]:
    # name is the name of the experiment, CA is whether use collision avoidance
    pointer = 0
    sac = SAC(s_dim, hidden_dim, a_dim, a_bound, actor_lr,critic_lr, alpha_lr, target_entropy, tau, gamma)
    
    model = EnsembleDynamicsModel(s_dim, a_dim, model_alpha)
    fake_env = FakeEnv(model)
    env_pool = ReplayBuffer(MEMORY_CAPACITY)
    model_pool = ReplayBuffer(model_pool_size)
    rollout_K = variableK(env_pool, model_pool)
    mbpo = MBPO(env, sac, fake_env, env_pool, model_pool, rollout_K,
                rollout_batch_size, real_ratio, total_episode)

    # training part
    max_rolling_score = np.float('-inf')
    max_score = np.float('-inf')
    collision_train = 0
    episode_score = np.zeros(total_episode)  # average score of each car following event
    rolling_score = np.zeros(total_episode)
    cum_collision_num = np.zeros(total_episode)

    score_safe = np.zeros(total_episode)
    score_efficiency = np.zeros(total_episode)
    score_comfort = np.zeros(total_episode)
    score_energy = np.zeros(total_episode)
    loss_list = np.zeros(total_episode)
    K_list = np.zeros(total_episode)
    
    
    # 随机探索采取数据
    return_list = []
    explore_return = 0
    car_fol_id2 = random.randint(0, trainNum - 1)
    data2 = train[car_fol_id2, 0]
    obs, done, episode_return = env.reset(data2), False, 0
    while not done:
        action = sac.take_action(obs)[0]
        next_obs, reward, done, _ = env.step(action)
        env_pool.add(obs, action, reward, next_obs, done)
        obs = next_obs
        episode_return += reward
    return_list.append(explore_return)

    for i in tqdm(range(total_episode)):
        car_fol_id = random.randint(0, trainNum - 1)
        data = train[car_fol_id, 0]
        s = env.reset(data)
        step = 0
        SOC_data = []
        SOC = 0.92
        SOC_origin = SOC
        para = {}
        para['k']= 0.5
        para['k2'] = 0.5
        para['speed'] = s[1]
        para['SOC'] = SOC_origin
        score = 0
        score_s, score_e, score_c, score_eng = 0, 0, 0, 0  # part objective scores

        while True:
            if step % 20 == 0:
                    loss = mbpo.train_model()
                    K = mbpo.rollout_model()
                    
            a = sac.take_action(s)[0]

            if CA:
                # add collision avoidance guidance
                space, svSpd, relSpd = s
                lvSpd = svSpd + relSpd
                RT = 1  # reaction time
                SD = svSpd * RT + (svSpd ** 2 - lvSpd ** 2) / (2 * a_bound)

                if space < SD:
                    a = - a_bound

            para['acc'] = a
            SOC_new, cost, INB, out = EV().run(para)
            price_elec = cost
            r_eng = - 5 * price_elec
            
            s_, r, done, r_info = env.step(a)
            r += r_eng
            
            env_pool.add(s, a, r, s_, done)
            SOC_data.append(SOC_new)

            s = s_
            
            para['speed'] = s[1]
            para['SOC'] = SOC_new
            
            score += r
            score_s += r_info[3]
            score_e += r_info[4]
            score_c += r_info[5]
            score_eng += r_eng
            
            mbpo.update_agent()
            step += 1

            if done:
                duration = data.shape[0]
                score /= duration  # normalize with respect to car-following length
                score_s /= duration
                score_e /= duration
                score_c /= duration
                score_eng /= duration

                if env.isCollision == 1:
                    collision_train += 1
                break

        # record episode results
        episode_score[i] = score
        score_safe[i] = score_s
        score_efficiency[i] = score_e
        score_comfort[i] = score_c
        score_energy[i] = score_eng
        rolling_score[i] = np.mean(episode_score[max(0, i - rolling_window + 1):i + 1])
        cum_collision_num[i] = collision_train
        loss_list[i] = loss
        K_list[i] = K

        if max_score < score:
            max_score = score

        if rolling_score[i] > max_rolling_score:
            max_rolling_score = rolling_score[i]
            # save network parameters
#             sac.savenet(f'model_{run}')
#             mbpo.save(i)
        if i > total_episode-10:
            sac.save(i)
            
        sys.stdout.write(
            f'''\r Run {run}, Episode {i}, Score: {score:.2f}, Rolling score: {rolling_score[i]:.2f}, loss: {loss_list[i]:.2f}, Max rolling score: {max_rolling_score:.2f}, collisions: {collision_train}   ''')
        sys.stdout.flush()

    # save results
    result.append([episode_score, rolling_score, cum_collision_num, score_safe, score_efficiency, score_comfort,score_energy,loss_list,K_list])

np.save(f'result_{run}.npy', result)

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  max_rolling_score = np.float('-inf')
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  max_score = np.float('-inf')
  state = torch.tensor([state], dtype=torch.float).to(device)
  0%|          | 0/200 [00:00<?, ?it/s]

rollout_length =  1
rollout_length =  1
KL_obs =  [0.58681862 0.02504021 0.5773291 ]
KL_reward =  0.01970073635733089
rollout_length =  4
KL_obs =  [0.55035491 0.02215195 0.52674147]
KL_reward =  0.01132509053454503
rollout_length =  5
KL_obs =  [0.49539817 0.02443335 0.55782507]
KL_reward =  0.01478904062178926
rollout_length =  4
KL_obs =  [0.47249402 0.03265186 0.61688119]
KL_reward =  0.011616814261570211
rollout_length =  4
KL_obs =  [0.38460578 0.0290349  0.44609332]
KL_reward =  0.011463287989510022
rollout_length =  5
KL_obs =  [0.35782233 0.0296381  0.55326049]
KL_reward =  0.010542309019758599
rollout_length =  5
KL_obs =  [0.27605276 0.03321809 0.87998401]
KL_reward =  0.00963067933304209
rollout_length =  3
KL_obs =  [0.50381067 0.03104865 0.71854988]
KL_reward =  0.0141887632241195
rollout_length =  3
KL_obs =  [0.66775817 0.03854353 0.90435841]
KL_reward =  0.016132433851149937
rollout_length =  3
KL_obs =  [0.65889665 0.05196795 0.68470233]
KL_reward =  0.012415164326972

  0%|          | 1/200 [00:34<1:53:45, 34.30s/it]

KL_obs =  [0.98405255 0.04208452 0.72957973]
KL_reward =  0.0168762458583127
rollout_length =  3
KL_obs =  [1.02060211 0.03717879 0.67751131]
KL_reward =  0.01743794142418442
rollout_length =  2
KL_obs =  [0.99213364 0.0476702  1.02025568]
KL_reward =  0.017497621119096356
rollout_length =  2
KL_obs =  [1.12534165 0.05165295 0.92233434]
KL_reward =  0.015296125939404668
rollout_length =  2
KL_obs =  [0.6977383  0.04602783 0.61495233]
KL_reward =  0.016755690132341808
rollout_length =  4
KL_obs =  [1.17410576 0.05280331 0.77346104]
KL_reward =  0.013881892027296927
rollout_length =  2
KL_obs =  [1.09322237 0.0490224  0.90346081]
KL_reward =  0.01709206863580337
rollout_length =  2
KL_obs =  [0.81112803 0.04162937 0.90189095]
KL_reward =  0.0174644065239225
rollout_length =  3
KL_obs =  [0.95723008 0.0462748  1.05797319]
KL_reward =  0.014715013938196844
rollout_length =  2
KL_obs =  [0.83848393 0.04212767 0.96401854]
KL_reward =  0.012793243670941371
rollout_length =  3
KL_obs =  [1.042

  1%|          | 2/200 [00:56<1:30:22, 27.39s/it]

KL_obs =  [1.00947599 0.0438737  0.84223216]
KL_reward =  0.015978650892441562
rollout_length =  2
KL_obs =  [0.88298114 0.04712305 1.20166325]
KL_reward =  0.014537220380026455
rollout_length =  2
KL_obs =  [1.1250732  0.03535629 1.07559251]
KL_reward =  0.014896252569183557
rollout_length =  2
KL_obs =  [0.97468861 0.03553974 1.14877175]
KL_reward =  0.016690220939080153
rollout_length =  2
KL_obs =  [0.90789169 0.04097191 1.28209019]
KL_reward =  0.01326738197698474
rollout_length =  2
KL_obs =  [0.95827027 0.04278179 1.34372631]
KL_reward =  0.01962334845176841
rollout_length =  2
KL_obs =  [0.91974747 0.03395227 1.14623868]
KL_reward =  0.011260606418690046
rollout_length =  2
KL_obs =  [0.74262254 0.03840177 1.02273492]
KL_reward =  0.01376876462529977
rollout_length =  2
KL_obs =  [0.75478975 0.03610871 1.14605218]
KL_reward =  0.0117292880492935
rollout_length =  2
 Run MBPO_VarK_Energy_s3, Episode 2, Score: 0.66, Rolling score: 0.42, loss: -26.28, Max rolling score: 0.42, coll

  2%|▏         | 3/200 [01:15<1:16:55, 23.43s/it]

KL_obs =  [0.65849319 0.04432892 0.93880604]
KL_reward =  0.01282092186853956
rollout_length =  3
KL_obs =  [0.75756284 0.04679615 1.3923751 ]
KL_reward =  0.014068628668669064
rollout_length =  2
KL_obs =  [0.96737047 0.04178633 1.03156325]
KL_reward =  0.015284423348658881
rollout_length =  2
KL_obs =  [0.89662535 0.04407797 1.62039136]
KL_reward =  0.01625614030875735
rollout_length =  2
KL_obs =  [1.08948173 0.03804774 0.85263817]
KL_reward =  0.018270885346384234
rollout_length =  2
KL_obs =  [0.78543878 0.04248486 0.91898532]
KL_reward =  0.012972137949547788
rollout_length =  3
KL_obs =  [0.6132616  0.03384596 0.94120754]
KL_reward =  0.011442076744001741
rollout_length =  3
KL_obs =  [0.63445015 0.03552499 1.44980072]
KL_reward =  0.010337575296504696
rollout_length =  2
KL_obs =  [0.71492917 0.03548438 1.52562621]
KL_reward =  0.013724212748234417
rollout_length =  2
KL_obs =  [0.90050083 0.03634218 1.47548629]
KL_reward =  0.01773223700039844
rollout_length =  2
KL_obs =  [0.

  2%|▏         | 4/200 [01:42<1:21:20, 24.90s/it]

KL_obs =  [0.85433451 0.03772742 1.34240516]
KL_reward =  0.017532560022000078
rollout_length =  2
KL_obs =  [0.53403249 0.03480015 1.03948487]
KL_reward =  0.010507846831246588
rollout_length =  2
KL_obs =  [0.74795063 0.04222816 1.93283735]
KL_reward =  0.014117286145548288
rollout_length =  1
KL_obs =  [0.68210744 0.03594713 1.4032465 ]
KL_reward =  0.010719579064773548
rollout_length =  2
KL_obs =  [0.51506319 0.03195116 1.1350857 ]
KL_reward =  0.012670010158796168
rollout_length =  2
KL_obs =  [0.95765011 0.03442498 1.50547   ]
KL_reward =  0.018724308691844348
rollout_length =  2
KL_obs =  [0.62171348 0.03916013 1.20384521]
KL_reward =  0.012017898196420244
rollout_length =  2
KL_obs =  [0.73052389 0.03743156 1.53025288]
KL_reward =  0.01299245049606719
rollout_length =  2
KL_obs =  [0.78458849 0.03727534 1.83737363]
KL_reward =  0.014139011655894299
rollout_length =  1
KL_obs =  [0.75396486 0.0324425  1.36995986]
KL_reward =  0.01309486465357456
rollout_length =  2
KL_obs =  [1

  2%|▎         | 5/200 [02:16<1:31:05, 28.03s/it]

KL_obs =  [0.39781595 0.03438127 1.4376697 ]
KL_reward =  0.0077167279967779315
rollout_length =  2
KL_obs =  [0.6523296  0.03524223 1.44836988]
KL_reward =  0.008357203329799102
rollout_length =  2
KL_obs =  [0.46895127 0.02781543 1.79506   ]
KL_reward =  0.010201678994622761
rollout_length =  1
KL_obs =  [0.64722072 0.02237877 1.72024843]
KL_reward =  0.012447059785862859
rollout_length =  1
KL_obs =  [0.46658461 0.03901114 1.54931086]
KL_reward =  0.011559873095710828
rollout_length =  2
KL_obs =  [0.49004389 0.03246124 1.65966701]
KL_reward =  0.008576958298683313
rollout_length =  2
KL_obs =  [0.56976302 0.03399233 1.84610572]
KL_reward =  0.011278365481826041
rollout_length =  1
KL_obs =  [0.54826911 0.03779974 1.17674009]
KL_reward =  0.008372687871229648
rollout_length =  2
KL_obs =  [0.58254902 0.03645715 1.52261784]
KL_reward =  0.01501646358594223
rollout_length =  2
KL_obs =  [0.63771397 0.04304373 1.55065261]
KL_reward =  0.00961204444967621
rollout_length =  2
KL_obs =  [

  3%|▎         | 6/200 [02:35<1:20:28, 24.89s/it]

In [None]:
plt.plot(rolling_score)

In [None]:
plt.plot(episode_score)

In [None]:
plt.plot(score_safe)

In [None]:
plt.plot(score_efficiency)

In [None]:
plt.plot(score_comfort)

In [None]:
plt.plot(score_energy)