In [1]:
import random
import gym
import numpy as np
from tqdm import tqdm
import torch
from torch import nn
import torch.nn.functional as F
import matplotlib.pyplot as plt
import random
import scipy.io as sio
import pickle as pk
import sys
import os
from simulation_env_new import Env
from FourWI_EV_new import EV
import rl_utils

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
CA = False
total_episode = 500
TTC_threshold = 3.001
base_name = f'DDPG' 

actor_lr = 0.001
critic_lr = 0.001
hidden_dim = 32
hidden2_dim = 16
gamma = 0.9
tau = 0.005  # 软更新参数
MEMORY_CAPACITY = 20000
# minimal_size = 10000
batch_size = 1024
sigma = 0.01  # 高斯噪声标准差

In [3]:
class PolicyNet(torch.nn.Module):
    def __init__(self, state_dim, hidden_dim,hidden2_dim, action_dim, action_bound):
        super(PolicyNet, self).__init__()
        self.fc1 = torch.nn.Linear(state_dim, hidden_dim)
        self.fc2 = torch.nn.Linear(hidden_dim, hidden2_dim)
        self.fc3 = torch.nn.Linear(hidden2_dim, action_dim)
        self.action_bound = action_bound  # action_bound是环境可以接受的动作最大值

    def forward(self, x):
        x = F.relu(self.fc2(F.relu(self.fc1(x))))
        return torch.tanh(self.fc3(x)) * self.action_bound


class QValueNet(torch.nn.Module):
    def __init__(self, state_dim, hidden_dim, action_dim):
        super(QValueNet, self).__init__()
        self.fc1 = torch.nn.Linear(state_dim + action_dim, hidden_dim)
        self.fc2 = torch.nn.Linear(hidden_dim, hidden_dim)
        self.fc_out = torch.nn.Linear(hidden_dim, 1)

    def forward(self, x, a):
        cat = torch.cat([x, a], dim=1) # 拼接状态和动作
        x = F.relu(self.fc1(cat))
        x = F.relu(self.fc2(x))
        return self.fc_out(x)

In [4]:
class DDPG:
    ''' DDPG算法 '''
    def __init__(self, state_dim, hidden_dim,hidden2_dim, action_dim, action_bound, sigma, actor_lr, critic_lr, tau, gamma, device):
        self.actor = PolicyNet(state_dim, hidden_dim,hidden2_dim, action_dim, action_bound).to(device)
        self.critic = QValueNet(state_dim, hidden_dim, action_dim).to(device)
        self.target_actor = PolicyNet(state_dim, hidden_dim,hidden2_dim, action_dim, action_bound).to(device)
        self.target_critic = QValueNet(state_dim, hidden_dim, action_dim).to(device)
        # 初始化目标价值网络并设置和价值网络相同的参数
        self.target_critic.load_state_dict(self.critic.state_dict())
        # 初始化目标策略网络并设置和策略相同的参数
        self.target_actor.load_state_dict(self.actor.state_dict())
        self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=actor_lr)
        self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), lr=critic_lr)
        self.gamma = gamma
        self.sigma = sigma  # 高斯噪声的标准差,均值直接设为0
        self.tau = tau  # 目标网络软更新参数
        self.action_dim = action_dim
        self.device = device
        self.pointer = 0
        self.memory = np.zeros((MEMORY_CAPACITY, s_dim * 2 + a_dim + 1), dtype=np.float32)

    def take_action(self, state):
        state = torch.tensor([state], dtype=torch.float).to(self.device)
        action = self.actor(state).item()
        return action

    def soft_update(self, net, target_net):
        for param_target, param in zip(target_net.parameters(), net.parameters()):
            param_target.data.copy_(param_target.data * (1.0 - self.tau) + param.data * self.tau)
            
    def store_transition(self, s, a, r, s_):
        transition = np.hstack((s, a, [r], s_))
        index = self.pointer % MEMORY_CAPACITY  # replace the old memory with new memory
        self.memory[index, :] = transition
        self.pointer += 1

    def update(self, transition_dict):
        states = torch.tensor(transition_dict['states'], dtype=torch.float).to(self.device)
        actions = torch.tensor(transition_dict['actions'], dtype=torch.float).view(-1, 1).to(self.device)
        rewards = torch.tensor(transition_dict['rewards'], dtype=torch.float).view(-1, 1).to(self.device)
        next_states = torch.tensor(transition_dict['next_states'], dtype=torch.float).to(self.device)
        dones = torch.tensor(transition_dict['dones'], dtype=torch.float).view(-1, 1).to(self.device)

        next_q_values = self.target_critic(next_states, self.target_actor(next_states))
        q_targets = rewards + self.gamma * next_q_values * (1 - dones)
        critic_loss = torch.mean(F.mse_loss(self.critic(states, actions), q_targets))
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        actor_loss = -torch.mean(self.critic(states, self.actor(states)))
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        self.soft_update(self.actor, self.target_actor)  # 软更新策略网络
        self.soft_update(self.critic, self.target_critic)  # 软更新价值网络
        
    def save(self,episode):
        torch.save(self.actor.state_dict(), "./model/DDPG/ddpg_s7_actor{}.pth".format(episode))
#         torch.save(self.actor.state_dict(), "./model/ddpg_actor.pth")

    def load(self,episode):
        self.actor.load_state_dict(torch.load("./model/ddpg_actor{}.pth".format(episode)))

In [None]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

env = Env(TTC_threshold)

# load training data
# train = sio.loadmat('trainSet.mat')['calibrationData']
# test = sio.loadmat('testSet.mat')['validationData']
train = sio.loadmat('calibrationData_new.mat')['calibrationData_new']
test = sio.loadmat('validationData_new.mat')['validationData_new']
trainNum = train.shape[0]
testNum = test.shape[0]
print('Number of training samples:', trainNum)
print('Number of validate samples:', testNum)

random.seed(14)
np.random.seed(14)
torch.manual_seed(14)

s_dim = env.n_features
a_dim = env.n_actions
a_bound = env.action_Bound

# # Stop distance collision avoidance
n_run = 3
rolling_window = 10  # 100 car following events, average score
result = []


for run in [base_name]:
    # name is the name of the experiment, CA is whether use collision avoidance
    pointer = 0
    replay_buffer = rl_utils.ReplayBuffer(MEMORY_CAPACITY)
    ddpg = DDPG(s_dim, hidden_dim,hidden2_dim, a_dim, a_bound, sigma, actor_lr, critic_lr, tau, gamma, device)

    # training part
    max_rolling_score = np.float('-inf')
    max_score = np.float('-inf')
    collision_train = 0
    episode_score = np.zeros(total_episode)  # average score of each car following event
    rolling_score = np.zeros(total_episode)
    cum_collision_num = np.zeros(total_episode)
    var = 3

    score_safe = np.zeros(total_episode)
    score_efficiency = np.zeros(total_episode)
    score_comfort = np.zeros(total_episode)
    score_energy = np.zeros(total_episode)


    for i in tqdm(range(total_episode)):
        car_fol_id = random.randint(0, trainNum - 1)
        data = train[car_fol_id, 0]
        s = env.reset(data)
        SOC_data = []
        SOC = 0.92
        SOC_origin = SOC
        para = {}
        para['k']= 0.5
        para['k2'] = 0.5
        para['speed'] = s[1]
        para['SOC'] = SOC_origin
        score = 0
        score_s, score_e, score_c, score_eng = 0, 0, 0, 0  # part objective scores

        while True:
            a = ddpg.take_action(s)
            a = np.clip(np.random.normal(a, var), -a_bound, a_bound)

            if CA:
                # add collision avoidance guidance
                space, svSpd, relSpd = s
                lvSpd = svSpd + relSpd
                RT = 1  # reaction time
                SD = svSpd * RT + (svSpd ** 2 - lvSpd ** 2) / (2 * a_bound)

                if space < SD:
                    a = - a_bound

            para['acc'] = a
            SOC_new, cost, INB, out = EV().run(para)
            price_elec = cost
            r_eng = - 5 * price_elec
            
            s_, r, done, r_info = env.step(a)
            
            r += r_eng
            
            SOC_data.append(SOC_new)
            replay_buffer.add(s, a, r, s_, done)
            pointer += 1 

            replace = False
            if pointer > MEMORY_CAPACITY:
                var *= .9995
                b_s, b_a, b_r, b_ns, b_d = replay_buffer.sample(batch_size)
                transition_dict = {'states': b_s, 'actions': b_a, 'next_states': b_ns, 'rewards': b_r, 'dones': b_d}
                ddpg.update(transition_dict)
                
            s = s_
                    
            para['speed'] = s[1]
            para['SOC'] = SOC_new
            
            score += r
            score_s += r_info[3]
            score_e += r_info[4]
            score_c += r_info[5]
            score_eng += r_eng
                
            if done:
                duration = data.shape[0]
                score /= duration  # normalize with respect to car-following length
                score_s /= duration
                score_e /= duration
                score_c /= duration
                score_eng /= duration

                if env.isCollision == 1:
                    collision_train += 1
                break

        # record episode results
        episode_score[i] = score
        score_safe[i] = score_s
        score_efficiency[i] = score_e
        score_comfort[i] = score_c
        score_energy[i] = score_eng
        rolling_score[i] = np.mean(episode_score[max(0, i - rolling_window + 1):i + 1])
        cum_collision_num[i] = collision_train

        if max_score < score:
            max_score = score

        if rolling_score[i] > max_rolling_score:
            max_rolling_score = rolling_score[i]
            # save network parameters
#             ddpg.save()

        if i > total_episode-10:
            ddpg.save(i)
            
        sys.stdout.write(
            f'''\r Run {run}, Episode {i}, Score: {score:.2f}, Rolling score: {rolling_score[i]:.2f}, Max score: {max_score:.2f}, Max rolling score: {max_rolling_score:.2f}, collisions: {collision_train}   ''')
        sys.stdout.flush()

    # save results
    result.append([episode_score, rolling_score, cum_collision_num, score_safe, score_efficiency, score_comfort,score_energy])

np.save(f'result_{run}.npy', result)

# 保存结果为MAT格式
result_dict = {
    'episode_score': episode_score,
    'rolling_score': rolling_score,
    'cum_collision_num': cum_collision_num,
    'score_safe': score_safe,
    'score_efficiency': score_efficiency,
    'score_comfort': score_comfort,
    'score_energy': score_energy
}

# 保存为MAT文件
sio.savemat(f'result_{run}.mat', result_dict)

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  max_rolling_score = np.float('-inf')
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  max_score = np.float('-inf')


Number of training samples: 1073
Number of validate samples: 268


  state = torch.tensor([state], dtype=torch.float).to(self.device)


 Run DDPG_Energy_s14, Episode 0, Score: -0.09, Rolling score: -0.09, Max score: -0.09, Max rolling score: -0.09, collisions: 0   

  0%|          | 1/500 [00:00<05:21,  1.55it/s]

 Run DDPG_Energy_s14, Episode 1, Score: -0.03, Rolling score: -0.06, Max score: -0.03, Max rolling score: -0.06, collisions: 0   

  0%|          | 2/500 [00:01<06:13,  1.33it/s]

 Run DDPG_Energy_s14, Episode 2, Score: -0.07, Rolling score: -0.06, Max score: -0.03, Max rolling score: -0.06, collisions: 0   

  1%|          | 3/500 [00:02<06:32,  1.26it/s]

 Run DDPG_Energy_s14, Episode 3, Score: -0.31, Rolling score: -0.12, Max score: -0.03, Max rolling score: -0.06, collisions: 1   

  1%|          | 4/500 [00:02<05:15,  1.57it/s]

 Run DDPG_Energy_s14, Episode 4, Score: -0.09, Rolling score: -0.12, Max score: -0.03, Max rolling score: -0.06, collisions: 1   

  1%|          | 5/500 [00:03<05:10,  1.60it/s]

 Run DDPG_Energy_s14, Episode 5, Score: -0.06, Rolling score: -0.11, Max score: -0.03, Max rolling score: -0.06, collisions: 1   

  1%|          | 6/500 [00:04<06:15,  1.32it/s]

 Run DDPG_Energy_s14, Episode 6, Score: -0.58, Rolling score: -0.18, Max score: -0.03, Max rolling score: -0.06, collisions: 2   

  1%|▏         | 7/500 [00:04<05:12,  1.58it/s]

 Run DDPG_Energy_s14, Episode 7, Score: -0.65, Rolling score: -0.23, Max score: -0.03, Max rolling score: -0.06, collisions: 3   

  2%|▏         | 8/500 [00:05<04:40,  1.75it/s]

 Run DDPG_Energy_s14, Episode 8, Score: -0.09, Rolling score: -0.22, Max score: -0.03, Max rolling score: -0.06, collisions: 3   

  2%|▏         | 9/500 [00:06<06:25,  1.27it/s]

In [None]:
plt.plot(cum_collision_num)

In [None]:
plt.plot(rolling_score)

In [None]:
plt.plot(episode_score)

In [None]:
plt.plot(score_safe)

In [None]:
plt.plot(score_efficiency)

In [None]:
plt.plot(score_comfort)

In [None]:
plt.plot(score_energy)

In [None]:
import pandas as pd
score_df = pd.DataFrame()
score_df['efficiency'] = score_efficiency
score_df['safe'] = score_safe
score_df['comfort'] = score_comfort

# rolling_window = 20
# conduct rolling window
%matplotlib inline
plt.plot(score_df.efficiency.rolling(window=rolling_window).mean(), label = 'Efficiency')
plt.plot(score_df.safe.rolling(window=rolling_window).mean(), label = 'Safety')
plt.plot(score_df.comfort.rolling(window=rolling_window).mean(), label = 'Comfort')
plt.legend()