In [161]:
import gym
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions import Normal
import torch.nn.functional as F
from collections import deque
import random

In [162]:

# class QuadrotorEnv(gym.Env):
#     metadata = {'render.modes': ['console']}

#     def __init__(self):
#         super(QuadrotorEnv, self).__init__()

#         # Constants
#         self.g = 9.81  # gravity
#         self.m = 1.0   # mass of the UAV
#         self.mu = 0.05 # damping factor
#         self.dt = 0.02 # time step

#         # Define action and observation space
#         # Actions are thrust T, angle phi, angle theta
#         self.action_space = spaces.Box(low=np.array([0, -np.pi, -np.pi]), 
#                                        high=np.array([20, np.pi, np.pi]), dtype=np.float32)

#         # Observation space: x, y, z, vx, vy, vz
#         self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(6,), dtype=np.float32)

#         # State initialization
#         self.state = None
#         self.reset()

#     def step(self, action):
#         T, phi, theta = action
#         x, y, z, vx, vy, vz = self.state

#         # Calculate accelerations
#         ax = (-0.7071 * np.cos(phi) * np.sin(theta) - 0.7071 * np.sin(phi)) * T / self.m
#         ay = (-0.7071 * np.cos(phi) * np.sin(theta) - 0.7071 * np.sin(phi)) * T / self.m
#         az = self.g - (np.cos(phi) * np.cos(theta)) * T / self.m

#         # Update velocities
#         vx += (ax - self.mu * vx) * self.dt
#         vy += (ay - self.mu * vy) * self.dt
#         vz += (az - self.mu * vz) * self.dt

#         # Update positions
#         x += vx * self.dt
#         y += vy * self.dt
#         z += vz * self.dt

#         # Update state
#         self.state = np.array([x, y, z, vx, vy, vz])

#         # Calculate reward (placeholder)
#         reward = -np.sqrt((x - 5 * np.cos(1.2 * self.current_step * self.dt))**2 + (y - 5 * np.sin(1.2 * self.current_step * self.dt))**2 + (z + 20)**2)

#         # Check if UAV is within the reasonable bounds (this is a simple check)
#         done = z < -25 or self.current_step > 1000

#         self.current_step += 1

#         # Optionally we could add more info
#         info = {}

#         return self.state, reward, done, info

#     def reset(self):
#         # Reset the state
#         self.state = np.array([0.0, 0.0, 0.0, 1.0, -1.0, 0.0], dtype=np.float32)
#         self.current_step = 0
#         return self.state

#     def render(self, mode='console'):
#         if mode == 'console':
#             print(f'State: {self.state}')


In [163]:
# Define the Quadrotor Environment
class QuadrotorEnv(gym.Env):
    metadata = {'render.modes': ['console']}
 
    def __init__(self):
        super(QuadrotorEnv, self).__init__()
        self.g = 9.81  
        self.m = 1.0   
        self.mu = 0.05
        self.dt = 0.02
        self.action_space = gym.spaces.Box(low=np.array([0, -np.pi, -np.pi]), 
                                           high=np.array([20, np.pi, np.pi]), dtype=np.float32)
        self.observation_space = gym.spaces.Box(low=-np.inf, high=np.inf, shape=(6,), dtype=np.float32)
        self.reset()
 
    def step(self, action):
        T, phi, theta = action
        x, y, z, vx, vy, vz = self.state
        ax = (-0.7071 * np.cos(phi) * np.sin(theta) - 0.7071 * np.sin(phi)) * T / self.m
        ay = (-0.7071 * np.cos(phi) * np.sin(theta) - 0.7071 * np.sin(phi)) * T / self.m
        az = self.g - (np.cos(phi) * np.cos(theta)) * T / self.m
        vx += (ax - self.mu * vx) * self.dt
        vy += (ay - self.mu * vy) * self.dt
        vz += (az - self.mu * vz) * self.dt
 
        x += vx * self.dt
        y += vy * self.dt
        z += vz * self.dt
 
        self.state = np.array([x, y, z, vx, vy, vz])
        reward = -np.sqrt((x - 5 * np.cos(1.2 * self.current_step * self.dt))**2 + 
                          (y - 5 * np.sin(1.2 * self.current_step * self.dt))**2 + (z + 20)**2)
        done = z < -50 or self.current_step > 1000
        self.current_step += 1
        return self.state, reward, done, {}
 
    def reset(self):
        self.state = np.array([0.0, 0.0, 0.0, 1.0, -1.0, 0.0], dtype=np.float32)
        self.current_step = 0
        return self.state
 
    def render(self, mode='console'):
        if mode == 'console':
            print(f'State: {self.state}')

In [164]:

 
# # Actor Network
# class Actor(nn.Module):
#     def __init__(self, state_dim, action_dim, action_bound):
#         super(Actor, self).__init__()
#         self.layer1 = nn.Linear(state_dim, 256)
#         self.layer2 = nn.Linear(256, 256)
#         self.mean = nn.Linear(256, action_dim)
#         self.log_std = nn.Linear(256, action_dim)
#         self.action_bound = action_bound
 
#     def forward(self, state):
#         x = torch.relu(self.layer1(state))
#         x = torch.relu(self.layer2(x))
#         mean = self.mean(x)
#         log_std = self.log_std(x)
#         log_std = torch.clamp(log_std, -20, 2)
#         return mean, log_std
 
#     def sample(self, state):
#         mean, log_std = self.forward(state)
#         std = torch.exp(log_std)
#         normal = Normal(mean, std)
#         z = normal.rsample()
#         action = torch.tanh(z) * self.action_bound
#         log_prob = normal.log_prob(z) - torch.log(1 - action.pow(2) + 1e-6)
#         log_prob = log_prob.sum(-1, keepdim=True)
#         return action, log_prob
 

In [165]:
class Actor(nn.Module):
    def __init__(self, state_dim, action_dim, action_bound):
        super(Actor, self).__init__()
        self.layer1 = nn.Linear(state_dim, 256)
        self.layer2 = nn.Linear(256, 256)
        self.mean = nn.Linear(256, action_dim)
        self.log_std = nn.Linear(256, action_dim)
        self.action_bound = action_bound
 
    def forward(self, state):
        # x = torch.relu(self.layer1(state))
        # x = torch.relu(self.layer2(x))
        # x = self.bn1(torch.relu(self.layer1(state)))
        # x = self.bn2(torch.relu(self.layer2(x)))
        x = torch.nn.functional.leaky_relu(self.layer1(state), 0.01)
        x = torch.nn.functional.leaky_relu(self.layer2(x), 0.01)
        mean = self.mean(x)
        log_std = self.log_std(x)
        log_std = torch.clamp(log_std, min=-4, max=1)
        std = torch.exp(log_std) + 1e-6  # Adding a small epsilon to avoid std = 0
        # print("Mean: ",mean)
        # print("State:", state)
        # print("std:  ", std)
        return mean, std
 
    def sample(self, state):
        # print("State:", state)
        mean, std = self.forward(state)
        normal = Normal(mean, std)
        z = normal.rsample()
        action = torch.tanh(z) * self.action_bound
        log_prob = normal.log_prob(z) - torch.log(1 - action.pow(2) + 1e-6)  # Enforcing action bounds
        log_prob = log_prob.sum(-1, keepdim=True)
        return action, log_prob

In [166]:
# Critic Network
class Critic(nn.Module):
    def __init__(self, state_dim, action_dim):
        super(Critic, self).__init__()
        self.layer1 = nn.Linear(state_dim + action_dim, 256)
        self.layer2 = nn.Linear(256, 256)
        self.value = nn.Linear(256, 1)
 
    def forward(self, state, action):
        x = torch.cat([state, action], dim=1)
        x = torch.relu(self.layer1(x))
        x = torch.relu(self.layer2(x))
        value = self.value(x)
        return value
 


In [167]:
# Replay Buffer
class ReplayBuffer:
    def __init__(self, capacity):
        self.buffer = deque(maxlen=capacity)
 
    def push(self, state, action, reward, next_state, done):
        self.buffer.append((state, action, reward, next_state, done))
 
    def sample(self, batch_size):
        state, action, reward, next_state, done = zip(*random.sample(self.buffer, batch_size))
        return np.array(state), np.array(action), np.array(reward, dtype=np.float32), np.array(next_state), np.array(done, dtype=np.float32)
 
    def __len__(self):
        return len(self.buffer)

In [168]:
# SAC Agent
class SACAgent:
    def __init__(self, state_dim, action_dim, action_bound):
        self.actor = Actor(state_dim, action_dim, action_bound).to(device)
        self.critic_1 = Critic(state_dim, action_dim).to(device)
        self.critic_2 = Critic(state_dim, action_dim).to(device)
        self.target_critic_1 = Critic(state_dim, action_dim).to(device)
        self.target_critic_2 = Critic(state_dim, action_dim).to(device)
        self.target_critic_1.load_state_dict(self.critic_1.state_dict())
        self.target_critic_2.load_state_dict(self.critic_2.state_dict())
        self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=3e-4)
        self.critic_optimizer = optim.Adam(list(self.critic_1.parameters()) + list(self.critic_2.parameters()), lr=3e-4)
        self.replay_buffer = ReplayBuffer(1000000)
        self.gamma = 0.99
        self.tau = 0.005
        self.alpha = 0.2
 
    def select_action(self, state):
        state = torch.FloatTensor(state).to(device)
        action, _ = self.actor.sample(state)
        return action.cpu().data.numpy()
 
    def update(self, batch_size):
        if len(self.replay_buffer) < batch_size:
            return
        state, action, reward, next_state, done = self.replay_buffer.sample(batch_size)
        state = torch.FloatTensor(state).to(device)
        action = torch.FloatTensor(action).to(device)
        reward = torch.FloatTensor(reward).to(device).unsqueeze(1)
        next_state = torch.FloatTensor(next_state).to(device)
        done = torch.FloatTensor(done).to(device).unsqueeze(1)
 
        with torch.no_grad():
            next_action, next_log_prob = self.actor.sample(next_state)
            target_Q1 = self.target_critic_1(next_state, next_action)
            target_Q2 = self.target_critic_2(next_state, next_action)
            target_Q = reward + (1 - done) * self.gamma * (torch.min(target_Q1, target_Q2) - self.alpha * next_log_prob)
 
        current_Q1 = self.critic_1(state, action)
        current_Q2 = self.critic_2(state, action)
        critic_loss = F.mse_loss(current_Q1, target_Q) + F.mse_loss(current_Q2, target_Q)
 
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()
 
        action_new, log_prob_new = self.actor.sample(state)
        Q1_new = self.critic_1(state, action_new)
        Q2_new = self.critic_2(state, action_new)
        actor_loss = (self.alpha * log_prob_new - torch.min(Q1_new, Q2_new)).mean()
        # Gradient clipping    
        torch.nn.utils.clip_grad_norm_(self.actor.parameters(), 1)
        torch.nn.utils.clip_grad_norm_(self.critic_1.parameters(), 1)
        torch.nn.utils.clip_grad_norm_(self.critic_2.parameters(), 1)
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()
 
        # Soft update the target network
        for target_param, param in zip(self.target_critic_1.parameters(), self.critic_1.parameters()):
            target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data)
        for target_param, param in zip(self.target_critic_2.parameters(), self.critic_2.parameters()):
            target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data)
 

In [169]:
# Main training loop
def train():
    env = QuadrotorEnv()
    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.shape[0]
    action_bound = env.action_space.high[0]
    agent = SACAgent(state_dim, action_dim, action_bound)
    episodes = 100
    steps_per_episode = 1000
    batch_size = 256
    gradient_steps = 50
 
    for episode in range(episodes):
        state = env.reset()
        episode_reward = 0
        for _ in range(steps_per_episode):
            action = agent.select_action(state)
            next_state, reward, done, _ = env.step(action)
            agent.replay_buffer.push(state, action, reward, next_state, done)
            state = next_state
            episode_reward += reward
            if done:
                break
        # for _ in range(gradient_steps):
        #     agent.update(batch_size)
 
        print(f'Episode: {episode}, Reward: {episode_reward}')
 
if __name__ == '__main__':
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    train()

Episode: 0, Reward: -706561.5831671506
Episode: 1, Reward: -661784.7763244238
Episode: 2, Reward: -771798.0514773278
Episode: 3, Reward: -689760.0092211156
Episode: 4, Reward: -783810.0809612436
Episode: 5, Reward: -696290.7614280793
Episode: 6, Reward: -754590.7176755683
Episode: 7, Reward: -750984.1425293714
Episode: 8, Reward: -792090.2312769819
Episode: 9, Reward: -669703.3240580729
Episode: 10, Reward: -716995.9191947443
Episode: 11, Reward: -683971.056110923
Episode: 12, Reward: -851412.9264123343
Episode: 13, Reward: -618961.2701145584
Episode: 14, Reward: -716244.2421027536
Episode: 15, Reward: -726142.0686088497
Episode: 16, Reward: -711013.6603602632
Episode: 17, Reward: -822775.7896523795
Episode: 18, Reward: -706572.4552782599
Episode: 19, Reward: -789808.6768011305
Episode: 20, Reward: -743524.2279347946
Episode: 21, Reward: -669322.4387733996
Episode: 22, Reward: -702775.372946844
Episode: 23, Reward: -717872.7816126568
Episode: 24, Reward: -696189.6912042585
Episode: 25,