In [2]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

  return torch._C._cuda_getDeviceCount() > 0


In [3]:
device

device(type='cpu')

In [11]:
class MaskVelocityWrapper(gym.ObservationWrapper):
    """
    Gym environment observation wrapper used to mask velocity terms in
    observations. The intention is the make the MDP partially observatiable.
    """
    def __init__(self, env):
        super(MaskVelocityWrapper, self).__init__(env)
        if ENV == "CartPole-v1":
            self.mask = np.array([1., 0., 1., 0.])
        elif ENV == "Pendulum-v0":
            self.mask = np.array([1., 1., 0.])
        elif ENV == "LunarLander-v2":
            self.mask = np.array([1., 1., 0., 0., 1., 0., 1., 1,])
        elif ENV == "LunarLanderContinuous-v2":
            self.mask = np.array([1., 1., 0., 0., 1., 0., 1., 1,])
        else:
            raise NotImplementedError

    def observation(self, observation):
        return  observation * self.mask

In [4]:
import numpy as np
import random
import torch
import torch.nn as nn
import matplotlib.pyplot as plt
import json
import torch.nn.functional as F



class Qfunction(nn.Module):
    def __init__(self, state_dim, action_dim):
        super().__init__()
        self.model = nn.Sequential(
                nn.Linear(state_dim, 64),
                nn.ReLU(),
                nn.Linear(64, 64),
                nn.ReLU(),
                nn.Linear(64, action_dim)                
        )

    def forward(self, states):
        actions = self.model(states)
        return actions

class DuelingQfunction(nn.Module):
    def __init__(self, state_dim, action_dim):
        super().__init__()
        self.model = nn.Sequential(
                nn.Linear(state_dim, 64),
                nn.ReLU()           
        )
        self.v_head = nn.Sequential(
            nn.Linear(64, 256),
            nn.ReLU(),
            nn.Linear(256, 1)
        )
        
        self.a_head = nn.Sequential(
            nn.Linear(64, 256),
            nn.ReLU(), 
            nn.Linear(256, action_dim)
        )

    def forward(self, states):
        inp = self.model(states)
        v = self.v_head(inp)
        a = self.a_head(inp)

        q = v + a - a.mean(1, keepdim=True)
        return q

class DuelingQfunctionWithEncoder(nn.Module):
    def __init__(self, state_dim, action_dim, encoder):
        super().__init__()
        self.encoder = encoder
        self.model = nn.Sequential(
                nn.Linear(state_dim, 64),
                nn.ReLU()           
        )
        self.v_head = nn.Sequential(
            nn.Linear(64, 256),
            nn.ReLU(),
            nn.Linear(256, 1)
        )
        
        self.a_head = nn.Sequential(
            nn.Linear(64, 256),
            nn.ReLU(), 
            nn.Linear(256, action_dim)
        )

    def forward(self, states):
        
        inp = self.model(states)
        v = self.v_head(inp)
        a = self.a_head(inp)

        q = v + a - a.mean(1, keepdim=True)
        return q


class DQN_double:
    def __init__(self, state_dim, action_dim, gamma=0.99, lr=1e-3, batch_size=64, epsilon_decrease=0.0001, epsilon_min=0.0001, period=100):
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.q_function = DuelingQfunction(state_dim, action_dim).to(device)
        self.freezing_q_function = DuelingQfunction(self.state_dim, self.action_dim).to(device)
        # self.q_function = Qfunction(state_dim, action_dim).to(device)
        # self.freezing_q_function = Qfunction(self.state_dim, self.action_dim).to(device)
        # self.q_function = QNetwork(state_dim, action_dim).to(device)
        # self.freezing_q_function = QNetwork(self.state_dim, self.action_dim).to(device)
        # self.freezing_q_function.load_state_dict(self.q_function.state_dict())
        self.gamma = gamma
        self.batch_size = batch_size
        self.epsilon = 0.5
        self.epsilon_decrease = epsilon_decrease
        self.epsilon_min = epsilon_min
        self.memory = []
        self.optimzaer = torch.optim.Adam(self.q_function.parameters(), lr=lr)
        self.counter = 1
        self.period = period

    def get_action(self, state):
        with torch.no_grad():
            q_values = self.q_function(torch.FloatTensor(state).unsqueeze(dim=0).to(device))
            q_values = q_values.squeeze()
            
            argmax_action = torch.argmax(q_values)
            probs = self.epsilon * np.ones(self.action_dim) / self.action_dim
            probs[argmax_action] += 1.0 - self.epsilon
            if self.counter % 10_000 == 0:
                print(probs)
                print(self.epsilon)
                print(q_values)
            action = np.random.choice(np.arange(self.action_dim), p=probs)
            return action

    def add(self, state, action, reward, done, next_state):
        self.memory.append([state, action, reward, int(done), next_state])
    
    def fit(self):
        if len(self.memory) > 128:

            if self.counter % self.period == 0:
                for parameter_freeze, parameter in zip(self.freezing_q_function.model.parameters(), self.q_function.model.parameters()):
                    with torch.no_grad():
                        parameter_freeze.data.copy_(parameter.data)
                for parameter_freeze, parameter in zip(self.freezing_q_function.a_head.parameters(), self.q_function.a_head.parameters()):
                    with torch.no_grad():
                        parameter_freeze.data.copy_(parameter.data)
                for parameter_freeze, parameter in zip(self.freezing_q_function.v_head.parameters(), self.q_function.v_head.parameters()):
                    with torch.no_grad():
                        parameter_freeze.data.copy_(parameter.data)
                # self.freezing_q_function.load_state_dict(self.q_function.state_dict())
            
            self.counter += 1

            batch = random.sample(self.memory, self.batch_size)

            states, actions, rewards, dones, next_states = map(torch.FloatTensor, list(zip(*batch)))
            states, actions, rewards, dones, next_states = states.to(device), actions.unsqueeze(1).to(device), rewards.unsqueeze(1).to(device), dones.unsqueeze(1).to(device), next_states.to(device)

            with torch.no_grad():
                targets = rewards + self.gamma * (1 - dones) * self.freezing_q_function(next_states.float()).gather(1, torch.argmax(self.q_function(next_states.float()), dim=1, keepdim=True).long())

            loss = F.mse_loss(self.q_function(states.float()).gather(1, actions.long()), targets)

            self.optimzaer.zero_grad()
            loss.backward()
            self.optimzaer.step()
            if self.counter % 10_000 == 0:
                print(f"loss: {loss.item()}")
                # print(targets)
            if self.epsilon > self.epsilon_min:
                self.epsilon -= (0.1 - self.epsilon_min) / 20_000
            


In [5]:
def DQN_learning(env, agent, episode_n = 100, t_max = 500, N=2):
    # agent.epsilon_decrease = 1.0 / (0.75*episode_n)

    total_rewards = []
    for episode in range(episode_n):
        total_reward = 0
    
        state, info = env.reset()
        
        for t in range(1, t_max):
            
            start_state = None
            start_action = None
            next_state = None
            done = False
            cum_reward = 0
            
            for i in range(N):
                action = agent.get_action(state)
                if i == 0:
                    start_state = state
                    start_action = action

                next_state, reward, terminated, truncated, info = env.step(action)
                cum_reward += reward

                state = next_state
                
                if terminated or truncated:
                    done = True
                    break
    
            total_reward += cum_reward
            
            agent.add(start_state, start_action, cum_reward, done, next_state)

            agent.fit()
            
            state = next_state
    
            if done:
                break
                
        total_rewards.append(total_reward)
        if episode % 10 == 0:
            print(f'episode: {episode}, total_reward: {np.mean(total_rewards[-10:])}')
    
    return total_rewards

In [7]:
import gym

torch.manual_seed(43)

env = gym.make('LunarLander-v2')
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.n

episode_n=500

agent = DQN_double(state_dim, action_dim)

total_rewards = DQN_learning(env, agent, episode_n=episode_n, N=2)


episode: 0, total_reward: -642.5320195837588
episode: 10, total_reward: -232.2707062217134


KeyboardInterrupt: 

In [14]:
import gym

torch.manual_seed(43)
ENV = "CartPole-v1"
env = gym.make(ENV)
env = MaskVelocityWrapper(env)
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.n

episode_n=1000

agent = DQN_double(state_dim, action_dim)

total_rewards = DQN_learning(env, agent, episode_n=episode_n, N=3)


episode: 0, total_reward: 16.0
episode: 10, total_reward: 23.7
episode: 20, total_reward: 17.1
episode: 30, total_reward: 14.6
episode: 40, total_reward: 19.4
episode: 50, total_reward: 16.5
episode: 60, total_reward: 24.5
episode: 70, total_reward: 32.7
episode: 80, total_reward: 27.4
episode: 90, total_reward: 21.1
episode: 100, total_reward: 17.0
episode: 110, total_reward: 41.3
episode: 120, total_reward: 29.8
episode: 130, total_reward: 41.9
episode: 140, total_reward: 44.1
episode: 150, total_reward: 45.6
episode: 160, total_reward: 30.9
episode: 170, total_reward: 35.6
episode: 180, total_reward: 40.9
episode: 190, total_reward: 45.5
episode: 200, total_reward: 42.6
episode: 210, total_reward: 33.2
episode: 220, total_reward: 32.5
episode: 230, total_reward: 41.3
episode: 240, total_reward: 42.6
episode: 250, total_reward: 34.6
episode: 260, total_reward: 31.3
episode: 270, total_reward: 45.8
episode: 280, total_reward: 39.1
episode: 290, total_reward: 38.1
episode: 300, total_r

KeyboardInterrupt: 