In [4]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

  return torch._C._cuda_getDeviceCount() > 0


In [5]:
device

device(type='cpu')

In [58]:
import numpy as np
import random
import torch
import torch.nn as nn
import matplotlib.pyplot as plt
import json
import torch.nn.functional as F



class Qfunction(nn.Module):
    def __init__(self, state_dim, action_dim, n_levels):
        super().__init__()
        self.model = nn.Sequential(
                nn.Linear(state_dim, 64),
                nn.ReLU(),
                nn.Linear(64, 64),
                nn.ReLU(),
                nn.Linear(64, n_levels * action_dim)
        )
        self.action_dim = action_dim
        self.n_levels = n_levels

    def forward(self, states):
        batch_size = states.shape[0]
        rewards = self.model(states)
        actions_rewards = rewards.reshape((batch_size, self.action_dim, self.n_levels))
        return actions_rewards


class DQN_double:
    def __init__(self,
                 state_dim,
                 action_dim,
                 gamma=0.95,
                 lr=1e-3,
                 batch_size=2,
                 epsilon_decrease=0.005,
                 epsilon_min=0.0001,
                 period=100,
                 n_levels = 5
                ):
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.q_function = Qfunction(state_dim, action_dim, n_levels).to(device)
        self.freezing_q_function = Qfunction(state_dim, action_dim, n_levels).to(device)
        self.freezing_q_function.load_state_dict(self.q_function.state_dict())
        self.gamma = gamma
        self.batch_size = batch_size
        self.epsilon = 1.0
        self.epsilon_decrease = epsilon_decrease
        self.epsilon_min = epsilon_min
        self.memory = []
        self.optimzaer = torch.optim.Adam(self.q_function.parameters(), lr=lr)
        self.counter = 1
        self.period = period
        self.N = 25_000
        
        self.n_levels = n_levels
        self.taus = self.get_taus()

    def get_taus(self):
        taus = np.zeros((self.n_levels,))
        for i in range(self.n_levels):
            taus[i] = (i / self.n_levels + (i + 1) / self.n_levels) / 2.0
        return taus

    def get_action(self, obs, show=False):
        with torch.no_grad():
            state = torch.FloatTensor(obs).unsqueeze(dim=0).to(device)
            # print(f"state: {state.shape}")
            action_reward = self.q_function(state)
            action_reward = action_reward.squeeze().cpu().numpy()
            q_values = np.sum(action_reward, axis=1) * 1.0 / self.n_levels
            if show:
                print(f"state: {obs}")
                print(f"q_value: {q_values}")

            max_action = np.argmax(q_values)
                
            probs = self.epsilon * np.ones(self.action_dim) / self.action_dim
            probs[max_action] += 1.0 - self.epsilon
            if show or self.counter % self.N == 0:
                print(f"probs: {probs}")
            action = np.random.choice(np.arange(self.action_dim), p=probs)
            return action


    def fit(self, state, action, reward, done, next_state, show=False):
        self.memory.append([state, action, reward, int(done), next_state])

        if len(self.memory) > 10 * self.batch_size:

            if self.counter % self.period == 0:
                self.freezing_q_function.load_state_dict(self.q_function.state_dict())
            
            self.counter += 1

            batch = random.sample(self.memory, self.batch_size)

            states, actions, rewards, dones, next_states = map(torch.FloatTensor, list(zip(*batch)))
            states, actions, rewards, dones, next_states = states.to(device), actions.unsqueeze(1).to(device), rewards.unsqueeze(dim=1), dones.unsqueeze(dim=1) , next_states.to(device)

            with torch.no_grad():
                next_action_rewards = self.q_function(next_states)
                next_q_values = torch.sum(next_action_rewards, dim=2) * 1.0 / self.n_levels
                max_next_action = torch.argmax(next_q_values, dim=1, keepdim=True)

                max_next_action = max_next_action.expand((self.batch_size, self.n_levels)).unsqueeze(dim=1)

                next_action_rewards = self.freezing_q_function(next_states)
                # next_action_rewards: (batch_size, n_actions, n_atoms)
                # max_next_action: (batch_size, 1, n_atoms) [[[2, 2, 2, 2 ...]], [[0, 0, 0, 0, ...]]]
                reward_distrs = next_action_rewards.gather(1, max_next_action).squeeze()
                  
                if show or self.counter % self.N == 0:
                    print(f"shape next_action_rewards: {next_action_rewards.shape}")
                    print(f"shape reward_distrs: {reward_distrs.shape}")
                    print(f"next_q_values: {next_q_values}")
                    print(f"max_next_action: {max_next_action}")

                target_distrs = rewards + self.gamma * (1 - dones) * reward_distrs
            
            # была ошибка здесь. Отдавал вместо actions - argmax_actions.
            distrs = self.q_function(states.float())
            actions = actions.long().expand((self.batch_size, self.n_levels)).unsqueeze(dim=1)
            distrs = distrs.gather(1, actions).squeeze()

            loss = None
            for i in range(self.n_levels):
                tau = self.taus[i]
                ttau = torch.ones((self.batch_size, self.n_levels)) * tau
                identity = (target_distrs > distrs[:, [i]]).long()
                if i == 0:
                    loss = torch.mean(torch.abs(identity - ttau) * torch.abs(target_distrs - distrs[:, [i]]))
                else:
                    loss += torch.mean(torch.abs(identity - ttau) * torch.abs(target_distrs - distrs[:, [i]]))
                if show or self.counter % self.N == 0:
                    print(f"ttau: {ttau}")
                    print(f"identity: {identity}")
                    print(f"diff: {target_distrs - distrs[:, [i]]}")

            self.optimzaer.zero_grad()
            loss.backward()
            self.optimzaer.step()

            if show:
                print(f"loss: {loss}")

    def decrease_params(self):
        self.epsilon = max(self.epsilon - self.epsilon_decrease, self.epsilon_min)
            


In [51]:
def DQN_learning(env, agent, episode_n = 100, t_max = 500):
    # agent.epsilon_decrease = 1.0 / (0.75*episode_n)

    total_rewards = []
    for episode in range(episode_n):
        total_reward = 0

        show = False
        if episode % 50 == 0:
            show = False
                
        state, info = env.reset()
        for t in range(t_max):
                
            action = agent.get_action(state, show)

            next_state, reward, terminated, truncated, info = env.step(action)
    
            total_reward += reward

            if show:
                print(f"iteration: {t}")
            
            agent.fit(state, action, reward, terminated or truncated, next_state, show)
            
    
            state = next_state
    
            if terminated or truncated:
                break
                
        total_rewards.append(total_reward)
        if episode % 10 == 0:
            print(f'episode: {episode}, total_reward: {np.mean(total_rewards[-10:])}')
            
        agent.decrease_params()
    
    return total_rewards

In [21]:
### Проблема при реализации была в том, что в модели добавились новые слои, а я эти слои не копировал, а копировал только общий позвоночник модели как в старой версии.

In [58]:
agent.epsilon

-8.81239525796218e-16

In [None]:
import gym
from gym.wrappers import TransformReward

torch.manual_seed(43)

env = gym.make('CartPole-v1')
# env = TransformReward(env, lambda r: v_min + r * (v_max - v_min) / 500.0)
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.n
print(action_dim)

episode_n=3000


agent = DQN_double(state_dim, action_dim)

total_rewards = DQN_learning(env, agent, episode_n=episode_n)
