In [1]:
import gym
from scipy.stats import multivariate_normal
import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions import MultivariateNormal
import numpy as np
from collections import deque
import matplotlib.pyplot as plt

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

env = gym.make('CarRacing-v0')



In [2]:
class CNN_Network(nn.Module):
    def __init__(self, output_size):
        super(CNN_Network, self).__init__()

        self.conv2d_0 = nn.Conv2d(1, 8, kernel_size=4, stride=2)
        self.relu_0 = nn.ReLU()

        self.conv2d_1 = nn.Conv2d(8, 16, kernel_size=3, stride=2)
        self.relu_1 = nn.ReLU()

        self.conv2d_2 = nn.Conv2d(16, 32, kernel_size=3, stride=2)
        self.relu_2 = nn.ReLU()

        self.conv2d_3 = nn.Conv2d(32, 64, kernel_size=3, stride=2)
        self.relu_3 = nn.ReLU()

        self.conv2d_4 = nn.Conv2d(64, 128, kernel_size=3, stride=2, padding=1)
        self.relu_4 = nn.ReLU()

        self.conv2d_5 = nn.Conv2d(128, 256, kernel_size=3, stride=2)
        self.relu_5 = nn.ReLU()

        self.action_linear_1 = nn.Linear(256, 128)
        self.relu_action_1 = nn.ReLU()
        self.action_linear_2 = nn.Linear(128, output_size)

        self.value_linear_1 = nn.Linear(256, 128)
        self.relu_value_1 = nn.ReLU()
        self.value_linear_2 = nn.Linear(128, 1)


    def forward(self, x):
        x = self.relu_0(self.conv2d_0(x))
        x = self.relu_1(self.conv2d_1(x))
        x = self.relu_2(self.conv2d_2(x))
        x = self.relu_3(self.conv2d_3(x))
        x = self.relu_4(self.conv2d_4(x))
        x = self.relu_5(self.conv2d_5(x))
        x = x.view(x.shape[0], -1)

        x_action = self.relu_action_1(self.action_linear_1(x))
        x_action = self.action_linear_2(x_action)

        x_value = self.relu_value_1(self.value_linear_1(x))
        x_value = self.value_linear_2(x_value)
        return x_action, x_value

agent = CNN_Network(env.action_space.shape[0]).to(device)

In [3]:
print(agent)

CNN_Network(
  (conv2d_0): Conv2d(1, 8, kernel_size=(4, 4), stride=(2, 2))
  (relu_0): ReLU()
  (conv2d_1): Conv2d(8, 16, kernel_size=(3, 3), stride=(2, 2))
  (relu_1): ReLU()
  (conv2d_2): Conv2d(16, 32, kernel_size=(3, 3), stride=(2, 2))
  (relu_2): ReLU()
  (conv2d_3): Conv2d(32, 64, kernel_size=(3, 3), stride=(2, 2))
  (relu_3): ReLU()
  (conv2d_4): Conv2d(64, 128, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
  (relu_4): ReLU()
  (conv2d_5): Conv2d(128, 256, kernel_size=(3, 3), stride=(2, 2))
  (relu_5): ReLU()
  (action_linear_1): Linear(in_features=256, out_features=128, bias=True)
  (relu_action_1): ReLU()
  (action_linear_2): Linear(in_features=128, out_features=3, bias=True)
  (value_linear_1): Linear(in_features=256, out_features=128, bias=True)
  (relu_value_1): ReLU()
  (value_linear_2): Linear(in_features=128, out_features=1, bias=True)
)


In [4]:

# class OrnsteinUhlenbeckActionNoise():
#     def __init__(self, mu, sigma=0.3, theta=.10, dt=1e-2, x0=None):
#         self.theta = theta
#         self.mu = mu
#         self.sigma = sigma
#         self.dt = dt
#         self.x0 = x0
#         self.reset()

#     def __call__(self):
#         x = self.x_prev + self.theta * (self.mu - self.x_prev) * self.dt + \
#                 self.sigma * self.dt**(1/2) * torch.normal(mean=0.0, std=1.0, size=self.mu.shape, device=device)
#         self.x_prev = x
#         return x

#     def reset(self):
#         self.x_prev = self.x0 if self.x0 is not None else torch.zeros_like(self.mu, device=device)

#     def __repr__(self):
#         return 'OrnsteinUhlenbeckActionNoise(mu={}, sigma={})'.format(self.mu, self.sigma)

In [5]:
def to_torch(tensor):
    return torch.tensor(tensor.copy(), dtype=torch.float32, device=device)

cov_exploration_matrix = torch.diag(torch.full((3,), 0.2)).to(device)
def get_action(state):
    actions, values = agent(state)
    actions = actions[0]
    distribution = MultivariateNormal(actions, cov_exploration_matrix)
    log_actions = distribution.log_prob(actions)

    return actions.cpu().numpy(), log_actions.cpu().numpy()

GAMMA = 0.975
def rollout(noise_factor, render):

    state = env.reset()
    memory = []
    timesteps = 0
    done = False
    while not done:
        if render:
            env.render()
        state = state / 255.
        action, log_action = get_action(to_torch(state).mean(dim=2).reshape(1, 1, state.shape[0], state.shape[1]))

        next_state, reward, done, _ = env.step(action)
        memory.append([state, action, reward, log_action])
        timesteps += 1
        state = next_state

    states, actions, rewards, log_actions = map(np.array, zip(*memory))
    plt.show()

    discounted_rewards = np.zeros((len(rewards)))
    discount = 0
    # Discounts rewards in reverse
    for i in reversed(range(len(rewards))):

        # Discount fowards from the future for previous states
        discount = rewards[i] + discount*GAMMA 
        discounted_rewards[i] = discount

    return to_torch(states).mean(dim=3).unsqueeze(dim=1), to_torch(actions), to_torch(discounted_rewards), to_torch(log_actions), timesteps, np.sum(rewards)

In [6]:
CLIP_EPSILON = 0.2
value_loss_fn = nn.SmoothL1Loss()

def get_log_probs_and_value(states):
    actions, values = agent(states)
    distribution = MultivariateNormal(actions, cov_exploration_matrix)
    log_actions = distribution.log_prob(actions)


    return log_actions, values

def compute_losses(states, actions, rewards, log_actions):
    # Compute policy loss first
    # Compute ratios 
    log_new_actions, values = get_log_probs_and_value(states)
    ratios = torch.exp(log_new_actions - log_actions)

    advantages = rewards.unsqueeze(dim=1) - values.detach()

    policy_loss = torch.min(ratios*advantages, torch.clip(ratios, 1 - CLIP_EPSILON, 1 + CLIP_EPSILON)*advantages)
    policy_loss = -torch.mean(policy_loss)

    # Compute value loss
    value_loss = value_loss_fn(rewards, values)

    # Compute entropy loss
    # TODO
    
    return policy_loss, value_loss

In [7]:
# TODOS
# 1. new_actions / actions - > en log_probs (log_new_actions - log_actions)
# 2. Trouver des meilleurs manières pour explorer
# 3. tweak value_factor, GAMMA, exploration_factor, lr 
# 4. Tweak le modèle pytorch, activations


def train():
    n_time_steps = 1000000
    n_updates_per_episode = 2
    
    value_factor = 0.3

    current_time_step = 0
    exploration_factor = 0.1
    exploration_decay = 0.995

    agent_optimizer = optim.Adam(agent.parameters(), lr=0.0003)
    scores = []
    while current_time_step < n_time_steps:
        # try:
        with torch.no_grad():
            states, actions, rewards, log_actions, timesteps, episode_score = rollout(exploration_factor*exploration_decay**len(scores), len(scores)%20==0)
        # except:
        #     continue
        scores.append(episode_score)
        print(f"Current score: {scores[-1]}")
        print(f"Total timesteps: {current_time_step}")
        current_time_step += timesteps

        for _ in range(n_updates_per_episode):

            agent_optimizer.zero_grad()
            policy_loss, value_loss = compute_losses(states, actions, rewards, log_actions)

            loss = policy_loss + value_factor*value_loss #- 10 * torch.mean(torch.std(actions, dim=0))
            print(f'loss: {loss}')
            loss.backward()

            agent_optimizer.step()



In [8]:
train()

Track generation: 1132..1419 -> 287-tiles track
Current score: -72.02797202797214
Total timesteps: 0


NameError: name 'state' is not defined