In [1]:
import gym
from scipy.stats import multivariate_normal
import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions import MultivariateNormal
import numpy as np
from collections import deque
import matplotlib.pyplot as plt

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

env = gym.make('CarRacing-v0')



In [2]:
class CNN_Network(nn.Module):
    def __init__(self, output_size):
        super(CNN_Network, self).__init__()

        self.conv2d_0 = nn.Conv2d(1, 16, kernel_size=4, stride=2)
        self.relu_0 = nn.LeakyReLU()

        self.conv2d_1 = nn.Conv2d(16, 32, kernel_size=3, stride=2)
        self.relu_1 = nn.LeakyReLU()

        # self.conv2d_2 = nn.Conv2d(16, 32, kernel_size=3, stride=2)
        # self.relu_2 = nn.LeakyReLU()

        self.conv2d_3 = nn.Conv2d(32, 64, kernel_size=3, stride=2)
        self.relu_3 = nn.LeakyReLU()

        self.conv2d_4 = nn.Conv2d(64, 128, kernel_size=3, stride=2, padding=1)
        self.relu_4 = nn.LeakyReLU()

        # self.conv2d_5 = nn.Conv2d(128, 256, kernel_size=3, stride=2)
        # self.relu_5 = nn.LeakyReLU()

        self.action_linear_1 = nn.Linear(4608, 256)
        self.relu_action_1 = nn.LeakyReLU()
        self.action_linear_2 = nn.Linear(256, output_size)
        self.activation_action1 = nn.Tanh()
        self.activation_action2 = nn.ReLU()

        self.value_linear_1 = nn.Linear(4608, 256)
        self.relu_value_1 = nn.LeakyReLU()
        self.value_linear_2 = nn.Linear(256, 1)

        for layer in [self.conv2d_0 , self.conv2d_1,self.conv2d_3, self.conv2d_4, self.action_linear_1, self.action_linear_2, self.value_linear_1, self.value_linear_2]:
            torch.nn.init.xavier_normal_(layer.weight)
            torch.nn.init.zeros_(layer.bias)

    def forward(self, x):
        x = self.relu_0(self.conv2d_0(x))
        x = self.relu_1(self.conv2d_1(x))
        # x = self.relu_2(self.conv2d_2(x))
        x = self.relu_3(self.conv2d_3(x))
        x = self.relu_4(self.conv2d_4(x))
        # x = self.relu_5(self.conv2d_5(x))
        x = x.view(x.shape[0], -1)

        x_action = self.action_linear_1(x)
        x_action = self.action_linear_2(x_action)
        # x_action = self.activation_action1(x_action)

        x_value = self.relu_value_1(self.value_linear_1(x))
        x_value = self.value_linear_2(x_value)
        return x_action, x_value

agent = CNN_Network(2).to(device)

In [3]:
print(agent)

CNN_Network(
  (conv2d_0): Conv2d(1, 16, kernel_size=(4, 4), stride=(2, 2))
  (relu_0): LeakyReLU(negative_slope=0.01)
  (conv2d_1): Conv2d(16, 32, kernel_size=(3, 3), stride=(2, 2))
  (relu_1): LeakyReLU(negative_slope=0.01)
  (conv2d_3): Conv2d(32, 64, kernel_size=(3, 3), stride=(2, 2))
  (relu_3): LeakyReLU(negative_slope=0.01)
  (conv2d_4): Conv2d(64, 128, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
  (relu_4): LeakyReLU(negative_slope=0.01)
  (action_linear_1): Linear(in_features=4608, out_features=256, bias=True)
  (relu_action_1): LeakyReLU(negative_slope=0.01)
  (action_linear_2): Linear(in_features=256, out_features=2, bias=True)
  (activation_action1): Tanh()
  (activation_action2): ReLU()
  (value_linear_1): Linear(in_features=4608, out_features=256, bias=True)
  (relu_value_1): LeakyReLU(negative_slope=0.01)
  (value_linear_2): Linear(in_features=256, out_features=1, bias=True)
)


In [4]:

# class OrnsteinUhlenbeckActionNoise():
#     def __init__(self, mu, sigma=0.3, theta=.10, dt=1e-2, x0=None):
#         self.theta = theta
#         self.mu = mu
#         self.sigma = sigma
#         self.dt = dt
#         self.x0 = x0
#         self.reset()

#     def __call__(self):
#         x = self.x_prev + self.theta * (self.mu - self.x_prev) * self.dt + \
#                 self.sigma * self.dt**(1/2) * torch.normal(mean=0.0, std=1.0, size=self.mu.shape, device=device)
#         self.x_prev = x
#         return x

#     def reset(self):
#         self.x_prev = self.x0 if self.x0 is not None else torch.zeros_like(self.mu, device=device)

#     def __repr__(self):
#         return 'OrnsteinUhlenbeckActionNoise(mu={}, sigma={})'.format(self.mu, self.sigma)

In [5]:
def to_torch(tensor):
    return torch.tensor(tensor.copy(), dtype=torch.float32, device=device)

cov_exploration_matrix = torch.tensor([0.25, 0.08], device=device)
cov_mat = torch.diag(cov_exploration_matrix)
def get_action(state):
    actions, values = agent(state)
    actions = actions[0]
    distribution = MultivariateNormal(actions, cov_mat)
    actions_with_exploration = distribution.sample()
    log_actions = distribution.log_prob(actions_with_exploration)

    return actions_with_exploration.cpu().numpy(), log_actions.cpu().numpy()

GAMMA = 0.99
def rollout(render, exploration_scale):
    base_exploration = [0.25, 0.1]
    cov_exploration_matrix = torch.tensor(base_exploration, device=device) * min(exploration_scale, 0.5)

    state = env.reset() / 255.
    memory = []
    timesteps = 0
    done = False
    streak = 0
    while not done:
        
        action, log_action = get_action(to_torch(state).mean(dim=2).reshape(1, 1, state.shape[0], state.shape[1]))

        for i in range(1):
            if render or timesteps%100==0:
                env.render()
            
            action_raw = [action[0], action[1], 0]


            next_state, reward, done, _ = env.step(action_raw)

            

            if np.mean(next_state[:, :, 1]) > 185.0:
                reward -= 0.05

            if reward < 0:
                streak +=1
                if streak > 300:
                    done = True
            else:
                streak = 0

            # reward += 0.1*np.clip(action[1], 0, 1)

            next_state = next_state / 255.
            memory.append([state, action, reward, log_action])
            timesteps += 1
            state = next_state

            
            if done: 
                break

    states, actions, rewards, log_actions = map(np.array, zip(*memory))

    discounted_rewards = np.zeros((len(rewards)))
    discount = 0
    # Discounts rewards in reverse
    for i in reversed(range(len(rewards))):

        # Discount fowards from the future for previous states
        discount = rewards[i] + discount*GAMMA 
        discounted_rewards[i] = discount

    return to_torch(states).mean(dim=3).unsqueeze(dim=1), to_torch(actions), to_torch(discounted_rewards).reshape(-1,1), to_torch(log_actions), timesteps, np.sum(rewards)

In [6]:
CLIP_EPSILON = 0.2
value_loss_fn = nn.SmoothL1Loss()

def get_log_probs_and_value(states, old_actions):
    actions, values = agent(states)
    distribution = MultivariateNormal(actions, cov_mat)
    log_actions = distribution.log_prob(old_actions)

    return actions, log_actions, values, distribution.entropy()

def compute_advantages(states, rewards):
    _, values = agent(states)

    advantages = rewards - values.detach()
    advantages = (advantages - advantages.mean()) / \
                (advantages.std() + 1e-8) 

    return advantages

def compute_losses(states, actions, rewards, log_actions, advantages):
    # Compute policy loss first
    # Compute ratios 
    new_actions, log_new_actions, values, entropy = get_log_probs_and_value(states, actions)
    ratios = torch.exp(log_new_actions - log_actions) 

    policy_loss = torch.min(ratios*advantages, torch.clip(ratios, 1 - CLIP_EPSILON, 1 + CLIP_EPSILON)*advantages)
    policy_loss = -torch.mean(policy_loss)

    # Compute value loss
    value_loss = value_loss_fn(rewards, values)

    # Compute entropy loss
    entropy_loss = -torch.mean(entropy)
    
    return policy_loss, value_loss, entropy_loss

In [7]:
# TODOS
# 1. new_actions / actions - > en log_probs (log_new_actions - log_actions)
# 2. Trouver des meilleurs manières pour explorer
# 3. tweak value_factor, GAMMA, exploration_factor, lr 
# 4. Tweak le modèle pytorch, activations


def train():
    n_time_steps = 1000000
    n_updates_per_episode = 5
    
    value_factor = 0.3
    entropy_factor = 0.01

    current_time_step = 0
    exploration_factor = 0.995

    agent_optimizer = optim.Adam(agent.parameters(), lr=0.0001)
    scores = []
    while current_time_step < n_time_steps:
        # try:
        with torch.no_grad():
            agent.eval()
            states, actions, rewards, log_actions, timesteps, episode_score = rollout(len(scores)%5==0, exploration_factor**len(scores))
        # except:
        #     continue
        scores.append(episode_score)
        print(f"Current score: {scores[-1]}")
        print(f"Total timesteps: {current_time_step}")
        current_time_step += timesteps
        print(cov_exploration_matrix)
        advantages = compute_advantages(states, rewards)

        agent.train()
        for _ in range(n_updates_per_episode):
            agent_optimizer.zero_grad()
            policy_loss, value_loss, entropy_loss = compute_losses(states, actions, rewards, log_actions, advantages)

            loss = policy_loss + value_factor*value_loss #+ entropy_factor*entropy_loss
            print(f'loss: {loss}')
            loss.backward()

            agent_optimizer.step()
            agent_optimizer.zero_grad()



In [8]:
train()

Track generation: 1155..1448 -> 293-tiles track
Current score: -9.878082191780841
Total timesteps: 0
tensor([0.2500, 0.0800], device='cuda:0')
loss: 1.719394326210022
loss: 1.707392930984497
loss: 1.695741057395935
loss: 1.6841288805007935
loss: 1.6726922988891602
Track generation: 1191..1493 -> 302-tiles track
Current score: -105.49966777408642
Total timesteps: 398
tensor([0.2500, 0.0800], device='cuda:0')
loss: 9.552884101867676
loss: 9.537296295166016
loss: 9.520057678222656
loss: 9.500947952270508
loss: 9.48011302947998
Track generation: 1175..1473 -> 298-tiles track
Current score: -19.727946127946087
Total timesteps: 749
tensor([0.2500, 0.0800], device='cuda:0')
loss: 1.4913465976715088
loss: 1.474423885345459
loss: 1.4561309814453125
loss: 1.436176061630249
loss: 1.414711833000183
Track generation: 1004..1265 -> 261-tiles track
retry to generate track (normal if there are not manyinstances of this message)
Track generation: 1136..1424 -> 288-tiles track
Current score: 5.713937282