In [1]:
import gym
from gym.wrappers import Monitor
from scipy.stats import multivariate_normal
import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions import MultivariateNormal, Beta, Cauchy, Normal, LogNormal
import numpy as np
from collections import deque
import matplotlib.pyplot as plt
import json
from scipy.special import expit

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

env = gym.make('CarRacing-v0')
env = Monitor(env, './video', video_callable=lambda episode_id: episode_id%20==0, force=True)



In [2]:
class Gaussian(nn.Module):
    def __init__(self):
        super().__init__()
        
        

    def forward(self, mean_actions, std_actions, old_actions):
        # cov_mat = torch.diag(std_actions.mean(dim=0))
        distribution = Normal(mean_actions, std_actions)
        actions_with_exploration = distribution.sample()

        if old_actions is None:
            log_actions = distribution.log_prob(actions_with_exploration)
        else:
            log_actions = distribution.log_prob(old_actions)

        return distribution.mean, actions_with_exploration, log_actions, distribution.entropy()

class CNN_Network(nn.Module):
    def __init__(self, output_size):
        super(CNN_Network, self).__init__()

        self.conv2d_0 = nn.Conv2d(1, 8, kernel_size=4, stride=2)
        self.relu_0 = nn.ReLU()

        self.conv2d_1 = nn.Conv2d(8, 16, kernel_size=3, stride=2)
        self.relu_1 = nn.ReLU()

        self.conv2d_2 = nn.Conv2d(16, 32, kernel_size=3, stride=2)
        self.relu_2 = nn.ReLU()

        self.conv2d_3 = nn.Conv2d(32, 64, kernel_size=3, stride=2)
        self.relu_3 = nn.ReLU()

        self.conv2d_4 = nn.Conv2d(64, 128, kernel_size=3, stride=2, padding=1)
        self.relu_4 = nn.ReLU()

        self.conv2d_5 = nn.Conv2d(128, 256, kernel_size=3, stride=2)
        self.relu_5 = nn.ReLU()

        # self.action_linear_1 = nn.Linear(4608, 256)
        self.action_mean = nn.Linear(256, output_size)
        self.action_std = nn.Linear(256, output_size)
        # self.mean_activation = nn.Tanh()
        self.std_activation = nn.Softplus()

        self.gaussian = Gaussian()


        # self.value_linear_1 = nn.Linear(4608, 256)
        # self.relu_value_1 = nn.LeakyReLU()
        self.value_linear_2 = nn.Linear(256, 1)

        for layer in [self.conv2d_0 , self.conv2d_1, self.conv2d_2, self.conv2d_3, self.conv2d_4, self.conv2d_5, self.action_mean, self.action_std,\
                      self.value_linear_2]:
            torch.nn.init.xavier_normal_(layer.weight)
            torch.nn.init.zeros_(layer.bias)

    def forward(self, x, old_actions=None):
        x = self.relu_0(self.conv2d_0(x))
        x = self.relu_1(self.conv2d_1(x))
        x = self.relu_2(self.conv2d_2(x))
        x = self.relu_3(self.conv2d_3(x))
        x = self.relu_4(self.conv2d_4(x))
        x = self.relu_5(self.conv2d_5(x))
        x = x.view(x.shape[0], -1)

        # x_action_mean = self.mean_activation(self.action_mean(x))
        x_action_mean = self.action_mean(x)
        x_action_std = self.std_activation(self.action_std(x))
        # x_action_std = self.action_std(x)
        mean, actions, log_actions, entropy = self.gaussian(x_action_mean, x_action_std, old_actions)
        # x_action = self.activation_action1(x_action)

        # x_value = self.relu_value_1(self.value_linear_1(x))
        x_value = self.value_linear_2(x)
        return mean, actions, log_actions, entropy, x_value

agent = CNN_Network(3).to(device)

In [3]:
print(list(list(agent.children())[-2].parameters()))

[]


In [4]:

# class OrnsteinUhlenbeckActionNoise():
#     def __init__(self, mu, sigma=0.3, theta=.10, dt=1e-2, x0=None):
#         self.theta = theta
#         self.mu = mu
#         self.sigma = sigma
#         self.dt = dt
#         self.x0 = x0
#         self.reset()

#     def __call__(self):
#         x = self.x_prev + self.theta * (self.mu - self.x_prev) * self.dt + \
#                 self.sigma * self.dt**(1/2) * torch.normal(mean=0.0, std=1.0, size=self.mu.shape, device=device)
#         self.x_prev = x
#         return x

#     def reset(self):
#         self.x_prev = self.x0 if self.x0 is not None else torch.zeros_like(self.mu, device=device)

#     def __repr__(self):
#         return 'OrnsteinUhlenbeckActionNoise(mu={}, sigma={})'.format(self.mu, self.sigma)

In [5]:
def to_torch(tensor):
    return torch.tensor(tensor.copy(), dtype=torch.float32, device=device)

def get_action(state):
    mean, actions, log_actions, _, _ = agent(state)
    actions = actions[0]
    log_actions = log_actions[0]

    return  mean[0].cpu().numpy(), actions.cpu().numpy(), log_actions.cpu().numpy()

GAMMA = 0.98
def rollout(test=False):

    state = env.reset() / 255.
    memory = []
    timesteps = 0
    done = False
    streak = 0
    while not done:
        
        mean_action, action, log_action = get_action(to_torch(state).mean(dim=2).reshape(1, 1, state.shape[0], state.shape[1]))
        if test:
            action = mean_action
        fixed_action = action.copy()#*[0.5, 0.5, 0.5] + [0., 0., 0.]
        
        action_rewards = []
        for i in range(1):
            next_state, reward, done, _ = env.step(fixed_action)
            action_rewards.append(reward)

            if reward < 0:
                streak += 1
                if streak > 250:
                    while not done:
                        _, _, done, _ = env.step(fixed_action)
            else:
                streak = 0

            if done: 
                break
        reward = np.mean(action_rewards)

        next_state = next_state / 255.

        memory.append([state, action, reward, log_action])
        timesteps += 1
        state = next_state
            

    states, actions, rewards, log_actions = map(np.array, zip(*memory))

    discounted_rewards = np.zeros((len(rewards)))
    discount = 0
    # Discounts rewards in reverse
    for i in reversed(range(len(rewards))):

        # Discount fowards from the future for previous states
        discount = rewards[i] + discount*GAMMA 
        discounted_rewards[i] = discount

    return to_torch(states).mean(dim=3).unsqueeze(dim=1), to_torch(actions), to_torch(discounted_rewards).reshape(-1,1), to_torch(log_actions), timesteps, np.sum(rewards)

In [6]:
CLIP_EPSILON = 0.2
value_loss_fn = nn.SmoothL1Loss()

def get_log_probs_and_value(states, old_actions):
    _, actions, log_actions_new_policy, entropy, values = agent(states, old_actions)

    return actions, log_actions_new_policy, values, entropy

def compute_advantages(states, rewards):
    _, _, _, _, values = agent(states)

    advantages = rewards - values.detach()
    advantages = (advantages - advantages.mean()) / \
                (advantages.std() + 1e-8) 

    return advantages

def compute_losses(states, actions, rewards, log_actions, advantages):
    # Compute policy loss first
    # Compute ratios 
    new_actions, log_actions_new_policy, values, entropy = get_log_probs_and_value(states, actions)
    ratios = torch.exp(log_actions_new_policy - log_actions) 

    policy_loss = torch.min(ratios*advantages, torch.clip(ratios, 1 - CLIP_EPSILON, 1 + CLIP_EPSILON)*advantages)
    policy_loss = -torch.mean(policy_loss)

    # Compute value loss
    value_loss = value_loss_fn(rewards, values)

    # Compute entropy loss
    entropy_loss = -torch.mean(entropy)
    
    return policy_loss, value_loss, entropy_loss

In [7]:
# TODOS
# 1. new_actions / actions - > en log_probs (log_new_actions - log_actions)
# 2. Trouver des meilleurs manières pour explorer
# 3. tweak value_factor, GAMMA, exploration_factor, lr 
# 4. Tweak le modèle pytorch, activations


def train():

    n_episodes = 3000
    n_updates_per_episode = 5
    
    value_factor = 0.5
    entropy_factor = 0.01

    current_time_step = 0

    agent_optimizer = optim.Adam(agent.parameters(), lr=0.0005)
    scores = []
    entropies = []
    while len(scores) < n_episodes:
        # try:
        with torch.no_grad():
            agent.eval()
            states, actions, rewards, log_actions, timesteps, episode_score = rollout()
        # except:
        #     continue
        scores.append(episode_score)
        print(f"Current score: {scores[-1]}")
        print(f"episode: {len(scores)}")
        current_time_step += timesteps
        advantages = compute_advantages(states, rewards)

        agent.train()
        for _ in range(n_updates_per_episode):
            agent_optimizer.zero_grad()
            policy_loss, value_loss, entropy_loss = compute_losses(states, actions, rewards, log_actions, advantages)

            loss = policy_loss + value_factor*value_loss + entropy_factor*entropy_loss
            print(f'loss: {loss}')
            loss.backward()

            agent_optimizer.step()
            agent_optimizer.zero_grad()


    
    with open('results.json', 'w') as f:
        json.dump({"scores": scores}, f)
        
    torch.save(agent, 'agent.pt')



In [8]:
train()

Track generation: 1196..1499 -> 303-tiles track
Current score: -56.95364238410687
episode: 1
loss: 1.250899314880371
loss: 1.2247835397720337
loss: 1.1965214014053345
loss: 1.160024642944336
loss: 1.1088812351226807
Track generation: 1188..1489 -> 301-tiles track
Current score: -40.000000000000554
episode: 2
loss: 0.6766636967658997
loss: 0.5941058397293091
loss: 0.4928376078605652
loss: 0.3848145306110382
loss: 0.2935178875923157
Track generation: 1193..1495 -> 302-tiles track
Current score: -18.532890365448385
episode: 3
loss: 0.7066059112548828
loss: 0.753675103187561
loss: 0.8116005659103394
loss: 0.8133165240287781
loss: 0.7821494936943054
Track generation: 1212..1520 -> 308-tiles track
Current score: -17.0120521172637
episode: 4
loss: 0.7820501327514648
loss: 0.7405053973197937
loss: 0.7163819670677185
loss: 0.7076880931854248
loss: 0.7087618708610535
Track generation: 953..1198 -> 245-tiles track
retry to generate track (normal if there are not manyinstances of this message)
Tra

In [9]:
agent = torch.load('agent.pt')

with torch.no_grad():
    for i in range(10):
        rollout(test=True)