In [2]:
import gym
from gym.wrappers import Monitor
from scipy.stats import multivariate_normal
import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions import MultivariateNormal, Beta, Cauchy, Normal, LogNormal
import numpy as np
from collections import deque
import matplotlib.pyplot as plt
import json
from scipy.special import expit

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

env = gym.make('CarRacing-v0')
env = Monitor(env, './video', video_callable=lambda episode_id: episode_id%50==0, force=True)



In [3]:
class Gaussian(nn.Module):
    def __init__(self):
        super().__init__()
        
        

    def forward(self, mean_actions, std_actions, old_actions):
        # cov_mat = torch.diag(std_actions.mean(dim=0))
        distribution = Normal(mean_actions, std_actions)
        actions_with_exploration = distribution.sample()

        if old_actions is None:
            log_actions = distribution.log_prob(actions_with_exploration)
        else:
            log_actions = distribution.log_prob(old_actions)

        return distribution.mean, actions_with_exploration, log_actions, distribution.entropy()

class CNN_Network(nn.Module):
    def __init__(self, output_size):
        super(CNN_Network, self).__init__()

        self.conv2d_0 = nn.Conv2d(1, 8, kernel_size=4, stride=2)
        self.relu_0 = nn.ReLU()

        self.conv2d_1 = nn.Conv2d(8, 16, kernel_size=3, stride=2)
        self.relu_1 = nn.ReLU()

        self.conv2d_2 = nn.Conv2d(16, 32, kernel_size=3, stride=2)
        self.relu_2 = nn.ReLU()

        self.conv2d_3 = nn.Conv2d(32, 64, kernel_size=3, stride=2)
        self.relu_3 = nn.ReLU()

        self.conv2d_4 = nn.Conv2d(64, 128, kernel_size=3, stride=2, padding=1)
        self.relu_4 = nn.ReLU()

        self.conv2d_5 = nn.Conv2d(128, 256, kernel_size=3, stride=2)
        self.relu_5 = nn.ReLU()

        # self.action_linear_1 = nn.Linear(4608, 256)
        self.action_mean = nn.Linear(256, output_size)
        self.action_std = nn.Linear(256, output_size)
        # self.mean_activation = nn.Tanh()
        self.std_activation = nn.Softplus()

        self.gaussian = Gaussian()


        # self.value_linear_1 = nn.Linear(4608, 256)
        # self.relu_value_1 = nn.LeakyReLU()
        self.value_linear_2 = nn.Linear(256, 1)

        for layer in [self.conv2d_0 , self.conv2d_1, self.conv2d_2, self.conv2d_3, self.conv2d_4, self.conv2d_5, self.action_mean, self.action_std,\
                      self.value_linear_2]:
            torch.nn.init.xavier_normal_(layer.weight)
            torch.nn.init.zeros_(layer.bias)

    def forward(self, x, old_actions=None):
        x = self.relu_0(self.conv2d_0(x))
        x = self.relu_1(self.conv2d_1(x))
        x = self.relu_2(self.conv2d_2(x))
        x = self.relu_3(self.conv2d_3(x))
        x = self.relu_4(self.conv2d_4(x))
        x = self.relu_5(self.conv2d_5(x))
        x = x.view(x.shape[0], -1)

        # x_action_mean = self.mean_activation(self.action_mean(x))
        x_action_mean = self.action_mean(x)
        x_action_std = self.std_activation(self.action_std(x))
        # x_action_std = self.action_std(x)
        mean, actions, log_actions, entropy = self.gaussian(x_action_mean, x_action_std, old_actions)
        # x_action = self.activation_action1(x_action)

        # x_value = self.relu_value_1(self.value_linear_1(x))
        x_value = self.value_linear_2(x)
        return mean, actions, log_actions, entropy, x_value

agent = CNN_Network(3).to(device)

In [4]:
def to_torch(tensor):
    return torch.tensor(tensor.copy(), dtype=torch.float32, device=device)

def get_action(state):
    mean, actions, log_actions, _, _ = agent(state)
    actions = actions[0]
    log_actions = log_actions[0]

    return  mean[0].cpu().numpy(), actions.cpu().numpy(), log_actions.cpu().numpy()

GAMMA = 0.98
def rollout(test=False):

    state = env.reset() / 255.
    memory = []
    timesteps = 0
    done = False
    streak = 0
    total_reward = 0
    while not done:
        
        mean_action, action, log_action = get_action(to_torch(state).mean(dim=2).reshape(1, 1, state.shape[0], state.shape[1]))

        fixed_action = action.copy()
        
        next_state, reward, done, _ = env.step(fixed_action)
        total_reward += reward

        if not test:
            if total_reward > 900:
                reward = 100
                while not done:
                    _, _, done, _ = env.step(fixed_action)
            else:
                if reward < 0:
                    streak += 1
                    if streak > 100:
                        reward = -100
                        while not done:
                            _, _, done, _ = env.step(fixed_action)
                else:
                    streak = 0

        next_state = next_state / 255.

        memory.append([state, action, reward, log_action])
        timesteps += 1
        state = next_state
            

    states, actions, rewards, log_actions = map(np.array, zip(*memory))

    discounted_rewards = np.zeros((len(rewards)))
    discount = 0
    # Discounts rewards in reverse
    for i in reversed(range(len(rewards))):

        # Discount fowards from the future for previous states
        discount = rewards[i] + discount*GAMMA 
        discounted_rewards[i] = discount

    return to_torch(states).mean(dim=3).unsqueeze(dim=1), to_torch(actions), to_torch(discounted_rewards).reshape(-1,1), to_torch(log_actions), timesteps, total_reward

In [5]:
CLIP_EPSILON = 0.2
value_loss_fn = nn.SmoothL1Loss()

def get_log_probs_and_value(states, old_actions):
    _, actions, log_actions_new_policy, entropy, values = agent(states, old_actions)

    return actions, log_actions_new_policy, values, entropy

def compute_advantages(states, rewards):
    _, _, _, _, values = agent(states)

    advantages = rewards - values.detach()
    advantages = (advantages - advantages.mean()) / \
                (advantages.std() + 1e-8) 

    return advantages

def compute_losses(states, actions, rewards, log_actions, advantages):
    # Compute policy loss first
    # Compute ratios 
    new_actions, log_actions_new_policy, values, entropy = get_log_probs_and_value(states, actions)
    ratios = torch.exp(log_actions_new_policy - log_actions) 

    policy_loss = torch.min(ratios*advantages, torch.clip(ratios, 1 - CLIP_EPSILON, 1 + CLIP_EPSILON)*advantages)
    policy_loss = -torch.mean(policy_loss)

    # Compute value loss
    value_loss = value_loss_fn(rewards, values)

    # Compute entropy loss
    entropy_loss = -torch.mean(entropy)
    
    return policy_loss, value_loss, entropy_loss

In [6]:

def train():

    n_episodes = 10000
    n_updates_per_episode = 5
    
    value_factor = 0.5
    entropy_factor = 0.005

    current_time_step = 0

    agent_optimizer = optim.Adam(agent.parameters(), lr=0.00015)
    scores = []
    entropies = []
    while len(scores) < n_episodes:
        # try:
        with torch.no_grad():
            agent.eval()
            states, actions, rewards, log_actions, timesteps, episode_score = rollout()
        # except:
        #     continue
        scores.append(episode_score)
        print(f"Current score: {scores[-1]}")
        print(f"episode: {len(scores)}")
        current_time_step += timesteps
        advantages = compute_advantages(states, rewards)

        agent.train()
        for _ in range(n_updates_per_episode):
            agent_optimizer.zero_grad()
            policy_loss, value_loss, entropy_loss = compute_losses(states, actions, rewards, log_actions, advantages)

            loss = policy_loss + value_factor*value_loss + entropy_factor*entropy_loss
            print(f'loss: {loss}')
            loss.backward()

            agent_optimizer.step()
            agent_optimizer.zero_grad()

        if len(scores)%1000 == 0:
            torch.save(agent, f'agent_checkpoint_{int(len(scores)/1000)}.pt')
            with open(f'results_checkpoint_{int(len(scores)/1000)}.json', 'w') as f:
                json.dump({"scores": scores}, f)
    
    with open('results.json', 'w') as f:
        json.dump({"scores": scores}, f)
        
    torch.save(agent, 'agent.pt')



In [9]:
train()

Track generation: 1075..1348 -> 273-tiles track
Current score: 491.74117647058426
episode: 1
loss: 7.5612382888793945
loss: 7.427149772644043
loss: 7.298557281494141
loss: 7.172558784484863
loss: 7.048154830932617
Track generation: 1127..1413 -> 286-tiles track
Current score: 847.8333333333165
episode: 2
loss: 5.688533306121826
loss: 5.694731712341309
loss: 5.680471420288086
loss: 5.650676250457764
loss: 5.608922958374023
Track generation: 1123..1408 -> 285-tiles track
Current score: 902.6732394366035
episode: 3
loss: 5.490977764129639
loss: 5.471651077270508
loss: 5.428076267242432
loss: 5.368774890899658
loss: 5.2995171546936035
Track generation: 1126..1411 -> 285-tiles track
retry to generate track (normal if there are not manyinstances of this message)
Track generation: 1044..1309 -> 265-tiles track
Current score: 900.8969696969565
episode: 4
loss: 6.399260520935059
loss: 6.2752251625061035
loss: 6.136133670806885
loss: 5.986415863037109
loss: 5.830905437469482
Track generation: 10

In [7]:
# Evaluation

agent = torch.load('agent.pt')

with torch.no_grad():
    scores = []
    for i in range(100):
        _, _, _, _, _, episode_score = rollout(test=True)
        scores.append(episode_score)
    print(f'Mean scores: {np.mean(scores)}')
    print(f'Median scores: {np.median(scores)}')

Track generation: 1167..1471 -> 304-tiles track
Track generation: 1186..1487 -> 301-tiles track
Track generation: 1172..1469 -> 297-tiles track
Track generation: 1198..1502 -> 304-tiles track
Track generation: 1063..1338 -> 275-tiles track
Track generation: 1183..1483 -> 300-tiles track
Track generation: 979..1228 -> 249-tiles track
Track generation: 1080..1354 -> 274-tiles track
Track generation: 1111..1393 -> 282-tiles track
Track generation: 1129..1411 -> 282-tiles track
Track generation: 1151..1443 -> 292-tiles track
Track generation: 1124..1409 -> 285-tiles track
Track generation: 1167..1463 -> 296-tiles track
Track generation: 1013..1270 -> 257-tiles track
Track generation: 984..1240 -> 256-tiles track
retry to generate track (normal if there are not manyinstances of this message)
Track generation: 1102..1381 -> 279-tiles track
Track generation: 1223..1533 -> 310-tiles track
Track generation: 1220..1529 -> 309-tiles track
Track generation: 1097..1375 -> 278-tiles track
Track gene