In [1]:
# optional Google drive integration - this will allow you to save and resume training, and may speed up redownloading the dataset
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


**DQN**

In [48]:
# this is a Deep Q Learning (DQN) agent including replay memory and a target network 
# you can write a brief 8-10 line abstract detailing your submission and experiments here
# the code is based on https://github.com/seungeunrho/minimalRL/blob/master/dqn.py, which is released under the MIT licesne
# make sure you reference any code you have studied as above, with one comment line per reference

# imports
import gym
import collections
import random
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

# hyperparameters
learning_rate = 0.0005
gamma = 0.98
buffer_limit = 50000
batch_size = 32
video_every = 50
print_every = 5

class ReplayBuffer():
    def __init__(self):
        self.buffer = collections.deque(maxlen=buffer_limit)
    
    def put(self, transition):
        self.buffer.append(transition)
    
    def sample(self, n):
        mini_batch = random.sample(self.buffer, n)
        s_lst, a_lst, r_lst, s_prime_lst, done_mask_lst = [], [], [], [], []
        
        for transition in mini_batch:
            s, a, r, s_prime, done_mask = transition
            s_lst.append(s)
            a_lst.append([a])
            r_lst.append([r])
            s_prime_lst.append(s_prime)
            done_mask_lst.append([done_mask])

        return torch.tensor(s_lst, dtype=torch.float), torch.tensor(a_lst), \
               torch.tensor(r_lst), torch.tensor(s_prime_lst, dtype=torch.float), \
               torch.tensor(done_mask_lst)
    
    def size(self):
        return len(self.buffer)

class QNetwork(nn.Module):
    def __init__(self):
        super(QNetwork, self).__init__()
        # input expands to a flat vector
        self.fc1 = nn.Linear(np.array(env.observation_space.shape).prod(), 256)
        self.fc2 = nn.Linear(256, 84)
        self.fc3 = nn.Linear(84, env.action_space.n)

    def forward(self, x):
        x = x.view(x.size(0),-1)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x
      
    def sample_action(self, obs, epsilon):
        out = self.forward(obs)
        coin = random.random()
        if coin < epsilon:
            return random.randint(0,17)
        else : 
            return out.argmax().item()
            
def train(q, q_target, memory, optimizer):
    for i in range(10):
        s,a,r,s_prime,done_mask = memory.sample(batch_size)

        q_out = q(s)
        q_a = q_out.gather(1,a)  # gather: select value according to the index
        # print(q_a.shape)
        max_q_prime = q_target(s_prime).max(1)[0].unsqueeze(1)
        # print(max_q_prime.shape)
        target = r + gamma * max_q_prime * done_mask
        loss = F.smooth_l1_loss(q_a, target)
        # print(loss)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()


**DQN Train**

In [50]:
# setup the Gravitar ram environment, and record a video every 50 episodes. You can use the non-ram version here if you prefer
env = gym.make('Gravitar-ram-v0')
env = gym.wrappers.Monitor(env, "drive/My Drive/rl/video", video_callable=lambda episode_id: (episode_id%video_every)==0,force=True)

# reproducible environment and action spaces, do not change lines 6-11 here (tools > settings > editor > show line numbers)
seed = 742
torch.manual_seed(seed)
env.seed(seed)
random.seed(seed)
np.random.seed(seed)
env.action_space.seed(seed)

q = QNetwork()
q_target = QNetwork()
q_target.load_state_dict(q.state_dict())
memory = ReplayBuffer()

score    = 0.0
marking  = []
optimizer = optim.Adam(q.parameters(), lr=learning_rate)

for n_episode in range(int(1e32)):
    epsilon = max(0.01, 0.08 - 0.01*(n_episode/200)) # linear annealing from 8% to 1%
    s = env.reset()
    # print(len(s))
    done = False
    score = 0.0

    while True:
        # print(torch.from_numpy(s).float().shape)
        a = q.sample_action(torch.from_numpy(s).float().unsqueeze(0), epsilon)
        # print(a)
        s_prime, r, done, info = env.step(a)
        done_mask = 0.0 if done else 1.0
        memory.put((s,a,r/100.0,s_prime, done_mask))
        s = s_prime

        score += r
        if done:
            break
        
    if memory.size()>2000:
        train(q, q_target, memory, optimizer)
        # q.update_critic(memory)
        # q.update_policy(memory)

    # do not change lines 44-48 here, they are for marking the submission log
    marking.append(score)
    if n_episode%100 == 0:
        print("marking, episode: {}, score: {:.1f}, mean_score: {:.2f}, std_score: {:.2f}".format(
            n_episode, score, np.array(marking).mean(), np.array(marking).std()))
        marking = []

    # you can change this part, and print any data you like (so long as it doesn't start with "marking")
    if n_episode%print_every==0 and n_episode!=0:
        q_target.load_state_dict(q.state_dict())
        # hard_update(q.target_policy, q.policy)
        # hard_update(q.target_critic, q.critic)
        print("episode: {}, score: {:.1f}, epsilon: {:.2f}".format(n_episode, score, epsilon))

marking, episode: 0, score: 250.0, mean_score: 250.00, std_score: 0.00
episode: 5, score: 0.0, epsilon: 0.08
episode: 10, score: 100.0, epsilon: 0.08
episode: 15, score: 0.0, epsilon: 0.08
episode: 20, score: 0.0, epsilon: 0.08
episode: 25, score: 250.0, epsilon: 0.08
episode: 30, score: 100.0, epsilon: 0.08
episode: 35, score: 200.0, epsilon: 0.08
episode: 40, score: 0.0, epsilon: 0.08
episode: 45, score: 100.0, epsilon: 0.08
episode: 50, score: 0.0, epsilon: 0.08
episode: 55, score: 0.0, epsilon: 0.08
episode: 60, score: 250.0, epsilon: 0.08
episode: 65, score: 700.0, epsilon: 0.08
episode: 70, score: 350.0, epsilon: 0.08
episode: 75, score: 0.0, epsilon: 0.08
episode: 80, score: 0.0, epsilon: 0.08
episode: 85, score: 250.0, epsilon: 0.08
episode: 90, score: 500.0, epsilon: 0.08
episode: 95, score: 0.0, epsilon: 0.08
marking, episode: 100, score: 0.0, mean_score: 125.50, std_score: 188.35
episode: 100, score: 0.0, epsilon: 0.07
episode: 105, score: 250.0, epsilon: 0.07
episode: 110, 

KeyboardInterrupt: ignored

**SAC**

In [13]:
# this is a Deep Q Learning (DQN) agent including replay memory and a target network 
# you can write a brief 8-10 line abstract detailing your submission and experiments here
# the code is based on https://github.com/seungeunrho/minimalRL/blob/master/dqn.py, which is released under the MIT licesne
# make sure you reference any code you have studied as above, with one comment line per reference

# TODO: SAC baseline ?

# imports
import gym
import collections
import random
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

# hyperparameters
learning_rate = 0.0005
gamma = 0.9
buffer_limit = 50000
batch_size = 32
video_every = 50
print_every = 5

class ReplayBuffer():
    def __init__(self):
        self.buffer = collections.deque(maxlen=buffer_limit)
    
    def put(self, transition):
        self.buffer.append(transition)
    
    def sample(self, n):
        mini_batch = random.sample(self.buffer, n)
        s_lst, a_lst, r_lst, s_prime_lst, done_mask_lst = [], [], [], [], []
        
        for transition in mini_batch:
            s, a, r, s_prime, done_mask = transition
            s_lst.append(s)
            a_lst.append([a])
            r_lst.append([r])
            s_prime_lst.append(s_prime)
            done_mask_lst.append([done_mask])

        return torch.tensor(s_lst, dtype=torch.float), torch.tensor(a_lst), \
               torch.tensor(r_lst), torch.tensor(s_prime_lst, dtype=torch.float), \
               torch.tensor(done_mask_lst)
    
    def size(self):
        return len(self.buffer)


def hard_update(target, source):
    """
    Copy network parameters from source to target
    Inputs:
        target (torch.nn.Module): Net to copy parameters to
        source (torch.nn.Module): Net whose parameters to copy
    """
    for target_param, param in zip(target.parameters(), source.parameters()):
        target_param.data.copy_(param.data)

def disable_gradients(module):
    for p in module.parameters():
        p.requires_grad = False

def enable_gradients(module):
    for p in module.parameters():
        p.requires_grad = True

class SAC(nn.Module):
    def __init__(self):
        super(SAC, self).__init__()
        self.reward_scale = 100.0
        self.policy = nn.Sequential(
            nn.Linear(np.array(env.observation_space.shape).prod(), 256),
            nn.LeakyReLU(inplace=True),
            nn.Linear(256, 84),
            nn.LeakyReLU(inplace=True),
            nn.Linear(84, env.action_space.n)
        )

        self.target_policy = nn.Sequential(
            nn.Linear(np.array(env.observation_space.shape).prod(), 256),
            nn.LeakyReLU(inplace=True),
            nn.Linear(256, 84),
            nn.LeakyReLU(inplace=True),
            nn.Linear(84, env.action_space.n)
        )

        hard_update(self.target_policy, self.policy)
        input_size = int(np.array(env.observation_space.shape).prod()+np.array(env.action_space.shape).prod())
        self.critic = nn.Sequential(
            nn.Linear(input_size, 256),
            nn.LeakyReLU(inplace=True),
            nn.Linear(256, 84),
            nn.LeakyReLU(inplace=True),
            nn.Linear(84, 1)
        )

        self.target_critic = nn.Sequential(
            nn.Linear(input_size, 256),
            nn.LeakyReLU(inplace=True),
            nn.Linear(256, 84),
            nn.LeakyReLU(inplace=True),
            nn.Linear(84, 1)
        )
        hard_update(self.target_critic, self.critic)
        self.policy_optimizer = optim.Adam(self.policy.parameters(), lr=learning_rate)
        self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=learning_rate)

    def update_critic(self, memory, soft=True):
      for i in range(10):
        s,a,r,s_prime,done_mask = memory.sample(batch_size)
        # print(s.shape, a.shape)

        x = torch.from_numpy(np.concatenate((s,a), axis=1)).float()
        # print(x.shape)
        q_out = self.critic(x)
        # print(q_out.shape)
        # q_a = q_out.gather(1,a)  # gather: select value according to the index
        s_prime = s_prime.view(s_prime.size(0),-1)
        a_prime = self.target_policy(s_prime).argmax(1).unsqueeze(1)
        # print(a_prime.shape)
        out = self.target_policy(s_prime)
        log_pi = (F.log_softmax(out, dim=1)*F.softmax(out, dim=1)).sum(1).unsqueeze(1)
        # print(log_pi.shape)
        x_prime = torch.from_numpy(np.concatenate((s_prime,a_prime), axis=1)).float()
        # print(x_prime.shape)
        x_prime = x_prime.view(x_prime.size(0),-1)

        max_q_prime = self.target_critic(x_prime)
        target = r + gamma * max_q_prime * done_mask
        if soft:
          target -= log_pi / self.reward_scale

        loss_fn = nn.MSELoss()
        loss = 0.5*loss_fn(q_out, target)
        # print(loss)
        
        self.critic_optimizer.zero_grad()
        loss.backward()
        self.critic_optimizer.step()

    def update_policy(self, memory):
      for i in range(10):
        s,a,r,s_prime,done_mask = memory.sample(batch_size)
        x = torch.from_numpy(np.concatenate((s,a), axis=1)).float()

        q_out = self.critic(x)
        # s_prime = s_prime.view(s_prime.size(0),-1)
        # a_prime = self.target_policy(s_prime).argmax(1).unsqueeze(1)
        # # print(a_prime.shape)
        # # print(log_pi)
        # x_prime = torch.from_numpy(np.concatenate((s_prime,a_prime), axis=1)).float()
        # # print(x_prime.shape)
        # x_prime = x_prime.view(x_prime.size(0),-1)

        # max_q_prime = self.target_critic(x_prime)
        # pol_target = r + gamma * max_q_prime * done_mask
        # v?
        pol_target = q_out
        # print(q_out.shape, q_out[0])
        out = self.policy(s)
        # print(out.shape)
        # xx = torch.zeros(a.shape).long()
        log_pi = (F.log_softmax(out, dim=1)*F.softmax(out, dim=1)).sum(1).unsqueeze(1)
        # print(log_pi.shape)
        # pol_loss = (log_pi * (log_pi / self.reward_scale - pol_target)).mean()
        pol_loss = (log_pi / self.reward_scale - pol_target).mean()
        # print(pol_loss)
        self.policy_optimizer.zero_grad()
        disable_gradients(self.critic)
        pol_loss.backward()
        enable_gradients(self.critic)
        self.policy_optimizer.step()
      
    def sample_action(self, obs, epsilon):
        obs = obs.view(obs.size(0),-1)
        # print(obs.shape)
        out = self.policy(obs)
        coin = random.random()
        if coin < epsilon:
            return random.randint(0,17)
        else : 
            return out.argmax().item()
        # va = []
        # for i in range(18):
        #   a = torch.tensor(i).unsqueeze(0).unsqueeze(0)
        #   # print(a.shape)
        #   x = torch.from_numpy(np.concatenate((obs,a), axis=1)).float()
        #   q_out = self.critic(x)
        #   va.append(q_out.item())
        # # print("sdsadasdas")
        # print(np.argmax(np.array(va)))
        # return out.argmax().item()
            
# def train(q, q_target, memory, optimizer):
#     for i in range(10):
#         s,a,r,s_prime,done_mask = memory.sample(batch_size)

#         q_out = q(s)
#         q_a = q_out.gather(1,a)  # gather: select value according to the index
#         max_q_prime = q_target(s_prime).max(1)[0].unsqueeze(1)
#         target = r + gamma * max_q_prime * done_mask
#         loss = F.smooth_l1_loss(q_a, target)
        
#         optimizer.zero_grad()
#         loss.backward()
#         optimizer.step()


**SAC Train**

← You can download the videos from the videos folder in the files on the left

In [17]:
# setup the Gravitar ram environment, and record a video every 50 episodes. You can use the non-ram version here if you prefer、
import logging
env = gym.make('Gravitar-ram-v0')
env = gym.wrappers.Monitor(env, "drive/My Drive/rl/video1", video_callable=lambda episode_id: (episode_id%video_every)==0,force=True)

# reproducible environment and action spaces, do not change lines 6-11 here (tools > settings > editor > show line numbers)
seed = 742
torch.manual_seed(seed)
env.seed(seed)
random.seed(seed)
np.random.seed(seed)
env.action_space.seed(seed)

q = SAC()
# q_target = QNetwork()
# q_target.load_state_dict(q.state_dict())
memory = ReplayBuffer()

score    = 0.0
marking  = []
optimizer = optim.Adam(q.parameters(), lr=learning_rate)

for n_episode in range(int(1e32)):
    epsilon = 0.06
    s = env.reset()
    # print(len(s))
    done = False
    score = 0.0

    while True:
        # print(torch.from_numpy(s).float().shape)
        a = q.sample_action(torch.from_numpy(s).float().unsqueeze(0), epsilon)
        # print(a)
        s_prime, r, done, info = env.step(a)
        done_mask = 0.0 if done else 1.0
        memory.put((s,a,r/100.0,s_prime, done_mask))
        s = s_prime

        score += r
        if done:
            break
        
    if memory.size()>2000:
        # train(q, q_target, memory, optimizer)
        q.update_critic(memory)
        q.update_policy(memory)

    # do not change lines 44-48 here, they are for marking the submission log
    marking.append(score)
    if n_episode%100 == 0:
        print("marking, episode: {}, score: {:.1f}, mean_score: {:.2f}, std_score: {:.2f}".format(
            n_episode, score, np.array(marking).mean(), np.array(marking).std()))
        logging.basicConfig(level=logging.INFO, 
                  filename='gravitar-log.txt',
                  filemode='a',  
                  format='%(message)s')
                # print(len(batch['utilities']))
        logging.info("marking, episode: {}, score: {:.1f}, mean_score: {:.2f}, std_score: {:.2f}".format(
            n_episode, score, np.array(marking).mean(), np.array(marking).std()))
        marking = []

    # you can change this part, and print any data you like (so long as it doesn't start with "marking")
    if n_episode%print_every==0 and n_episode!=0:
        # q_target.load_state_dict(q.state_dict())
        hard_update(q.target_policy, q.policy)
        hard_update(q.target_critic, q.critic)
        print("episode: {}, score: {:.1f}, epsilon: {:.2f}".format(n_episode, score, epsilon))

marking, episode: 0, score: 100.0, mean_score: 100.00, std_score: 0.00
episode: 5, score: 0.0, epsilon: 0.06
episode: 10, score: 250.0, epsilon: 0.06
episode: 15, score: 0.0, epsilon: 0.06
episode: 20, score: 100.0, epsilon: 0.06
episode: 25, score: 0.0, epsilon: 0.06
episode: 30, score: 100.0, epsilon: 0.06
episode: 35, score: 0.0, epsilon: 0.06
episode: 40, score: 0.0, epsilon: 0.06
episode: 45, score: 350.0, epsilon: 0.06
episode: 50, score: 100.0, epsilon: 0.06
episode: 55, score: 200.0, epsilon: 0.06
episode: 60, score: 0.0, epsilon: 0.06
episode: 65, score: 100.0, epsilon: 0.06
episode: 70, score: 350.0, epsilon: 0.06
episode: 75, score: 100.0, epsilon: 0.06
episode: 80, score: 350.0, epsilon: 0.06
episode: 85, score: 250.0, epsilon: 0.06
episode: 90, score: 450.0, epsilon: 0.06
episode: 95, score: 600.0, epsilon: 0.06
marking, episode: 100, score: 250.0, mean_score: 184.00, std_score: 216.09
episode: 100, score: 250.0, epsilon: 0.06
episode: 105, score: 250.0, epsilon: 0.06
epis

KeyboardInterrupt: ignored