In [None]:
#%pip install https://github.com/chovanecm/python-genetic-algorithm/archive/master.zip#egg=mchgenalg
#%pip install torch

In [1]:
import torch
import gym 
from nes_py.wrappers import JoypadSpace
import gym_super_mario_bros
from gym_super_mario_bros.actions import SIMPLE_MOVEMENT, COMPLEX_MOVEMENT
import numpy as np
import matplotlib.pyplot as plt
from skimage.transform import resize
from tqdm import tqdm
torch.cuda.is_available()

True

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [3]:
env = gym_super_mario_bros.make('SuperMarioBros-v3')
env = JoypadSpace(env, COMPLEX_MOVEMENT)

Model parameters, I use a Genetic Algorithm to find the best parameters for the model.

In [4]:
def preprocess_frame(frame, new_size=(42,42), to_gray=True):
    if to_gray:
        return resize(frame, new_size, anti_aliasing=True).max(axis=2)
    else:
        return resize(frame, new_size, anti_aliasing=True)

In [5]:
from collections import deque
from torch import nn
from torch import optim
import torch.nn.functional as F

def prepare_state(state):
    return torch.from_numpy(preprocess_frame(state, to_gray=True)).float().unsqueeze(0)

def prepare_multi_states(state1, state2):
    state1 = state1.clone()
    temp = torch.from_numpy(preprocess_frame(state2, to_gray=True)).float()
    state1[0][0] = state1[0][1]
    state1[0][1] = state1[0][2]
    state1[0][2] = temp
    return state1

def prepare_initial_state(state, N=4):
    state_ = torch.from_numpy(preprocess_frame(state, to_gray=True)).float()
    tmp = state_.repeat((N, 1, 1))
    return tmp.unsqueeze(0)

In [6]:
# Policy definition

def policy(qvalues, eps=None):
    if eps is not None:
        if torch.rand(1) < eps:
            return torch.randint(low=0, high=qvalues.shape[1], size=(1,))
        else:
            return torch.argmax(qvalues)
    else:
        return torch.multinomial(F.softmax(F.normalize(qvalues)), num_samples=1)

In [7]:
# Experience replay memory in order to sample mini batches of experiences for training
from random import shuffle

class ExperienceReplayMemory:
    def __init__(self, N=500, batch_size=100):
        self.N = N
        self.batch_size = batch_size
        self.memory = []
        self.counter = 0

    def add_memory(self, state1, action, reward, state2):
        self.counter += 1
        if self.counter % self.N == 0:
            self.shuffle_memory()
        if(len(self.memory) < self.N):
            self.memory.append((state1, action, reward, state2))
        else:
            rand_idx = np.random.randint(0, self.N - 1)
            self.memory[rand_idx] = (state1, action, reward, state2)

    def shuffle_memory(self):
        shuffle(self.memory)

    def get_batch(self):
        if len(self.memory) < self.batch_size:
            batch_size = len(self.memory)
        else:
            batch_size = self.batch_size
        if len(self.memory) < 1:
            print("Error: Memory is empty")
            return None
        
        ind = np.random.choice(np.arange(len(self.memory)), batch_size, replace=False)
        batch = [self.memory[i] for i in ind]
        state1_batch = torch.stack([x[0].squeeze(0) for x in batch], dim=0)
        state1_batch.to(device)
        action_batch = torch.Tensor([x[1] for x in batch]).long()
        action_batch.to(device)
        reward_batch = torch.Tensor([x[2] for x in batch])
        reward_batch.to(device)
        state2_batch = torch.stack([x[3].squeeze(0) for x in batch], dim=0)
        state2_batch.to(device)
        return state1_batch, action_batch, reward_batch, state2_batch


In [8]:
# Intrinsic curiosity module: 3 diverse nn networks (forward, inverse, encoder)

class Phi(nn.Module): # Encoder
    def __init__(self):
        super(Phi, self).__init__()
        self.conv1 = nn.Conv2d(4, 32, kernel_size=(3,3), stride=2, padding=1)
        self.conv2 = nn.Conv2d(32, 32, kernel_size=(3,3), stride=2, padding=1)
        self.conv3 = nn.Conv2d(32, 32, kernel_size=(3,3), stride=2, padding=1)
        self.conv4 = nn.Conv2d(32, 32, kernel_size=(3,3), stride=2, padding=1)
    
    def forward(self, x):
        x = F.normalize(x)
        y = F.elu(self.conv1(x))
        y = F.elu(self.conv2(y))
        y = F.elu(self.conv3(y))
        y = F.elu(self.conv4(y))
        y = y.flatten(start_dim=1)
        return y
    
class Gnet(nn.Module): # Inverse model
    def __init__(self):
        super(Gnet, self).__init__()
        self.fc1 = nn.Linear(576, 256)
        self.fc2 = nn.Linear(256, env.action_space.n)

    def forward(self, state1, state2):
        x = torch.cat((state1, state2), dim=1)
        y = F.relu(self.fc1(x))
        y = self.fc2(y)
        y = F.softmax(y, dim=1)
        return y
    
class Fnet(nn.Module): # Forward model
    def __init__(self):
        super(Fnet, self).__init__()
        self.fc1 = nn.Linear(300, 256)
        self.fc2 = nn.Linear(256, 288)

    def forward(self, state, action):
        action_ = torch.zeros((action.shape[0], env.action_space.n)).cuda()
        indices = torch.stack((torch.arange(action.shape[0]).cuda(), action.squeeze().cuda()), dim=0).cuda()
        indices = indices.tolist()
        action_[indices] = 1
        x = torch.cat((state, action_), dim=1)
        y = F.relu(self.fc1(x))
        y = self.fc2(y)
        return y

In [9]:
# Q network

class Qnet(nn.Module):
    def __init__(self):
        super(Qnet, self).__init__()
        self.conv1 = nn.Conv2d(in_channels=4, out_channels=32, kernel_size=(3,3), stride=2, padding=1)
        self.conv2 = nn.Conv2d(in_channels=32, out_channels=32, kernel_size=(3,3), stride=2, padding=1)
        self.conv3 = nn.Conv2d(in_channels=32, out_channels=32, kernel_size=(3,3), stride=2, padding=1)
        self.conv4 = nn.Conv2d(in_channels=32, out_channels=32, kernel_size=(3,3), stride=2, padding=1)
        self.fc1 = nn.Linear(288, 100)
        self.fc2 = nn.Linear(100, env.action_space.n)

    def forward(self, x):
        x = F.normalize(x)
        y = F.elu(self.conv1(x))
        y = F.elu(self.conv2(y))
        y = F.elu(self.conv3(y))
        y = F.elu(self.conv4(y))
        y = y.flatten(start_dim=2)
        y = y.view(y.shape[0], -1, 32)
        y = y.flatten(start_dim=1)
        y = F.elu(self.fc1(y))
        y = self.fc2(y)
        return y

## Hyperparameters:

In [10]:
# hyperparameters
hyperparams = {
    'batch_size': [
        32, 64, 128, 256, 512], # from 32 to 512
    'beta': 0.2,
    'lambda': 0.1,
    'eta': 1.0,
    'gamma': 0.2,
    'max_episode_length': 200,
    'min_progress': 15,
    'action_repeats': 6,
    'frames_per_state': 4,
    'learning_rate': [0.0001, 0.0005, 0.001, 0.005, 0.01],
    'skip_frames': 4
} 

In [11]:
def loss_fn(q_loss, forward_loss, inverse_loss, beta, lambda_value):
    loss_ = (1 - beta)*inverse_loss
    loss_ += hyperparams['beta']*forward_loss
    loss_ = loss_.sum() / loss_.flatten().shape[0]
    loss_ += lambda_value*q_loss
    return loss_

def reset_env():
    env.reset()
    state1 = prepare_initial_state(env.render(mode='rgb_array'))
    return state1

In [12]:
qnet = Qnet()
qnet.cuda()
encoder = Phi()
encoder.cuda()
forward_model = Fnet()
forward_model.cuda()
inverse_model = Gnet()
inverse_model.cuda()
forward_loss = nn.MSELoss(reduction='none')
forward_loss.cuda()
inverse_loss = nn.CrossEntropyLoss(reduction='none')
inverse_loss.cuda()
qloss = nn.MSELoss()
qloss.cuda()
all_model_params = list(qnet.parameters()) + list(encoder.parameters()) + list(forward_model.parameters()) + list(inverse_model.parameters())

In [13]:
def ICM(state1, action, state2, forward_scale = 1., inverse_scale = 1e4):
    state1_hat = encoder(state1.cuda())
    state2_hat = encoder(state2.cuda())
    state2_hat_pred = forward_model(state1_hat.detach(), action.detach())
    forward_pred_err = forward_scale * forward_loss(state2_hat_pred, state2_hat.detach()).sum(dim=1).unsqueeze(dim=1)
    pred_action = inverse_model(state1_hat, state2_hat)
    inverse_pred_err = inverse_scale * inverse_loss(pred_action, action.detach().flatten()).unsqueeze(dim=1)
    return forward_pred_err, inverse_pred_err

def minibatch_train(use_explicit=True, gamma = hyperparams['gamma']):
    state1_batch, action_batch, reward_batch, state2_batch = replay.get_batch()
    action_batch = action_batch.view(action_batch.shape[0], 1)
    reward_batch = reward_batch.view(reward_batch.shape[0], 1)
    forward_pred_err, inverse_pred_err = ICM(state1_batch.cuda(), action_batch.cuda(), state2_batch)
    i_reward = (1./hyperparams['eta'])*forward_pred_err 
    reward = i_reward.detach() 
    if use_explicit:
        reward += reward_batch
    qvals = qnet(state2_batch.cuda())
    reward += gamma*torch.max(qvals)
    reward_pred = qnet(state1_batch.cuda()) 
    reward_target = reward_pred.clone()
    indices = torch.stack((torch.arange(action_batch.shape[0]), action_batch.squeeze()), dim=0).cuda()
    indices = indices.tolist()
    reward_target[indices] = reward.squeeze()
    q_loss = 1e5 * qloss(F.normalize(reward_pred), F.normalize(reward_target.detach()))
    return forward_pred_err, inverse_pred_err, q_loss

In [16]:
def train(epochs, lambda_value=hyperparams['lambda'], beta=hyperparams['beta'], gamma=hyperparams['gamma'], eps = 0.15, batch_size=None):
    env.reset()
    state1 = prepare_initial_state(env.render(mode='rgb_array'))
    state1.to(device)
    eps = eps
    losses = []
    episode_length = 0
    switch_to_eps_greedy = 1000
    state_deque = deque(maxlen=hyperparams['frames_per_state'])
    env.reset()
    _, _, _,info_0 = env.step(0)
    env.reset()
    last_x_pos = info_0['x_pos']
    e_reward = 0
    ep_lengths = []
    #use_explicit = False
    for i in tqdm(range(epochs)):
        optimizer.zero_grad()
        episode_length += 1
        q_val_pred = qnet(state1.cuda())
        if i > switch_to_eps_greedy:
            action = int(policy(q_val_pred, eps))
        else:
            action = int(policy(q_val_pred))
        for j in range(hyperparams['action_repeats']):
            for k in range(hyperparams['skip_frames']):
                state2, e_reward_, done, info = env.step(action)
                e_reward += e_reward_
                if done:
                    state1 = reset_env()
                    break 
            state_deque.append(prepare_state(state2))
        state2 = torch.stack(list(state_deque), dim=1)
        replay.add_memory(state1, action, e_reward, state2)
        e_reward = 0
        if episode_length > hyperparams['max_episode_length']:
            if (info['x_pos'] - last_x_pos) < hyperparams['min_progress']:
                done = True
            else:
                last_x_pos = info['x_pos']
        if done:
            ep_lengths.append(episode_length)
            episode_length = 0
            state1 = reset_env()
            last_x_pos = info_0['x_pos']
        else:
            state1 = state2
        if len(replay.memory) < batch_size:
            continue
        forward_pred_err, inverse_pred_err, q_loss = minibatch_train(use_explicit = False, gamma=gamma)
        loss = loss_fn(q_loss, forward_pred_err, inverse_pred_err, lambda_value=lambda_value, beta=beta)
        loss_list = (q_loss.mean(), forward_pred_err.flatten().mean(), inverse_pred_err.flatten().mean())
        losses.append(loss_list)
        loss.backward()
        optimizer.step()
    return ep_lengths, losses


### Hyperparameters tuning:

In [19]:
# Finding best hyperparameters:
for i in range(5):
    for j in range(5):
        replay = ExperienceReplayMemory(N=1500, batch_size=hyperparams['batch_size'][i])
        optimizer = optim.Adam(all_model_params, lr=hyperparams['learning_rate'][j])
        ep_lengths, losses = train(epochs=1000, batch_size=hyperparams['batch_size'][i])
        losses_q = [x[0].detach() for x in losses]
        losses_f = [x[1].detach() for x in losses]
        losses_i = [x[2].detach() for x in losses]
        plt.figure(figsize=(8,6))
        plt.plot(np.log(losses_q), label='Q loss')
        plt.plot(np.log(losses_f), label='Forward loss')
        plt.plot(np.log(losses_i), label='Inverse loss')
        plt.legend()
        plt.savefig('losses_'+str(i)+'_'+str(j)+'.png')
        plt.close()

  0%|          | 0/1000 [00:00<?, ?it/s]

  return torch.multinomial(F.softmax(F.normalize(qvalues)), num_samples=1)
100%|██████████| 1000/1000 [01:32<00:00, 10.77it/s]


AttributeError: 'list' object has no attribute 'cpu'

In [11]:
replay = ExperienceReplayMemory(N=1500, batch_size=hyperparams['batch_size'])
qnet = Qnet()
qnet.cuda()
encoder = Phi()
encoder.cuda()
forward_model = Fnet()
forward_model.cuda()
inverse_model = Gnet()
inverse_model.cuda()
forward_loss = nn.MSELoss(reduction='none')
forward_loss.cuda()
inverse_loss = nn.CrossEntropyLoss(reduction='none')
inverse_loss.cuda()
qloss = nn.MSELoss()
qloss.cuda()
all_model_params = list(qnet.parameters()) + list(encoder.parameters()) + list(forward_model.parameters()) + list(inverse_model.parameters())
optimizer = optim.Adam(all_model_params, lr=hyperparams['learning_rate'])

In [20]:
def train(epochs, lambda_value=hyperparams['lambda'], beta=hyperparams['beta'], gamma=hyperparams['gamma'], eps = 0.15):
    env.reset()
    state1 = prepare_initial_state(env.render(mode='rgb_array'))
    state1.to(device)
    eps = eps
    losses = []
    episode_length = 0
    switch_to_eps_greedy = 1000
    state_deque = deque(maxlen=hyperparams['frames_per_state'])
    env.reset()
    _, _, _,info_0 = env.step(0)
    env.reset()
    last_x_pos = info_0['x_pos']
    e_reward = 0
    ep_lengths = []
    #use_explicit = False
    for i in tqdm(range(epochs)):
        optimizer.zero_grad()
        episode_length += 1
        q_val_pred = qnet(state1.cuda())
        if i > switch_to_eps_greedy:
            action = int(policy(q_val_pred, eps))
        else:
            action = int(policy(q_val_pred))
        for j in range(hyperparams['action_repeats']):
            for k in range(hyperparams['skip_frames']):
                state2, e_reward_, done, info = env.step(action)
                e_reward += e_reward_
                if done:
                    state1 = reset_env()
                    break 
            state_deque.append(prepare_state(state2))
        state2 = torch.stack(list(state_deque), dim=1)
        replay.add_memory(state1, action, e_reward, state2)
        e_reward = 0
        if episode_length > hyperparams['max_episode_length']:
            if (info['x_pos'] - last_x_pos) < hyperparams['min_progress']:
                done = True
            else:
                last_x_pos = info['x_pos']
        if done:
            ep_lengths.append(episode_length)
            episode_length = 0
            state1 = reset_env()
            last_x_pos = info_0['x_pos']
        else:
            state1 = state2
        if len(replay.memory) < hyperparams['batch_size']:
            continue
        forward_pred_err, inverse_pred_err, q_loss = minibatch_train(use_explicit = False, gamma=gamma)
        loss = loss_fn(q_loss, forward_pred_err, inverse_pred_err, lambda_value=lambda_value, beta=beta)
        loss_list = (q_loss.mean(), forward_pred_err.flatten().mean(), inverse_pred_err.flatten().mean())
        losses.append(loss_list)
        loss.backward()
        optimizer.step()
    return ep_lengths, losses


In [21]:
ep_len, losses_plot =train(epochs=50000, lambda_value=hyperparams['lambda'], beta=hyperparams['beta'], gamma=hyperparams['gamma'], eps = 0.15)
losses_q = [x[0].detach().numpy() for x in losses_plot]
losses_f = [x[1].detach().numpy() for x in losses_plot]
losses_i = [x[2].detach().numpy() for x in losses_plot]
plt.figure(figsize=(8,6))
plt.plot(np.log(losses_q), label='Q loss')
plt.plot(np.log(losses_f), label='Forward loss')
plt.plot(np.log(losses_i), label='Inverse loss')
plt.legend()
plt.show()


  0%|          | 0/50000 [00:00<?, ?it/s]

  return torch.multinomial(F.softmax(F.normalize(qvalues)), num_samples=1)
  3%|▎         | 1339/50000 [02:55<1:51:11,  7.29it/s]

In [None]:
eps = 0.1
done = True
state_deque = deque(maxlen=hyperparams['frames_per_state'])
for step in range(5000):
    if done: 
        env.reset()
        state1 = prepare_initial_state(env.render(mode='rgb_array'))
    q_val_pred = qnet(state1)
    action = int(policy(q_val_pred, eps))
    state2, reward, done, info = env.step(action)
    state2 = prepare_multi_states(state1, state2)
    state1 = state2
    env.render()
env.close()

