In [3]:
# #only run once
# #!pip install nes-py==0.2.6
# !brew update
# !brew install ffmpeg
# !brew install libsm
# !brew install libxext
# !brew install mesa
# !pip install opencv-python
# !pip install gym-super-mario-bros
# !pip install gym

In [1]:
import torch
import torch.nn as nn
import random
import gym
import gym_super_mario_bros
from nes_py.wrappers import JoypadSpace
from gym_super_mario_bros import SuperMarioBrosEnv
from tqdm import tqdm
import pickle 
import gym
import numpy as np
import collections 
import cv2
import matplotlib.pyplot as plt
import time
import datetime

In [2]:
from toolkit.gym_env import *
from toolkit.marlios_model import *
from toolkit.constants import *

CONSECUTIVE_ACTIONS = 2

%load_ext autoreload
%autoreload 2

In [3]:
def show_state(env, ep=0, info=""):
    plt.figure(3)
    plt.clf()
    plt.imshow(env.render(mode='rgb_array'))
    plt.title("Episode: %d %s" % (ep, info))
    plt.axis('off')

    # display.clear_output(wait=True)
    # display.display(plt.gcf())
    display(plt.gcf(), clear=True)

In [4]:
def make_env(env, actions=ACTION_SPACE):
    env = MaxAndSkipEnv(env)
    env = ProcessFrame84(env)
    env = ImageToPyTorch(env)
    env = BufferWrapper(env, 4)
    env = ScaledFloatFrame(env)
    return JoypadSpace(env, actions)

def generate_epoch_time_id():
    epoch_time = int(time.time())
    return str(epoch_time)

In [5]:
def save_checkpoint(agent, total_rewards, terminal_info, run_id):
    with open(f"ending_position-{run_id}.pkl", "wb") as f:
        pickle.dump(agent.ending_position, f)
    with open(f"num_in_queue-{run_id}.pkl", "wb") as f:
        pickle.dump(agent.num_in_queue, f)
    with open(f"total_rewards-{run_id}.pkl", "wb") as f:
        pickle.dump(total_rewards, f)
    with open(f"terminal_info-{run_id}.pkl", "wb") as f:
        pickle.dump(terminal_info, f)
    if agent.double_dq:
        torch.save(agent.local_net.state_dict(), f"dq1-{run_id}.pt")
        torch.save(agent.target_net.state_dict(), f"dq2-{run_id}.pt")
    else:
        torch.save(agent.dqn.state_dict(), f"dq-{run_id}.pt")  
    # torch.save(agent.STATE_MEM,  f"STATE_MEM-{run_id}.pt")
    # torch.save(agent.ACTION_MEM, f"ACTION_MEM-{run_id}.pt")
    # torch.save(agent.REWARD_MEM, f"REWARD_MEM-{run_id}.pt")
    # torch.save(agent.STATE2_MEM, f"STATE2_MEM-{run_id}.pt")
    # torch.save(agent.DONE_MEM,   f"DONE_MEM-{run_id}.pt")
    # torch.save(agent.SPACE_MEM,   f"SPACE_MEM-{run_id}.pt")

In [11]:
# Testing the act
env = gym.make('SuperMarioBros-1-1-v0')
env = make_env(env, ACTION_SPACE) # we always need to declare this with ACTION_SPACE
agent = DQNAgent(state_space=env.observation_space.shape,
                     action_space=TWO_ACTIONS_SET,
                     max_memory_size=30000,
                     batch_size=32,
                     gamma=0.90,
                     lr=0.00025,
                     dropout=0.,
                     exploration_max=0.02,
                     exploration_min=0.02,
                     exploration_decay=.99,
                     double_dq=True,
                     pretrained=False,
                     run_id=None,
                     n_actions=12)

state = env.reset()
state = torch.Tensor([state])
two_actions_index = agent.act(state)
two_actions_vector = agent.cur_action_space[two_actions_index]
two_actions = toolkit.action_utils.vec_to_action(two_actions_vector.cpu()) # tuple of actions

In [14]:
print(two_actions[0], ACTION_TO_INDEX[two_actions[0]])


('B', 'down') 13


In [6]:
def run(training_mode=True, pretrained=False, lr=0.0001, gamma=0.90, exploration_decay=0.99, exploration_min=0.02,
        mario_env='SuperMarioBros-1-1-v0', action_space=TWO_ACTIONS_SET, num_episodes=1000, run_id=None, n_actions=5, consecutiveActions = 2):
   
    run_id = run_id or generate_epoch_time_id()
    fh = open(f'progress-{run_id}.txt', 'a')
    env = gym.make(mario_env)
    #env = gym_super_mario_bros.make('SuperMarioBros-v0')
    
    #env = make_env(env)  # Wraps the environment so that frames are grayscale 
    #env = SuperMarioBrosEnv()
    env = make_env(env, ACTION_SPACE)
    observation_space = env.observation_space.shape
    
    # Change this to be either the training / validation / test set
    action_space = TWO_ACTIONS_SET

    #todo: add agent params as a setting/create different agents in diff functions to run 

    agent = DQNAgent(state_space=observation_space,
                     action_space=action_space,
                     max_memory_size=30000,
                     batch_size=32,
                     gamma=gamma,
                     lr=lr,
                     dropout=0.,
                     exploration_max=1,
                     exploration_min=0.02,
                     exploration_decay=exploration_decay,
                     double_dq=True,
                     pretrained=pretrained,
                     run_id=run_id,
                     n_actions=n_actions)
    
    
    # num_episodes = 10
    env.reset()
    total_rewards = []
    total_info = []
    
    for ep_num in tqdm(range(num_episodes)):
        state = env.reset()
        state = torch.Tensor([state])
        total_reward = 0
        steps = 0
        while True:
            if not training_mode:
                show_state(env, ep_num)


            two_actions_index = agent.act(state)
            two_actions_vector = agent.cur_action_space[two_actions_index]

            two_actions = toolkit.action_utils.vec_to_action(two_actions_vector.cpu()) # tuple of actions

            steps += 1
            reward = 0
            info = None
            terminal = False
            for action in two_actions: 
                if not terminal:
                    # compute index into ACTION_SPACE of our action
                    step_action = ACTION_TO_INDEX[action]

                    state_next, cur_reward, terminal, info = env.step(step_action)
                    total_reward += cur_reward
                    reward += cur_reward
                    state_next = torch.Tensor([state_next])

            reward = torch.tensor([reward]).unsqueeze(0)        
            terminal = torch.tensor([int(terminal)]).unsqueeze(0)
            
            if training_mode:
                agent.remember(state, two_actions_index, reward, state_next, terminal)
                agent.experience_replay()
            
            state = state_next
            if terminal:
                break

        total_info.append(info)
        total_rewards.append(total_reward)

        if training_mode and (ep_num % 300) == 0:
            save_checkpoint(agent, total_rewards, total_info, run_id)

        with open(f'total_reward-{run_id}.txt', 'a') as f:
            f.write("Total reward after episode {} is {}\n".format(ep_num + 1, total_rewards[-1]))
            if (ep_num%100 == 0):
                f.write("==================\n")
                f.write("{} current time at episode {}\n".format(datetime.datetime.now(), ep_num+1))
                f.write("==================\n")
            #print("Total reward after episode {} is {}".format(ep_num + 1, total_rewards[-1]))
            num_episodes += 1
    
    if training_mode:
        save_checkpoint(agent, total_rewards, run_id)
    
    env.close()
    fh.close()
    
    if num_episodes > 500:
        plt.title("Episodes trained vs. Average Rewards (per 500 eps)")
        plt.plot([0 for _ in range(500)] + 
                 np.convolve(total_rewards, np.ones((500,))/500, mode="valid").tolist())
        plt.show()



# test it here
run(action_space=SUFFICIENT_ACTIONS, n_actions=10)

  logger.warn(
  deprecation(
  deprecation(
  state = torch.Tensor([state])
  logger.deprecation(
  if not isinstance(done, (bool, np.bool8)):
  5%|▍         | 48/1000 [30:07<35:29:38, 134.22s/it]