In [1]:
from reinforcement_learning import EpsilonGreedy
from reinforcement_learning import NeuralNetwork
from reinforcement_learning import ReplayMemory
from reinforcement_learning import LinearControlSignal

import numpy as np
import random

In [2]:
class Environment:
    
    def __init__(self):
        self.n_queries = 0
    
    def getState(self):
        return np.array([[self.n_queries]])
    
    def calculateReward(self, action):
        if action == 0 and self.n_queries > 5:
            return 0.8
        elif action == 0 and self.n_queries <= 5:
            return -0.8
        elif action == 1 and self.n_queries > 5:
            return -0.8
        elif action == 1 and self.n_queries <= 5:
            return 0.8
            
    
    def step(self, action):
        # More queries show up.
        self.n_queries += random.randint(0, 5)
        
        # User answers some.
        self.n_queries -= random.randint(0, self.n_queries)
        
        state = np.array([[self.n_queries]])
        
        reward = self.calculateReward(action)
        
        # Placeholder
        end_episode = 0
        
        info = ''
        
        return [state, reward, end_episode, info]
    
    def get_action_meanings(self):
        return ['Fully Autonomous', 'Not Autonomous']

In [3]:
# Create the game-environment using OpenAI Gym.
env = Environment()

# The number of possible actions that the agent may take in every step.
num_actions = 2

# Whether we are training (True) or testing (False).
training = True

# Whether to use logging during training.
use_logging = False

if use_logging and training:
    # Used for logging Q-values and rewards during training.
    log_q_values = LogQValues()
    log_reward = LogReward()
else:
    log_q_values = None
    log_reward = None

# List of string-names for the actions in the game-environment.
action_names = env.get_action_meanings()

# Epsilon-greedy policy for selecting an action from the Q-values.
# During training the epsilon is decreased linearly over the given
# number of iterations. During testing the fixed epsilon is used.
epsilon_greedy = EpsilonGreedy(start_value=1.0,
                                    end_value=0.1,
                                    num_iterations=1e6,
                                    num_actions=num_actions,
                                    epsilon_testing=0.01)

if training:
    # The following control-signals are only used during training.

    # The learning-rate for the optimizer decreases linearly.
    learning_rate_control = LinearControlSignal(start_value=1e-3,
                                                     end_value=1e-5,
                                                     num_iterations=5e6)

    # The loss-limit is used to abort the optimization whenever the
    # mean batch-loss falls below this limit.
    loss_limit_control = LinearControlSignal(start_value=0.1,
                                                  end_value=0.015,
                                                  num_iterations=5e6)

    # The maximum number of epochs to perform during optimization.
    # This is increased from 5 to 10 epochs, because it was found for
    # the Breakout-game that too many epochs could be harmful early
    # in the training, as it might cause over-fitting.
    # Later in the training we would occasionally get rare events
    # and would therefore have to optimize for more iterations
    # because the learning-rate had been decreased.
    max_epochs_control = LinearControlSignal(start_value=5.0,
                                                  end_value=10.0,
                                                  num_iterations=5e6)

    # The fraction of the replay-memory to be used.
    # Early in the training, we want to optimize more frequently
    # so the Neural Network is trained faster and the Q-values
    # are learned and updated more often. Later in the training,
    # we need more samples in the replay-memory to have sufficient
    # diversity, otherwise the Neural Network will over-fit.
    replay_fraction = LinearControlSignal(start_value=0.1,
                                               end_value=1.0,
                                               num_iterations=5e6)
else:
    # We set these objects to None when they will not be used.
    learning_rate_control = None
    loss_limit_control = None
    max_epochs_control = None
    replay_fraction = None

if training:
    # We only create the replay-memory when we are training the agent,
    # because it requires a lot of RAM. The image-frames from the
    # game-environment are resized to 105 x 80 pixels gray-scale,
    # and each state has 2 channels (one for the recent image-frame
    # of the game-environment, and one for the motion-trace).
    # Each pixel is 1 byte, so this replay-memory needs more than
    # 3 GB RAM (105 x 80 x 2 x 200000 bytes).

    replay_memory = ReplayMemory(size=200000,
                                num_actions=num_actions)
else:
    replay_memory = None

# Create the Neural Network used for estimating Q-values.
model = NeuralNetwork(num_actions=num_actions,
                      replay_memory=replay_memory,
                      checkpoint_dir='checkpoints',
                      use_pretty_tensor=False, state_shape = [1])

# Log of the rewards obtained in each episode during calls to run()
episode_rewards = []

Trying to restore last checkpoint ...
('Failed to restore checkpoint from:', None)
Initializing variables instead.


In [4]:
model.get_q_values(env.getState())

array([[ 0.,  0.]], dtype=float32)

In [5]:
env.getState()

array([[0]])

In [6]:
state, reward, end, info = env.step(0)

In [7]:
model.get_q_values(state)

array([[ 0.00053891,  0.00037644]], dtype=float32)

In [8]:
state

array([[2]])