In [2]:
"""
The original code is from https://github.com/dennybritz/reinforcement-learning/tree/master/TD
"""

import sys
import numpy as np
import itertools
import pickle
from collections import defaultdict
from game import Game


# In our case, we have 3 action (stay, go-left, go-right)
def get_action_num():
    return 3


## this function return policy function to choose the action based on Q value.
def make_policy(Q, epsilon, nA):
    """
    This is the epsilon-greedy policy, which select random actions for some chance (epsilon).
    (Check dennybritz's repository for detail)

    You may change the policy function for the given task.
    """
    def policy_fn(observation):        
        A = np.ones(nA, dtype=float) * epsilon / nA
        best_action = np.argmax(Q[observation])
        A[best_action] += (1.0 - epsilon)
        return A
    return policy_fn

	
## this function return state from given game information.
def get_state(counter,score,game_info): # cannot change parameters since game.py is using same parameter
    basket_location, item_location = game_info
    clock = (3,0,0)# default value when there is no clock in the current state
    min_tuple = tuple(item_location[0])
    min_height = 9
    for items in item_location:
        if (abs(3*items[1]+2 - 4*basket_location) <= 8*(9-items[2])) : # exclude the items that we cannot obtain
            if( 9 - items[2] <= min_height):
                min_height =  9 - items[2]
                min_tuple = tuple(items)
            elif(items[0] == 2):
                clock = tuple(items)

    return  (basket_location , min_tuple, clock)


## this function return reward from given previous and current score and counter.
def get_reward(current_state, action):
    basket_location, item , clock = current_state
    reward =0

    if (clock[0] == 3): # when there is no clock in the state
        if action == 1 :
            if 3*item[1]+2 < 4*basket_location:
                reward = 2
            elif 3*item[1]+2 >4*basket_location:
                reward = -2
        elif action == 2 :
            if 3*item[1]+2 > 4*basket_location + 8:
                reward = 2
            elif 3*item[1] +2 < 4*basket_location+8:
                reward = -2
    elif (clock[0] == 2) : #when there is a clock , aim is to get the clock not the closest item
        if action == 1:
            if 3 * clock[1] + 2 < 4 * basket_location:
                reward = 3
            elif 3 * clock[1] + 2 > 4 * basket_location:
                reward = -3
        elif action == 2  :
            if 3 * clock[1] + 2 > 4 * basket_location + 8:
                reward = 3
            elif 3 * clock[1] + 2 < 4 * basket_location + 8:
                reward = -3


    return reward


def save_q(Q, num_episode, params, filename="model_q.pkl"):
    data = {"num_episode": num_episode, "params": params, "q_table": dict(Q)}
    with open(filename, "wb") as w:
        w.write(pickle.dumps(data))

        
def load_q(filename="model_q.pkl"):
    with open(filename, "rb") as f:
        data = pickle.loads(f.read())
        return defaultdict(lambda: np.zeros(3), data["q_table"]), data["num_episode"], data["params"]


def q_learning(game, num_episodes, params):
    """
    Q-Learning algorithm: Off-policy TD control. Finds the optimal greedy policy
    while following an epsilon-greedy policy.
    You can edit those parameters, please speficy your changes in the report.
    
    Args:
        game: Coin drop game environment.
        num_episodes: Number of episodes to run for.
        discount_factor: Gamma discount factor.
        alpha: TD learning rate.
        epsilon: Chance the sample a random action. Float betwen 0 and 1.
    
    Returns:
        Q: the optimal action-value function, a dictionary mapping state -> action values.
    """
    
    epsilon, alpha, discount_factor = params
    
    # The final action-value function.
    # A nested dictionary that maps state -> (action -> action-value).
    Q = defaultdict(lambda: np.zeros(get_action_num()))

    # The policy we're following
    policy = make_policy(Q, epsilon, get_action_num())

    
    for i_episode in range(num_episodes):
        # Reset the environment and pick the first action
        _, counter, score, game_info = game.reset()
        state = get_state(counter,score,game_info)
        action = 0
        
        # One step in the environment
        for t in itertools.count():
            current_state = get_state(counter,score,game_info)

            # Take a step
            action_probs = policy(get_state(counter,score,game_info))
            action = np.random.choice(np.arange(len(action_probs)), p=action_probs)
            done, next_counter, next_score, game_info = game.step(action)
            
            next_state = get_state(counter,score,game_info)
            reward = get_reward(current_state, action) # originally game_info
            
            counter = next_counter
            score = next_score
            
            """
            this code performs TD Update. (Update Q value)
            You may change this part for the given task.
            """
            best_next_action = np.argmax(Q[next_state])    
            td_target = reward + discount_factor * Q[next_state][best_next_action]
            td_delta = td_target - Q[state][action]
            Q[state][action] += alpha * td_delta


            if done:
                break
                
            state = next_state
        
        # Print out which episode we're on, useful for debugging.
        if (i_episode + 1) % 100 == 0 :
            print("Episode {}/{} (Score: {})\n".format(i_episode + 1, num_episodes, score), end="")
            sys.stdout.flush()

    return Q

def train(num_episodes, params):
    g = Game(False)
    Q = q_learning(g, num_episodes, params)
    return Q


## This function will be called in the game.py
def get_action(Q, counter, score, game_info, params):
    epsilon = params[0]
    policy = make_policy(Q, epsilon, 3)
    action_probs = policy(get_state(counter, score, game_info))
    action = np.random.choice(np.arange(len(action_probs)), p=action_probs)
    return action

def main():
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument("-n", "--num_episode", help="# of the episode (size of training data)",
                    type=int, required=True)
    parser.add_argument("-e", "--epsilon", help="the probability of random movement, 0~1",
                    type=float, default=0.1)
    parser.add_argument("-lr", "--learning_rate", help="learning rate of training",
                    type=float, default=0.1)
    
    args = parser.parse_args()
    
    if args.num_episode is None:
        parser.print_help()
        exit(1)

    # you can pass your parameter as list or dictionary.
    # fix corresponding parts if you want to change the parameters
    
    num_episodes = args.num_episode
    epsilon = args.epsilon
    learning_rate = args.learning_rate
    
    Q = train(num_episodes, [epsilon, learning_rate, 0.9])
    save_q(Q, num_episodes, [epsilon, learning_rate, 0.9])
    
    #Q, n, params = load_q()

if __name__ == "__main__":
    main()

usage: ipykernel_launcher.py [-h] -n NUM_EPISODE [-e EPSILON]
                             [-lr LEARNING_RATE]
ipykernel_launcher.py: error: the following arguments are required: -n/--num_episode


SystemExit: 2

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)
