## General

In [141]:
# General imports
import operator
import numpy as np
np.set_printoptions(formatter={'float': lambda x: "{0:0.1f}".format(x)})

# Global variables
WORLD = -1. * np.ones((18,14))
STARTS = [(17,3), (17,4), (17,5), (17,6)]
GOALS = [(1,13), (2,13), (3,13), (4,13), (5,13)]
BEG_ = [(6,11),(7,9),(8,8),(9,8),(10,8),(11,8),(12,8),(13,7),(14,7),(15,7),(16,7),(17,7)]
END_ = [(0,13),(1,4),(2,0),(3,0),(4,0),(5,0),(6,1),(7,1),(8,1),(9,2),(10,2),(11,3),(12,3),(13,3),(14,3),(15,2),(16,2),(17,2)]

# Populate world
for coup in END_:
    WORLD[coup[0], 0:coup[1]+1] = -10
    
for coup in BEG_:
    WORLD[coup[0], coup[1]:14] = -10

for cell in GOALS:
    WORLD[cell] = -1

print(WORLD)

[[-10.0 -10.0 -10.0 -10.0 -10.0 -10.0 -10.0 -10.0 -10.0 -10.0 -10.0 -10.0
  -10.0 -10.0]
 [-10.0 -10.0 -10.0 -10.0 -10.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0
  -1.0]
 [-10.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0]
 [-10.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0]
 [-10.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0]
 [-10.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0]
 [-10.0 -10.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -10.0 -10.0
  -10.0]
 [-10.0 -10.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -10.0 -10.0 -10.0 -10.0
  -10.0]
 [-10.0 -10.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -10.0 -10.0 -10.0 -10.0 -10.0
  -10.0]
 [-10.0 -10.0 -10.0 -1.0 -1.0 -1.0 -1.0 -1.0 -10.0 -10.0 -10.0 -10.0
  -10.0 -10.0]
 [-10.0 -10.0 -10.0 -1.0 -1.0 -1.0 -1.0 -1.0 -10.0 -10.0 -10.0 -10.0
  -10.0 -10.0]
 [-10.0 -10.0 -10.0 -10.0 -1.0 -1.0 -1.0 -1.0 -10.0 -10.0 -10.0 -10.0
  -10.0 -10.0]
 [-10.0 -10.0 -10.0 -10.0 -1.0 -1.0 -

## Common Routines

In [151]:
# Global variables
MIN = 1
MAX = 5
EPS = 0.6

def generate_actions():
    actions = []
    for i in range(MAX+1):
        for j in range(MAX+1):
            if MIN <= (i + j) <= MAX:
                actions.append((i,j))
    return actions

def terminal(state, allow=False):
    # Out of bounds
    if ((state['y'] < 0 or state['y'] >= WORLD.shape[0]) or 
        (state['x'] < 0 or state['x'] >= WORLD.shape[1])):
        return True
    
    # Goal state
    elif ((state['y'], state['x']) in GOALS) and not allow:
        return True
    
    # Obstacle hit
    elif WORLD[state['y'], state['x']] == -10:
        return True
    
    else:
        return False

def move(state, action):
    new_state = state.copy()
    
    # One of the velocity is 0
    if action[0] == 0 or action[1] == 0:        
        # Index of non-zero velocity
        idx = 0 if action[1] == 0 else 1
        
        # Dictionary entry based on idx
        k = "x" if action[1] == 0 else "y"
        
        # Simulate trajectory
        for x in range(action[idx]):
            if k == "x":
                new_state[k] += 1
            else:
                new_state[k] -= 1
            
            # Check if terminal
            if terminal(new_state):
                return new_state
            
    # Diagonal movement
    else:
        if action[0] >= action[1]:
            # Simulate trajectory
            for x in range(action[0]):
                new_state['y'] -= 1
                new_state['x'] += 1

                # Check if terminal
                if terminal(new_state):
                    return new_state
        else:
            # Simulate trajectory
            for x in range(action[0]):
                new_state['y'] -= 1
                new_state['x'] += 1

                # Check if terminal
                if terminal(new_state):
                    return new_state
            
            for x in range(action[1] - action[0]):
                new_state['y'] -= 1
                
                # Check if terminal
                if terminal(new_state):
                    return new_state
    
    return new_state

def encode(state, action, R, Q, Pi):
    # Enconding key
    key_s = ','.join(str(x) for x in state.values())
    key_a = ','.join(str(x) for x in action)

    # Check if key not already in dict
    if key_s not in R.keys():
        R[key_s] = {}
    
    if key_s not in Q.keys():
        Q[key_s] = {}
        
    if key_s not in Pi.keys():
        Pi[key_s] = {}
        
    if key_a not in R[key_s].keys():
        R[key_s][key_a] = []
        
    return key_s, key_a

def action_selection(action, actions):
    # Random velocity increments
    x_increment = np.random.randint(-1,2)
    y_increment = np.random.randint(-1,2)
    
    # Compute new velocity
    new_action = (action[0] + x_increment, 
                  action[1] + y_increment)
    
    return new_action if new_action in actions else action

def update_episode(episode, state, action, R, Q, Pi):
    # Encode state and action
    key_s, key_a = encode(state, action, R, Q, Pi)
    
    # Add new episode
    episode.append((key_s, key_a))

def admissible_actions(actions, key_s):
    # Get state from key
    s_list = key_s.split(',')
    state = {"y": int(s_list[0]), "x": int(s_list[1])}
    
    # Admissible actions
    admissible = []
        
    for action in actions:        
        # Get new state with action
        next_state = move(state, action)
        
        # Check if state is admissible
        if not terminal(next_state, allow=True):
            admissible.append(action)
    
    return admissible

def world_admissible_actions(WORLD):
    # Admissible actions
    A = {}
    
    # Possible actions
    actions = generate_actions()
    
    for y in range(WORLD.shape[0]):
        for x in range(WORLD.shape[1]):
            if (y,x) not in GOALS and WORLD[y,x] != -10:
                # Creat key for state
                key_s = str(y) + "," + str(x)
                A[key_s] = admissible_actions(actions, key_s)
    
    return A

def monte_carlo(loops=1000, uniform=False):
    # Value functions
    R = {}
    Q = {}
    Pi = {}
    V = np.zeros_like(WORLD)
    
    # Possible actions
    actions = generate_actions()
    
    # World admissible actions
    A = world_admissible_actions(WORLD)
    
    for count in range(loops):        
        # Starting state
        state = {'y': 17, 'x': np.random.randint(3,7)}
        
        # Episode states
        episode = []
        
        # Initial action (i.e -> [0,1])
        action = actions[0]
        
        # Update first episode
        update_episode(episode, state, action, R, Q, Pi)
        
        ## (a)
        while True:
            # Action choice
            if uniform:
                action = action_selection(action, actions)
            else: 
                key_s = ','.join(str(x) for x in state.values())
                
                if key_s in Q.keys() and Q[key_s]:
                    # Greedy choice (soft policy)
                    if len(A[key_s]) == 0:
                        print(A[key_s])
                        print(key_s)
                    if np.random.uniform(0.0, 1.0) >= (EPS / len(A[key_s])): 
                        a_star = max(Q[key_s].items(), key=operator.itemgetter(1))[0]
                        a_list = a_star.split(',')
                        action = (int(a_list[0]), int(a_list[1]))
                    else:
                        action = action_selection(action, actions)
                else:
                    action = action_selection(action, actions)
            
            # Update episode with action
            update_episode(episode, state, action, R, Q, Pi)
            
            # Apply move
            new_state = move(state, action)
            
            # Check if new state terminal
            if not terminal(new_state):
                # Update state
                state = new_state
            else:
                break
        
        # (b)
        for key_s, key_a in episode:
            # Get state
            s_list = key_s.split(',')
            state = (int(s_list[0]), int(s_list[1]))

            # Get action
            a_list = key_a.split(',')
            action = (int(a_list[0]), int(a_list[1]))
            
            # Get resulting state with given action
            state_dict = {"y": int(s_list[0]), "x": int(s_list[1])}
            next_state = move(state_dict, action)
            
            # Update return dict
            R[key_s][key_a].append(WORLD[next_state["y"], next_state["x"]])
            Q[key_s][key_a] = np.mean(R[key_s][key_a])
            
        # (c)
        if not uniform:
            for key_s, _ in episode:
                a_star = max(Q[key_s].items(), key=operator.itemgetter(1))[0]
                admissible = A[key_s]
                a_len = len(admissible)
                for action in admissible:
                    key_a = ','.join(str(x) for x in action)
                    Pi[key_s][key_a] = (1 - EPS + EPS / a_len) if a_star == key_a else EPS / a_len
    
    # Compute value state table
    if uniform:
        for key in Q.keys():
            # Get state
            s_list = key.split(',')
            state = (int(s_list[0]), int(s_list[1]))
                        
            # Values for the state
            values = Q[key].values()
                        
            # Update table
            V[int(s_list[0]), int(s_list[1])] = sum(values) * (1/len(values))
        
    return Pi if not uniform else V

## Task 4.1

In [152]:
V = monte_carlo(loops=1000, uniform=True)

In [153]:
print("State-Value table for 1000 episodes with u.a.r action selection: \n\n", V)

State-Value table for 1000 episodes with u.a.r action selection: 

 [[0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0]
 [0.0 0.0 0.0 0.0 0.0 -10.0 -10.0 -10.0 -10.0 -9.0 -8.7 -10.0 -10.0 0.0]
 [0.0 0.0 0.0 0.0 -10.0 -8.2 -8.0 -8.2 -7.3 -7.0 -4.6 -4.6 -2.0 0.0]
 [0.0 0.0 0.0 0.0 -7.0 -5.5 -4.6 -5.5 -4.8 -1.0 -2.8 -2.5 -1.0 0.0]
 [0.0 0.0 0.0 0.0 -4.6 -4.0 -3.6 -1.8 -2.6 -2.1 -1.0 -1.0 -1.0 0.0]
 [0.0 0.0 0.0 0.0 -3.2 -2.0 -1.0 -1.7 -1.0 -1.0 -1.0 -1.0 0.0 0.0]
 [0.0 0.0 0.0 0.0 -1.0 -1.0 -1.0 -1.0 -1.0 -2.5 -10.0 0.0 0.0 0.0]
 [0.0 0.0 0.0 0.0 -1.0 -1.0 -1.0 -1.6 -2.8 0.0 0.0 0.0 0.0 0.0]
 [0.0 0.0 0.0 0.0 -1.0 -1.0 -1.0 -2.4 0.0 0.0 0.0 0.0 0.0 0.0]
 [0.0 0.0 0.0 0.0 -1.0 -1.0 -1.9 -7.8 0.0 0.0 0.0 0.0 0.0 0.0]
 [0.0 0.0 0.0 0.0 -1.0 -1.0 -3.8 -7.4 0.0 0.0 0.0 0.0 0.0 0.0]
 [0.0 0.0 0.0 0.0 -1.0 -1.0 -3.5 -7.8 0.0 0.0 0.0 0.0 0.0 0.0]
 [0.0 0.0 0.0 0.0 -1.0 -1.0 -3.2 -8.2 0.0 0.0 0.0 0.0 0.0 0.0]
 [0.0 0.0 0.0 0.0 -1.0 -1.9 -5.5 0.0 0.0 0.0 0.0 0.0 0.0 0.0]
 [0.0 0.0 0.0 0.0 -1

## Task 4.2

In [156]:
Pi = monte_carlo(loops=1000, uniform=False)

In [157]:
trajectories = []

for state_ in STARTS:
    # Episode
    episode = []
    
    # States
    state = {'y': state_[0], 'x': state_[1]}
    
    while True:
        # Transform state to key
        key_s = ','.join(str(x) for x in [state['y'],state['x']])
        
        # Fetch best action
        a_star = max(Pi[key_s].items(), key=operator.itemgetter(1))[0]
        a_list = a_star.split(',')
        action = (int(a_list[0]), int(a_list[1]))
        
        # Apply move to action
        new_state = move(state, action)

        # Enconding keys
        key_s = ','.join(str(x) for x in state.values())
        key_a = ','.join(str(x) for x in action)

        episode.append((key_s, key_a))

        # Check new state terminal
        if not terminal(new_state):
            # Update state
            state = new_state
        else:
            # Enconding keys goal
            key_s = ','.join(str(x) for x in new_state.values())
            key_a = ','.join(str(x) for x in action)

            episode.append((key_s, key_a))
            break
            
    trajectories.append(episode)

In [158]:
for count, trajectory in enumerate(trajectories):
    print("Trajectory " + str(count) + " :\n\n", trajectory, "\n")

Trajectory 0 :

 [('17,3', '0,1'), ('16,3', '1,1'), ('15,4', '0,1'), ('14,4', '0,1'), ('13,4', '1,2'), ('11,5', '2,2'), ('9,7', '0,2'), ('7,7', '1,1'), ('6,8', '2,2'), ('4,10', '3,2'), ('1,13', '3,2')] 

Trajectory 1 :

 [('17,4', '0,1'), ('16,4', '0,1'), ('15,4', '0,1'), ('14,4', '0,1'), ('13,4', '1,2'), ('11,5', '2,2'), ('9,7', '0,2'), ('7,7', '1,1'), ('6,8', '2,2'), ('4,10', '3,2'), ('1,13', '3,2')] 

Trajectory 2 :

 [('17,5', '1,0'), ('17,6', '0,1'), ('16,6', '0,1'), ('15,6', '0,1'), ('14,6', '0,1'), ('13,6', '0,1'), ('12,6', '0,1'), ('11,6', '0,1'), ('10,6', '1,1'), ('9,7', '0,2'), ('7,7', '1,1'), ('6,8', '2,2'), ('4,10', '3,2'), ('1,13', '3,2')] 

Trajectory 3 :

 [('17,6', '0,1'), ('16,6', '0,1'), ('15,6', '0,1'), ('14,6', '0,1'), ('13,6', '0,1'), ('12,6', '0,1'), ('11,6', '0,1'), ('10,6', '1,1'), ('9,7', '0,2'), ('7,7', '1,1'), ('6,8', '2,2'), ('4,10', '3,2'), ('1,13', '3,2')] 



## Task 4.3