<a href="https://colab.research.google.com/github/Dhanush-adk/reinforcement_learning/blob/main/first_approach_mc.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [245]:
class MoverEnvironment:
    def __init__(self, size=7, eater_actions=None, eater_policy=None):
        self.size = size
        self.true_goal = (6, 6)  # True goal at (6, 6)
        self.fake_goal = (1, 5)  # Fake goal at (1, 5)
        self.initial_resources = 100  # High initial resource value for both goals
        self.resources = {self.true_goal: self.initial_resources, self.fake_goal: self.initial_resources}
        self.eater_actions = eater_actions or [(1, 0), (0, 1), (0.5, 0.5), (0.2, 0.8), (0.6, 0.4),
                                               (0.8, 0.2), (0.4, 0.6), (0.7, 0.3), (0.3, 0.7)]
        self.eater_policy = eater_policy or {}
        self.moves = ['Up', 'Down', 'Left', 'Right']
        self.visited_fake_goal = False
        self.state = (0, 0)

    def reset(self):
        self.state = (0, 0)
        self.resources = {self.true_goal: self.initial_resources, self.fake_goal: self.initial_resources}
        self.visited_fake_goal = False
        return self.state

    def step(self, curr_state, action):
        move_offsets = [(0, 1),(0, -1),(-1, 0), (1, 0)]
        new_state = (curr_state[0] + move_offsets[action][0], curr_state[1] + move_offsets[action][1])

        # Ensure the agent doesn't move out of bounds
        new_state = (
            max(0, min(new_state[0], self.size - 1)),
            max(0, min(new_state[1], self.size - 1))
        )

        # Calculate the linear index for the eater policy
        index = new_state[1] * self.size + new_state[0]


        # Apply the eater action based on the eater policy or use a default action
        eater_index = self.eater_policy.get((index, action), 0)
        consumption = self.eater_actions[eater_index]
        self.resources[self.true_goal] -= consumption[1]
        self.resources[self.fake_goal] -= consumption[0]

        # Update state
        self.state = new_state
        reward = -1  # Default step cost
        deceive = (2,5)
        # Logic for fake goal interaction
        if self.state == deceive and not self.visited_fake_goal:
            self.visited_fake_goal = True
            reward += 50  # Reward for first visiting the fake goal
        elif self.state == deceive:
            reward += 40  # Reward for revisiting the fake goal

        # Logic for true goal interaction
        if self.state == self.true_goal:
            if not self.visited_fake_goal:
                reward -= 100  # Penalty for reaching true goal before fake goal
            else:
                reward += 815  # Big reward for reaching true goal after fake

        # Check for resource depletion
        if self.resources[self.true_goal] <= 0 or self.resources[self.fake_goal] <= 0:
            reward -= 100  # Severe penalty for running out of resources

        # Determine if the game is over (reaching the true goal)
        done = self.state == self.true_goal

        return self.state, reward, done


In [None]:
import numpy as np
import random

def generate_episode(env, policy, epsilon=0.1):
    episode = []
    current_state = env.reset()
    done = False
    while not done:
        if random.random() < epsilon:
            action = random.choice(range(4))
        else:
            action = policy[current_state] if current_state in policy else random.choice(range(4))
        next_state, reward , done = env.step(current_state, action)
        episode.append((current_state, action, reward))
        current_state = next_state
    return episode

import numpy as np
import random


def first_visit_mc(env, num_episodes, epsilon=0.1):
    Q = {}
    returns = {}
    policy = {}
    for x in range(7):
        for y in range(7):
            state = (x, y)
            for action in range(4):
                Q[(state, action)] = 0
                returns[(state, action)] = []
            policy[state] = random.choice(list(range(4)))
    for episode_number in range(num_episodes):
        if episode_number % 1 ==0:
          print(episode_number)
        episode = generate_episode(env, policy, epsilon)
        G = 0
        visited = set()
        for (state, action, reward) in reversed(episode):
            G = reward + 0.99 * G
            if (state, action) not in visited:
                visited.add((state, action))
                returns[(state, action)].append(G)
                Q[(state, action)] = np.mean(returns[(state, action)])
                best_action = max((Q[(state, a)], a) for a in range(4))[1]
                policy[state] = best_action

    return policy, Q
env = MoverEnvironment(
    eater_policy=
{(1, 0): 7, (1, 1): 7, (1, 2): 7, (1, 3): 7, (2, 0): 7, (2, 1): 7, (2, 2): 7, (2, 3): 7, (3, 0): 7, (3, 1): 7, (3, 2): 7, (3, 3): 7, (4, 0): 7, (4, 1): 7, (4, 2): 7, (4, 3): 7, (5, 0): 7, (5, 1): 7, (5, 2): 7, (5, 3): 7, (6, 0): 7, (6, 1): 7, (6, 2): 7, (6, 3): 7, (7, 0): 7, (7, 1): 7, (7, 2): 7, (7, 3): 7, (8, 0): 7, (8, 1): 7, (8, 2): 7, (8, 3): 7, (9, 0): 7, (9, 1): 7, (9, 2): 7, (9, 3): 7, (10, 0): 7, (10, 1): 7, (10, 2): 7, (10, 3): 7, (11, 0): 7, (11, 1): 7, (11, 2): 7, (11, 3): 7, (12, 0): 5, (12, 1): 5, (12, 2): 5, (12, 3): 5, (13, 0): 5, (13, 1): 5, (13, 2): 5, (13, 3): 5, (14, 0): 5, (14, 1): 5, (14, 2): 5, (14, 3): 5, (15, 0): 5, (15, 1): 5, (15, 2): 5, (15, 3): 5, (16, 0): 5, (16, 1): 5, (16, 2): 5, (16, 3): 5, (17, 0): 5, (17, 1): 5, (17, 2): 5, (17, 3): 5, (18, 0): 5, (18, 1): 5, (18, 2): 5, (18, 3): 5, (19, 0): 5, (19, 1): 5, (19, 2): 5, (19, 3): 5, (20, 0): 5, (20, 1): 5, (20, 2): 5, (20, 3): 5, (21, 0): 5, (21, 1): 5, (21, 2): 5, (21, 3): 5, (22, 0): 5, (22, 1): 5, (22, 2): 5, (22, 3): 5, (23, 0): 5, (23, 1): 5, (23, 2): 5, (23, 3): 5, (24, 0): 5, (24, 1): 5, (24, 2): 5, (24, 3): 5, (25, 0): 5, (25, 1): 5, (25, 2): 5, (25, 3): 5, (26, 0): 5, (26, 1): 5, (26, 2): 5, (26, 3): 5, (27, 0): 5, (27, 1): 5, (27, 2): 5, (27, 3): 5, (28, 0): 5, (28, 1): 5, (28, 2): 5, (28, 3): 5, (29, 0): 5, (29, 1): 5, (29, 2): 5, (29, 3): 5, (30, 0): 5, (30, 1): 5, (30, 2): 5, (30, 3): 5, (31, 0): 5, (31, 1): 5, (31, 2): 5, (31, 3): 5, (32, 0): 0, (32, 1): 5, (32, 2): 5, (32, 3): 5, (33, 0): 0, (33, 1): 5, (33, 2): 5, (33, 3): 5, (34, 0): 0, (34, 1): 5, (34, 2): 5, (34, 3): 5, (35, 0): 0, (35, 1): 5, (35, 2): 5, (35, 3): 5, (36, 0): 0, (36, 1): 5, (36, 2): 5, (36, 3): 5, (37, 0): 0, (37, 1): 5, (37, 2): 5, (37, 3): 5, (38, 0): 0, (38, 1): 5, (38, 2): 5, (38, 3): 5, (39, 0): 0, (39, 1): 5, (39, 2): 5, (39, 3): 8, (40, 0): 5, (40, 1): 5, (40, 2): 8, (40, 3): 8, (41, 0): 5, (41, 1): 4, (41, 2): 8, (41, 3): 8, (42, 0): 2, (42, 1): 8, (42, 2): 8, (42, 3): 8, (43, 0): 8, (43, 1): 8, (43, 2): 8, (43, 3): 8, (44, 0): 8, (44, 1): 8, (44, 2): 8, (44, 3): 8, (45, 0): 8, (45, 1): 8, (45, 2): 8, (45, 3): 8, (46, 0): 8, (46, 1): 8, (46, 2): 8, (46, 3): 8, (47, 0): 8, (47, 1): 8, (47, 2): 8, (47, 3): 8, (48, 0): 8, (48, 1): 8, (48, 2): 8, (48, 3): 3, (49, 0): 8, (49, 1): 8, (49, 2): 3, (49, 3): 1})
estimated_policy, Q_values = first_visit_mc(env, 8500, 0.1)


In [258]:
print(estimated_policy)

{(0, 0): 3, (0, 1): 3, (0, 2): 2, (0, 3): 1, (0, 4): 0, (0, 5): 3, (0, 6): 2, (1, 0): 3, (1, 1): 1, (1, 2): 1, (1, 3): 0, (1, 4): 0, (1, 5): 1, (1, 6): 1, (2, 0): 3, (2, 1): 3, (2, 2): 1, (2, 3): 1, (2, 4): 1, (2, 5): 0, (2, 6): 3, (3, 0): 0, (3, 1): 3, (3, 2): 0, (3, 3): 1, (3, 4): 1, (3, 5): 1, (3, 6): 1, (4, 0): 2, (4, 1): 3, (4, 2): 1, (4, 3): 2, (4, 4): 1, (4, 5): 3, (4, 6): 0, (5, 0): 3, (5, 1): 0, (5, 2): 3, (5, 3): 1, (5, 4): 1, (5, 5): 3, (5, 6): 3, (6, 0): 0, (6, 1): 0, (6, 2): 0, (6, 3): 0, (6, 4): 0, (6, 5): 0, (6, 6): 1}


In [259]:
moves = ['Up', 'Down', 'Left', 'Right']
for i in estimated_policy:
  print(i, moves[estimated_policy[i]])

(0, 0) Right
(0, 1) Right
(0, 2) Left
(0, 3) Down
(0, 4) Up
(0, 5) Right
(0, 6) Left
(1, 0) Right
(1, 1) Down
(1, 2) Down
(1, 3) Up
(1, 4) Up
(1, 5) Down
(1, 6) Down
(2, 0) Right
(2, 1) Right
(2, 2) Down
(2, 3) Down
(2, 4) Down
(2, 5) Up
(2, 6) Right
(3, 0) Up
(3, 1) Right
(3, 2) Up
(3, 3) Down
(3, 4) Down
(3, 5) Down
(3, 6) Down
(4, 0) Left
(4, 1) Right
(4, 2) Down
(4, 3) Left
(4, 4) Down
(4, 5) Right
(4, 6) Up
(5, 0) Right
(5, 1) Up
(5, 2) Right
(5, 3) Down
(5, 4) Down
(5, 5) Right
(5, 6) Right
(6, 0) Up
(6, 1) Up
(6, 2) Up
(6, 3) Up
(6, 4) Up
(6, 5) Up
(6, 6) Down


In [260]:
env = MoverEnvironment(
    eater_policy=
{(1, 0): 7, (1, 1): 7, (1, 2): 7, (1, 3): 7, (2, 0): 7, (2, 1): 7, (2, 2): 7, (2, 3): 7, (3, 0): 7, (3, 1): 7, (3, 2): 7, (3, 3): 7, (4, 0): 7, (4, 1): 7, (4, 2): 7, (4, 3): 7, (5, 0): 7, (5, 1): 7, (5, 2): 7, (5, 3): 7, (6, 0): 7, (6, 1): 7, (6, 2): 7, (6, 3): 7, (7, 0): 7, (7, 1): 7, (7, 2): 7, (7, 3): 7, (8, 0): 7, (8, 1): 7, (8, 2): 7, (8, 3): 7, (9, 0): 7, (9, 1): 7, (9, 2): 7, (9, 3): 7, (10, 0): 7, (10, 1): 7, (10, 2): 7, (10, 3): 7, (11, 0): 7, (11, 1): 7, (11, 2): 7, (11, 3): 7, (12, 0): 5, (12, 1): 5, (12, 2): 5, (12, 3): 5, (13, 0): 5, (13, 1): 5, (13, 2): 5, (13, 3): 5, (14, 0): 5, (14, 1): 5, (14, 2): 5, (14, 3): 5, (15, 0): 5, (15, 1): 5, (15, 2): 5, (15, 3): 5, (16, 0): 5, (16, 1): 5, (16, 2): 5, (16, 3): 5, (17, 0): 5, (17, 1): 5, (17, 2): 5, (17, 3): 5, (18, 0): 5, (18, 1): 5, (18, 2): 5, (18, 3): 5, (19, 0): 5, (19, 1): 5, (19, 2): 5, (19, 3): 5, (20, 0): 5, (20, 1): 5, (20, 2): 5, (20, 3): 5, (21, 0): 5, (21, 1): 5, (21, 2): 5, (21, 3): 5, (22, 0): 5, (22, 1): 5, (22, 2): 5, (22, 3): 5, (23, 0): 5, (23, 1): 5, (23, 2): 5, (23, 3): 5, (24, 0): 5, (24, 1): 5, (24, 2): 5, (24, 3): 5, (25, 0): 5, (25, 1): 5, (25, 2): 5, (25, 3): 5, (26, 0): 5, (26, 1): 5, (26, 2): 5, (26, 3): 5, (27, 0): 5, (27, 1): 5, (27, 2): 5, (27, 3): 5, (28, 0): 5, (28, 1): 5, (28, 2): 5, (28, 3): 5, (29, 0): 5, (29, 1): 5, (29, 2): 5, (29, 3): 5, (30, 0): 5, (30, 1): 5, (30, 2): 5, (30, 3): 5, (31, 0): 5, (31, 1): 5, (31, 2): 5, (31, 3): 5, (32, 0): 0, (32, 1): 5, (32, 2): 5, (32, 3): 5, (33, 0): 0, (33, 1): 5, (33, 2): 5, (33, 3): 5, (34, 0): 0, (34, 1): 5, (34, 2): 5, (34, 3): 5, (35, 0): 0, (35, 1): 5, (35, 2): 5, (35, 3): 5, (36, 0): 0, (36, 1): 5, (36, 2): 5, (36, 3): 5, (37, 0): 0, (37, 1): 5, (37, 2): 5, (37, 3): 5, (38, 0): 0, (38, 1): 5, (38, 2): 5, (38, 3): 5, (39, 0): 0, (39, 1): 5, (39, 2): 5, (39, 3): 8, (40, 0): 5, (40, 1): 5, (40, 2): 8, (40, 3): 8, (41, 0): 5, (41, 1): 4, (41, 2): 8, (41, 3): 8, (42, 0): 2, (42, 1): 8, (42, 2): 8, (42, 3): 8, (43, 0): 8, (43, 1): 8, (43, 2): 8, (43, 3): 8, (44, 0): 8, (44, 1): 8, (44, 2): 8, (44, 3): 8, (45, 0): 8, (45, 1): 8, (45, 2): 8, (45, 3): 8, (46, 0): 8, (46, 1): 8, (46, 2): 8, (46, 3): 8, (47, 0): 8, (47, 1): 8, (47, 2): 8, (47, 3): 8, (48, 0): 8, (48, 1): 8, (48, 2): 8, (48, 3): 3, (49, 0): 8, (49, 1): 8, (49, 2): 3, (49, 3): 1})
current_mover_state =env.reset()

for i in range(100):
    action_index = estimated_policy[current_mover_state]
    next_state, reward, done = env.step(current_mover_state, action_index)
    print(f"Step {i+1}: State: {current_mover_state},  action: {env.moves[action_index]}, Next State: {next_state}, Reward: {reward}, resources : {env.resources}")
    current_mover_state = next_state
    if done:
      break


Step 1: State: (0, 0),  action: Right, Next State: (1, 0), Reward: -1, resources : {(6, 6): 99.7, (1, 5): 99.3}
Step 2: State: (1, 0),  action: Right, Next State: (2, 0), Reward: -1, resources : {(6, 6): 99.4, (1, 5): 98.6}
Step 3: State: (2, 0),  action: Right, Next State: (3, 0), Reward: -1, resources : {(6, 6): 99.10000000000001, (1, 5): 97.89999999999999}
Step 4: State: (3, 0),  action: Up, Next State: (3, 1), Reward: -1, resources : {(6, 6): 98.80000000000001, (1, 5): 97.19999999999999}
Step 5: State: (3, 1),  action: Right, Next State: (4, 1), Reward: -1, resources : {(6, 6): 98.50000000000001, (1, 5): 96.49999999999999}
Step 6: State: (4, 1),  action: Right, Next State: (5, 1), Reward: -1, resources : {(6, 6): 98.30000000000001, (1, 5): 95.69999999999999}
Step 7: State: (5, 1),  action: Up, Next State: (5, 2), Reward: -1, resources : {(6, 6): 98.10000000000001, (1, 5): 94.89999999999999}
Step 8: State: (5, 2),  action: Right, Next State: (6, 2), Reward: -1, resources : {(6, 6): 

In [261]:
import numpy as np
from tabulate import tabulate

moves = ['↑', '↓', '←', '→']

data = estimated_policy

matrix_size = 7  # This should match the highest index values used in your data dictionary

confusion_matrix = np.full((matrix_size, matrix_size), '', dtype=object)

for (x, y), value in data.items():
    if value in range(len(moves)):  # Ensure that the index value is valid
        confusion_matrix[y, x] = moves[value]  # Note the inversion of indices for y, x
    else:
        confusion_matrix[y, x] = 'Invalid'  # Handle invalid data gracefully

confusion_matrix = np.flipud(confusion_matrix)

labels = [str(i) for i in range(matrix_size)]
confusion_matrix[0,6] = 'Done'
# Generate a textual representation of the matrix using tabulate
table = tabulate(confusion_matrix, headers=labels, tablefmt="grid", showindex=labels[::-1])

print(table)


+----+-----+-----+-----+-----+-----+-----+------+
|    | 0   | 1   | 2   | 3   | 4   | 5   | 6    |
|  6 | ←   | ↓   | →   | ↓   | ↑   | →   | Done |
+----+-----+-----+-----+-----+-----+-----+------+
|  5 | →   | ↓   | ↑   | ↓   | →   | →   | ↑    |
+----+-----+-----+-----+-----+-----+-----+------+
|  4 | ↑   | ↑   | ↓   | ↓   | ↓   | ↓   | ↑    |
+----+-----+-----+-----+-----+-----+-----+------+
|  3 | ↓   | ↑   | ↓   | ↓   | ←   | ↓   | ↑    |
+----+-----+-----+-----+-----+-----+-----+------+
|  2 | ←   | ↓   | ↓   | ↑   | ↓   | →   | ↑    |
+----+-----+-----+-----+-----+-----+-----+------+
|  1 | →   | ↓   | →   | →   | →   | ↑   | ↑    |
+----+-----+-----+-----+-----+-----+-----+------+
|  0 | →   | →   | →   | ↑   | ←   | →   | ↑    |
+----+-----+-----+-----+-----+-----+-----+------+
