<a href="https://colab.research.google.com/github/CodeWithDharan/AI-ASSISTANT/blob/main/Cleaning_robot_MDP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np

def deterministic_robot_cleaning_v1():
    # Initialization
    states = [0, 1, 2, 3, 4, 5]                # Set of states
    actions = [-1, 1]                         # Set of actions
    Q = np.zeros((len(states), len(actions)))  # Initial Q can be chosen arbitrarily
    Qold = np.copy(Q)                         # Save a backup to compare later
    L = 15                                   # Number of iterations
    gamma = 0.5                              # Discounting factor
    epsilon = 0.001                          # Final error to stop the algorithm

    # Deterministic Q-iteration algorithm
    for l in range(1, L + 1):
        print(f'iteration: {l}')
        for ii in range(len(states)):
            for jj in range(len(actions)):
                next_state = model(states[ii], actions[jj])
                Q[ii, jj] = reward(states[ii], actions[jj]) + gamma * np.max(Q[next_state, :])

        if np.abs(np.sum(Q - Qold)) < epsilon:
            print('Epsilon criteria satisfied!')
            break
        else:
            Qold = np.copy(Q)

    # Show the final Q matrix
    print('Q matrix (optimal):')
    print(Q)

    C = np.argmax(Q, axis=1)                   # Finding the max values
    print('Q(optimal):')
    print(C)
    print('Optimal Policy:')
    print('*')
    print([actions[C[s]] if s not in [0, 5] else 'Terminal' for s in range(len(states))])
    print('*')

# This function is the transition model of the robot
# The inputs are: the current state, and the chosen action
# The output is the next state
def model(x, u):
    next_state = x + u
    if next_state < 0:
        return 0
    elif next_state > 5:
        return 5
    return next_state

# This function is the reward function for the task
# The inputs are: the current state, and the chosen action
# The output is the expected reward
def reward(x, u):
    if x == 5 and u == 1:
        return 5
    elif x == 0 and u == -1:
        return 1
    else:
        return 0

# Call the main function
deterministic_robot_cleaning_v1()

iteration: 1
iteration: 2
iteration: 3
iteration: 4
iteration: 5
iteration: 6
iteration: 7
iteration: 8
iteration: 9
iteration: 10
iteration: 11
iteration: 12
iteration: 13
iteration: 14
iteration: 15
Q matrix (optimal):
[[1.99993896 0.49996948]
 [0.99996948 0.62469578]
 [0.49998474 1.24969578]
 [0.62484789 2.49969578]
 [1.24984789 4.99969578]
 [2.49984789 9.99969578]]
Q(optimal):
[0 0 1 1 1 1]
Optimal Policy:
*
['Terminal', -1, 1, 1, 1, 'Terminal']
*
