<a href="https://colab.research.google.com/github/BRamya14/Reinforcement-Learning/blob/main/simple_randomwalk.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np

# Function to choose an action based on epsilon-greedy policy
def choose_action(state, q_table, epsilon):
    if np.random.uniform() < epsilon:
        # Explore (choose a random action)
        action = np.random.randint(0, 2)
    else:
        # Exploit (choose the action with maximum Q-value)
        action = np.argmax(q_table[state])
    return action

# Function to update Q-values based on the action taken and reward received
def update_q_table(state, action, reward, next_state, alpha, gamma, q_table):
    q_table[state, action] += alpha * (reward + gamma * np.max(q_table[next_state]) - q_table[state, action])

# Function to run the random walk simulation
def run_random_walk(num_episodes, alpha, gamma, epsilon):
    # Initialize the Q-table
    q_table = np.zeros((7, 2))  # Q-table with 7 states (0-6) and 2 actions (0: backward, 1: forward)

    # Iterate over episodes
    for episode in range(num_episodes):
        # Reset the state to the middle position (state 3)
        state = 3
        done = False

        # Keep track of the total episode reward
        total_reward = 0

        # Loop within each episode until the agent reaches the terminal states
        while not done:
            # Choose an action based on epsilon-greedy policy
            action = choose_action(state, q_table, epsilon)

            # Take the chosen action and observe the next state and reward
            if action == 0:
                next_state = state - 1
                reward = -1 if state == 1 else 0  # Terminal state 0
            else:
                next_state = state + 1
                reward = 1 if state == 5 else 0  # Terminal state 6

            # Update the Q-table based on the action taken and reward received
            update_q_table(state, action, reward, next_state, alpha, gamma, q_table)

            # Update the current state and accumulate the total reward
            state = next_state
            total_reward += reward

            # Check if the agent has reached a terminal state
            if state == 0 or state == 6:
                done = True

        # Print the total reward for the episode
        print("Episode {}: Total Reward = {}".format(episode + 1, total_reward))

    # Return the learned Q-table
    return q_table

# Set the hyperparameters
num_episodes = 10
alpha = 0.5  # Learning rate
gamma = 0.9  # Discount factor
epsilon = 0.1  # Exploration rate

# Run the random walk simulation
q_table = run_random_walk(num_episodes, alpha, gamma, epsilon)

# Print the learned Q-table
print("\nLearned Q-table:")
print(q_table)


Episode 1: Total Reward = -1
Episode 2: Total Reward = -1
Episode 3: Total Reward = -1
Episode 4: Total Reward = -1
Episode 5: Total Reward = -1
Episode 6: Total Reward = -1
Episode 7: Total Reward = -1
Episode 8: Total Reward = -1
Episode 9: Total Reward = -1
Episode 10: Total Reward = -1

Learned Q-table:
[[ 0.          0.        ]
 [-0.99902344  0.        ]
 [ 0.          0.        ]
 [ 0.          0.        ]
 [ 0.          0.        ]
 [ 0.          0.        ]
 [ 0.          0.        ]]
