<a href="https://colab.research.google.com/github/Bhoomi059/RL/blob/main/Practical_9_Monte_Carlo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **MONTE CARLO**

Ep1: A+3 A+2 B-4 A+4 B-3 <br>
Ep2: B-2 A+3 B-3

## **FIRST VISIT**

In [1]:
def generate_episode(i):
    episode_pattern = input(f"Enter the episode {i}: ").split()
    episode = []
    for step in episode_pattern:
        state = step[0]
        reward = int(step[1:])
        episode.append((state, reward))
    return episode

def monte_carlo_first_visit(episodes):
    returns = {}
    state_count = {}
    state_values = {}

    for episode in episodes:
        states, rewards = zip(*episode)
        total_return = 0

        for t in range(len(states) - 1, -1, -1):
            state = states[t]
            total_return += rewards[t]
            if state not in states[:t]:
                if state in returns:
                    returns[state].append(total_return)
                else:
                    returns[state] = [total_return]
                state_count[state] = len(returns[state])
                state_values[state] = sum(returns[state]) / state_count[state]

    return state_values

def main():
    num_episodes = int(input("Enter the number of episodes: "))
    episodes = []
    for i in range(num_episodes):
        episode = generate_episode(i+1)
        episodes.append(episode)

    state_values = monte_carlo_first_visit(episodes)

    print("Estimated state values:")
    for state, value in state_values.items():
        print(f"State {state}: {value}")

if __name__ == "__main__":
    main()


Enter the number of episodes: 2
Enter the episode 1: A+3 A+2 B-4 A+4 B-3
Enter the episode 2: B-2 A+3 B-3
Estimated state values:
State B: -2.5
State A: 1.0


## **EVERY VISIT**

In [2]:
import numpy as np

gamma = 1.0

def generate_episode(i):
    episode_pattern = input(f"Enter the episode {i}: ").split()
    episode = []
    for step in episode_pattern:
        state = step[0]
        reward = int(step[1:])
        episode.append((state, reward))
    return episode

def calculate_return(episode, t):
    G = 0
    for i in range(t, len(episode)):
        reward = episode[i][1]
        G = G + (gamma ** (i - t)) * reward
    return G

def monte_carlo_every_visit(episodes):
    state_values = {}
    returns_sum = {}
    state_counts = {}

    for episode in episodes:
        for t in range(len(episode)):
            state = episode[t][0]
            if state not in state_counts:
                state_counts[state] = 0
            state_counts[state] += 1

            G = calculate_return(episode, t)

            if state not in returns_sum:
                returns_sum[state] = 0
            returns_sum[state] += G

            state_values[state] = returns_sum[state] / state_counts[state]

    return state_values

def main():
    num_episodes = int(input("Enter the number of episodes: "))
    episodes = []
    for i in range(num_episodes):
        episode = generate_episode(i + 1)
        episodes.append(episode)

    state_values = monte_carlo_every_visit(episodes)

    print("Estimated state values:")
    for state, value in state_values.items():
        print(f"State {state}: {value}")

if __name__ == "__main__":
    main()

# A+3 A+2 B-4 A+4 B-3
# B-2 A+3 B-3

Enter the number of episodes: 2
Enter the episode 1: A+3 A+2 B-4 A+4 B-3
Enter the episode 2: B-2 A+3 B-3
Estimated state values:
State A: 0.5
State B: -2.75


## **BOTH**

In [4]:
import numpy as np

gamma = 1.0

def generate_episode(i):
    episode_pattern = input(f"Enter the episode {i}: ").split()
    episode = []
    for step in episode_pattern:
        state = step[0]
        reward = int(step[1:])
        episode.append((state, reward))
    return episode

def monte_carlo_first_visit(episodes):
    returns = {}
    state_count = {}
    state_values = {}

    for episode in episodes:
        states, rewards = zip(*episode)
        total_return = 0

        for t in range(len(states) - 1, -1, -1):
            state = states[t]
            total_return += rewards[t]
            if state not in states[:t]:
                if state in returns:
                    returns[state].append(total_return)
                else:
                    returns[state] = [total_return]
                state_count[state] = len(returns[state])
                state_values[state] = sum(returns[state]) / state_count[state]

    return state_values

def calculate_return(episode, t):
    G = 0
    for i in range(t, len(episode)):
        reward = episode[i][1]
        G = G + (gamma ** (i - t)) * reward
    return G

def monte_carlo_every_visit(episodes):
    state_values = {}
    returns_sum = {}
    state_counts = {}

    for episode in episodes:
        for t in range(len(episode)):
            state = episode[t][0]
            if state not in state_counts:
                state_counts[state] = 0
            state_counts[state] += 1

            G = calculate_return(episode, t)

            if state not in returns_sum:
                returns_sum[state] = 0
            returns_sum[state] += G

            state_values[state] = returns_sum[state] / state_counts[state]

    return state_values

def main():
    num_episodes = int(input("Enter the number of episodes: "))
    episodes = []
    for i in range(num_episodes):
        episode = generate_episode(i+1)
        episodes.append(episode)

    visit = int(input("Choose either 1. First Visit or 2. Every Visit: "))
    if visit == 1:
      state_values = monte_carlo_first_visit(episodes)
    elif visit == 2:
      state_values = monte_carlo_every_visit(episodes)
    else:
      print("Error: choose either 1. First Visit or 2. Every Visit.")
      return

    print("Estimated state values:")
    for state, value in state_values.items():
        print(f"State {state}: {value}")

if __name__ == "__main__":
    main()

# A+3 A+2 B-4 A+4 B-3
# B-2 A+3 B-3

Enter the number of episodes: 2
Enter the episode 1: A+3 A+2 B-4 A+4 B-3
Enter the episode 2: B-2 A+3 B-3
Choose either 1. First Visit or 2. Every Visit: 2
Estimated state values:
State A: 0.5
State B: -2.75
