In [None]:
!pip install numpy
!pip install matplotlib
!pip install gymnasium[box2d] 

In [None]:
import numpy as np
import gymnasium as gym
import matplotlib.pyplot as plt
from collections import defaultdict
import time

class DiscretizedAcrobot:
    def __init__(self, bins=10, render_mode=None):
        self.env = gym.make("Acrobot-v1", render_mode=render_mode)
        self.bins = bins
        self.num_actions = self.env.action_space.n
        
        self.observation_space_high = np.array([
            1.0, 1.0, 1.0, 1.0, 12.0, 28.0
        ])
        self.observation_space_low = np.array([
            -1.0, -1.0, -1.0, -1.0, -12.0, -28.0
        ])
    
    def reset(self, seed=None):
        obs, info = self.env.reset(seed=seed)
        return self._discretize_observation(obs), info
    
    def step(self, action):
        obs, reward, terminated, truncated, info = self.env.step(action)
        
        end_height = -np.cos(np.arccos(obs[0])) - np.cos(np.arccos(obs[0]) + np.arccos(obs[2])) + 2.0
        enhanced_reward = reward + 0.1 * end_height
        
        return self._discretize_observation(obs), enhanced_reward, terminated, truncated, info
    
    def _discretize_observation(self, observation):
        scaled = ((observation - self.observation_space_low) / 
                  (self.observation_space_high - self.observation_space_low) * self.bins)
        
        discretized = np.clip(scaled.astype(np.int32), 0, self.bins - 1)
        
        return tuple(discretized)
    
    def render(self):
        return self.env.render()
    
    def close(self):
        self.env.close()

class MonteCarloAgent:
    def __init__(self, env, epsilon=0.1, gamma=0.99):
        self.env = env
        self.epsilon = epsilon
        self.gamma = gamma
        self.num_actions = env.num_actions
        
        self.Q = defaultdict(lambda: np.zeros(self.num_actions))
        self.returns_sum = defaultdict(float)
        self.returns_count = defaultdict(int)
        self.policy = defaultdict(lambda: np.ones(self.num_actions) / self.num_actions)
    
    def select_action(self, state, exploring=True):
        if exploring and np.random.random() < self.epsilon:
            return np.random.randint(self.num_actions)
        else:
            return np.argmax(self.Q[state])
    
    def update_policy(self, state):
        best_action = np.argmax(self.Q[state])
        for a in range(self.num_actions):
            self.policy[state][a] = 1.0 if a == best_action else 0.0
    
    def generate_episode(self, max_steps=500, exploring=True):
        episode = []
        state, _ = self.env.reset()
        
        for _ in range(max_steps):
            action = self.select_action(state, exploring)
            next_state, reward, terminated, truncated, _ = self.env.step(action)
            episode.append((state, action, reward))
            
            if terminated or truncated:
                break
                
            state = next_state
            
        return episode
    
    def monte_carlo_prediction(self, num_episodes=100):
        for episode in range(num_episodes):
            if episode % 10 == 0:
                print(f"Prediction Episode {episode}/{num_episodes}")
                
            episode_data = self.generate_episode()
            
            G = 0
            visited_state_actions = set()
            
            for t in range(len(episode_data)-1, -1, -1):
                state, action, reward = episode_data[t]
                G = self.gamma * G + reward
                
                state_action = (state, action)
                if state_action not in visited_state_actions:
                    visited_state_actions.add(state_action)
                    
                    self.returns_sum[state_action] += G
                    self.returns_count[state_action] += 1
                    self.Q[state][action] = self.returns_sum[state_action] / self.returns_count[state_action]
    
    def monte_carlo_control(self, num_episodes=1000):
        episode_rewards = []
        episode_lengths = []
        epsilon_schedule = np.linspace(self.epsilon, 0.01, num_episodes)
        
        for episode in range(num_episodes):
            self.epsilon = epsilon_schedule[episode]
            
            episode_data = self.generate_episode(exploring=True)
            episode_reward = sum(r for _, _, r in episode_data)
            episode_rewards.append(episode_reward)
            episode_lengths.append(len(episode_data))
            
            if episode % 20 == 0:
                avg_reward = np.mean(episode_rewards[-20:] if episode >= 20 else episode_rewards)
                avg_length = np.mean(episode_lengths[-20:] if episode >= 20 else episode_lengths)
                print(f"Episode {episode}/{num_episodes}, Avg Reward: {avg_reward:.2f}, Avg Length: {avg_length:.1f}")
            
            G = 0
            visited_state_actions = set()
            
            for t in range(len(episode_data)-1, -1, -1):
                state, action, reward = episode_data[t]
                G = self.gamma * G + reward
                
                state_action = (state, action)
                if state_action not in visited_state_actions:
                    visited_state_actions.add(state_action)
                    
                    self.returns_sum[state_action] += G
                    self.returns_count[state_action] += 1
                    self.Q[state][action] = self.returns_sum[state_action] / self.returns_count[state_action]
                    
                    self.update_policy(state)
        
        return episode_rewards, episode_lengths
    
    def evaluate(self, num_episodes=10, render=False):
        render_mode = "human" if render else None
        if render:
            self.env.close()
            eval_env = DiscretizedAcrobot(bins=self.env.bins, render_mode=render_mode)
        else:
            eval_env = self.env
            
        total_rewards = 0
        episode_lengths = []
        
        for _ in range(num_episodes):
            state, _ = eval_env.reset()
            episode_reward = 0
            steps = 0
            done = False
            truncated = False
            
            while not (done or truncated):
                action = self.select_action(state, exploring=False)
                next_state, reward, done, truncated, _ = eval_env.step(action)
                episode_reward += reward
                state = next_state
                steps += 1
                
                if render:
                    eval_env.render()
                    time.sleep(0.01)
            
            total_rewards += episode_reward
            episode_lengths.append(steps)
        
        if render:
            eval_env.close()
            
        avg_reward = total_rewards / num_episodes
        avg_length = np.mean(episode_lengths)
        return avg_reward, avg_length
    
    def plot_results(self, rewards, lengths):
        fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(10, 10))
        
        ax1.plot(rewards)
        ax1.set_title('Learning Curve - Rewards')
        ax1.set_xlabel('Episode')
        ax1.set_ylabel('Episode Reward')
        ax1.grid(True)
        
        window = 50
        if len(rewards) >= window:
            rolling_mean = np.convolve(rewards, np.ones(window)/window, mode='valid')
            ax1.plot(range(window-1, len(rewards)), rolling_mean, 'r-', linewidth=2, 
                     label=f'{window}-episode Moving Avg')
            ax1.legend()
        
        ax2.plot(lengths)
        ax2.set_title('Learning Curve - Episode Lengths')
        ax2.set_xlabel('Episode')
        ax2.set_ylabel('Steps per Episode')
        ax2.grid(True)
        
        if len(lengths) >= window:
            rolling_mean = np.convolve(lengths, np.ones(window)/window, mode='valid')
            ax2.plot(range(window-1, len(lengths)), rolling_mean, 'r-', linewidth=2, 
                     label=f'{window}-episode Moving Avg')
            ax2.legend()
        
        plt.tight_layout()
        plt.show()

def main():
    env = DiscretizedAcrobot(bins=8)
    
    agent = MonteCarloAgent(env, epsilon=0.2, gamma=0.99)
    
    print("Running Monte Carlo Prediction on initial policy...")
    agent.monte_carlo_prediction(num_episodes=100)
    
    print("\nTraining with Monte Carlo Control...")
    rewards, lengths = agent.monte_carlo_control(num_episodes=2000)
    
    agent.plot_results(rewards, lengths)
    
    print("\nEvaluating learned policy...")
    avg_reward, avg_length = agent.evaluate(num_episodes=10, render=True)
    print(f"Average Reward: {avg_reward:.2f}")
    print(f"Average Episode Length: {avg_length:.1f}")
    
    env.close()

if __name__ == "__main__":
    main()

Running Monte Carlo Prediction on initial policy...
Prediction Episode 0/100
Prediction Episode 10/100
Prediction Episode 20/100
Prediction Episode 30/100
Prediction Episode 40/100
Prediction Episode 50/100
Prediction Episode 60/100
Prediction Episode 70/100
Prediction Episode 80/100
Prediction Episode 90/100

Training with Monte Carlo Control...
Episode 0/2000, Avg Reward: -433.03, Avg Length: 500.0
Episode 20/2000, Avg Reward: -438.45, Avg Length: 483.2
Episode 40/2000, Avg Reward: -454.51, Avg Length: 497.4
Episode 60/2000, Avg Reward: -453.32, Avg Length: 500.0
Episode 80/2000, Avg Reward: -434.74, Avg Length: 485.8
Episode 100/2000, Avg Reward: -445.31, Avg Length: 496.3
Episode 120/2000, Avg Reward: -441.33, Avg Length: 490.2
Episode 140/2000, Avg Reward: -453.17, Avg Length: 500.0
Episode 160/2000, Avg Reward: -448.17, Avg Length: 497.6
Episode 180/2000, Avg Reward: -449.05, Avg Length: 500.0
Episode 200/2000, Avg Reward: -444.61, Avg Length: 490.5
Episode 220/2000, Avg Reward: 