In [None]:
import numpy as np
import matplotlib.pyplot as plt
from collections import defaultdict

class ArmSimulator:
    def __init__(self, segment_sizes=[0.9, 0.7], goal_position=None):
        self.segment_sizes = np.array(segment_sizes)
        self.joint_count = len(segment_sizes)
        self.rotation_increment = 0.15
        self.angle_bounds = (-np.pi, np.pi)
        self.joint_angles = np.zeros(self.joint_count)
        self.goal_position = np.array([0.9, 0.7]) if goal_position is None else np.array(goal_position)
        self.max_reach = np.sum(self.segment_sizes)
        
    def initialize(self):
        self.joint_angles = np.random.uniform(self.angle_bounds[0], self.angle_bounds[1], self.joint_count)
        return self._discretize_state()
    
    def execute_action(self, action_id):
        joint_id = action_id // 2
        direction = 1 if action_id % 2 == 0 else -1
        
        if 0 <= joint_id < self.joint_count:
            delta = direction * self.rotation_increment
            self.joint_angles[joint_id] += delta
            self.joint_angles[joint_id] = np.clip(self.joint_angles[joint_id], 
                                                self.angle_bounds[0], 
                                                self.angle_bounds[1])
        
        end_effector = self._compute_position(self.joint_angles)
        distance = np.linalg.norm(end_effector - self.goal_position)
        
        reward = -distance
        terminal = distance < 0.15
        
        if terminal:
            reward += 15.0
            
        return self._discretize_state(), reward, terminal
    
    def _discretize_state(self):
        discretization = 0.25
        discrete_angles = tuple((self.joint_angles / discretization).astype(int))
        return discrete_angles
    
    def _compute_position(self, angles):
        position = np.zeros(2)
        cumulative_angle = 0
        
        for i in range(self.joint_count):
            cumulative_angle += angles[i]
            position[0] += self.segment_sizes[i] * np.cos(cumulative_angle)
            position[1] += self.segment_sizes[i] * np.sin(cumulative_angle)
            
        return position
    
    def visualize(self):
        plt.figure(figsize=(8, 8))
        
        position = np.zeros(2)
        cumulative_angle = 0
        coordinates = [position.copy()]
        
        for i in range(self.joint_count):
            cumulative_angle += self.joint_angles[i]
            position[0] += self.segment_sizes[i] * np.cos(cumulative_angle)
            position[1] += self.segment_sizes[i] * np.sin(cumulative_angle)
            coordinates.append(position.copy())
        
        coordinates = np.array(coordinates)
        
        plt.plot(coordinates[:, 0], coordinates[:, 1], 'b-', linewidth=4)
        plt.plot(coordinates[:, 0], coordinates[:, 1], 'ko', markersize=8)
        plt.plot(self.goal_position[0], self.goal_position[1], 'r*', markersize=16)
        
        workspace_radius = self.max_reach * 1.2
        plt.xlim([-workspace_radius, workspace_radius])
        plt.ylim([-workspace_radius, workspace_radius])
        plt.grid(True)
        plt.title('Articulated Arm Simulation')
        plt.xlabel('X Position')
        plt.ylabel('Y Position')
        plt.show()


class MonteCarloLearner:
    def __init__(self, simulator, exploration_rate=0.15, discount_factor=0.92):
        self.simulator = simulator
        self.exploration_rate = exploration_rate
        self.discount_factor = discount_factor
        self.action_count = simulator.joint_count * 2
        self.state_action_values = defaultdict(lambda: np.zeros(self.action_count))
        self.returns_sum = defaultdict(lambda: np.zeros(self.action_count))
        self.returns_count = defaultdict(lambda: np.zeros(self.action_count))
        self.behavior_policy = {}
        
    def select_action(self, state, explore=True):
        if state not in self.behavior_policy:
            self.behavior_policy[state] = np.ones(self.action_count) / self.action_count
        
        if explore and np.random.random() < self.exploration_rate:
            return np.random.randint(self.action_count)
        else:
            return np.argmax(self.state_action_values[state])
    
    def collect_trajectory(self, max_steps=150, explore=True):
        trajectory = []
        state = self.simulator.initialize()
        
        for _ in range(max_steps):
            action = self.select_action(state, explore)
            next_state, reward, done = self.simulator.execute_action(action)
            trajectory.append((state, action, reward))
            state = next_state
            if done:
                break
                
        return trajectory
    
    def update_behavior_policy(self, state):
        best_action = np.argmax(self.state_action_values[state])
        self.behavior_policy[state] = np.zeros(self.action_count)
        self.behavior_policy[state][best_action] = 1.0
    
    def evaluate_policy(self, episodes=500):
        for _ in range(episodes):
            trajectory = self.collect_trajectory(explore=True)
            visited_pairs = set()
            
            returns = 0
            for t in range(len(trajectory)-1, -1, -1):
                state, action, reward = trajectory[t]
                returns = self.discount_factor * returns + reward
                
                if (state, action) not in visited_pairs:
                    visited_pairs.add((state, action))
                    self.returns_sum[(state, action)] += returns
                    self.returns_count[(state, action)] += 1
                    self.state_action_values[state][action] = (
                        self.returns_sum[(state, action)] / self.returns_count[(state, action)]
                    )
    
    def improve_policy(self, episodes=3000):
        for i in range(episodes):
            trajectory = self.collect_trajectory(explore=True)
            visited_pairs = set()
            
            returns = 0
            for t in range(len(trajectory)-1, -1, -1):
                state, action, reward = trajectory[t]
                returns = self.discount_factor * returns + reward
                
                if (state, action) not in visited_pairs:
                    visited_pairs.add((state, action))
                    self.returns_sum[(state, action)] += returns
                    self.returns_count[(state, action)] += 1
                    self.state_action_values[state][action] = (
                        self.returns_sum[(state, action)] / self.returns_count[(state, action)]
                    )
                    self.update_behavior_policy(state)
            
            if (i+1) % 300 == 0:
                print(f"Training progress: {i+1}/{episodes} episodes completed")
                self.assess_performance()
    
    def assess_performance(self, trials=5, steps_limit=100, show_visualization=False):
        success_counter = 0
        reward_total = 0
        
        for _ in range(trials):
            state = self.simulator.initialize()
            episode_reward = 0
            terminal = False
            
            for _ in range(steps_limit):
                action = self.select_action(state, explore=False)
                next_state, reward, terminal = self.simulator.execute_action(action)
                episode_reward += reward
                state = next_state
                
                if terminal:
                    success_counter += 1
                    break
            
            reward_total += episode_reward
        
        if show_visualization:
            self.simulator.visualize()
            
        avg_reward = reward_total / trials
        success_rate = success_counter / trials
        print(f"Performance metrics - Average reward: {avg_reward:.2f}, Success rate: {success_rate:.2f}")
        
        return avg_reward, success_rate


def run_experiment():
    arm_simulator = ArmSimulator(segment_sizes=[1.1, 0.75], goal_position=[1.2, 0.5])
    
    mc_agent = MonteCarloLearner(arm_simulator, exploration_rate=0.25, discount_factor=0.9)
    
    print("Starting Monte Carlo training procedure...")
    mc_agent.improve_policy(episodes=4000)
    
    print("\nFinal performance evaluation:")
    mc_agent.assess_performance(trials=10, show_visualization=True)

if __name__ == "__main__":
    run_experiment()