In [None]:
import sys
!{sys.executable} -m pip install numpy
!{sys.executable} -m pip install ipywidgets matplotlib


In [None]:
# Q-learning parameters
alpha = 0.1  # Learning rate
gamma = 0.99  # Discount factor
epsilon = 0.1  # Exploration rate
num_episodes = 5000  # Total number of episodes
epsilon_decay = 0.995  # Decay rate for epsilon
total_rewards = []

In [520]:
import numpy as np
import gym
import matplotlib.pyplot as plt
import os
import matplotlib.colors as mcolors


class MountainCarAgent:
    """Initialize the agen, Q_table, create the state_grid"""
    def __init__(self, alpha=0.1, gamma=0.99, epsilon=0.15, epsilon_decay=0.9, min_epsilon=0.01, num_episodes=1001, num_states=[20, 20]):
        self.env = gym.make('MountainCar-v0')
        self.alpha = alpha
        self.gamma = gamma
        self.epsilon = epsilon
        self.epsilon_decay = epsilon_decay
        self.min_epsilon = min_epsilon
        self.num_episodes = num_episodes
        self.num_states = num_states
        self.state_bounds = list(zip(self.env.observation_space.low, self.env.observation_space.high))
        self.state_grid = [np.linspace(bound[0], bound[1], num_states[i]) for i, bound in enumerate(self.state_bounds)]
        self.Q_table = np.zeros(self.num_states + [self.env.action_space.n])
        self.total_rewards = []
        self.episode_lengths = []

    """Convert a continuous state to its discretized form."""
    def discretize_state(self, state):
        discretized_state = []
        for s, grid in zip(state, self.state_grid):
            index = np.digitize(s, grid) - 1
            index = max(0, min(index, len(grid) - 1))
            discretized_state.append(index)
        return tuple(discretized_state)

    """Return an action based on the epsilon-greedy policy."""
    def epsilon_greedy_policy(self, state):
        if np.random.random() < epsilon:
            return self.env.action_space.sample()  # Explore
        else:
            return np.argmax(self.Q_table[state])  # Exploit

    """Create a directory if it doesn't exist."""
    def create_directory(self, dir_path):
        if not os.path.exists(dir_path):
            os.makedirs(dir_path)

    """Save the Q_table for later use."""
    def save_Q_table(self, filename, folder="mountain_car/q_tables"):
        self.create_directory(folder)
        np.save(os.path.join(folder, filename), self.Q_table)

    """Load the Q_table to be used."""
    def load_Q_table(self, filename, folder="mountain_car/q_tables"):
            self.Q_table = np.load(filename)

    """Train the agent using Q-learning."""
    def train(self):
        for episode in range(self.num_episodes):
            state_raw, _ = self.env.reset()
            state = self.discretize_state(state_raw)
            total_reward, steps = 0, 0

            done = False
            while not done:
                action = self.epsilon_greedy_policy(state)
                next_state_raw, reward, done, _, _ = self.env.step(action)
                next_state = self.discretize_state(next_state_raw)

                # Q-table update
                best_next_action = np.argmax(self.Q_table[next_state])
                td_target = reward + self.gamma * self.Q_table[next_state][best_next_action]
                self.Q_table[state][action] += self.alpha * (td_target - self.Q_table[state][action])

                state = next_state
                total_reward += reward
                steps += 1

            self.total_rewards.append(total_reward)
            self.episode_lengths.append(steps)
            
            # Decay epsilon
            self.epsilon = max(self.epsilon * self.epsilon_decay, self.min_epsilon)

            if self.num_episodes < 1500 and episode % 100 == 0:
                print(f"Episode: {episode}, Average Reward: {np.mean(self.total_rewards[-100:])}")
            elif self.num_episodes < 10500 and episode % 500 == 0:
                print(f"Episode: {episode}, Average Reward: {np.mean(self.total_rewards[-500:])}")
            elif self.num_episodes > 10500 and episode % 2500 == 0:
                print(f"Episode: {episode}, Average Reward: {np.mean(self.total_rewards[-2500:])}")

        print("Training complete!")

    """ Plot the Q_table graphics and save them"""
    def plot_and_save_Q_table_analysis(self, test_id = 0, folder="mountain_car/tests/learning"):
        self.create_directory(folder)
        num_positions = len(self.state_grid[0])
        num_velocities = len(self.state_grid[1])
        policy = np.argmax(self.Q_table, axis=2)

        positions = np.linspace(-1.2, 0.6, num_positions)
        velocities = np.linspace(-0.07, 0.07, num_velocities)
        pos_grid, vel_grid = np.meshgrid(positions, velocities)
        action_colors = {0: 'red', 1: 'green', 2: 'blue'}
        color_array = np.array([[action_colors[action] for action in row] for row in policy])

        plt.figure(figsize=(18, 24))
        
        # Subplot 1: Learning Curve
        plt.subplot(5, 2, 1)
        plt.plot(self.total_rewards)  
        plt.title('Learning Curve: Total Reward per Episode')
        plt.xlabel('Episode')
        plt.ylabel('Total Reward')

        # Subplot 2: Episode Length Over Time
        plt.subplot(5, 2, 2)
        plt.plot(self.episode_lengths)  
        plt.title('Episode Length Over Time')
        plt.xlabel('Episode')
        plt.ylabel('Length of Episode')

        # Subplot 3: Heatmap for First Action
        selected_action = 0  
        plt.subplot(5, 2, 3)
        sns.heatmap(self.Q_table[:, :, selected_action])  # Replace with your Q_table data
        plt.title(f'Heatmap of Q-Values for Action {selected_action}')
        plt.xlabel('Position')
        plt.ylabel('Velocity')

        # Subplot 4: 3D Surface Plot for the Same Action
        ax = plt.subplot(5, 2, 4, projection='3d')
        X, Y = np.meshgrid(np.arange(self.Q_table.shape[0]), np.arange(self.Q_table.shape[1]))
        Z = self.Q_table[:, :, selected_action]  # Replace with your Q_table data
        ax.plot_surface(X, Y, Z, cmap='viridis')
        ax.set_title(f'3D Surface Plot of Q-Values for Action {selected_action}')
        plt.xlabel('Position')
        plt.ylabel('Velocity')
        ax.set_zlabel('Q-Value')

        # Subplot 5: Heatmap for Second Action
        selected_action = 1
        plt.subplot(5, 2, 5)
        sns.heatmap(self.Q_table[:, :, selected_action])  # Replace with your Q_table data
        plt.title(f'Heatmap of Q-Values for Action {selected_action}')
        plt.xlabel('Position')
        plt.ylabel('Velocity')

        # Subplot 6: 3D Surface Plot for the Same Action
        ax = plt.subplot(5, 2, 6, projection='3d')
        X, Y = np.meshgrid(np.arange(self.Q_table.shape[0]), np.arange(self.Q_table.shape[1]))
        Z = self.Q_table[:, :, selected_action]  
        ax.plot_surface(X, Y, Z, cmap='viridis')
        ax.set_title(f'3D Surface Plot of Q-Values for Action {selected_action}')
        plt.xlabel('Position')
        plt.ylabel('Velocity')
        ax.set_zlabel('Q-Value')

        # Subplot 7: Heatmap for Third Action
        selected_action = 2
        plt.subplot(5, 2, 7)
        sns.heatmap(self.Q_table[:, :, selected_action]) 
        plt.title(f'Heatmap of Q-Values for Action {selected_action}')
        plt.xlabel('Position')
        plt.ylabel('Velocity')

        # Subplot 8: 3D Surface Plot for the Same Action
        ax = plt.subplot(5, 2, 8, projection='3d')
        X, Y = np.meshgrid(np.arange(self.Q_table.shape[0]), np.arange(self.Q_table.shape[1]))
        Z = self.Q_table[:, :, selected_action]  
        ax.plot_surface(X, Y, Z, cmap='viridis')
        ax.set_title(f'3D Surface Plot of Q-Values for Action {selected_action}')
        plt.xlabel('Position')
        plt.ylabel('Velocity')
        ax.set_zlabel('Q-Value')

        # Subplot 9: Policy Map (Color-Coded Actions)
        plt.subplot(5, 2, 9)
        cmap_colors = [action_colors[i] for i in sorted(action_colors)]
        cmap = sns.color_palette(cmap_colors, as_cmap=True)
        policy_heatmap = sns.heatmap(policy, cmap=cmap, annot=False)  # 'policy' and 'cmap' as defined earlier
        plt.title("Policy Map (Color-Coded Actions)")
        plt.xlabel('Position')
        plt.ylabel('Velocity')

        # Subplot 10: Position and Velocity Policy Visualization
        plt.subplot(5, 2, 10)
        plt.scatter(pos_grid, vel_grid, color=color_array.ravel()) 
        plt.title('Policy Visualization for Mountain Car')
        plt.xlabel('Position')
        plt.ylabel('Velocity')
        patches = [mpatches.Patch(color=color, label=label) for label, color in action_colors.items()]  
        plt.legend(handles=patches, title="Actions", title_fontsize='13', loc='best')

        # # Adjust layout and show
        # plt.tight_layout()
        # plt.show()
        
        # Save the figure based on the test_id
        plt.tight_layout()
        filename = f"Q_table_analysis_test_{test_id}.png"
        plt.savefig(os.path.join(folder, filename))
        plt.close()
        
    """Test the trained policy over a number of episodes."""        
    def test_policy(self, num_episodes=1000, max_steps=1000):
        success_count = 0
        for episode in range(num_episodes):
            state_raw, _ = self.env.reset()
            state = self.discretize_state(state_raw)
            done = False
            step = 0
            while not done and step < max_steps:
                action = np.argmax(self.Q_table[state])
                state_raw, _, done, _, _ = self.env.step(action)
                state = self.discretize_state(state_raw)
                step += 1
                if done and state_raw[0] >= 0.5:  # Check if goal is reached
                    success_count += 1

        success_rate = success_count / num_episodes
        return success_rate
    
    """Test the policy and gather detailed analysis data."""
    def test_policy_analyzed(self, num_episodes=10000, max_steps=1000):
        success_count = 0
        episode_lengths = []
        episode_rewards = []
        grid_x, grid_y = self.num_states
        state_visits = np.zeros(grid_x * grid_y)
        episode_end_states = []
        episode_outcomes = []
        failures = []

        for episode in range(num_episodes):
            state_raw, _ = self.env.reset()
            state = self.discretize_state(state_raw)
            done = False
            success = False
            step = 0
            total_reward = 0

            while not done and step < max_steps:
                action = np.argmax(self.Q_table[state])
                state_raw, reward, done, _, _ = self.env.step(action)
                new_state = self.discretize_state(state_raw)
                total_reward += reward
                x, y = new_state
                single_index = y * grid_x + x
                state_visits[single_index] += 1

                if done and state_raw[0] < 0.5:
                    failures.append((episode, state_raw, step))

                state = new_state
                step += 1

                if done and state_raw[0] >= 0.5:
                    success = True
                    success_count += 1

            episode_lengths.append(step)
            episode_rewards.append(total_reward)
            episode_end_states.append(state_raw)
            episode_outcomes.append(success)

        success_rate = success_count / num_episodes
        analysis_data = {
            'success_rate': success_rate,
            'episode_lengths': episode_lengths,
            'episode_rewards': episode_rewards,
            'state_visits': state_visits,
            'episode_end_states': episode_end_states,
            'failures': failures,
            'episode_outcomes': episode_outcomes,
            'episodes' : num_episodes
        }
        return analysis_data
    
    """Generate and save plots based on analysis data."""
    def plot_analysis_data(self, analysis_data, test_id = 0, folder="mountain_car/tests/analysis"):
        self.create_directory(folder)
        plt.figure(figsize=(15, 20))

        # Scatter Plot of End States Color-Coded by Success or Failure
        end_states = np.array(analysis_data['episode_end_states'])
        outcomes = np.array(analysis_data['episode_outcomes'])
        cmap = mcolors.ListedColormap(['red', 'green'])
        binary_outcomes = np.where(outcomes, 1, 0)
        plt.subplot(3, 2, 1)
        plt.scatter(end_states[:, 0], end_states[:, 1], c=binary_outcomes, cmap=cmap)
        plt.title('End States Scatter Plot (Red=Failure, Green=Succes)')
        plt.xlabel('Position')
        plt.ylabel('Velocity')

        # Histogram of Episode Lengths
        plt.subplot(3, 2, 2)
        plt.hist(analysis_data['episode_lengths'], bins=30, color='skyblue')
        plt.title('Histogram of Episode Lengths')
        plt.xlabel('Episode Length')
        plt.ylabel('Frequency')

        # Line Plot of Episode Rewards
        plt.subplot(3, 2, 3)
        plt.plot(analysis_data['episode_rewards'], color='green')
        plt.title('Episode Rewards Over Time')
        plt.xlabel('Episode')
        plt.ylabel('Total Reward')

        # Heatmap of State Visits
        plt.subplot(3, 2, 4)
        state_visits_reshaped = np.reshape(analysis_data['state_visits'], self.num_states)
        sns.heatmap(state_visits_reshaped, cmap='viridis', annot=False)
        plt.title('Heatmap of State Visits')
        plt.xlabel('Position')
        plt.ylabel('Velocity')

        # Simple plot to display success or failure for each episode
        plt.subplot(3, 2, 5)
        plt.plot(binary_outcomes, marker='o', linestyle='', color='blue')
        plt.title('Success (1) or Failure (0) Per Episode')
        plt.xlabel('Episode')
        plt.ylabel('Outcome (1=Success, 0=Failure)')
        plt.yticks([0, 1], ['Failure', 'Success'])
        plt.grid(True)

        # Line plot for episode lengths
        plt.subplot(3, 2, 6)
        plt.plot(analysis_data['episode_lengths'], color='purple')
        plt.title('Episode Lengths Over Time')
        plt.xlabel('Episode')
        plt.ylabel('Length of Episode')
        plt.grid(True)
        
        # # Adjust layout and show
        # plt.tight_layout()
        # plt.show()

        plt.tight_layout()
        filename = f"analysis_plots_test_{test_id}_{analysis_data['episodes']}_episodes.png"
        plt.savefig(os.path.join(folder, filename))
        plt.close()
        
    """Visualize the agent's policy for a specified number of episodes."""
    def test_policy_with_visualization(self, num_episodes=5):
        for episode in range(num_episodes):
            state_raw, _ = self.env.reset()
            state = self.discretize_state(state_raw)
            done = False
            step = 0

            while not done:
                action = np.argmax(self.Q_table[state])
                action_taken = "Turn Left" if action == 0 else ("Do Nothing" if action == 1 else "Turn Right")
                self.custom_render(step, action_taken)
                state_raw, _, done, _, _ = self.env.step(action)
                state = self.discretize_state(state_raw)
                step += 1
           
        """Custom render function to visualize the state of the environment."""
    def custom_render(self, step, action_taken):
        plt.figure(figsize=(8, 4))
        positions = np.linspace(self.env.unwrapped.min_position, self.env.unwrapped.max_position, 100)
        mountain_heights = np.sin(3 * positions)
        plt.plot(positions, mountain_heights, color='gray', linewidth=2)
        car_position = self.env.unwrapped.state[0]
        plt.scatter(car_position, np.sin(3 * car_position), c='blue', marker='o', s=200)
        plt.xlabel("Position")
        plt.ylabel("Mountain Height")
        plt.title(f"MountainCarEnv - Step {step} - Action {action_taken}")
        plt.xlim(self.env.unwrapped.min_position, self.env.unwrapped.max_position)
        plt.ylim(-1.2, 1.2)
        display.clear_output(wait=True)
        display.display(plt.gcf())



In [None]:
# # Usage with default values
# default_agent = MountainCarAgent()
# default_agent.train()
# default_agent.plot_and_save_Q_table_analysis()
# #default_agent.plot_results()

config = {
    'alpha': 0.1,
    'gamma': 0.99,
    'epsilon': 0.3,
    'epsilon_decay': 0.9,
    'min_epsilon': 0.01,
    'num_episodes': 50001,
    'num_states': [100,100]
}

custom_agent = MountainCarAgent(**config)
custom_agent.train()
custom_agent.plot_and_save_Q_table_analysis(10)

In [None]:
succes_rate = custom_agent.test_policy(num_episodes = 100)
print(f"Success rate of the learned policy: {success_rate * 100:.2f}%")

In [None]:
analysis_data = custom_agent.test_policy_analyzed(num_episodes = 50)
custom_agent.plot_analysis_data(analysis_data)

In [None]:
custom_agent.test_policy_with_visualization()

In [521]:
# Tests for comparation between different approaches

configs = [
    {'alpha': 0.1, 'gamma': 0.99, 'epsilon': 0.1, 'epsilon_decay': 0.95, 'min_epsilon': 0.01, 'num_episodes': 500, 'num_states': [10, 10]},  
    {'alpha': 0.1, 'gamma': 0.99, 'epsilon': 0.1, 'epsilon_decay': 0.95, 'min_epsilon': 0.01, 'num_episodes': 1000, 'num_states': [10, 10]},
    
    {'alpha': 0.25, 'gamma': 0.99, 'epsilon': 0.1, 'epsilon_decay': 0.95, 'min_epsilon': 0.01, 'num_episodes': 500, 'num_states': [10, 10]},  
    {'alpha': 0.25, 'gamma': 0.99, 'epsilon': 0.1, 'epsilon_decay': 0.95, 'min_epsilon': 0.01, 'num_episodes': 1000, 'num_states': [10, 10]},  
    
    {'alpha': 0.1, 'gamma': 0.99, 'epsilon': 0.1, 'epsilon_decay': 0.95, 'min_epsilon': 0.01, 'num_episodes': 500, 'num_states': [20, 20]},  
    {'alpha': 0.1, 'gamma': 0.99, 'epsilon': 0.1, 'epsilon_decay': 0.95, 'min_epsilon': 0.01, 'num_episodes': 1000, 'num_states': [20, 20]},  
    {'alpha': 0.1, 'gamma': 0.99, 'epsilon': 0.1, 'epsilon_decay': 0.95, 'min_epsilon': 0.01, 'num_episodes': 10000, 'num_states': [20, 20]},
    {'alpha': 0.1, 'gamma': 0.99, 'epsilon': 0.1, 'epsilon_decay': 0.95, 'min_epsilon': 0.01, 'num_episodes': 25000, 'num_states': [20, 20]},
    
    {'alpha': 0.3, 'gamma': 0.99, 'epsilon': 0.1, 'epsilon_decay': 0.95, 'min_epsilon': 0.01, 'num_episodes': 500, 'num_states': [20, 20]},  
    {'alpha': 0.3, 'gamma': 0.99, 'epsilon': 0.1, 'epsilon_decay': 0.95, 'min_epsilon': 0.01, 'num_episodes': 1000, 'num_states': [20, 20]},  
    {'alpha': 0.3, 'gamma': 0.99, 'epsilon': 0.1, 'epsilon_decay': 0.95, 'min_epsilon': 0.01, 'num_episodes': 10000, 'num_states': [20, 20]},  
    
    {'alpha': 0.1, 'gamma': 0.99, 'epsilon': 0.1, 'epsilon_decay': 0.95, 'min_epsilon': 0.01, 'num_episodes': 500, 'num_states': [50, 50]},  
    {'alpha': 0.1, 'gamma': 0.99, 'epsilon': 0.1, 'epsilon_decay': 0.95, 'min_epsilon': 0.01, 'num_episodes': 1000, 'num_states': [50, 50]},  
    {'alpha': 0.1, 'gamma': 0.99, 'epsilon': 0.1, 'epsilon_decay': 0.95, 'min_epsilon': 0.01, 'num_episodes': 10000, 'num_states': [50, 50]},
    {'alpha': 0.1, 'gamma': 0.99, 'epsilon': 0.1, 'epsilon_decay': 0.95, 'min_epsilon': 0.01, 'num_episodes': 50000, 'num_states': [50, 50]},
    
    {'alpha': 0.3, 'gamma': 0.99, 'epsilon': 0.3, 'epsilon_decay': 0.95, 'min_epsilon': 0.01, 'num_episodes': 500, 'num_states': [50, 50]},  
    {'alpha': 0.3, 'gamma': 0.99, 'epsilon': 0.3, 'epsilon_decay': 0.95, 'min_epsilon': 0.01, 'num_episodes': 1000, 'num_states': [50, 50]},  
    {'alpha': 0.3, 'gamma': 0.99, 'epsilon': 0.3, 'epsilon_decay': 0.95, 'min_epsilon': 0.01, 'num_episodes': 10000, 'num_states': [50, 50]},
    {'alpha': 0.5, 'gamma': 0.99, 'epsilon': 0.3, 'epsilon_decay': 0.9, 'min_epsilon': 0.01, 'num_episodes': 10000, 'num_states': [50, 50]},
    {'alpha': 0.3, 'gamma': 0.99, 'epsilon': 0.3, 'epsilon_decay': 0.95, 'min_epsilon': 0.01, 'num_episodes': 50000, 'num_states': [50, 50]},
    
    {'alpha': 0.25, 'gamma': 0.99, 'epsilon': 0.3, 'epsilon_decay': 0.95, 'min_epsilon': 0.01, 'num_episodes': 1000, 'num_states': [100, 100]},  
    {'alpha': 0.25, 'gamma': 0.99, 'epsilon': 0.3, 'epsilon_decay': 0.95, 'min_epsilon': 0.01, 'num_episodes': 25000, 'num_states': [100, 100]},
    
    {'alpha': 0.1, 'gamma': 0.99, 'epsilon': 0.4, 'epsilon_decay': 0.99, 'min_epsilon': 0.01, 'num_episodes': 100000, 'num_states': [50, 50]},
    {'alpha': 0.1, 'gamma': 0.99, 'epsilon': 0.3, 'epsilon_decay': 0.99, 'min_epsilon': 0.01, 'num_episodes': 50000, 'num_states': [20, 20]},
    {'alpha': 0.1, 'gamma': 0.99, 'epsilon': 0.1, 'epsilon_decay': 0.95, 'min_epsilon': 0.01, 'num_episodes': 10000, 'num_states': [10, 10]},
    {'alpha': 0.25, 'gamma': 0.5, 'epsilon': 0.4, 'epsilon_decay': 0.9, 'min_epsilon': 0.01, 'num_episodes': 50000, 'num_states': [50, 50]},

]


# Test for each configuration
for i, config in enumerate(configs):
    print(f"Testing configuration {i+1}: {config}")

    # Initialize and train the agent
    agent = MountainCarAgent(**config)
    agent.train()

    # Save Q-table plot
    agent.plot_and_save_Q_table_analysis(test_id=i+1)

    # Run a simple test with 1000 episodes
    success_rate = agent.test_policy(num_episodes=1000)
    print(f"Success rate with 1000 episodes: {success_rate * 100:.2f}%")
    
    # Save the Q-table
    q_table_filename = f"q_table_config_{i+1}.npy"
    agent.save_Q_table(q_table_filename)
    print(f"Q-table for configuration {i+1} saved.")

    # Run analysis tests with different number of episodes
    for num_episodes in [100, 500, 1000, 10000]:
        analysis_data = agent.test_policy_analyzed(num_episodes=num_episodes)
        agent.plot_analysis_data(analysis_data, test_id=i+1)
        print(f"Analysis for {num_episodes} episodes saved.")
        print(f"Success Rate: {analysis_data['success_rate'] * 100:.2f}%, "
              f"Average Episode Length: {np.mean(analysis_data['episode_lengths']):.2f} steps, "
              f"Min Episode Length: {np.min(analysis_data['episode_lengths'])} steps, "
              f"Average Reward: {np.mean(analysis_data['episode_rewards']):.2f}, "
              f"Failures: {len(analysis_data['failures'])}, "
              f"Total Episodes: {analysis_data['episodes']}")
    
    print("----------------------------------------------------------------------------------------------------------------------------------------------------------------------------")

Testing configuration 1: {'alpha': 0.1, 'gamma': 0.99, 'epsilon': 0.1, 'epsilon_decay': 0.95, 'min_epsilon': 0.01, 'num_episodes': 500, 'num_states': [10, 10]}


  if not isinstance(terminated, (bool, np.bool8)):


Episode: 0, Average Reward: -110099.0
Episode: 100, Average Reward: -656.27
Episode: 200, Average Reward: -1145.66
Episode: 300, Average Reward: -1177.25
Episode: 400, Average Reward: -1358.67
Training complete!
Success rate with 1000 episodes: 72.00%
Q-table for configuration 1 saved.
Analysis for 100 episodes saved.
Success Rate: 70.00%, Average Episode Length: 508.58 steps, Min Episode Length: 270 steps, Average Reward: -508.58, Failures: 0, Total Episodes: 100
Analysis for 500 episodes saved.
Success Rate: 73.60%, Average Episode Length: 484.08 steps, Min Episode Length: 268 steps, Average Reward: -484.08, Failures: 0, Total Episodes: 500
Analysis for 1000 episodes saved.
Success Rate: 69.70%, Average Episode Length: 513.39 steps, Min Episode Length: 268 steps, Average Reward: -513.39, Failures: 0, Total Episodes: 1000
Analysis for 10000 episodes saved.
Success Rate: 70.22%, Average Episode Length: 508.65 steps, Min Episode Length: 268 steps, Average Reward: -508.65, Failures: 0, T

  if not isinstance(terminated, (bool, np.bool8)):


Episode: 0, Average Reward: -36786.0
Episode: 100, Average Reward: -766.64
Episode: 200, Average Reward: -754.84
Episode: 300, Average Reward: -2231.87
Episode: 400, Average Reward: -408.26
Episode: 500, Average Reward: -1137.34
Episode: 600, Average Reward: -2355.37
Episode: 700, Average Reward: -529.97
Episode: 800, Average Reward: -1182.44
Episode: 900, Average Reward: -652.03
Training complete!
Success rate with 1000 episodes: 83.00%
Q-table for configuration 2 saved.
Analysis for 100 episodes saved.
Success Rate: 87.00%, Average Episode Length: 275.50 steps, Min Episode Length: 136 steps, Average Reward: -275.50, Failures: 0, Total Episodes: 100
Analysis for 500 episodes saved.
Success Rate: 82.60%, Average Episode Length: 306.96 steps, Min Episode Length: 134 steps, Average Reward: -306.96, Failures: 0, Total Episodes: 500
Analysis for 1000 episodes saved.
Success Rate: 81.80%, Average Episode Length: 314.53 steps, Min Episode Length: 134 steps, Average Reward: -314.53, Failures:

  if not isinstance(terminated, (bool, np.bool8)):


Episode: 0, Average Reward: -18399.0
Episode: 100, Average Reward: -1528.89
Episode: 200, Average Reward: -2407.71
Episode: 300, Average Reward: -1101.89
Episode: 400, Average Reward: -1452.7
Training complete!
Success rate with 1000 episodes: 4.30%
Q-table for configuration 3 saved.
Analysis for 100 episodes saved.
Success Rate: 5.00%, Average Episode Length: 982.97 steps, Min Episode Length: 328 steps, Average Reward: -982.97, Failures: 0, Total Episodes: 100
Analysis for 500 episodes saved.
Success Rate: 4.20%, Average Episode Length: 982.70 steps, Min Episode Length: 325 steps, Average Reward: -982.70, Failures: 0, Total Episodes: 500
Analysis for 1000 episodes saved.
Success Rate: 4.50%, Average Episode Length: 981.43 steps, Min Episode Length: 322 steps, Average Reward: -981.43, Failures: 0, Total Episodes: 1000
Analysis for 10000 episodes saved.
Success Rate: 4.30%, Average Episode Length: 982.78 steps, Min Episode Length: 322 steps, Average Reward: -982.78, Failures: 0, Total E

  if not isinstance(terminated, (bool, np.bool8)):


Episode: 0, Average Reward: -11663.0
Episode: 100, Average Reward: -1550.78
Episode: 200, Average Reward: -1154.64
Episode: 300, Average Reward: -1272.4
Episode: 400, Average Reward: -1082.71
Episode: 500, Average Reward: -1038.01
Episode: 600, Average Reward: -2030.8
Episode: 700, Average Reward: -1318.63
Episode: 800, Average Reward: -1518.96
Episode: 900, Average Reward: -1795.34
Training complete!
Success rate with 1000 episodes: 82.60%
Q-table for configuration 4 saved.
Analysis for 100 episodes saved.
Success Rate: 83.00%, Average Episode Length: 292.25 steps, Min Episode Length: 135 steps, Average Reward: -292.25, Failures: 0, Total Episodes: 100
Analysis for 500 episodes saved.
Success Rate: 82.40%, Average Episode Length: 301.31 steps, Min Episode Length: 133 steps, Average Reward: -301.31, Failures: 0, Total Episodes: 500
Analysis for 1000 episodes saved.
Success Rate: 83.80%, Average Episode Length: 294.12 steps, Min Episode Length: 133 steps, Average Reward: -294.12, Failur

  if not isinstance(terminated, (bool, np.bool8)):


Episode: 0, Average Reward: -19256.0
Episode: 100, Average Reward: -1120.31
Episode: 200, Average Reward: -434.65
Episode: 300, Average Reward: -393.43
Episode: 400, Average Reward: -303.47
Training complete!
Success rate with 1000 episodes: 97.40%
Q-table for configuration 5 saved.
Analysis for 100 episodes saved.
Success Rate: 96.00%, Average Episode Length: 302.70 steps, Min Episode Length: 174 steps, Average Reward: -302.70, Failures: 0, Total Episodes: 100
Analysis for 500 episodes saved.
Success Rate: 97.60%, Average Episode Length: 289.32 steps, Min Episode Length: 177 steps, Average Reward: -289.32, Failures: 0, Total Episodes: 500
Analysis for 1000 episodes saved.
Success Rate: 97.10%, Average Episode Length: 296.98 steps, Min Episode Length: 146 steps, Average Reward: -296.98, Failures: 0, Total Episodes: 1000
Analysis for 10000 episodes saved.
Success Rate: 97.40%, Average Episode Length: 291.84 steps, Min Episode Length: 146 steps, Average Reward: -291.84, Failures: 0, Tota

  if not isinstance(terminated, (bool, np.bool8)):


Episode: 0, Average Reward: -14602.0
Episode: 100, Average Reward: -1262.75
Episode: 200, Average Reward: -369.03
Episode: 300, Average Reward: -292.81
Episode: 400, Average Reward: -351.58
Episode: 500, Average Reward: -322.06
Episode: 600, Average Reward: -456.59
Episode: 700, Average Reward: -285.37
Episode: 800, Average Reward: -235.94
Episode: 900, Average Reward: -226.87
Training complete!
Success rate with 1000 episodes: 100.00%
Q-table for configuration 6 saved.
Analysis for 100 episodes saved.
Success Rate: 100.00%, Average Episode Length: 163.20 steps, Min Episode Length: 143 steps, Average Reward: -163.20, Failures: 0, Total Episodes: 100
Analysis for 500 episodes saved.
Success Rate: 100.00%, Average Episode Length: 161.67 steps, Min Episode Length: 139 steps, Average Reward: -161.67, Failures: 0, Total Episodes: 500
Analysis for 1000 episodes saved.
Success Rate: 100.00%, Average Episode Length: 160.97 steps, Min Episode Length: 139 steps, Average Reward: -160.97, Failures

  if not isinstance(terminated, (bool, np.bool8)):


Episode: 0, Average Reward: -18658.0
Episode: 500, Average Reward: -517.608
Episode: 1000, Average Reward: -217.332
Episode: 1500, Average Reward: -194.744
Episode: 2000, Average Reward: -163.912
Episode: 2500, Average Reward: -184.978
Episode: 3000, Average Reward: -184.69
Episode: 3500, Average Reward: -153.91
Episode: 4000, Average Reward: -171.318
Episode: 4500, Average Reward: -143.348
Episode: 5000, Average Reward: -141.568
Episode: 5500, Average Reward: -135.624
Episode: 6000, Average Reward: -140.532
Episode: 6500, Average Reward: -143.336
Episode: 7000, Average Reward: -165.14
Episode: 7500, Average Reward: -152.522
Episode: 8000, Average Reward: -144.08
Episode: 8500, Average Reward: -140.748
Episode: 9000, Average Reward: -138.82
Episode: 9500, Average Reward: -145.726
Training complete!
Success rate with 1000 episodes: 98.60%
Q-table for configuration 7 saved.
Analysis for 100 episodes saved.
Success Rate: 98.00%, Average Episode Length: 226.53 steps, Min Episode Length: 18

  if not isinstance(terminated, (bool, np.bool8)):


Episode: 0, Average Reward: -21296.0
Episode: 2500, Average Reward: -256.8376
Episode: 5000, Average Reward: -164.9872
Episode: 7500, Average Reward: -152.2168
Episode: 10000, Average Reward: -148.7288
Episode: 12500, Average Reward: -150.9288
Episode: 15000, Average Reward: -135.1324
Episode: 17500, Average Reward: -154.532
Episode: 20000, Average Reward: -139.752
Episode: 22500, Average Reward: -147.5408
Training complete!
Success rate with 1000 episodes: 100.00%
Q-table for configuration 8 saved.
Analysis for 100 episodes saved.
Success Rate: 100.00%, Average Episode Length: 140.78 steps, Min Episode Length: 133 steps, Average Reward: -140.78, Failures: 0, Total Episodes: 100
Analysis for 500 episodes saved.
Success Rate: 100.00%, Average Episode Length: 140.86 steps, Min Episode Length: 133 steps, Average Reward: -140.86, Failures: 0, Total Episodes: 500
Analysis for 1000 episodes saved.
Success Rate: 100.00%, Average Episode Length: 140.80 steps, Min Episode Length: 133 steps, Ave

  if not isinstance(terminated, (bool, np.bool8)):


Episode: 0, Average Reward: -9049.0
Episode: 100, Average Reward: -714.13
Episode: 200, Average Reward: -338.8
Episode: 300, Average Reward: -337.24
Episode: 400, Average Reward: -249.1
Training complete!
Success rate with 1000 episodes: 0.00%
Q-table for configuration 9 saved.
Analysis for 100 episodes saved.
Success Rate: 0.00%, Average Episode Length: 1000.00 steps, Min Episode Length: 1000 steps, Average Reward: -1000.00, Failures: 0, Total Episodes: 100
Analysis for 500 episodes saved.
Success Rate: 0.00%, Average Episode Length: 1000.00 steps, Min Episode Length: 1000 steps, Average Reward: -1000.00, Failures: 0, Total Episodes: 500
Analysis for 1000 episodes saved.
Success Rate: 0.00%, Average Episode Length: 1000.00 steps, Min Episode Length: 1000 steps, Average Reward: -1000.00, Failures: 0, Total Episodes: 1000
Analysis for 10000 episodes saved.
Success Rate: 0.00%, Average Episode Length: 1000.00 steps, Min Episode Length: 1000 steps, Average Reward: -1000.00, Failures: 0, T

  if not isinstance(terminated, (bool, np.bool8)):


Episode: 0, Average Reward: -16553.0
Episode: 100, Average Reward: -849.8
Episode: 200, Average Reward: -307.06
Episode: 300, Average Reward: -228.56
Episode: 400, Average Reward: -189.28
Episode: 500, Average Reward: -206.42
Episode: 600, Average Reward: -218.02
Episode: 700, Average Reward: -225.16
Episode: 800, Average Reward: -228.87
Episode: 900, Average Reward: -220.71
Training complete!
Success rate with 1000 episodes: 84.50%
Q-table for configuration 10 saved.
Analysis for 100 episodes saved.
Success Rate: 83.00%, Average Episode Length: 325.30 steps, Min Episode Length: 158 steps, Average Reward: -325.30, Failures: 0, Total Episodes: 100
Analysis for 500 episodes saved.
Success Rate: 82.40%, Average Episode Length: 330.18 steps, Min Episode Length: 158 steps, Average Reward: -330.18, Failures: 0, Total Episodes: 500
Analysis for 1000 episodes saved.
Success Rate: 83.20%, Average Episode Length: 319.21 steps, Min Episode Length: 158 steps, Average Reward: -319.21, Failures: 0, 

  if not isinstance(terminated, (bool, np.bool8)):


Episode: 0, Average Reward: -15080.0
Episode: 500, Average Reward: -335.912
Episode: 1000, Average Reward: -198.934
Episode: 1500, Average Reward: -181.296
Episode: 2000, Average Reward: -210.906
Episode: 2500, Average Reward: -162.274
Episode: 3000, Average Reward: -194.584
Episode: 3500, Average Reward: -177.422
Episode: 4000, Average Reward: -168.234
Episode: 4500, Average Reward: -185.092
Episode: 5000, Average Reward: -169.988
Episode: 5500, Average Reward: -167.73
Episode: 6000, Average Reward: -183.094
Episode: 6500, Average Reward: -196.53
Episode: 7000, Average Reward: -193.986
Episode: 7500, Average Reward: -169.534
Episode: 8000, Average Reward: -192.718
Episode: 8500, Average Reward: -186.394
Episode: 9000, Average Reward: -192.512
Episode: 9500, Average Reward: -168.442
Training complete!
Success rate with 1000 episodes: 96.70%
Q-table for configuration 11 saved.
Analysis for 100 episodes saved.
Success Rate: 94.00%, Average Episode Length: 273.47 steps, Min Episode Length

  if not isinstance(terminated, (bool, np.bool8)):


Episode: 0, Average Reward: -54727.0
Episode: 100, Average Reward: -1910.23
Episode: 200, Average Reward: -943.97
Episode: 300, Average Reward: -624.45
Episode: 400, Average Reward: -641.75
Training complete!
Success rate with 1000 episodes: 18.40%
Q-table for configuration 12 saved.
Analysis for 100 episodes saved.
Success Rate: 30.00%, Average Episode Length: 865.48 steps, Min Episode Length: 259 steps, Average Reward: -865.48, Failures: 0, Total Episodes: 100
Analysis for 500 episodes saved.
Success Rate: 20.00%, Average Episode Length: 903.17 steps, Min Episode Length: 259 steps, Average Reward: -903.17, Failures: 0, Total Episodes: 500
Analysis for 1000 episodes saved.
Success Rate: 20.30%, Average Episode Length: 901.98 steps, Min Episode Length: 259 steps, Average Reward: -901.98, Failures: 0, Total Episodes: 1000
Analysis for 10000 episodes saved.
Success Rate: 18.80%, Average Episode Length: 910.89 steps, Min Episode Length: 256 steps, Average Reward: -910.89, Failures: 0, Tot

  if not isinstance(terminated, (bool, np.bool8)):


Episode: 0, Average Reward: -45079.0
Episode: 100, Average Reward: -1940.69
Episode: 200, Average Reward: -876.78
Episode: 300, Average Reward: -760.61
Episode: 400, Average Reward: -538.3
Episode: 500, Average Reward: -583.2
Episode: 600, Average Reward: -527.75
Episode: 700, Average Reward: -527.09
Episode: 800, Average Reward: -438.7
Episode: 900, Average Reward: -442.7
Training complete!
Success rate with 1000 episodes: 99.80%
Q-table for configuration 13 saved.
Analysis for 100 episodes saved.
Success Rate: 100.00%, Average Episode Length: 293.83 steps, Min Episode Length: 185 steps, Average Reward: -293.83, Failures: 0, Total Episodes: 100
Analysis for 500 episodes saved.
Success Rate: 99.60%, Average Episode Length: 302.91 steps, Min Episode Length: 184 steps, Average Reward: -302.91, Failures: 0, Total Episodes: 500
Analysis for 1000 episodes saved.
Success Rate: 99.50%, Average Episode Length: 301.54 steps, Min Episode Length: 170 steps, Average Reward: -301.54, Failures: 0, T

  if not isinstance(terminated, (bool, np.bool8)):


Episode: 0, Average Reward: -46662.0
Episode: 500, Average Reward: -941.982
Episode: 1000, Average Reward: -445.398
Episode: 1500, Average Reward: -381.738
Episode: 2000, Average Reward: -322.542
Episode: 2500, Average Reward: -324.266
Episode: 3000, Average Reward: -293.858
Episode: 3500, Average Reward: -261.698
Episode: 4000, Average Reward: -244.836
Episode: 4500, Average Reward: -229.676
Episode: 5000, Average Reward: -213.494
Episode: 5500, Average Reward: -189.826
Episode: 6000, Average Reward: -189.418
Episode: 6500, Average Reward: -190.822
Episode: 7000, Average Reward: -190.01
Episode: 7500, Average Reward: -180.728
Episode: 8000, Average Reward: -180.726
Episode: 8500, Average Reward: -171.08
Episode: 9000, Average Reward: -162.618
Episode: 9500, Average Reward: -168.97
Training complete!
Success rate with 1000 episodes: 100.00%
Q-table for configuration 14 saved.
Analysis for 100 episodes saved.
Success Rate: 100.00%, Average Episode Length: 139.60 steps, Min Episode Lengt

  if not isinstance(terminated, (bool, np.bool8)):


Episode: 0, Average Reward: -43721.0
Episode: 2500, Average Reward: -484.0112
Episode: 5000, Average Reward: -247.432
Episode: 7500, Average Reward: -185.6116
Episode: 10000, Average Reward: -170.5304
Episode: 12500, Average Reward: -153.616
Episode: 15000, Average Reward: -147.3528
Episode: 17500, Average Reward: -147.2536
Episode: 20000, Average Reward: -154.2596
Episode: 22500, Average Reward: -150.4124
Episode: 25000, Average Reward: -136.8128
Episode: 27500, Average Reward: -141.564
Episode: 30000, Average Reward: -143.5324
Episode: 32500, Average Reward: -141.6132
Episode: 35000, Average Reward: -128.9656
Episode: 37500, Average Reward: -126.3948
Episode: 40000, Average Reward: -128.7612
Episode: 42500, Average Reward: -121.9848
Episode: 45000, Average Reward: -121.3496
Episode: 47500, Average Reward: -130.4928
Training complete!
Success rate with 1000 episodes: 99.70%
Q-table for configuration 15 saved.
Analysis for 100 episodes saved.
Success Rate: 100.00%, Average Episode Leng

  if not isinstance(terminated, (bool, np.bool8)):


Episode: 0, Average Reward: -28928.0
Episode: 100, Average Reward: -1112.4
Episode: 200, Average Reward: -558.66
Episode: 300, Average Reward: -475.46
Episode: 400, Average Reward: -423.19
Training complete!
Success rate with 1000 episodes: 99.80%
Q-table for configuration 16 saved.
Analysis for 100 episodes saved.
Success Rate: 100.00%, Average Episode Length: 336.77 steps, Min Episode Length: 240 steps, Average Reward: -336.77, Failures: 0, Total Episodes: 100
Analysis for 500 episodes saved.
Success Rate: 99.40%, Average Episode Length: 346.19 steps, Min Episode Length: 235 steps, Average Reward: -346.19, Failures: 0, Total Episodes: 500
Analysis for 1000 episodes saved.
Success Rate: 99.60%, Average Episode Length: 338.25 steps, Min Episode Length: 196 steps, Average Reward: -338.25, Failures: 0, Total Episodes: 1000
Analysis for 10000 episodes saved.
Success Rate: 99.69%, Average Episode Length: 337.89 steps, Min Episode Length: 196 steps, Average Reward: -337.89, Failures: 0, Tot

  if not isinstance(terminated, (bool, np.bool8)):


Episode: 0, Average Reward: -25215.0
Episode: 100, Average Reward: -1160.08
Episode: 200, Average Reward: -575.11
Episode: 300, Average Reward: -473.96
Episode: 400, Average Reward: -396.63
Episode: 500, Average Reward: -361.89
Episode: 600, Average Reward: -402.64
Episode: 700, Average Reward: -333.29
Episode: 800, Average Reward: -309.09
Episode: 900, Average Reward: -298.13
Training complete!
Success rate with 1000 episodes: 100.00%
Q-table for configuration 17 saved.
Analysis for 100 episodes saved.
Success Rate: 100.00%, Average Episode Length: 311.92 steps, Min Episode Length: 238 steps, Average Reward: -311.92, Failures: 0, Total Episodes: 100
Analysis for 500 episodes saved.
Success Rate: 100.00%, Average Episode Length: 317.12 steps, Min Episode Length: 238 steps, Average Reward: -317.12, Failures: 0, Total Episodes: 500
Analysis for 1000 episodes saved.
Success Rate: 100.00%, Average Episode Length: 313.92 steps, Min Episode Length: 237 steps, Average Reward: -313.92, Failure

  if not isinstance(terminated, (bool, np.bool8)):


Episode: 0, Average Reward: -21225.0
Episode: 500, Average Reward: -603.786
Episode: 1000, Average Reward: -330.956
Episode: 1500, Average Reward: -262.548
Episode: 2000, Average Reward: -219.638
Episode: 2500, Average Reward: -210.92
Episode: 3000, Average Reward: -189.32
Episode: 3500, Average Reward: -175.634
Episode: 4000, Average Reward: -165.184
Episode: 4500, Average Reward: -166.214
Episode: 5000, Average Reward: -156.058
Episode: 5500, Average Reward: -161.922
Episode: 6000, Average Reward: -164.68
Episode: 6500, Average Reward: -147.096
Episode: 7000, Average Reward: -164.502
Episode: 7500, Average Reward: -173.014
Episode: 8000, Average Reward: -169.728
Episode: 8500, Average Reward: -155.75
Episode: 9000, Average Reward: -162.722
Episode: 9500, Average Reward: -144.368
Training complete!
Success rate with 1000 episodes: 100.00%
Q-table for configuration 18 saved.
Analysis for 100 episodes saved.
Success Rate: 100.00%, Average Episode Length: 131.13 steps, Min Episode Length

  if not isinstance(terminated, (bool, np.bool8)):


Episode: 0, Average Reward: -15368.0
Episode: 500, Average Reward: -526.12
Episode: 1000, Average Reward: -289.31
Episode: 1500, Average Reward: -223.342
Episode: 2000, Average Reward: -203.276
Episode: 2500, Average Reward: -190.096
Episode: 3000, Average Reward: -191.994
Episode: 3500, Average Reward: -171.98
Episode: 4000, Average Reward: -176.7
Episode: 4500, Average Reward: -183.82
Episode: 5000, Average Reward: -183.346
Episode: 5500, Average Reward: -174.852
Episode: 6000, Average Reward: -172.138
Episode: 6500, Average Reward: -141.898
Episode: 7000, Average Reward: -144.594
Episode: 7500, Average Reward: -162.404
Episode: 8000, Average Reward: -173.954
Episode: 8500, Average Reward: -158.708
Episode: 9000, Average Reward: -147.82
Episode: 9500, Average Reward: -142.468
Training complete!
Success rate with 1000 episodes: 100.00%
Q-table for configuration 19 saved.
Analysis for 100 episodes saved.
Success Rate: 100.00%, Average Episode Length: 136.93 steps, Min Episode Length: 1

  if not isinstance(terminated, (bool, np.bool8)):


Episode: 0, Average Reward: -28973.0
Episode: 2500, Average Reward: -323.3368
Episode: 5000, Average Reward: -169.6868
Episode: 7500, Average Reward: -161.6872
Episode: 10000, Average Reward: -155.6048
Episode: 12500, Average Reward: -144.9664
Episode: 15000, Average Reward: -131.8468
Episode: 17500, Average Reward: -143.5248
Episode: 20000, Average Reward: -139.4108
Episode: 22500, Average Reward: -137.9748
Episode: 25000, Average Reward: -141.5996
Episode: 27500, Average Reward: -133.81
Episode: 30000, Average Reward: -138.9952
Episode: 32500, Average Reward: -129.6456
Episode: 35000, Average Reward: -125.5092
Episode: 37500, Average Reward: -149.5412
Episode: 40000, Average Reward: -128.768
Episode: 42500, Average Reward: -140.3288
Episode: 45000, Average Reward: -130.9968
Episode: 47500, Average Reward: -137.9192
Training complete!
Success rate with 1000 episodes: 100.00%
Q-table for configuration 20 saved.
Analysis for 100 episodes saved.
Success Rate: 100.00%, Average Episode Len

  if not isinstance(terminated, (bool, np.bool8)):


Episode: 0, Average Reward: -106860.0
Episode: 100, Average Reward: -1969.71
Episode: 200, Average Reward: -1139.05
Episode: 300, Average Reward: -871.14
Episode: 400, Average Reward: -740.41
Episode: 500, Average Reward: -721.8
Episode: 600, Average Reward: -560.14
Episode: 700, Average Reward: -597.92
Episode: 800, Average Reward: -523.98
Episode: 900, Average Reward: -508.0
Training complete!
Success rate with 1000 episodes: 87.70%
Q-table for configuration 21 saved.
Analysis for 100 episodes saved.
Success Rate: 89.00%, Average Episode Length: 577.37 steps, Min Episode Length: 338 steps, Average Reward: -577.37, Failures: 0, Total Episodes: 100
Analysis for 500 episodes saved.
Success Rate: 89.40%, Average Episode Length: 577.38 steps, Min Episode Length: 310 steps, Average Reward: -577.38, Failures: 0, Total Episodes: 500
Analysis for 1000 episodes saved.
Success Rate: 89.20%, Average Episode Length: 585.30 steps, Min Episode Length: 261 steps, Average Reward: -585.30, Failures: 0

  if not isinstance(terminated, (bool, np.bool8)):


Episode: 0, Average Reward: -105517.0
Episode: 2500, Average Reward: -561.1648
Episode: 5000, Average Reward: -296.7532
Episode: 7500, Average Reward: -236.7592
Episode: 10000, Average Reward: -189.7176
Episode: 12500, Average Reward: -165.3032
Episode: 15000, Average Reward: -151.94
Episode: 17500, Average Reward: -143.6088
Episode: 20000, Average Reward: -143.6184
Episode: 22500, Average Reward: -142.0572
Training complete!
Success rate with 1000 episodes: 99.80%
Q-table for configuration 22 saved.
Analysis for 100 episodes saved.
Success Rate: 100.00%, Average Episode Length: 141.50 steps, Min Episode Length: 86 steps, Average Reward: -141.50, Failures: 0, Total Episodes: 100
Analysis for 500 episodes saved.
Success Rate: 100.00%, Average Episode Length: 140.76 steps, Min Episode Length: 85 steps, Average Reward: -140.76, Failures: 0, Total Episodes: 500
Analysis for 1000 episodes saved.
Success Rate: 99.70%, Average Episode Length: 142.55 steps, Min Episode Length: 85 steps, Averag

In [528]:
# Reevaluate a case for later vizualization

i = 24
config = {'alpha': 0.1, 'gamma': 0.99, 'epsilon': 0.4, 'epsilon_decay': 0.99, 'min_epsilon': 0.01, 'num_episodes': 150000, 'num_states': [100, 100]}


# Initialize and train the agent
agent = MountainCarAgent(**config)
agent.train()

# Save Q-table plot
agent.plot_and_save_Q_table_analysis(test_id=i)

# Run a simple test with 1000 episodes
success_rate = agent.test_policy(num_episodes=1000)
print(f"Success rate with 1000 episodes: {success_rate * 100:.2f}%")

# Save the Q-table
q_table_filename = f"q_table_config_{i}.npy"
agent.save_Q_table(q_table_filename)
print(f"Q-table for configuration {i} saved.")

# Run analysis tests with different number of episodes
for num_episodes in [100, 500, 1000, 10000]:
    analysis_data = agent.test_policy_analyzed(num_episodes=num_episodes)
    agent.plot_analysis_data(analysis_data, test_id=i)
    print(f"Analysis for {num_episodes} episodes saved.")
    print(f"Success Rate: {analysis_data['success_rate'] * 100:.2f}%, "
          f"Average Episode Length: {np.mean(analysis_data['episode_lengths']):.2f} steps, "
          f"Min Episode Length: {np.min(analysis_data['episode_lengths'])} steps, "
          f"Average Reward: {np.mean(analysis_data['episode_rewards']):.2f}, "
          f"Failures: {len(analysis_data['failures'])}, "
          f"Total Episodes: {analysis_data['episodes']}")

print("----------------------------------------------------------------------------------------------------------------------------------------------------------------------------")

  if not isinstance(terminated, (bool, np.bool8)):


Episode: 0, Average Reward: -159228.0
Episode: 2500, Average Reward: -846.622
Episode: 5000, Average Reward: -412.426
Episode: 7500, Average Reward: -335.5824
Episode: 10000, Average Reward: -298.9316
Episode: 12500, Average Reward: -272.9368
Episode: 15000, Average Reward: -247.712
Episode: 17500, Average Reward: -224.7736
Episode: 20000, Average Reward: -202.7004
Episode: 22500, Average Reward: -186.3996
Episode: 25000, Average Reward: -171.7868
Episode: 27500, Average Reward: -162.3468
Episode: 30000, Average Reward: -155.0896
Episode: 32500, Average Reward: -152.6452
Episode: 35000, Average Reward: -150.8672
Episode: 37500, Average Reward: -144.6872
Episode: 40000, Average Reward: -141.6596
Episode: 42500, Average Reward: -144.536
Episode: 45000, Average Reward: -141.9188
Episode: 47500, Average Reward: -138.1168
Episode: 50000, Average Reward: -141.4632
Episode: 52500, Average Reward: -142.0348
Episode: 55000, Average Reward: -142.5892
Episode: 57500, Average Reward: -138.2952
Epi

In [None]:
agent.test_policy_with_visualization(num_episodes=3)