In [None]:
import sys
!{sys.executable} -m pip install numpy
!{sys.executable} -m pip install ipywidgets matplotlib


In [2]:
# Q-learning parameters
alpha = 0.1  # Learning rate
gamma = 0.99  # Discount factor
epsilon = 0.1  # Exploration rate
num_episodes = 5000  # Total number of episodes
epsilon_decay = 0.995  # Decay rate for epsilon
total_rewards = []

In [9]:
import numpy as np
import gym
import matplotlib.pyplot as plt
import os
import matplotlib.colors as mcolors
import seaborn as sns
from IPython import display


class MountainCarAgent:
    """
    Initialize the Mountain Car Agent.

    Parameters:
        alpha (float): The learning rate.
        gamma (float): The discount factor for future rewards.
        epsilon (float): The initial probability of taking a random action (exploration).
        epsilon_decay (float): The decay rate of epsilon after each episode.
        min_epsilon (float): The minimum value to which epsilon can decay.
        num_episodes (int): The total number of episodes for training.
        num_states (list): The number of discrete states in each dimension.

    The constructor sets up the environment, initializes the Q-table, and configures the agent's learning and exploration parameters.
    """
    def __init__(self, alpha=0.1, gamma=0.99, epsilon=0.15, epsilon_decay=0.9, min_epsilon=0.01, num_episodes=1001, num_states=[20, 20]):
        self.env = gym.make('MountainCar-v0')
        self.alpha = alpha
        self.gamma = gamma
        self.epsilon = epsilon
        self.epsilon_decay = epsilon_decay
        self.min_epsilon = min_epsilon
        self.num_episodes = num_episodes
        self.num_states = num_states
        self.state_bounds = list(zip(self.env.observation_space.low, self.env.observation_space.high))
        self.state_grid = [np.linspace(bound[0], bound[1], num_states[i]) for i, bound in enumerate(self.state_bounds)]
        self.Q_table = np.zeros(self.num_states + [self.env.action_space.n])
        self.total_rewards = []
        self.episode_lengths = []

    def discretize_state(self, state):
        """
        Discretize a continuous state into its corresponding grid indices.

        Parameters:
            state (tuple): The continuous state to be discretized.

        Returns:
            tuple: A discretized state represented by indices corresponding to each dimension.

        This method maps continuous state values to a discrete grid, enabling the use of a tabular Q-learning approach.
        """        
        discretized_state = []
        for s, grid in zip(state, self.state_grid):
            index = np.digitize(s, grid) - 1
            index = max(0, min(index, len(grid) - 1))
            discretized_state.append(index)
        return tuple(discretized_state)

    def epsilon_greedy_policy(self, state):
        """
        Determine an action using an epsilon-greedy policy.

        Parameters:
            state (tuple): The current state from which the action needs to be decided.

        Returns:
            int: The action chosen based on the epsilon-greedy policy.

        With a probability of epsilon, a random action is chosen (exploration). Otherwise, the best known action (exploitation) is chosen based on the Q-table.
        """        
        if np.random.random() < self.epsilon:
            return self.env.action_space.sample()  # Explore
        else:
            return np.argmax(self.Q_table[state])  # Exploit

    def create_directory(self, dir_path):
        """
        Create a directory if it doesn't exist.

        Parameters:
            dir_path (str): The path of the directory to be created.

        This method checks if a directory exists at the given path, and if not, it creates it.
        """                
        if not os.path.exists(dir_path):
            os.makedirs(dir_path)

    def save_Q_table(self, filename, folder="mountain_car/q_tables"):
        """
        Save the current Q-table to a file.

        Parameters:
            filename (str): The name of the file to save the Q-table.
            folder (str): The folder where the Q-table should be saved.

        This method saves the Q-table into a specified file for later use, such as resuming training or analysis.
        """        
        self.create_directory(folder)
        np.save(os.path.join(folder, filename), self.Q_table)

    def load_Q_table(self, filename, folder="mountain_car/q_tables"):
        """
        Load a Q-table from a file.

        Parameters:
            filename (str): The name of the file containing the Q-table.
            folder (str): The folder where the Q-table is located.

        This method loads a Q-table from a file, allowing the agent to use a pre-trained policy.
        """            
        self.Q_table = np.load(filename)

    def train(self):
        """
        Train the agent using the Q-learning algorithm.

        During training, the agent interacts with the environment, updates the Q-table based on the received rewards, and gradually improves its policy. Epsilon is decayed after each episode to reduce exploration over time.
        """        
        for episode in range(self.num_episodes):
            state_raw, _ = self.env.reset()
            state = self.discretize_state(state_raw)
            total_reward, steps = 0, 0

            done = False
            while not done:
                action = self.epsilon_greedy_policy(state)
                next_state_raw, reward, done, _, _ = self.env.step(action)
                next_state = self.discretize_state(next_state_raw)

                # Q-table update
                best_next_action = np.argmax(self.Q_table[next_state])
                td_target = reward + self.gamma * self.Q_table[next_state][best_next_action] # Temporal Difference Target
                self.Q_table[state][action] += self.alpha * (td_target - self.Q_table[state][action])

                state = next_state
                total_reward += reward
                steps += 1

            self.total_rewards.append(total_reward)
            self.episode_lengths.append(steps)
            
            # Decay epsilon
            self.epsilon = max(self.epsilon * self.epsilon_decay, self.min_epsilon)

            if self.num_episodes < 1500 and episode % 100 == 0:
                print(f"Episode: {episode}, Average Reward: {np.mean(self.total_rewards[-100:])}")
            elif self.num_episodes < 10500 and episode % 500 == 0:
                print(f"Episode: {episode}, Average Reward: {np.mean(self.total_rewards[-500:])}")
            elif self.num_episodes > 10500 and episode % 2500 == 0:
                print(f"Episode: {episode}, Average Reward: {np.mean(self.total_rewards[-2500:])}")

        print("Training complete!")

    def plot_and_save_Q_table_analysis(self, test_id = 0, folder="mountain_car/tests/learning"):
        """
        Generate and save plots analyzing the Q-table.

        Parameters:
            test_id (int): An identifier for the test scenario.
            folder (str): The folder where the plots should be saved.

        This method creates a series of plots to analyze the learning process of the agent, including the learning curve, episode lengths, and Q-value heatmaps, and saves them to files, in the given folder or default to mountain_car/tests/learning.
        """        
        self.create_directory(folder)
        num_positions = len(self.state_grid[0])
        num_velocities = len(self.state_grid[1])
        policy = np.argmax(self.Q_table, axis=2)

        positions = np.linspace(-1.2, 0.6, num_positions)
        velocities = np.linspace(-0.07, 0.07, num_velocities)
        pos_grid, vel_grid = np.meshgrid(positions, velocities)
        action_colors = {0: 'red', 1: 'green', 2: 'blue'}
        color_array = np.array([[action_colors[action] for action in row] for row in policy])

        plt.figure(figsize=(18, 24))
        
        # Subplot 1: Learning Curve
        plt.subplot(5, 2, 1)
        plt.plot(self.total_rewards)  
        plt.title('Learning Curve: Total Reward per Episode')
        plt.xlabel('Episode')
        plt.ylabel('Total Reward')

        # Subplot 2: Episode Length Over Time
        plt.subplot(5, 2, 2)
        plt.plot(self.episode_lengths)  
        plt.title('Episode Length Over Time')
        plt.xlabel('Episode')
        plt.ylabel('Length of Episode')

        # Subplot 3: Heatmap for First Action
        selected_action = 0  
        plt.subplot(5, 2, 3)
        sns.heatmap(self.Q_table[:, :, selected_action])  # Replace with your Q_table data
        plt.title(f'Heatmap of Q-Values for Action {selected_action}')
        plt.xlabel('Position')
        plt.ylabel('Velocity')

        # Subplot 4: 3D Surface Plot for the Same Action
        ax = plt.subplot(5, 2, 4, projection='3d')
        X, Y = np.meshgrid(np.arange(self.Q_table.shape[0]), np.arange(self.Q_table.shape[1]))
        Z = self.Q_table[:, :, selected_action]  # Replace with your Q_table data
        ax.plot_surface(X, Y, Z, cmap='viridis')
        ax.set_title(f'3D Surface Plot of Q-Values for Action {selected_action}')
        plt.xlabel('Position')
        plt.ylabel('Velocity')
        ax.set_zlabel('Q-Value')

        # Subplot 5: Heatmap for Second Action
        selected_action = 1
        plt.subplot(5, 2, 5)
        sns.heatmap(self.Q_table[:, :, selected_action])  # Replace with your Q_table data
        plt.title(f'Heatmap of Q-Values for Action {selected_action}')
        plt.xlabel('Position')
        plt.ylabel('Velocity')

        # Subplot 6: 3D Surface Plot for the Same Action
        ax = plt.subplot(5, 2, 6, projection='3d')
        X, Y = np.meshgrid(np.arange(self.Q_table.shape[0]), np.arange(self.Q_table.shape[1]))
        Z = self.Q_table[:, :, selected_action]  
        ax.plot_surface(X, Y, Z, cmap='viridis')
        ax.set_title(f'3D Surface Plot of Q-Values for Action {selected_action}')
        plt.xlabel('Position')
        plt.ylabel('Velocity')
        ax.set_zlabel('Q-Value')

        # Subplot 7: Heatmap for Third Action
        selected_action = 2
        plt.subplot(5, 2, 7)
        sns.heatmap(self.Q_table[:, :, selected_action]) 
        plt.title(f'Heatmap of Q-Values for Action {selected_action}')
        plt.xlabel('Position')
        plt.ylabel('Velocity')

        # Subplot 8: 3D Surface Plot for the Same Action
        ax = plt.subplot(5, 2, 8, projection='3d')
        X, Y = np.meshgrid(np.arange(self.Q_table.shape[0]), np.arange(self.Q_table.shape[1]))
        Z = self.Q_table[:, :, selected_action]  
        ax.plot_surface(X, Y, Z, cmap='viridis')
        ax.set_title(f'3D Surface Plot of Q-Values for Action {selected_action}')
        plt.xlabel('Position')
        plt.ylabel('Velocity')
        ax.set_zlabel('Q-Value')

        # Subplot 9: Policy Map (Color-Coded Actions)
        plt.subplot(5, 2, 9)
        cmap_colors = [action_colors[i] for i in sorted(action_colors)]
        cmap = sns.color_palette(cmap_colors, as_cmap=True)
        policy_heatmap = sns.heatmap(policy, cmap=cmap, annot=False)  # 'policy' and 'cmap' as defined earlier
        plt.title("Policy Map (Color-Coded Actions)")
        plt.xlabel('Position')
        plt.ylabel('Velocity')

        # Subplot 10: Position and Velocity Policy Visualization
        plt.subplot(5, 2, 10)
        plt.scatter(pos_grid, vel_grid, color=color_array.ravel()) 
        plt.title('Policy Visualization for Mountain Car')
        plt.xlabel('Position')
        plt.ylabel('Velocity')
        patches = [mpatches.Patch(color=color, label=label) for label, color in action_colors.items()]  
        plt.legend(handles=patches, title="Actions", title_fontsize='13', loc='best')

        # # Adjust layout and show
        # plt.tight_layout()
        # plt.show()
        
        # Save the figure based on the test_id
        plt.tight_layout()
        filename = f"Q_table_analysis_test_{test_id}.png"
        plt.savefig(os.path.join(folder, filename))
        plt.close()
        
    def test_policy(self, num_episodes=1000, max_steps=1000):
        """
        Test the trained policy over a number of episodes.

        Parameters:
            num_episodes (int): The number of episodes to test the policy.
            max_steps (int): The maximum number of steps per episode.

        Returns:
            float: The success rate of the agent.

        This method tests the agent's policy by running it through several episodes and calculates the rate of successful episodes.
        """        
        success_count = 0
        for episode in range(num_episodes):
            state_raw, _ = self.env.reset()
            state = self.discretize_state(state_raw)
            done = False
            step = 0
            while not done and step < max_steps:
                action = np.argmax(self.Q_table[state])
                state_raw, _, done, _, _ = self.env.step(action)
                state = self.discretize_state(state_raw)
                step += 1
                if done and state_raw[0] >= 0.5:  # Check if goal is reached
                    success_count += 1

        success_rate = success_count / num_episodes
        return success_rate
    
    def test_policy_analyzed(self, num_episodes=10000, max_steps=1000):
        """
        Test the policy and gather detailed analysis data.

        Parameters:
            num_episodes (int): The number of episodes for testing.
            max_steps (int): The maximum number of steps per episode.

        Returns:
            dict: A dictionary containing detailed analysis data such as success rates, episode lengths, rewards, state visits, and outcomes.

        This method performs an extensive test of the agent's policy, collecting data for further in-depth analysis.
        """        
        success_count = 0
        episode_lengths = []
        episode_rewards = []
        grid_x, grid_y = self.num_states
        state_visits = np.zeros(grid_x * grid_y)
        episode_end_states = []
        episode_outcomes = []
        failures = []

        for episode in range(num_episodes):
            state_raw, _ = self.env.reset()
            state = self.discretize_state(state_raw)
            done = False
            success = False
            step = 0
            total_reward = 0

            while not done and step < max_steps:
                action = np.argmax(self.Q_table[state])
                state_raw, reward, done, _, _ = self.env.step(action)
                new_state = self.discretize_state(state_raw)
                total_reward += reward
                x, y = new_state
                single_index = y * grid_x + x
                state_visits[single_index] += 1

                if done and state_raw[0] < 0.5:
                    failures.append((episode, state_raw, step))

                state = new_state
                step += 1

                if done and state_raw[0] >= 0.5:
                    success = True
                    success_count += 1

            episode_lengths.append(step)
            episode_rewards.append(total_reward)
            episode_end_states.append(state_raw)
            episode_outcomes.append(success)

        success_rate = success_count / num_episodes
        analysis_data = {
            'success_rate': success_rate,
            'episode_lengths': episode_lengths,
            'episode_rewards': episode_rewards,
            'state_visits': state_visits,
            'episode_end_states': episode_end_states,
            'failures': failures,
            'episode_outcomes': episode_outcomes,
            'episodes' : num_episodes
        }
        return analysis_data
    
    def plot_analysis_data(self, analysis_data, test_id = 0, folder="mountain_car/tests/analysis"):
        """
        Generate and save plots based on analysis data.

        Parameters:
            analysis_data (dict): The analysis data obtained from testing the policy.
            test_id (int): An identifier for the test scenario.
            folder (str): The folder where the plots should be saved.

        This method visualizes the performance and behavior of the agent using the provided analysis data and saves the resulting plots.
        """        
        self.create_directory(folder)
        plt.figure(figsize=(15, 20))

        # Scatter Plot of End States Color-Coded by Success or Failure
        end_states = np.array(analysis_data['episode_end_states'])
        outcomes = np.array(analysis_data['episode_outcomes'])
        cmap = mcolors.ListedColormap(['red', 'green'])
        binary_outcomes = np.where(outcomes, 1, 0)
        plt.subplot(3, 2, 1)
        plt.scatter(end_states[:, 0], end_states[:, 1], c=binary_outcomes, cmap=cmap)
        plt.title('End States Scatter Plot (Red=Failure, Green=Succes)')
        plt.xlabel('Position')
        plt.ylabel('Velocity')

        # Histogram of Episode Lengths
        plt.subplot(3, 2, 2)
        plt.hist(analysis_data['episode_lengths'], bins=30, color='skyblue')
        plt.title('Histogram of Episode Lengths')
        plt.xlabel('Episode Length')
        plt.ylabel('Frequency')

        # Line Plot of Episode Rewards
        plt.subplot(3, 2, 3)
        plt.plot(analysis_data['episode_rewards'], color='green')
        plt.title('Episode Rewards Over Time')
        plt.xlabel('Episode')
        plt.ylabel('Total Reward')

        # Heatmap of State Visits
        plt.subplot(3, 2, 4)
        state_visits_reshaped = np.reshape(analysis_data['state_visits'], self.num_states)
        sns.heatmap(state_visits_reshaped, cmap='viridis', annot=False)
        plt.title('Heatmap of State Visits')
        plt.xlabel('Position')
        plt.ylabel('Velocity')

        # Simple plot to display success or failure for each episode
        plt.subplot(3, 2, 5)
        plt.plot(binary_outcomes, marker='o', linestyle='', color='blue')
        plt.title('Success (1) or Failure (0) Per Episode')
        plt.xlabel('Episode')
        plt.ylabel('Outcome (1=Success, 0=Failure)')
        plt.yticks([0, 1], ['Failure', 'Success'])
        plt.grid(True)

        # Line plot for episode lengths
        plt.subplot(3, 2, 6)
        plt.plot(analysis_data['episode_lengths'], color='purple')
        plt.title('Episode Lengths Over Time')
        plt.xlabel('Episode')
        plt.ylabel('Length of Episode')
        plt.grid(True)
        
        # # Adjust layout and show
        # plt.tight_layout()
        # plt.show()

        plt.tight_layout()
        filename = f"analysis_plots_test_{test_id}_{analysis_data['episodes']}_episodes.png"
        plt.savefig(os.path.join(folder, filename))
        plt.close()
        
    def test_policy_with_visualization(self, num_episodes=5):
        """
        Visualize the agent's policy for a specified number of episodes.

        Parameters:
            num_episodes (int): The number of episodes to visualize.

        This method runs the agent for a few episodes and visually shows the agent's decisions in the Mountain Car environment.
        """        
        for episode in range(num_episodes):
            state_raw, _ = self.env.reset()
            state = self.discretize_state(state_raw)
            done = False
            step = 0

            while not done:
                action = np.argmax(self.Q_table[state])
                action_taken = "Turn Left" if action == 0 else ("Do Nothing" if action == 1 else "Turn Right")
                self.custom_render(step, action_taken)
                state_raw, _, done, _, _ = self.env.step(action)
                state = self.discretize_state(state_raw)
                step += 1
           
    def custom_render(self, step, action_taken):
        """
        Custom function to visualize the state of the environment.

        Parameters:
            step (int): The current step number in the episode.
            action_taken (str): The action taken by the agent at this step.

        This method creates a visual representation of the Mountain Car environment, including the current position of the car and the action taken.
        """        
        plt.figure(figsize=(8, 4))
        positions = np.linspace(self.env.unwrapped.min_position, self.env.unwrapped.max_position, 100)
        mountain_heights = np.sin(3 * positions)
        plt.plot(positions, mountain_heights, color='gray', linewidth=2)
        car_position = self.env.unwrapped.state[0]
        plt.scatter(car_position, np.sin(3 * car_position), c='blue', marker='o', s=200)
        plt.xlabel("Position")
        plt.ylabel("Mountain Height")
        plt.title(f"MountainCarEnv - Step {step} - Action {action_taken}")
        plt.xlim(self.env.unwrapped.min_position, self.env.unwrapped.max_position)
        plt.ylim(-1.2, 1.2)
        display.clear_output(wait=True)
        display.display(plt.gcf())



In [None]:
# Usage with default values
default_agent = MountainCarAgent()
default_agent.train()

# Run a simple test with 100 episodes
success_rate = default_agent.test_policy(num_episodes=100)
print(f"Success rate with 1000 episodes: {success_rate * 100:.2f}%")

In [6]:
# Usage with custom values

config = {
    'alpha': 0.1,
    'gamma': 0.99,
    'epsilon': 0.3,
    'epsilon_decay': 0.9,
    'min_epsilon': 0.01,
    'num_episodes': 5001,
    'num_states': [20,20]
}

print(f"Testing configuration: {config}")

custom_agent = MountainCarAgent(**config)
custom_agent.train()

# Uncomment this to save the q-table analysys
#custom_agent.plot_and_save_Q_table_analysis(100)

success_rate = custom_agent.test_policy(num_episodes = 100)
print(f"Success rate of the learned policy: {success_rate * 100:.2f}%")

# Uncomment this to run and save the graphics for an analyzed test
#analysis_data = custom_agent.test_policy_analyzed(num_episodes = 100)
#custom_agent.plot_analysis_data(analysis_data)
# print(f"Success Rate: {analysis_data['success_rate'] * 100:.2f}%, "
#               f"Average Episode Length: {np.mean(analysis_data['episode_lengths']):.2f} steps, "
#               f"Min Episode Length: {np.min(analysis_data['episode_lengths'])} steps, "
#               f"Average Reward: {np.mean(analysis_data['episode_rewards']):.2f}, "
#               f"Failures: {len(analysis_data['failures'])}, "
#               f"Total Episodes: {analysis_data['episodes']}")

Testing configuration: {'alpha': 0.1, 'gamma': 0.99, 'epsilon': 0.3, 'epsilon_decay': 0.9, 'min_epsilon': 0.01, 'num_episodes': 5001, 'num_states': [20, 20]}
Episode: 0, Average Reward: -11371.0
Episode: 500, Average Reward: -503.568
Episode: 1000, Average Reward: -285.328
Episode: 1500, Average Reward: -202.41
Episode: 2000, Average Reward: -204.802
Episode: 2500, Average Reward: -189.836
Episode: 3000, Average Reward: -182.776
Episode: 3500, Average Reward: -182.64
Episode: 4000, Average Reward: -202.424
Episode: 4500, Average Reward: -183.982
Episode: 5000, Average Reward: -183.592
Training complete!
Success rate of the learned policy: 97.00%


In [None]:
# Run with visualization 
custom_agent.test_policy_with_visualization()

In [None]:
# Tests for comparation between different approaches

configs = [
    {'alpha': 0.1, 'gamma': 0.99, 'epsilon': 0.1, 'epsilon_decay': 0.95, 'min_epsilon': 0.01, 'num_episodes': 500, 'num_states': [10, 10]},  
    {'alpha': 0.1, 'gamma': 0.99, 'epsilon': 0.1, 'epsilon_decay': 0.95, 'min_epsilon': 0.01, 'num_episodes': 1000, 'num_states': [10, 10]},
    
    {'alpha': 0.25, 'gamma': 0.99, 'epsilon': 0.1, 'epsilon_decay': 0.95, 'min_epsilon': 0.01, 'num_episodes': 500, 'num_states': [10, 10]},  
    {'alpha': 0.25, 'gamma': 0.99, 'epsilon': 0.1, 'epsilon_decay': 0.95, 'min_epsilon': 0.01, 'num_episodes': 1000, 'num_states': [10, 10]},  
    
    {'alpha': 0.1, 'gamma': 0.99, 'epsilon': 0.1, 'epsilon_decay': 0.95, 'min_epsilon': 0.01, 'num_episodes': 500, 'num_states': [20, 20]},  
    {'alpha': 0.1, 'gamma': 0.99, 'epsilon': 0.1, 'epsilon_decay': 0.95, 'min_epsilon': 0.01, 'num_episodes': 1000, 'num_states': [20, 20]},  
    {'alpha': 0.1, 'gamma': 0.99, 'epsilon': 0.1, 'epsilon_decay': 0.95, 'min_epsilon': 0.01, 'num_episodes': 10000, 'num_states': [20, 20]},
    {'alpha': 0.1, 'gamma': 0.99, 'epsilon': 0.1, 'epsilon_decay': 0.95, 'min_epsilon': 0.01, 'num_episodes': 25000, 'num_states': [20, 20]},
    
    {'alpha': 0.3, 'gamma': 0.99, 'epsilon': 0.1, 'epsilon_decay': 0.95, 'min_epsilon': 0.01, 'num_episodes': 500, 'num_states': [20, 20]},  
    {'alpha': 0.3, 'gamma': 0.99, 'epsilon': 0.1, 'epsilon_decay': 0.95, 'min_epsilon': 0.01, 'num_episodes': 1000, 'num_states': [20, 20]},  
    {'alpha': 0.3, 'gamma': 0.99, 'epsilon': 0.1, 'epsilon_decay': 0.95, 'min_epsilon': 0.01, 'num_episodes': 10000, 'num_states': [20, 20]},  
    
    {'alpha': 0.1, 'gamma': 0.99, 'epsilon': 0.1, 'epsilon_decay': 0.95, 'min_epsilon': 0.01, 'num_episodes': 500, 'num_states': [50, 50]},  
    {'alpha': 0.1, 'gamma': 0.99, 'epsilon': 0.1, 'epsilon_decay': 0.95, 'min_epsilon': 0.01, 'num_episodes': 1000, 'num_states': [50, 50]},  
    {'alpha': 0.1, 'gamma': 0.99, 'epsilon': 0.1, 'epsilon_decay': 0.95, 'min_epsilon': 0.01, 'num_episodes': 10000, 'num_states': [50, 50]},
    {'alpha': 0.1, 'gamma': 0.99, 'epsilon': 0.1, 'epsilon_decay': 0.95, 'min_epsilon': 0.01, 'num_episodes': 50000, 'num_states': [50, 50]},
    
    {'alpha': 0.3, 'gamma': 0.99, 'epsilon': 0.3, 'epsilon_decay': 0.95, 'min_epsilon': 0.01, 'num_episodes': 500, 'num_states': [50, 50]},  
    {'alpha': 0.3, 'gamma': 0.99, 'epsilon': 0.3, 'epsilon_decay': 0.95, 'min_epsilon': 0.01, 'num_episodes': 1000, 'num_states': [50, 50]},  
    {'alpha': 0.3, 'gamma': 0.99, 'epsilon': 0.3, 'epsilon_decay': 0.95, 'min_epsilon': 0.01, 'num_episodes': 10000, 'num_states': [50, 50]},
    {'alpha': 0.5, 'gamma': 0.99, 'epsilon': 0.3, 'epsilon_decay': 0.9, 'min_epsilon': 0.01, 'num_episodes': 10000, 'num_states': [50, 50]},
    {'alpha': 0.3, 'gamma': 0.99, 'epsilon': 0.3, 'epsilon_decay': 0.95, 'min_epsilon': 0.01, 'num_episodes': 50000, 'num_states': [50, 50]},
    
    {'alpha': 0.25, 'gamma': 0.99, 'epsilon': 0.3, 'epsilon_decay': 0.95, 'min_epsilon': 0.01, 'num_episodes': 1000, 'num_states': [100, 100]},  
    {'alpha': 0.25, 'gamma': 0.99, 'epsilon': 0.3, 'epsilon_decay': 0.95, 'min_epsilon': 0.01, 'num_episodes': 25000, 'num_states': [100, 100]},
    
    {'alpha': 0.1, 'gamma': 0.99, 'epsilon': 0.4, 'epsilon_decay': 0.99, 'min_epsilon': 0.01, 'num_episodes': 100000, 'num_states': [50, 50]},
    {'alpha': 0.1, 'gamma': 0.99, 'epsilon': 0.3, 'epsilon_decay': 0.99, 'min_epsilon': 0.01, 'num_episodes': 50000, 'num_states': [20, 20]},
    {'alpha': 0.1, 'gamma': 0.99, 'epsilon': 0.1, 'epsilon_decay': 0.95, 'min_epsilon': 0.01, 'num_episodes': 10000, 'num_states': [10, 10]},
    {'alpha': 0.25, 'gamma': 0.5, 'epsilon': 0.4, 'epsilon_decay': 0.9, 'min_epsilon': 0.01, 'num_episodes': 50000, 'num_states': [50, 50]},

]


# Test for each configuration
for i, config in enumerate(configs):
    print(f"Testing configuration {i+1}: {config}")

    # Initialize and train the agent
    agent = MountainCarAgent(**config)
    agent.train()

    # Save Q-table plot
    agent.plot_and_save_Q_table_analysis(test_id=i+1)

    # Run a simple test with 1000 episodes
    success_rate = agent.test_policy(num_episodes=1000)
    print(f"Success rate with 1000 episodes: {success_rate * 100:.2f}%")
    
    # Save the Q-table
    q_table_filename = f"q_table_config_{i+1}.npy"
    agent.save_Q_table(q_table_filename)
    print(f"Q-table for configuration {i+1} saved.")

    # Run analysis tests with different number of episodes
    for num_episodes in [100, 500, 1000, 10000]:
        analysis_data = agent.test_policy_analyzed(num_episodes=num_episodes)
        agent.plot_analysis_data(analysis_data, test_id=i+1)
        print(f"Analysis for {num_episodes} episodes saved.")
        print(f"Success Rate: {analysis_data['success_rate'] * 100:.2f}%, "
              f"Average Episode Length: {np.mean(analysis_data['episode_lengths']):.2f} steps, "
              f"Min Episode Length: {np.min(analysis_data['episode_lengths'])} steps, "
              f"Average Reward: {np.mean(analysis_data['episode_rewards']):.2f}, "
              f"Failures: {len(analysis_data['failures'])}, "
              f"Total Episodes: {analysis_data['episodes']}")
    
    print("----------------------------------------------------------------------------------------------------------------------------------------------------------------------------")

In [None]:
# Reevaluate a case for later vizualization

i = 24
config = {'alpha': 0.1, 'gamma': 0.99, 'epsilon': 0.4, 'epsilon_decay': 0.99, 'min_epsilon': 0.01, 'num_episodes': 150000, 'num_states': [100, 100]}

# Initialize and train the agent
agent = MountainCarAgent(**config)
agent.train()

# Save Q-table plot
agent.plot_and_save_Q_table_analysis(test_id=i)

# Run a simple test with 1000 episodes
success_rate = agent.test_policy(num_episodes=1000)
print(f"Success rate with 1000 episodes: {success_rate * 100:.2f}%")

# Save the Q-table
q_table_filename = f"q_table_config_{i}.npy"
agent.save_Q_table(q_table_filename)
print(f"Q-table for configuration {i} saved.")

# Run analysis tests with different number of episodes
for num_episodes in [100, 500, 1000, 10000]:
    analysis_data = agent.test_policy_analyzed(num_episodes=num_episodes)
    agent.plot_analysis_data(analysis_data, test_id=i)
    print(f"Analysis for {num_episodes} episodes saved.")
    print(f"Success Rate: {analysis_data['success_rate'] * 100:.2f}%, "
          f"Average Episode Length: {np.mean(analysis_data['episode_lengths']):.2f} steps, "
          f"Min Episode Length: {np.min(analysis_data['episode_lengths'])} steps, "
          f"Average Reward: {np.mean(analysis_data['episode_rewards']):.2f}, "
          f"Failures: {len(analysis_data['failures'])}, "
          f"Total Episodes: {analysis_data['episodes']}")

print("----------------------------------------------------------------------------------------------------------------------------------------------------------------------------")

In [None]:
agent.test_policy_with_visualization(num_episodes=3)