In [1]:
!pip install pygame
!pip install gym
!pip install numpy
!pip install matplotlib
!pip install ipywidgets


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.3.1[0m[39;49m -> [0m[32;49m23.1.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.3.1[0m[39;49m -> [0m[32;49m23.1.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.3.1[0m[39;49m -> [0m[32;49m23.1.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.3.1[0m[39;49m -> [0m[32;49m23.1.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [2]:
import ipywidgets as widgets
from IPython.display import display
import gym
import numpy as np
import pickle
import os
import matplotlib.pyplot as plt

class Training:
    def __init__(self):
        self.env = gym.make('Taxi-v3')
        self.num_episodes = 5500
        self.learning_rate = 0.1
        self.epsilon = 0.1
        self.num_states = self.env.observation_space.n
        self.num_actions = self.env.action_space.n
        self.Q = np.zeros((self.num_states, self.num_actions))
        self.convergence_threshold = 0.1
        self.converged = False
        self.convergence_episodes = []
        self.convergence_episode = 0
        self.Q_prev = np.copy(self.Q)
        self.mse_values = []
        self.mse = 0

    def take_action(self, row):
        if np.random.uniform(0, 1) < self.epsilon:
            return self.env.action_space.sample()
        else:
            max_value = np.max(row)
            max_indices = np.where(row == max_value)[0]
            return np.random.choice(max_indices)

    def train(self):
        for episode in range(self.num_episodes):
            env_reset = self.env.reset()
            state = env_reset[0]
            done = False

            action = self.take_action(self.Q[state, :])

            while not done:
                next_state, reward, done, extra, _ = self.env.step(action)
                next_action = self.take_action(self.Q[next_state, :])

                self.Q[state, action] += self.learning_rate * (
                    reward + self.Q[next_state, next_action] - self.Q[state, action]
                )

                state = next_state
                action = next_action

            if episode % 100 == 0:
                mse = np.mean(np.square(self.Q - self.Q_prev))
                self.mse_values.append(mse)
                self.Q_prev = np.copy(self.Q)

            if (
                episode % 100 == 0
                and mse < self.convergence_threshold
                and episode > 0
                and not self.converged
            ):
                self.converged = True
                self.convergence_episode = episode

    def plot_convergence(self):
        x = np.arange(0, self.num_episodes, 100)
        y = np.array(self.mse_values)

        plt.plot(x[: len(y)], y)
        plt.xlabel('Episodes')
        plt.ylabel('Mean Squared Error (MSE)')
        plt.title('Convergence: MSE')
        plt.show()

    def run(self):
        self.train()
        self.plot_convergence()

        if self.converged:
            print("Convergence achieved at episode:", self.convergence_episode)
        else:
            print("No convergence found.")


In [3]:
class Test:
    def __init__(self, Q):
        self.env = gym.make('Taxi-v3')
        self.Qtest = Q
        self.success_count = 0
        self.num_evaluations = 100
        self.rewards = []
        self.episode_lengths = []
        self.Save_Q = True


    def evaluate(self):
        for _ in range(self.num_evaluations):
            env_reset = self.env.reset()
            state = env_reset[0]
            done = False
            episode_reward = 0
            episode_length = 0

            while not done:
                action = np.argmax(self.Qtest[state, :])
                next_state, reward, done, extra, _ = self.env.step(action)
                state = next_state
                episode_reward += reward
                episode_length += 1
                if reward == 20:
                    self.success_count += 1

            self.rewards.append(episode_reward)
            self.episode_lengths.append(episode_length)

    def calculate_metrics(self):
        success_rate = self.success_count / self.num_evaluations * 100
        average_reward = np.mean(self.rewards)
        average_episode_length = np.mean(self.episode_lengths)

        print("Success Rate: {:.2f}%".format(success_rate))
        print("Average Reward: {:.2f}".format(average_reward))
        print("Average Episode Length: {:.2f} time steps".format(average_episode_length))

    def run(self):
        self.evaluate()
        self.calculate_metrics()
        self.env.close()
        
def handle_run_training(mode, num_episodes, learning_rate, epsilon):
    training = Training()
    if mode == 'User Mode':
        training.num_episodes = num_episodes
        training.learning_rate = learning_rate
        training.epsilon = epsilon
    training.run()
    if True:
        with open('Q_table.pkl', 'wb') as f:
            pickle.dump(training.Q, f)
        Qparam= training.Q
    else:
        with open('Q_table.pkl', 'rb') as f:
            Qparam = pickle.load(f)
    test = Test(Qparam)
    test.run()

mode_widget = widgets.RadioButtons(options=['User Mode', 'Time-Limited Mode'], description='Mode:')
num_episodes_widget = widgets.IntSlider(value=8000, min=100, max=15000, step=100, description='Number of Episodes:', layout=widgets.Layout(width='500px'))
learning_rate_widget = widgets.FloatSlider(value=0.2, min=0.1, max=1.0, step=0.1, description='Learning Rate:', layout=widgets.Layout(width='500px'))
epsilon_widget = widgets.FloatSlider(value=0.1, min=0.05, max=0.5, step=0.05, description='Epsilon:', layout=widgets.Layout(width='500px'))

num_episodes_widget.style.description_width = '200px'
learning_rate_widget.style.description_width = '200px'
epsilon_widget.style.description_width = '200px'


widgets.interact_manual(handle_run_training,
                        mode=mode_widget,
                        num_episodes=num_episodes_widget,
                        learning_rate=learning_rate_widget,
                        epsilon=epsilon_widget)



interactive(children=(RadioButtons(description='Mode:', options=('User Mode', 'Time-Limited Mode'), value='Use…

<function __main__.handle_run_training(mode, num_episodes, learning_rate, epsilon)>