In [16]:
import gym
import numpy as np
import random
from collections import deque
from tensorflow import keras
from tensorflow.keras import layers
import matplotlib.pyplot as plt
from IPython.display import display, clear_output
import matplotlib.animation as animation

In [7]:
# Define the Deep Q Network (DQN) class
class DQNAgent:
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size
        self.memory = deque(maxlen=2000)
        self.gamma = 0.95  # Discount factor
        self.epsilon = 1.0  # Exploration rate (start high for exploration)
        self.epsilon_decay = 0.995  # Decay rate of exploration rate
        self.epsilon_min = 0.01  # Minimum exploration rate
        self.learning_rate = 0.001
        self.model = self.build_model()

    def build_model(self):
        model = keras.Sequential([
            layers.Dense(24, input_dim=self.state_size, activation='relu'),
            layers.Dense(24, activation='relu'),
            layers.Dense(self.action_size, activation='linear')
        ])
        model.compile(loss='mse', optimizer=keras.optimizers.Adam(lr=self.learning_rate))
        return model

    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def act(self, state):
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)
        q_values = self.model.predict(state)
        return np.argmax(q_values[0])

    def replay(self, batch_size):
        minibatch = random.sample(self.memory, batch_size)
        for state, action, reward, next_state, done in minibatch:
            target = reward
            if not done:
                target = reward + self.gamma * np.amax(self.model.predict(next_state)[0])
            target_f = self.model.predict(state)
            target_f[0][action] = target
            self.model.fit(state, target_f, epochs=1, verbose=0)
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

In [17]:
# Training the DQN agent on the CartPole environment
def train_cartpole_dqn(episodes=1000, batch_size=32):
    env = gym.make('CartPole-v1')
    state_size = env.observation_space.shape[0]
    action_size = env.action_space.n

    agent = DQNAgent(state_size, action_size)
    
    # Variables for capturing frames
    frames = []  # To store the frames from the environment

    for episode in range(episodes):
        state = env.reset()
        state = np.reshape(state, [1, state_size])
        done = False
        time = 0

        while not done:
            # Capturing frames
            frames.append(env.render(mode='rgb_array'))
            
            action = agent.act(state)
            next_state, reward, done, _ = env.step(action)
            reward = reward if not done or time == 499 else -100
            next_state = np.reshape(next_state, [1, state_size])
            agent.remember(state, action, reward, next_state, done)
            state = next_state
            time += 1

            if done:
                # Display progress based on the score and epsilon
                print(f"Episode {episode}/{episodes}, Score: {time}, Epsilon: {agent.epsilon}")
                break

        if len(agent.memory) > batch_size:
            agent.replay(batch_size)

        # Display frames as animation after every few episodes
        if episode % 50 == 0:
            clear_output(wait=True)
            fig = plt.figure()
            ims = [[plt.imshow(frame, animated=True)] for frame in frames]
            ani = animation.ArtistAnimation(fig, ims, interval=50, blit=True)
            plt.close()  # Close the figure to prevent duplicate display
            display(ani)
            frames = []  # Reset frames after displaying

    env.close()

In [18]:
if __name__ == "__main__":
    train_cartpole_dqn()

<matplotlib.animation.ArtistAnimation at 0x15294a437c0>

Episode 1/1000, Score: 28, Epsilon: 1.0
Episode 2/1000, Score: 10, Epsilon: 0.995
Episode 3/1000, Score: 21, Epsilon: 0.990025
Episode 4/1000, Score: 19, Epsilon: 0.985074875
Episode 5/1000, Score: 12, Epsilon: 0.9801495006250001
Episode 6/1000, Score: 16, Epsilon: 0.9752487531218751
Episode 7/1000, Score: 10, Epsilon: 0.9703725093562657
Episode 8/1000, Score: 18, Epsilon: 0.9655206468094844
Episode 9/1000, Score: 13, Epsilon: 0.960693043575437
Episode 10/1000, Score: 12, Epsilon: 0.9558895783575597
Episode 11/1000, Score: 32, Epsilon: 0.9511101304657719
Episode 12/1000, Score: 16, Epsilon: 0.946354579813443
Episode 13/1000, Score: 34, Epsilon: 0.9416228069143757
Episode 14/1000, Score: 24, Epsilon: 0.9369146928798039


: 

: 

In [1]:
import gym
import numpy as np
import random
from collections import deque
from tensorflow import keras
from tensorflow.keras import layers
import matplotlib.pyplot as plt
from IPython.display import clear_output

# Define the Deep Q Network (DQN) class
class DQNAgent:
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size
        self.memory = deque(maxlen=2000)
        self.gamma = 0.9  # Lower discount factor for short-sighted agent
        self.epsilon = 0.5  # Higher exploration rate for more random actions
        self.epsilon_decay = 0.995
        self.epsilon_min = 0.01
        self.learning_rate = 0.001  # Higher learning rate for faster but unstable learning
        self.model = self.build_model()

    def build_model(self):
        model = keras.Sequential([
            layers.Dense(24, input_dim=self.state_size, activation='relu'),
            layers.Dense(24, activation='relu'),
            layers.Dense(self.action_size, activation='linear')
        ])
        model.compile(loss='mse', optimizer=keras.optimizers.Adam(lr=self.learning_rate))
        return model

    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def act(self, state):
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)
        q_values = self.model.predict(state)
        return np.argmax(q_values[0])

    def replay(self, batch_size):
        minibatch = random.sample(self.memory, batch_size)
        for state, action, reward, next_state, done in minibatch:
            target = reward
            if not done:
                target = reward + self.gamma * np.amax(self.model.predict(next_state)[0])
            target_f = self.model.predict(state)
            target_f[0][action] = target
            self.model.fit(state, target_f, epochs=1, verbose=0)
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

# Training the DQN agent on the CartPole environment
def train_cartpole_dqn(episodes=1000, batch_size=32):
    env = gym.make('CartPole-v1')
    state_size = env.observation_space.shape[0]
    action_size = env.action_space.n

    agent = DQNAgent(state_size, action_size)

    scores = []
    for episode in range(episodes):
        state = env.reset()
        state = np.reshape(state, [1, state_size])
        done = False
        time = 0

        while not done:
            action = agent.act(state)
            next_state, reward, done, _ = env.step(action)
            reward = reward if not done or time == 499 else -100
            next_state = np.reshape(next_state, [1, state_size])
            agent.remember(state, action, reward, next_state, done)
            state = next_state
            time += 1

            if done:
                scores.append(time)
                print(f"Episode {episode}/{episodes}, Score: {time}, Epsilon: {agent.epsilon}")
                break

        if len(agent.memory) > batch_size:
            agent.replay(batch_size)

    env.close()

    # Plotting the scores
    plt.plot(scores)
    plt.xlabel('Episode')
    plt.ylabel('Score')
    plt.title('DQN Agent Performance')
    plt.show()

if __name__ == "__main__":
    train_cartpole_dqn()




Episode 0/1000, Score: 19, Epsilon: 0.5
Episode 1/1000, Score: 14, Epsilon: 0.5
Episode 2/1000, Score: 12, Epsilon: 0.4975
Episode 3/1000, Score: 12, Epsilon: 0.4950125
Episode 4/1000, Score: 23, Epsilon: 0.4925374375
Episode 5/1000, Score: 50, Epsilon: 0.49007475031250003
Episode 6/1000, Score: 119, Epsilon: 0.48762437656093754
Episode 7/1000, Score: 46, Epsilon: 0.48518625467813287
Episode 8/1000, Score: 32, Epsilon: 0.4827603234047422
Episode 9/1000, Score: 40, Epsilon: 0.4803465217877185
Episode 10/1000, Score: 22, Epsilon: 0.47794478917877986
Episode 11/1000, Score: 32, Epsilon: 0.47555506523288593


KeyboardInterrupt: 