In [1]:
import numpy as np
import gym
import random
from collections import deque
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam

# Initialize the Gym environment
env = gym.make('CartPole-v1')
state_size = env.observation_space.shape[0]
action_size = env.action_space.n


In [2]:
class DQNAgent:
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size
        self.memory = deque(maxlen=2000)  # where past experiences are stored
        self.gamma = 0.95    # discount factor for future rewards
        self.epsilon = 1.0  # exploration rate: how much to act randomly
        self.epsilon_min = 0.01  # minimum exploration rate
        self.epsilon_decay = 0.995  # multiplicative factor (per episode) for decreasing epsilon
        self.learning_rate = 0.001  # learning rate for the neural network
        self.model = self._build_model()

    def _build_model(self):
        """Builds a simple neural network with two hidden layers."""
        model = Sequential()
        model.add(Dense(24, input_dim=self.state_size, activation='relu'))
        model.add(Dense(24, activation='relu'))
        model.add(Dense(self.action_size, activation='linear'))
        model.compile(loss='mse', optimizer=Adam(lr=self.learning_rate))
        return model

    def remember(self, state, action, reward, next_state, done):
        """Stores experiences in memory."""
        self.memory.append((state, action, reward, next_state, done))

    def act(self, state):
        """Returns actions for given state as per current policy."""
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)
        act_values = self.model.predict(state)
        return np.argmax(act_values[0])  # returns action

    def replay(self, batch_size):
        """Trains the agent by replaying experiences from the memory."""
        minibatch = random.sample(self.memory, batch_size)
        for state, action, reward, next_state, done in minibatch:
            target = reward
            if not done:
                target = (reward + self.gamma * np.max(self.model.predict(next_state)[0]))
            target_f = self.model.predict(state)
            target_f[0][action] = target
            self.model.fit(state, target_f, epochs=1, verbose=0)
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay


In [6]:
import numpy as np
import gym

# Assuming the environment 'CartPole-v1' is used
env = gym.make('CartPole-v1')
state_size = env.observation_space.shape[0]
action_size = env.action_space.n

# Define the state extraction and reshaping process
def process_state(state):
    if isinstance(state, tuple):
        state = state[0]  # Assuming the relevant information is in the first element
    return np.reshape(state, [1, state_size])

# Initialize DQNAgent here (Assuming you have defined this in previous cells)
agent = DQNAgent(state_size, action_size)

episodes = 10  # Set the number of episodes
for e in range(episodes):
    initial_state = env.reset()
    state = process_state(initial_state)  # Process state before using it

    for time in range(500):  # Modify as needed
        action = agent.act(state)
        step_results = env.step(action)
        next_state_raw = step_results[0]  # Always the first item
        reward = step_results[1]          # Always the second item
        done = step_results[2]            # Always the third item
        # Ignore additional return values by not assigning them

        next_state = process_state(next_state_raw)  # Process the next state similarly
        
        reward = reward if not done else -10  # Penalize if the pole falls
        agent.remember(state, action, reward, next_state, done)
        state = next_state
        
        if done:
            print(f"Episode: {e+1}/{episodes}, Score: {time}, Epsilon: {agent.epsilon:.2}")
            break

    if len(agent.memory) > 32:
        agent.replay(32)  # Replay to train

# Optionally save the model
agent.model.save('dqn_model.h5')
print("Model saved successfully!")



Episode: 1/10, Score: 36, Epsilon: 1.0
Episode: 2/10, Score: 11, Epsilon: 0.99
Episode: 3/10, Score: 24, Epsilon: 0.99
Episode: 4/10, Score: 15, Epsilon: 0.99
Episode: 5/10, Score: 18, Epsilon: 0.98
Episode: 6/10, Score: 14, Epsilon: 0.98
Episode: 7/10, Score: 20, Epsilon: 0.97
Episode: 8/10, Score: 17, Epsilon: 0.97
Episode: 9/10, Score: 20, Epsilon: 0.96
Episode: 10/10, Score: 25, Epsilon: 0.96
Model saved successfully!
