In [1]:
!pip install gym==0.17.3
!pip install matplotlib
!pip install tensorflow==2.10



In [2]:
import random
import gym
import numpy as np
from collections import deque
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam




In [3]:

NUM_ACTIONS = 3 
action_bins = np.linspace(-2, 2, NUM_ACTIONS) 
print(action_bins)

[-2.  0.  2.]


In [None]:

#the parameters like gamma, epsilon, epsilon_min, epsilon_decay, learning_rate, alpha, etc. are they all correct?

In [5]:
class DQNAgent:
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size
        self.memory = deque(maxlen=100000)
        self.gamma = 0.99           # Discount factor -----------------ASK CHER
        self.epsilon = 1.0          # Exploration rate 
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.995
        self.learning_rate = 0.001

        self.model = self._build_model()

    def _build_model(self):
        model = Sequential()
        model.add(Dense(12, input_dim=self.state_size, activation='relu'))
        model.add(Dense(12, activation='relu'))
        model.add(Dense(self.action_size, activation='linear'))  # Q-values for each discrete action
        model.compile(loss='mse', optimizer=Adam(learning_rate=self.learning_rate))
        return model

    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def act(self, state):
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)
        act_values = self.model.predict(state[np.newaxis, :], verbose=0)
        return np.argmax(act_values[0])

    def replay(self, batch_size):
        minibatch = random.sample(self.memory, batch_size)
    
        states = np.zeros((batch_size, self.state_size))
        targets = np.zeros((batch_size, self.action_size))
    
        for i, (state, action, reward, next_state, done) in enumerate(minibatch):
            target = reward
            if not done:
                target += self.gamma * np.amax(self.model.predict(next_state[np.newaxis, :], verbose=0)[0])
            target_f = self.model.predict(state[np.newaxis, :], verbose=0)[0]
            target_f[action] = target
    
            states[i] = state
            targets[i] = target_f
    
        
        self.model.fit(states, targets, epochs=1, verbose=0)
    
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

In [None]:
env = gym.make('Pendulum-v0')  # gym 0.17.3
state_size = env.observation_space.shape[0]
action_size = NUM_ACTIONS  # number of discrete actions

agent = DQNAgent(state_size, action_size)
episodes = 100000
batch_size = 64

max_reward= -5
reward_history = []



In [None]:
for e in range(episodes):
    state = env.reset()
    total_reward = 0

    for time in range(200):
        action_idx = agent.act(state)
        action = [action_bins[action_idx]]  # env expects an array

        next_state, reward, done, _ = env.step(action)
        agent.remember(state, action_idx, reward, next_state, done)
        state = next_state
        total_reward += reward

        if done:
            break

    # ✅ Replay only once per episode (massively faster)
    if len(agent.memory) > batch_size:
        agent.replay(batch_size)

    print(f"Episode {e+1}: Reward = {total_reward}")
    reward_history.append(total_reward)

    if total_reward >= max_reward: 
        print(f"✅ Solved! Pendulum upright and still in episode {e+1} with reward {total_reward}")
        break

Episode 1: Reward = -998.7016510373827
Episode 2: Reward = -1579.2324119851214
Episode 3: Reward = -1351.459454464147
Episode 4: Reward = -1695.7104671279446
Episode 5: Reward = -1340.428114136991
Episode 6: Reward = -900.9022290683072
Episode 7: Reward = -885.0912354876874
Episode 8: Reward = -1172.3869422969674
Episode 9: Reward = -944.0884249203689
Episode 10: Reward = -988.7033492161648
Episode 11: Reward = -1434.2484688468562
Episode 12: Reward = -1530.1505526327214
Episode 13: Reward = -903.751706995634
Episode 14: Reward = -1084.880638089237
Episode 15: Reward = -747.3900425651003
Episode 16: Reward = -1305.491606355525
Episode 17: Reward = -898.365321111625
Episode 18: Reward = -909.0033210615195
Episode 19: Reward = -1429.857217915887
Episode 20: Reward = -1204.7999842257716
Episode 21: Reward = -936.2859230806695
Episode 22: Reward = -971.1713527952638
Episode 23: Reward = -1313.2294477219966
Episode 24: Reward = -1425.6190878585542
Episode 25: Reward = -1377.5492508650777
Ep

In [None]:
#do i need to plot the learnign rate and epsilon cos lioterally define how much its decaying per episode 

In [None]:
import matplotlib.pyplot as plt
import numpy as np

plt.figure(figsize=(10, 5))
plt.plot(reward_history, label='Episode Reward')
plt.axhline(y=-5, color='r', linestyle='--', label='Target Reward (-5)')
plt.xlabel('Episode')
plt.ylabel('Total Reward')
plt.title('Pendulum DQN - Reward per Episode')
plt.legend()
plt.grid(True)
plt.show()


In [None]:
def moving_average(data, window=10):
    return np.convolve(data, np.ones(window)/window, mode='valid')

plt.plot(moving_average(reward_history), label='Moving Avg (10 episodes)')
