# Implementing Q-Learning in Keras

In [2]:
import os 
os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0' 
os.environ['CUDA_VISIBLE_DEVICES'] = '-1' 

In [3]:
%pip install gym
!pip install gymnasium
!pip install numpy==1.21.6  
!pip install tensorflow==2.10.0  

Collecting gym
  Using cached gym-0.26.2.tar.gz (721 kB)
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'done'
Collecting cloudpickle>=1.2.0 (from gym)
  Using cached cloudpickle-3.1.2-py3-none-any.whl.metadata (7.1 kB)
Collecting gym_notices>=0.0.4 (from gym)
  Using cached gym_notices-0.1.0-py3-none-any.whl.metadata (1.2 kB)
Using cached cloudpickle-3.1.2-py3-none-any.whl (22 kB)
Using cached gym_notices-0.1.0-py3-none-any.whl (3.3 kB)
Building wheels for collected packages: gym
  Building wheel for gym (pyproject.toml): started
  Building wheel for gym (pyproject.toml): finished with status 'done'
  Created wheel for gym: filename=gym-0.26.2-py3-none-any.whl size=827739 sha256=47269530fb267c7964215bcddcf85167c32

ERROR: Ignored the following yanked versions: 2.4.0
ERROR: Ignored the following versions that require a different python version: 1.21.2 Requires-Python >=3.7,<3.11; 1.21.3 Requires-Python >=3.7,<3.11; 1.21.4 Requires-Python >=3.7,<3.11; 1.21.5 Requires-Python >=3.7,<3.11; 1.21.6 Requires-Python >=3.7,<3.11; 1.26.0 Requires-Python >=3.9,<3.13; 1.26.1 Requires-Python >=3.9,<3.13
ERROR: Could not find a version that satisfies the requirement numpy==1.21.6 (from versions: 1.3.0, 1.4.1, 1.5.0, 1.5.1, 1.6.0, 1.6.1, 1.6.2, 1.7.0, 1.7.1, 1.7.2, 1.8.0, 1.8.1, 1.8.2, 1.9.0, 1.9.1, 1.9.2, 1.9.3, 1.10.0.post2, 1.10.1, 1.10.2, 1.10.4, 1.11.0, 1.11.1, 1.11.2, 1.11.3, 1.12.0, 1.12.1, 1.13.0, 1.13.1, 1.13.3, 1.14.0, 1.14.1, 1.14.2, 1.14.3, 1.14.4, 1.14.5, 1.14.6, 1.15.0, 1.15.1, 1.15.2, 1.15.3, 1.15.4, 1.16.0, 1.16.1, 1.16.2, 1.16.3, 1.16.4, 1.16.5, 1.16.6, 1.17.0, 1.17.1, 1.17.2, 1.17.3, 1.17.4, 1.17.5, 1.18.0, 1.18.1, 1.18.2, 1.18.3, 1.18.4, 1.18.5, 1.19.0, 1.19.1, 1.19.2, 1.19.3, 1.19.4, 1.19.5, 

In [4]:
import sys 
sys.setrecursionlimit(1500) 

import gymnasium as gym
import numpy as np 

# Create the environment 
env = gym.make('CartPole-v1') 

# Set random seed for reproducibility 
np.random.seed(42) 
env.action_space.seed(42) 
env.observation_space.seed(42)

42

## Define the Q-Learning Model

In [5]:
# Suppress warnings for a cleaner notebook or console experience
import warnings
warnings.filterwarnings('ignore')

# Override the default warning function
def warn(*args, **kwargs):
    pass
warnings.warn = warn

# Import necessary libraries for the Q-Learning model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Input  # Import Input layer
from tensorflow.keras.optimizers import Adam

In [7]:
def build_model(state_size, action_size):
    model = Sequential()
    model.add(Input(shape=(state_size,)))  # Use Input layer for defining input shape
    model.add(Dense(24, activation='relu'))
    model.add(Dense(24, activation='relu'))
    model.add(Dense(action_size, activation='linear'))
    model.compile(loss='mse', optimizer=Adam(learning_rate=0.001))
    return model

env = gym.make('CartPole-v1')       # create the environment
state_size = int(env.observation_space.shape[0])  # get size of state space
action_size = int(env.action_space.n)    # get size of action space
model = build_model(state_size, action_size)  # build the Q-learning model
print(f"State size: {state_size}, Action size: {action_size}")
model.summary()

State size: 4, Action size: 2


- env = gym.make('CartPole-v1'): Membuat environment CartPole dari OpenAI Gym.
- state_size = env.observation_space.shape[0]: Mendapatkan dimensi vektor state (jumlah fitur observasi), mis. 4 untuk CartPole (posisi cart, kecepatan, sudut tiang, kecepatan sudut)
- action_size = env.action_space.n: Mendapatkan jumlah aksi diskrit yang tersedia (2 untuk CartPole: kiri atau kanan).
- model = build_model(state_size, action_size): Membangun model jaringan saraf (Q-network) dengan input berukuran state_size dan output berukuran action_size, untuk memetakan state → Q-value tiap aksi.

## Implement the Q-Learning Algorithm

In [10]:
import random
import numpy as np
from collections import deque
import tensorflow as tf

# Define epsilon and epsilon_decay
epsilon = 1.0
epsilon_min = 0.01
epsilon_decay = 0.99

# Replay memory
memory = deque(maxlen=2000)

def remember(state, action, reward, next_state, done):
    """Store experience in memory."""
    memory.append((state, action, reward, next_state, done))
    
def replay(batch_size):
    """Train the model using randomly sampled experiences from memory."""
    global epsilon
    if len(memory) < batch_size:
        return
    
    minibatch = random.sample(memory, batch_size)
    
    # Extract information for batch processing and reshape properly
    states = np.array([experience[0] for experience in minibatch]).reshape(batch_size, state_size)
    actions = np.array([experience[1] for experience in minibatch])
    rewards = np.array([experience[2] for experience in minibatch])
    next_states = np.array([experience[3] for experience in minibatch]).reshape(batch_size, state_size)
    dones = np.array([experience[4] for experience in minibatch])
    
    # Predict Q-values for the next states in batch
    q_next = model.predict(next_states, verbose=0)
    # Predict Q-values for the current states in batch
    q_target = model.predict(states, verbose=0)
    
    # Vectorized update of target values
    for i in range(batch_size):
        target = rewards[i]
        if not dones[i]:
            target += 0.95 * np.amax(q_next[i])  # Update Q value with the discounted future reward
        q_target[i][actions[i]] = target  # Update only the taken action's Q value
    
    # Train the model with the updated targets in batch
    model.fit(states, q_target, epochs=1, verbose=0)
    
    if epsilon > epsilon_min:
        epsilon *= epsilon_decay


def act(state):
    """Choose an action based on the current state and exploration rate."""
    if np.random.rand() <= epsilon:
        return random.randrange(action_size) # Explore: select a random action
    act_values = model.predict(state, verbose=0) # Exploit: select the action with max Q-value
    return np.argmax(act_values[0])  # Return the action with the highest Q-value
    

# Define the number of episodes you want to train the model for
episodes = 10  
train_frequency = 5  # Train the model every 5 steps

for e in range(episodes):
    state, _ = env.reset()  # Reset the environment at the start of each episode
    state = np.reshape(state, [1, state_size])
    for time in range(200):         # Limit to 200 time steps per episode
        action = act(state)         # Choose action
        next_state, reward, terminated, truncated, _ = env.step(action)  # Take action
        done = terminated or truncated
        reward = reward if not done else -10  # Penalize if the episode ends 
        next_state = np.reshape(next_state, [1, state_size])  # Reshape next state
        remember(state, action, reward, next_state, done)  # Store experience
        state = next_state  # Transition to the next state
        
        if done: 
            print(f"Episode: {e+1}/{episodes}, score: {time}, e: {epsilon:.2}")
            break
        
        # Train the model every 'train_frequency' steps
        if time % train_frequency == 0:
            replay(batch_size=64)  # Train the model with a batch size of 64
env.close()

Episode: 1/10, score: 14, e: 1.0
Episode: 2/10, score: 31, e: 1.0
Episode: 3/10, score: 27, e: 0.98
Episode: 4/10, score: 24, e: 0.93
Episode: 5/10, score: 61, e: 0.82
Episode: 6/10, score: 38, e: 0.75
Episode: 7/10, score: 9, e: 0.74
Episode: 8/10, score: 9, e: 0.72
Episode: 9/10, score: 11, e: 0.7
Episode: 10/10, score: 49, e: 0.64
