<a href="https://colab.research.google.com/github/AdarshKammar/Python/blob/main/Gym's_CartPole_v1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [14]:
import gym
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

# Create the CartPole environment
env = gym.make('CartPole-v1')

# Define the model
num_inputs = env.observation_space.shape[0]
num_actions = env.action_space.n

model = keras.Sequential([
    layers.Dense(128, activation='relu', input_shape=(num_inputs,)),
    layers.Dense(64, activation='relu'),
    layers.Dense(num_actions, activation='linear')  # Output layer (linear for Q-values)
])

# Define the optimizer and loss function
optimizer = keras.optimizers.Adam(learning_rate=0.001)
loss_fn = keras.losses.MeanSquaredError()

# Training loop
num_episodes = 500  # Adjust as needed
gamma = 0.99       # Discount factor
epsilon_start = 1.0 # Initial exploration rate
epsilon_end = 0.01  # Final exploration rate
epsilon_decay = 0.001 # Decay rate for epsilon

for episode in range(num_episodes):
    state = env.reset()
    state = np.reshape(state, [1, num_inputs])
    total_reward = 0
    done = False

    epsilon = epsilon_end + (epsilon_start - epsilon_end) * np.exp(-epsilon_decay * episode)  # Epsilon decay

    while not done:
        # Choose action (using epsilon-greedy for exploration)
        if np.random.rand() < epsilon:
            action = env.action_space.sample()  # Explore
        else:
            q_values = model(state)
            action = np.argmax(q_values)       # Exploit

        # Take action and observe new state and reward
        next_state, reward, done, _ = env.step(action)
        next_state = np.reshape(next_state, [1, num_inputs])

        total_reward += reward

        # Calculate target Q-values (using Q-learning update rule)
        if done:
            target = reward
        else:
            q_values_next = model(next_state)
            max_q_next = np.max(q_values_next)
            target = reward + gamma * max_q_next

        target = np.array([target])  # CRUCIAL FIX: Make target a numpy array of shape (1,)

        # Train the model
        with tf.GradientTape() as tape:
            q_values = model(state)
            loss = loss_fn(target, tf.gather(q_values[0], [action]))  # The fix is here!

        gradients = tape.gradient(loss, model.trainable_variables)
        optimizer.apply_gradients(zip(gradients, model.trainable_variables))

        state = next_state

    print(f"Episode: {episode+1}, Total Reward: {total_reward}, Epsilon: {epsilon}")

# Save the trained model (optional)
model.save("cartpole_model.h5")

env.close()  # Close the environment

Episode: 1, Total Reward: 10.0, Epsilon: 1.0
Episode: 2, Total Reward: 20.0, Epsilon: 0.9990104948350412
Episode: 3, Total Reward: 13.0, Epsilon: 0.9980219786806598
Episode: 4, Total Reward: 20.0, Epsilon: 0.9970344505483393
Episode: 5, Total Reward: 36.0, Epsilon: 0.9960479094505515
Episode: 6, Total Reward: 23.0, Epsilon: 0.9950623544007555
Episode: 7, Total Reward: 10.0, Epsilon: 0.9940777844133959
Episode: 8, Total Reward: 19.0, Epsilon: 0.9930941985039028
Episode: 9, Total Reward: 25.0, Epsilon: 0.99211159568869
Episode: 10, Total Reward: 15.0, Epsilon: 0.9911299749851548
Episode: 11, Total Reward: 11.0, Epsilon: 0.9901493354116764
Episode: 12, Total Reward: 12.0, Epsilon: 0.9891696759876151
Episode: 13, Total Reward: 11.0, Epsilon: 0.9881909957333113
Episode: 14, Total Reward: 18.0, Epsilon: 0.9872132936700847
Episode: 15, Total Reward: 48.0, Epsilon: 0.9862365688202333
Episode: 16, Total Reward: 16.0, Epsilon: 0.985260820207032
Episode: 17, Total Reward: 15.0, Epsilon: 0.9842860



Episode: 500, Total Reward: 169.0, Epsilon: 0.6110661188014018
