<a href="https://colab.research.google.com/github/AqueeqAzam/reinforcement-learning-for-robotic/blob/main/reinforcement_learning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# `Simple Project`

In [4]:
import gym
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

# Hyperparameters
learning_rate = 0.01
discount_factor = 0.95
episodes = 1000

# Create the environment
env = gym.make('CartPole-v1')

# Define the agent's policy network (model)
model = Sequential([
  Dense(16, activation='relu', input_shape=env.observation_space.shape),
  Dense(2, activation='softmax')  # Output layer for action probabilities
])

# Define the loss function (categorical cross-entropy)
loss_fn = tf.keras.losses.CategoricalCrossentropy(from_logits=True)

# Define the optimizer (Adam)
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)

def choose_action(state):
  # Convert state to a tensor and add a batch dimension (for compatibility with model)
  state_tensor = tf.expand_dims(state, axis=0)

  # Predict action probabilities from the model
  action_probs = model(state_tensor)

  # Sample an action based on the probability distribution
  action = tf.random.categorical(action_probs, 1)[0]  # Sample one action
  return action.numpy()[0]  # Convert back to a NumPy value

def train(state, action, reward, next_state, done):
  # Convert state and next_state tensors to a batch dimension
  state_tensor = tf.expand_dims(state, axis=0)
  next_state_tensor = tf.expand_dims(next_state, axis=0)

  # Get one-hot encoded action (binary representation of chosen action)
  one_hot_action = tf.one_hot(action, env.action_space.n)

  # Calculate the target value (expected future reward)
  if done:
    target_value = tf.constant([reward], dtype=tf.float32) # Convert reward to a tensor with shape (1,)
  else:
    # Use Bellman equation to calculate target value (consider discounted future reward)
    q_values_next = model(next_state_tensor)
    target_value = reward + discount_factor * tf.reduce_max(q_values_next, axis=1)

  # Calculate the loss (difference between predicted Q-value and target value for the chosen action)
  with tf.GradientTape() as tape:
    q_values = model(state_tensor)
    q_action = tf.reduce_sum(q_values * one_hot_action, axis=1)
    loss = loss_fn(target_value, q_action)

  # Calculate gradients with respect to model parameters
  grads = tape.gradient(loss, model.trainable_variables)

  # Update model parameters using optimizer
  optimizer.apply_gradients(zip(grads, model.trainable_variables))

# Training loop
for episode in range(episodes):
  state = env.reset()
  done = False
  score = 0

  while not done:
    action = choose_action(state)
    next_state, reward, done, info = env.step(action)
    score += reward

    train(state, action, reward, next_state, done)
    state = next_state

  print(f"Episode: {episode+1}, Score: {score}")

env.close()




Episode: 1, Score: 9.0
Episode: 2, Score: 25.0
Episode: 3, Score: 14.0
Episode: 4, Score: 24.0
Episode: 5, Score: 54.0
Episode: 6, Score: 33.0
Episode: 7, Score: 15.0
Episode: 8, Score: 16.0
Episode: 9, Score: 11.0
Episode: 10, Score: 15.0
Episode: 11, Score: 11.0
Episode: 12, Score: 22.0
Episode: 13, Score: 9.0
Episode: 14, Score: 10.0
Episode: 15, Score: 10.0
Episode: 16, Score: 15.0
Episode: 17, Score: 9.0
Episode: 18, Score: 30.0
Episode: 19, Score: 11.0
Episode: 20, Score: 15.0
Episode: 21, Score: 18.0
Episode: 22, Score: 9.0
Episode: 23, Score: 12.0
Episode: 24, Score: 16.0
Episode: 25, Score: 25.0
Episode: 26, Score: 10.0
Episode: 27, Score: 12.0
Episode: 28, Score: 12.0
Episode: 29, Score: 12.0
Episode: 30, Score: 18.0
Episode: 31, Score: 10.0
Episode: 32, Score: 13.0
Episode: 33, Score: 9.0
Episode: 34, Score: 10.0
Episode: 35, Score: 18.0
Episode: 36, Score: 47.0
Episode: 37, Score: 8.0
Episode: 38, Score: 16.0
Episode: 39, Score: 16.0
Episode: 40, Score: 9.0
Episode: 41, Sco

KeyboardInterrupt: 