<a href="https://colab.research.google.com/github/AlfredoMijares/DeepLearning/blob/main/Ejercicio11.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import gym
from keras import models, layers
from keras.optimizers import Adam
from collections import deque
import random
import numpy as np
import matplotlib.pyplot as plt

class MountainCarTrain:
    def __init__(self, env):
        self.env = env
        self.gamma = 0.99

        self.epsilon = 1
        self.epsilon_decay = 0.05
        self.epsilon_min = 0.01

        self.learningRate = 0.001
        self.replayBuffer = deque(maxlen=20000)
        self.trainNetwork = self.createNetwork()

        self.episodeNum = 400
        self.iterationNum = 201  # max is 200
        self.numPickFromBuffer = 32

        self.targetNetwork = self.createNetwork()
        self.targetNetwork.set_weights(self.trainNetwork.get_weights())

        # To store rewards for plotting
        self.rewards = []

    def createNetwork(self):
        model = models.Sequential()
        state_shape = self.env.observation_space.shape

        model.add(layers.Input(shape=state_shape))
        model.add(layers.Dense(24, activation='relu'))
        model.add(layers.Dense(48, activation='relu'))
        model.add(layers.Dense(self.env.action_space.n, activation='linear'))
        model.compile(loss='mse', optimizer=Adam(learning_rate=self.learningRate))
        return model

    def getBestAction(self, state):
        self.epsilon = max(self.epsilon_min, self.epsilon)

        if np.random.rand(1) < self.epsilon:
            action = np.random.randint(0, 3)
        else:
            action = np.argmax(self.trainNetwork.predict(state)[0])

        return action

    def trainFromBuffer(self):
        if len(self.replayBuffer) < self.numPickFromBuffer:
            return

        samples = random.sample(self.replayBuffer, self.numPickFromBuffer)

        states = []
        newStates = []
        for sample in samples:
            state, action, reward, new_state, done = sample
            states.append(state)
            newStates.append(new_state)

        states = np.array(states).reshape(self.numPickFromBuffer, 2)
        newStates = np.array(newStates).reshape(self.numPickFromBuffer, 2)

        targets = self.trainNetwork.predict(states)
        new_state_targets = self.targetNetwork.predict(newStates)

        for i, sample in enumerate(samples):
            state, action, reward, new_state, done = sample
            target = targets[i]
            if done:
                target[action] = reward
            else:
                Q_future = max(new_state_targets[i])
                target[action] = reward + Q_future * self.gamma

        self.trainNetwork.fit(states, targets, epochs=1, verbose=0)

    def orginalTry(self, currentState, eps):
        rewardSum = 0
        max_position = -99

        for i in range(self.iterationNum):
            bestAction = self.getBestAction(currentState)

            new_state, reward, done, _ = self.env.step(bestAction)
            new_state = new_state.reshape(1, 2)

            if new_state[0][0] > max_position:
                max_position = new_state[0][0]

            if new_state[0][0] >= 0.5:
                reward += 10

            self.replayBuffer.append([currentState, bestAction, reward, new_state, done])
            self.trainFromBuffer()

            rewardSum += reward
            currentState = new_state

            if done:
                break

        self.rewards.append(rewardSum)  # Store the total reward for the episode

        # Sync
        self.targetNetwork.set_weights(self.trainNetwork.get_weights())
        self.epsilon -= self.epsilon_decay

    def start(self):
        for eps in range(self.episodeNum):
            currentState = self.env.reset().reshape(1, 2)
            self.orginalTry(currentState, eps)

        # After training, plot the rewards
        self.plot_rewards()

    def plot_rewards(self):
        plt.plot(self.rewards)
        plt.xlabel('Episodes')
        plt.ylabel('Total Reward')
        plt.title('Total Reward per Episode')
        plt.grid()
        plt.show()


env = gym.make('MountainCar-v0')
dqn = MountainCarTrain(env=env)
dqn.start()

  from jax import xla_computation as _xla_computation
  deprecation(
  deprecation(
  if not isinstance(terminated, (bool, np.bool8)):


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 256ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 134ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 125ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 108ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 60ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 92ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 65ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 212ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 66ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 43ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 95ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 57ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[

KeyboardInterrupt: 