In [2]:
import keras
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam



from collections import deque
import numpy as np
import random

import gym
from gym import wrappers

from tqdm import tqdm


In [14]:
class QNetwork():
    def __init__(self, input_size, hidden_size, output_size):
        
        self.input_size = input_size

        # Create NN
        self.model = Sequential()  # Create the model, a *sequential* NN
        self.model.add(Dense(hidden_size, activation='relu', name="hidden")) # Create input layer, Dense indicating a fully connected layer
        self.model.add(Dense(output_size , activation='relu', name="output")) # Create output layer
        self.model.compile(loss="mse", optimizer=Adam(), jit_compile=True)
        self.model.build((1, input_size))
        self.model.summary()

        # Memory buffer
        self.replay_buffer = deque(maxlen=1000)
        
        # Hyperparameters
        self.epsilon = 0.6
        self.gamma = 0.8
        self.decay = 0.95

    def forward(self, input):
        return self.model.predict(input.reshape(1,4), verbose=0)

    def ep_greedy(self, Q_values):
        if random.random() < self.epsilon:
            return random.randint(0, 1)
        else: 
            return np.argmax(Q_values)
        
        self.epsilon *= self.decay

    def store_experience(self, state, action, reward, state_new, done):
        experience = (state, action, reward, state_new, done)
        self.replay_buffer.append(experience)
    
    def train(self, batch_size,e):
        if len(self.replay_buffer) > batch_size:
            experience_sample = random.sample(self.replay_buffer, batch_size)
            x = np.array([e[0] for e in experience_sample])

            # Construct target
            y = self.model.predict(x,verbose=0)
            x2 = np.array([e[3] for e in experience_sample])
            Q2 = self.gamma * np.max(self.model.predict(x2,verbose=0), axis=1)
            
            for i,(s,a,r,s2,d) in enumerate(experience_sample):
                y[i][a] = r
                if not d:
                    y[i][a] += Q2[i]

            # Update
            self.model.fit(x, y, batch_size=batch_size, epochs=e, verbose=0)



In [15]:
env = gym.make('CartPole-v1', render_mode="human")

s_size = env.observation_space.shape[0]
a_size = env.action_space.n
hidden_size = 8

net = QNetwork(s_size, hidden_size, a_size)

gamma = 0.85  

rewards = []

for g in tqdm(range(500)):
    
    game_reward = 0 
    
    done = False
    state = env.reset()[0]
    
    while not done:
        
        # Forward prop
        Q_values = net.forward(state)
        
        # Policy Decision
        Q_current = np.max(Q_values)
        action = net.ep_greedy(Q_values)
    
        # Next step
        state_next, reward, done, info, _ = env.step(action)
        
        net.store_experience(state, action, reward, state_next, done)

        #Reward count
        game_reward += reward
        
        
        state = state_next

    if done:
        rewards.append(game_reward)
        if g % 10 == 0:
            try:
                print(rewards[-10:])
            except:
                continue
            net.train(10, 5)
env.close()



Model: "sequential_6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 hidden (Dense)              (1, 8)                    40        
                                                                 
 output (Dense)              (1, 2)                    18        
                                                                 
Total params: 58
Trainable params: 58
Non-trainable params: 0
_________________________________________________________________


  0%|          | 0/500 [00:00<?, ?it/s]

[]


  1%|          | 5/500 [00:06<10:50,  1.31s/it]


KeyboardInterrupt: 