In [10]:
import gym

In [11]:
env = gym.make('CartPole-v0')

In [7]:
for episode in range(20):
    observation = env.reset()
    for t in range(50):
        env.render()
        action = env.action_space.sample()
        observation,reward,done,info = env.step(action)
        if done:
            print("Game Over!!!  Episode :  {} Score : {}".format(episode,t))
            break
env.close()
print("All 20 episodes completed!!!!!!!!!")

Game Over!!!  Episode :  0 Score : 15
Game Over!!!  Episode :  1 Score : 27
Game Over!!!  Episode :  2 Score : 12
Game Over!!!  Episode :  3 Score : 11
Game Over!!!  Episode :  4 Score : 12
Game Over!!!  Episode :  6 Score : 19
Game Over!!!  Episode :  7 Score : 12
Game Over!!!  Episode :  8 Score : 39
Game Over!!!  Episode :  9 Score : 10
Game Over!!!  Episode :  10 Score : 11
Game Over!!!  Episode :  11 Score : 22
Game Over!!!  Episode :  12 Score : 17
Game Over!!!  Episode :  13 Score : 30
Game Over!!!  Episode :  14 Score : 16
Game Over!!!  Episode :  15 Score : 14
Game Over!!!  Episode :  16 Score : 12
Game Over!!!  Episode :  17 Score : 14
Game Over!!!  Episode :  18 Score : 28
Game Over!!!  Episode :  19 Score : 13
All 20 episodes completed!!!!!!!!!


In [12]:
import numpy as np
import matplotlib.pyplot as plt
import os
from collections import deque
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam
import random

In [13]:
class Agent:
    def __init__(self,state_size,action_size):
        self.state_size = state_size
        self.action_size = action_size
        self.memory = deque(maxlen=2000)
        self.gamma = 0.95 #discount factor
        self.epsilon = 1.0 #max exploration
        self.epsilon_decay = 0.995
        self.epsilon_min = 0.01
        self.learning_rate = 0.0001
        self.model = self._create_model()
    
    def _create_model(self):
        #MLP classifier
        model = Sequential()
        model.add(Dense(24,input_dim = self.state_size , activation = "relu")) #hidden layer1
        model.add(Dense(24,activation="relu")) #hiddden layer2
        model.add(Dense(self.action_size,activation="linear"))
        model.compile(loss='mse',optimizer=Adam(lr=self.learning_rate))
        return model
    
    def remember(self,state,action,reward,next_state,done):
        #append in deque the eperience
        self.memory.append((state,action,reward,next_state,done))
        
    def act(self,state):
        #Epsilon Greedy Method
        if np.random.rand()<=self.epsilon: #random number b/w 0 and 1
            #We will Explore now that is take a random action
            return random.randrange(self.action_size) #random number in range(action_size)
        else :
            #we will exploit what we have learnt
            rewards_action = self.model.predict(state) # it gives 2D list batch_size*action_size here batch_size=1
            return np.argmax(rewards_action[0])  #max of left and right
        
    def train(self,batch_size=32):
        #Training using a replay buffer
        minibatch = random.sample(self.memory,batch_size) #select a random minibatch from memory
        for experience in minibatch:
            state,action,reward,next_state,done = experience
            if done:
                target = reward
            else :
                #Bellman Equation
                target = reward + self.gamma*np.amax(self.model.predict(next_state)[0]) #amax gives max in multidim arrayy
            target_f = self.model.predict(state)
            target_f[0][action] = target
            #For Neaural N/W X=state && Y=target_f , loss = (taget_f-target)^2
            self.model.fit(state,target_f,epochs=1,verbose=0) 
            # Epsilon decay
            if self.epsilon > self.epsilon_min:
                self.epsilon *= self.epsilon_decay
    
    def load(self,name):
       self.model.load_weights(name)
    
    def save(self,name):
        self.model.save_weights(name)
        

####Training the Deep Q Learner

In [14]:
no_of_episode = 1000
output_dir = "cartpole/"
state_size = 4
action_size = 2
batch_size = 32
#os.mkdir(output_dir)

In [15]:
agent = Agent(4,2)
done =False

In [None]:
for episode in range(no_of_episode):
    state = env.reset()
    state = np.reshape(state,[1,state_size])
    for time in range(500):
        env.render()
        action = agent.act(state) #select a action 0 or 1
        next_state,reward,done,other_info = env.step(action) #perform the action
        if done :
            reward = -10
        next_state = np.reshape(next_state,[1,state_size])
        agent.remember(state,action,reward,next_state,done)
        if done :
            print("Game Over!!!  Episode :  {} Score : {} Explorate rate : {:.2}".format(episode,time,agent.epsilon))
            break      
    
    if len(agent.memory)>batch_size:
        agent.train(batch_size)
    
    if episode%50 == 0 :
       agent.save(output_dir+"weights_"+'{:04d}'.format(episode)+".hdf5")
env.close()
        

Game Over!!!  Episode :  0 Score : 11 Explorate rate : 1.0
Game Over!!!  Episode :  1 Score : 15 Explorate rate : 1.0
Game Over!!!  Episode :  2 Score : 58 Explorate rate : 1.0
Game Over!!!  Episode :  3 Score : 9 Explorate rate : 0.01
Game Over!!!  Episode :  4 Score : 7 Explorate rate : 0.01
Game Over!!!  Episode :  5 Score : 7 Explorate rate : 0.01
Game Over!!!  Episode :  6 Score : 9 Explorate rate : 0.01
Game Over!!!  Episode :  7 Score : 7 Explorate rate : 0.01
Game Over!!!  Episode :  8 Score : 9 Explorate rate : 0.01
Game Over!!!  Episode :  9 Score : 8 Explorate rate : 0.01
Game Over!!!  Episode :  10 Score : 11 Explorate rate : 0.01
Game Over!!!  Episode :  11 Score : 10 Explorate rate : 0.01
Game Over!!!  Episode :  12 Score : 8 Explorate rate : 0.01
Game Over!!!  Episode :  13 Score : 8 Explorate rate : 0.01
Game Over!!!  Episode :  14 Score : 8 Explorate rate : 0.01
Game Over!!!  Episode :  15 Score : 8 Explorate rate : 0.01
Game Over!!!  Episode :  16 Score : 9 Explorate 

In [9]:
while 1:
    print("Welcome to cartpole-game")
    print("\nEnter your choice")
    print("\n1. Random Episode")
    print("\n2. Play with AI agent")
    print("\n3. Quit")

    inp = int(input())

    if inp == 1:
        observation = env.reset()
        for t in range(500):
            env.render()
            action = env.action_space.sample()
            observation,reward,done,info = env.step(action)
            if done:
                print("Game Over!!! Score : {}".format(t))
                break
        env.close()
    elif inp == 2:
        agent.load(output_dir+"weights_"+'{:04d}'.format(250)+".hdf5")
        state = env.reset()
        state = np.reshape(state,[1,state_size])
        for time in range(500):
            env.render()
            action = agent.act(state) #select a action 0 or 1
            next_state,reward,done,other_info = env.step(action) #perform the action
            if done :
                reward = -10
            next_state = np.reshape(next_state,[1,state_size])
            agent.remember(state,action,reward,next_state,done)
            if done :
                print("Game Over!!!   Score : {} Explorate rate : {:.2}".format(time,agent.epsilon))
                break  
        env.close()
    else :
         break
    
    

Welcome to cartpole-game

Enter your choice

1. Random Episode

2. Play with AI agent

3. Quit
1
Game Over!!! Score : 32
Welcome to cartpole-game

Enter your choice

1. Random Episode

2. Play with AI agent

3. Quit
2


NameError: name 'agent' is not defined