In [1]:
import gym
import numpy as np
import random
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from collections import deque
from tensorflow.keras.models import Sequential
from tensorflow.keras.models import load_model

In [2]:
class Agent:
    
    def __init__(self,obs_size,action_size):
        
        self.obs_size=obs_size                          #size of tuple used to define any state in the game
        self.action_size=action_size                    #no. of possible actions
        self.gamma=0.95                                 #discount factor
        self.memory=deque(maxlen=2000)                  
        #stores past experiences, experiences in the distant past will be discarded
        #when the memory is full and experiences in the recent past will be used
        
        self.epsilon=1                                  #exploration factor, intially we will do 100% exploration
        self.epsilon_decay=0.995                        #how much the exploration factor will decrease after episode
        self.epsilon_min=0.01                           #minimum value of epsilon
        self.lr=0.001                                   #learning_rate for SGD
        self.save_path="model.h5"
        self.model=self._create_model()
        
    def _create_model(self):
        
          #if u want to train the NN from scratch, uncomment these lines and comment model=self.load() line
        
#         model=Sequential()
        
#         model.add(Dense(24,activation="relu",input_dim=self.obs_size))
#         model.add(Dense(24,activation="relu"))
#         model.add(Dense(self.action_size,activation="linear"))
        
#         model.compile(optimizer=Adam(lr=self.lr),loss="mse")                      

        model=self.load()
        
        return model
    
    def remember(self,cur_state,action,reward,next_state,done):
        self.memory.append((cur_state,action,reward,next_state,done))
    
    def act(self,state):
        
        if np.random.rand()<=self.epsilon:                                #exploration
            return random.sample(list(np.arange(self.action_size)),1)[0]
        
        return np.argmax(self.model.predict(state)[0])                     #exploitation
    
    def fit(self,batch_size=32):                                           
        
        batch=random.sample(self.memory,batch_size)
        
        for cur_state,action,reward,next_state,done in batch:
            
            if done:
                y_true=reward
                
            else:
                y_true=reward+(self.gamma*np.amax(self.model.predict(next_state)[0]))
                
            target=self.model.predict(cur_state)
            target[0][action]=y_true
            
            self.model.fit(cur_state,target,epochs=1,verbose=0)
            
        if self.epsilon>self.epsilon_min:
            self.epsilon*=self.epsilon_decay
            
    def save(self):
        self.model.save(self.save_path)
        
    def load(self):
        return load_model(self.save_path)

In [3]:
env=gym.make("CartPole-v0")                           #creates a cartpole game environment
#env.max_episode_steps=1000                              #to increase the no. of timesteps for which the game is played

done=False
observation_size=env.observation_space.shape[0]
action_size=env.action_space.n
batch_size=32

agent=Agent(action_size=action_size,obs_size=observation_size)
agent

<__main__.Agent at 0x1c2e3da0940>

###### If u want to train the model, run the below code:

In [4]:
for episode in range(3000):
    
    state=env.reset()                                 #intial state for each episode
    state=np.reshape(state,(1,observation_size))
    
    for t in range(200):
        
        env.render()
        
        action=agent.act(state)
        next_state,reward,done,other_info=env.step(action)
        next_state=np.reshape(next_state,(1,observation_size))
        
        agent.remember(cur_state=state,action=action,reward=reward,next_state=next_state,done=done)
        state=next_state
        
        if done:
            print("Episode: ",episode," Score: ",t," Exploration factor: ",agent.epsilon)
            break
    
    if len(agent.memory)>=batch_size:
        agent.fit(batch_size)
        
agent.save()
env.close()

Episode:  0  Score:  -199  Exploration factor:  1
Episode:  1  Score:  -199  Exploration factor:  0.995
Episode:  2  Score:  -199  Exploration factor:  0.990025
Episode:  3  Score:  -199  Exploration factor:  0.985074875
Episode:  4  Score:  -199  Exploration factor:  0.9801495006250001


KeyboardInterrupt: 

###### If u want to run the game on the Trained model, Run the below code:

In [9]:
for episode in range(1):
    
    state=env.reset()                                 #intial state for each episode
    state=np.reshape(state,(1,observation_size))
    
    for t in range(5000):
        
        env.render()
        
        next_state,reward,done,other_info=env.step(np.argmax(agent.model.predict(state)[0]))
        next_state=np.reshape(next_state,(1,observation_size))
        
        state=next_state
        
        if done:
            print("Episode: ",episode," Score: ",t)
            break
            
env.close()

Episode:  0  Score:  156
