In [1]:
import gym
import random
import numpy as np
from collections import deque
from keras.layers import Dense
from keras.optimizers import Adam, SGD
from keras.models import Sequential

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


### Let's build a DQN agent 

In [2]:
class DQNAgent:
        def __init__(self,state_size,action_size,path=None):
            self.render=False

            self.path=path
            self.state_size=state_size
            self.action_size=action_size

            self.discount_factor=0.95
            self.learning_rate=0.001
            self.epsilon=1  #0.41#1
            self.epsilon_decay=0.99999#0.99999
            self.epsilon_min=0.01
            self.batch_size=256
            self.start_train=1000
            self.memory=deque(maxlen=25000)
            self.model=self.build_model()
            self.frozen_target_model=self.build_model()

            self.update_target_model()

            self.update_counter=0

            if (self.path!=None):
                self.model.load_weights(self.path)
                self.frozen_target_model.load_weights(self.path)
        
        def build_model(self):
            model=Sequential()
            model.add(Dense(50,input_shape=(self.state_size,),activation="relu"))
            #model.add(Dense(24,activation="relu"))
            model.add(Dense(self.action_size,activation="linear"))
            model.summary()
            model.compile(loss='mean_squared_error',optimizer=Adam(lr=self.learning_rate))
            return model
        def update_target_model(self):
            self.frozen_target_model.set_weights(self.model.get_weights())
        
        def eps_greedy_policy(self,state):
            if (self.epsilon<self.epsilon_min):
                self.epsilon=self.epsilon_min
            if (np.random.rand()<=self.epsilon):
                return (random.randint(0,2))#((((1-(-1))*(random.random()-0))/(1-0))+(-1))
            else :
                state=np.reshape(state,(1,self.state_size))
                q_value=self.model.predict(state)
                return (np.argmax(q_value[0]))
        def greedy_policy(self,state):
            state=np.reshape(state,(1,self.state_size))
            q_value=self.model.predict(state)
            return (np.argmax(q_value[0]))
        
        def memorize_sample(self,state,action,reward,next_state,done):
                self.memory.append((state,action,reward,next_state,done))
                if len(self.memory)>10000:
                    self.memory.popleft()
                if (self.epsilon> self.epsilon_min):
                    self.epsilon*=self.epsilon_decay
        
        def train_model(self):
            if len(self.memory)< self.start_train:
                return
            mini_batch=random.sample(self.memory,self.batch_size)
            first_state=np.zeros((self.batch_size,self.state_size))
            next_state=np.zeros((self.batch_size,self.state_size))
            actions,reward,done=[],[],[]
            for i in range (self.batch_size):
                first_state[i]=mini_batch[i][0]
                actions.append(mini_batch[i][1])
                reward.append(mini_batch[i][2])
                next_state[i]=mini_batch[i][3]
                done.append(mini_batch[i][4])
            q_value=self.model.predict(first_state)
            q_value_next=self.frozen_target_model.predict(next_state)
            
            for i in range(self.batch_size):
                """
                if (next_state[i][0]):
                    q_value[i][action[i]]= reward[i]
                    self.update_counter+=1
                else:
                """
                q_value[i][actions[i]]= reward[i]+self.discount_factor*(np.amax(q_value_next[i]))
                
                
            self.model.fit(first_state,q_value,batch_size=self.batch_size,epochs=1,verbose=0)
            """
            if (self.update_counter%500==0):
                self.update_target_model()
            """
        """
        def load_weights(self):
            if (self.path!=None):
                self.model.load_weights(self.path)
        """

In [30]:
if __name__=="__main__":
    episodes_n=1000
    weights_path="D:/RL_CartPole_agent_weights/MountainCarContinuous_dqn2.h5"
    env=gym.make("MountainCar-v0")
    state_size=env.observation_space.shape[0]
    action_size=env.action_space.n
    
    agent=DQNAgent(state_size,action_size)#,path="D:/RL_CartPole_agent_weights/MountainCarContinuous_dqn.h5")
    #agent.load_weights()
    max_pos=-0.4
    success=[]
    steps=200
    
    
    for e in range(episodes_n):
        current_reward=0
        state=env.reset()
        if (e%10==0):
            agent.update_target_model()
        #done=False
        #while(not done):
        for step in range(steps):
            if (agent.render):
                env.render()
            
            action=agent.eps_greedy_policy(np.reshape(state,(1,state_size)))
            next_state,reward,done,_=env.step(action)
            # Adjust reward based on car position
            if (next_state[0]>max_pos):
                max_pos=next_state[0]
                reward+=10
            else:
                reward=reward
            if (next_state[0]>=0.5):
                reward+=100
                success.append(e)
            current_reward+=reward
       
            agent.memorize_sample(state,action,reward,next_state,done)
            agent.train_model()
            state=next_state
        print("episode {}, mean reward={},epsilon={}".format(e,current_reward/200,agent.epsilon))
        if e % 10 == 0:
            agent.model.save_weights(weights_path)
    
        
    

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_33 (Dense)             (None, 50)                150       
_________________________________________________________________
dense_34 (Dense)             (None, 3)                 153       
Total params: 303
Trainable params: 303
Non-trainable params: 0
_________________________________________________________________
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_35 (Dense)             (None, 50)                150       
_________________________________________________________________
dense_36 (Dense)             (None, 3)                 153       
Total params: 303
Trainable params: 303
Non-trainable params: 0
_________________________________________________________________
episode 0, mean reward=-0.2,epsilon=0.9980019886872552
episode 1, mean reward=-1

episode 126, mean reward=-1.0,epsilon=0.7756908169128434
episode 127, mean reward=-1.0,epsilon=0.7741409778854593
episode 128, mean reward=-1.0,epsilon=0.7725942354539854
episode 129, mean reward=-1.0,epsilon=0.771050583431387
episode 130, mean reward=-1.0,epsilon=0.7695100156429925
episode 131, mean reward=-1.0,epsilon=0.7679725259264679
episode 132, mean reward=-1.0,epsilon=0.7664381081317896
episode 133, mean reward=-1.0,epsilon=0.7649067561212238
episode 134, mean reward=-1.0,epsilon=0.7633784637692994
episode 135, mean reward=-1.0,epsilon=0.7618532249627826
episode 136, mean reward=-1.0,epsilon=0.7603310336006563
episode 137, mean reward=-1.0,epsilon=0.7588118835940915
episode 138, mean reward=-1.0,epsilon=0.7572957688664254
episode 139, mean reward=-1.0,epsilon=0.7557826833531376
episode 140, mean reward=-1.0,epsilon=0.7542726210018217
episode 141, mean reward=-1.0,epsilon=0.7527655757721672
episode 142, mean reward=-1.0,epsilon=0.7512615416359297
episode 143, mean reward=-1.0,ep

episode 271, mean reward=-1.0,epsilon=0.5804203363861905
episode 272, mean reward=-1.0,epsilon=0.5792606499879438
episode 273, mean reward=-1.0,epsilon=0.5781032806562408
episode 274, mean reward=-1.0,epsilon=0.5769482237615552
episode 275, mean reward=-1.0,epsilon=0.5757954746836118
episode 276, mean reward=-1.0,epsilon=0.5746450288113678
episode 277, mean reward=-1.0,epsilon=0.5734968815429914
episode 278, mean reward=-1.0,epsilon=0.5723510282858457
episode 279, mean reward=-1.0,epsilon=0.5712074644564694
episode 280, mean reward=-1.0,epsilon=0.5700661854805614
episode 281, mean reward=-1.0,epsilon=0.5689271867929586
episode 282, mean reward=-1.0,epsilon=0.5677904638376176
episode 283, mean reward=-1.0,epsilon=0.5666560120676015
episode 284, mean reward=-1.0,epsilon=0.5655238269450559
episode 285, mean reward=-1.0,epsilon=0.5643939039411923
episode 286, mean reward=-1.0,epsilon=0.5632662385362731
episode 287, mean reward=-1.0,epsilon=0.5621408262195919
episode 288, mean reward=-1.0,e

episode 415, mean reward=-1.0,epsilon=0.43517624891897544
episode 416, mean reward=-1.0,epsilon=0.43430676185059747
episode 417, mean reward=-1.0,epsilon=0.43343901202721846
episode 418, mean reward=-1.0,epsilon=0.43257299597780324
episode 419, mean reward=-1.0,epsilon=0.43170871023825186
episode 420, mean reward=-1.0,epsilon=0.4308461513513858
episode 421, mean reward=-1.0,epsilon=0.42998531586693334
episode 422, mean reward=-1.0,epsilon=0.4291262003415175
episode 423, mean reward=-1.0,epsilon=0.4282688013386398
episode 424, mean reward=-1.0,epsilon=0.4274131154286699
episode 425, mean reward=-1.0,epsilon=0.4265591391888281
episode 426, mean reward=-1.0,epsilon=0.4257068692031748
episode 427, mean reward=-1.0,epsilon=0.42485630206259395
episode 428, mean reward=-1.0,epsilon=0.4240074343647822
episode 429, mean reward=-0.85,epsilon=0.4231602627142337
episode 430, mean reward=-1.0,epsilon=0.4223147837222271
episode 431, mean reward=-1.0,epsilon=0.4214709940068111
episode 432, mean rewar

episode 558, mean reward=-1.0,epsilon=0.326931179646213
episode 559, mean reward=-1.0,epsilon=0.3262779674507909
episode 560, mean reward=-1.0,epsilon=0.3256260603807251
episode 561, mean reward=-1.0,epsilon=0.3249754558283598
episode 562, mean reward=-1.0,epsilon=0.32432615119125024
episode 563, mean reward=-1.0,epsilon=0.32367814387215094
episode 564, mean reward=-1.0,epsilon=0.3230314312790066
episode 565, mean reward=-1.0,epsilon=0.3223860108249391
episode 566, mean reward=-1.0,epsilon=0.32174187992824016
episode 567, mean reward=-1.0,epsilon=0.32109903601235984
episode 568, mean reward=-1.0,epsilon=0.3204574765058953
episode 569, mean reward=-1.0,epsilon=0.31981719884258314
episode 570, mean reward=-1.0,epsilon=0.3191782004612854
episode 571, mean reward=-1.0,epsilon=0.3185404788059825
episode 572, mean reward=-1.0,epsilon=0.3179040313257608
episode 573, mean reward=-1.0,epsilon=0.31726885547480493
episode 574, mean reward=-1.0,epsilon=0.31663494871238457
episode 575, mean reward=

episode 701, mean reward=-1.0,epsilon=0.24561082202986478
episode 702, mean reward=-1.0,epsilon=0.24512008882891667
episode 703, mean reward=-1.0,epsilon=0.2446303361184556
episode 704, mean reward=-1.0,epsilon=0.2441415619394505
episode 705, mean reward=-1.0,epsilon=0.2436537643367844
episode 706, mean reward=-1.0,epsilon=0.24316694135924677
episode 707, mean reward=-1.0,epsilon=0.24268109105952562
episode 708, mean reward=-1.0,epsilon=0.24219621149419973
episode 709, mean reward=-1.0,epsilon=0.2417123007237305
episode 710, mean reward=-1.0,epsilon=0.24122935681245508
episode 711, mean reward=-1.0,epsilon=0.24074737782857783
episode 712, mean reward=-1.0,epsilon=0.24026636184416275
episode 713, mean reward=-1.0,epsilon=0.23978630693512618
episode 714, mean reward=-1.0,epsilon=0.2393072111812284
episode 715, mean reward=-1.0,epsilon=0.23882907266606693
episode 716, mean reward=-1.0,epsilon=0.23835188947706784
episode 717, mean reward=-1.0,epsilon=0.23787565970547858
episode 718, mean r

episode 843, mean reward=-1.0,epsilon=0.18488737186118503
episode 844, mean reward=-1.0,epsilon=0.18451796480062288
episode 845, mean reward=-1.0,epsilon=0.18414929581954664
episode 846, mean reward=-1.0,epsilon=0.18378136344326548
episode 847, mean reward=-1.0,epsilon=0.18341416620003442
episode 848, mean reward=-1.0,epsilon=0.18304770262104916
episode 849, mean reward=-1.0,epsilon=0.18268197124044036
episode 850, mean reward=1.5,epsilon=0.18231697059526755
episode 851, mean reward=-1.0,epsilon=0.18195269922551285
episode 852, mean reward=18.5,epsilon=0.18158915567407588
episode 853, mean reward=22.5,epsilon=0.18122633848676756
episode 854, mean reward=-1.0,epsilon=0.1808642462123039
episode 855, mean reward=-1.0,epsilon=0.18050287740230078
episode 856, mean reward=-1.0,epsilon=0.1801422306112683
episode 857, mean reward=-1.0,epsilon=0.179782304396604
episode 858, mean reward=23.0,epsilon=0.17942309731858838
episode 859, mean reward=-1.0,epsilon=0.17906460794037798
episode 860, mean r

episode 986, mean reward=20.0,epsilon=0.1388987720134489
episode 987, mean reward=21.0,epsilon=0.13862125069563977
episode 988, mean reward=15.5,epsilon=0.1383442838685631
episode 989, mean reward=22.5,epsilon=0.13806787042434024
episode 990, mean reward=17.5,epsilon=0.1377920092573061
episode 991, mean reward=17.0,epsilon=0.13751669926400426
episode 992, mean reward=20.5,epsilon=0.13724193934318354
episode 993, mean reward=21.5,epsilon=0.1369677283957929
episode 994, mean reward=20.0,epsilon=0.1366940653249772
episode 995, mean reward=21.0,epsilon=0.1364209490360728
episode 996, mean reward=19.0,epsilon=0.1361483784366037
episode 997, mean reward=20.0,epsilon=0.13587635243627538
episode 998, mean reward=-1.0,epsilon=0.1356048699469731
episode 999, mean reward=-1.0,epsilon=0.13533392988275578


In [3]:
env=gym.make("MountainCar-v0")
state_size=env.observation_space.shape[0]
action_size=env.action_space.n
agent=DQNAgent(state_size,action_size,path="D:/RL_CartPole_agent_weights/MountainCarDiscrete_dqn.h5")



    


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 50)                150       
_________________________________________________________________
dense_2 (Dense)              (None, 3)                 153       
Total params: 303
Trainable params: 303
Non-trainable params: 0
_________________________________________________________________
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_3 (Dense)              (None, 50)                150       
_________________________________________________________________
dense_4 (Dense)              (None, 3)                 153       
Total params: 303
Trainable params: 303
Non-trainable params: 0
_________________________________________________________________


In [7]:
state=env.reset()
state=np.reshape(state,(1,agent.state_size))
while(state[0][0]<0.5):
        env.render()
        action=agent.greedy_policy(state)
        next_state,reward,done,_=env.step(np.array(action))
        next_state=np.reshape(next_state,(1,state_size))
        state=next_state
env.close()