### This is my attempt to implement REINFORCE algorithm, a gradient based algorithm.

In [177]:
import gym
import random
import numpy as np
from keras.layers import Dense,Input
from keras.optimizers import Adam
from keras.models import Sequential,Model
from keras import backend as K
from keras import losses
from keras import utils 

In [311]:
class Reinforce_agent:
    def __init__(self,state_size,action_size,path=None):
        self.path=path
        self.state_size=state_size
        self.action_size=action_size
        self.model=self.build_model()
        self.discount_factor=0.95
        self.optimizer=Adam(lr=0.0006)
      

        if (self.path!=None):
            self.model.load_weights(self.path)
    """  
    def build_model(self):
        x= Input(shape=(self.state_size,))
        y1=Dense(50,activation="relu")(x)
        y2=Dense(self.action_size,activation="softmax")(y1)
        return(Model(inputs=[x],outputs=[y2]))
     """       
    def build_model(self):
        model=Sequential()
        model.add(Dense(50,input_shape=(self.state_size,),activation="relu"))
        model.add(Dense(self.action_size,activation="softmax"))
        return(model)
    
    def compute_discounted_R(self,R,discount_rate=None):
        
        if discount_rate==None:
            discount_rate=self.discount_factor
        discounted_r = np.zeros_like(R, dtype=np.float32)
        running_add = 0
        for t in reversed(range(len(R))):

            running_add = running_add * discount_rate + R[t]
            discounted_r[t] = running_add

        discounted_r = (discounted_r-discounted_r.mean()) / discounted_r.std()
        
        return discounted_r

    def generate_episode(self,env,steps=400):
        #we are going to limit the length of an episode to 200 steps
        #this function returns the discounted rewards
        current_reward=0
        state=env.reset()
        max_pos=-0.4
        actions=[]
        visited_states=[]
        rewards=[]
        for step in range(steps):
            prediction=self.model.predict(np.reshape(state,(1,self.state_size)))
            #action=np.argmax(self.model.predict(np.stack([np.reshape(state,(1,self.state_size))],axis=0)),axis=2)
            action=np.random.choice([0,1,2],p=prediction.reshape((3,)))
 
            next_state,reward,done,_=env.step(action)
            # Adjust reward based on car position
            if (next_state[0]>max_pos):
                max_pos=next_state[0]
                reward+=0.1
            else:
                reward=reward
            if (next_state[0]>=0.5):
                reward+=1
            actions.append(action)
            rewards.append(reward)
            visited_states.append(state)
            state=next_state
            
        return (visited_states,actions,rewards)


     
        
    def update_weights_fn(self):
        action_prob_placeholder=self.model.output
        action_one_hot_placeholder=K.placeholder(shape=(None,self.model.output_shape[1]))
        discounted_r_placeholder=K.placeholder(shape=(None,))
        chosen_actions_prob=K.sum(action_prob_placeholder*action_one_hot_placeholder,axis=1)
        log_loss=-1*K.log(chosen_actions_prob)*discounted_r_placeholder
        log_loss=K.mean(log_loss)
        updates = self.optimizer.get_updates(params=self.model.trainable_weights,
                                   loss=log_loss)
        self.update_fn=K.function(inputs=[self.model.input,
                                           action_one_hot_placeholder,
                                           discounted_r_placeholder],
                                   outputs=[],
                                   updates=updates)
        return
    
    def update_weights(self,states,actions,rewards):
        action_one_hot_placeholder=utils.to_categorical(actions, num_classes=self.model.output_shape[1])
        discounted_reward = self.compute_discounted_R(rewards)
        self.update_fn([states, action_one_hot_placeholder, discounted_reward])
        
        
    

In [None]:
if __name__=="__main__":
    episodes_n=10000
    weights_path="D:/RL_CartPole_agent_weights/MountainCarContinuous_Reinforce.h5"
    env=gym.make("MountainCar-v0")
    state_size=env.observation_space.shape[0]
    action_size=env.action_space.n
    agent=Reinforce_agent(state_size,action_size)
    agent.update_weights_fn()
    for episode in range(episodes_n):
        visited_states,actions,rewards=agent.generate_episode(env)
        #visited_states=[x.reshape(1,x.shape[0]) for x in visited_states]
        #visited_states=np.stack(visited_states,axis=0)
        agent.update_weights(visited_states,actions,rewards)
        
        #update_weights=agent.update_weights_fn(200,actions,discounted_r)(visited_states)
        #action_prob,highest_action_prob=update_weights(visited_states)
        #update_weights(visited_states)
        
        print(print("episode {}/{}".format(episode+1,episodes_n)))
        if episode % 10 == 0:
            agent.model.save_weights(weights_path)
        

episode 1/10000
None
episode 2/10000
None
episode 3/10000
None
episode 4/10000
None
episode 5/10000
None
episode 6/10000
None
episode 7/10000
None
episode 8/10000
None
episode 9/10000
None
episode 10/10000
None
episode 11/10000
None
episode 12/10000
None
episode 13/10000
None
episode 14/10000
None
episode 15/10000
None
episode 16/10000
None
episode 17/10000
None
episode 18/10000
None
episode 19/10000
None
episode 20/10000
None
episode 21/10000
None
episode 22/10000
None
episode 23/10000
None
episode 24/10000
None
episode 25/10000
None
episode 26/10000
None
episode 27/10000
None
episode 28/10000
None
episode 29/10000
None
episode 30/10000
None
episode 31/10000
None
episode 32/10000
None
episode 33/10000
None
episode 34/10000
None
episode 35/10000
None
episode 36/10000
None
episode 37/10000
None
episode 38/10000
None
episode 39/10000
None
episode 40/10000
None
episode 41/10000
None
episode 42/10000
None
episode 43/10000
None
episode 44/10000
None
episode 45/10000
None
episode 46/10000
No

episode 362/10000
None
episode 363/10000
None
episode 364/10000
None
episode 365/10000
None
episode 366/10000
None
episode 367/10000
None
episode 368/10000
None
episode 369/10000
None
episode 370/10000
None
episode 371/10000
None
episode 372/10000
None
episode 373/10000
None
episode 374/10000
None
episode 375/10000
None
episode 376/10000
None
episode 377/10000
None
episode 378/10000
None
episode 379/10000
None
episode 380/10000
None
episode 381/10000
None
episode 382/10000
None
episode 383/10000
None
episode 384/10000
None
episode 385/10000
None
episode 386/10000
None
episode 387/10000
None
episode 388/10000
None
episode 389/10000
None
episode 390/10000
None
episode 391/10000
None
episode 392/10000
None
episode 393/10000
None
episode 394/10000
None
episode 395/10000
None
episode 396/10000
None
episode 397/10000
None
episode 398/10000
None
episode 399/10000
None
episode 400/10000
None
episode 401/10000
None
episode 402/10000
None
episode 403/10000
None
episode 404/10000
None
episode 405

episode 719/10000
None
episode 720/10000
None
episode 721/10000
None
episode 722/10000
None
episode 723/10000
None
episode 724/10000
None
episode 725/10000
None
episode 726/10000
None
episode 727/10000
None
episode 728/10000
None
episode 729/10000
None
episode 730/10000
None
episode 731/10000
None
episode 732/10000
None
episode 733/10000
None
episode 734/10000
None
episode 735/10000
None
episode 736/10000
None
episode 737/10000
None
episode 738/10000
None
episode 739/10000
None
episode 740/10000
None
episode 741/10000
None
episode 742/10000
None
episode 743/10000
None
episode 744/10000
None
episode 745/10000
None
episode 746/10000
None
episode 747/10000
None
episode 748/10000
None
episode 749/10000
None
episode 750/10000
None
episode 751/10000
None
episode 752/10000
None
episode 753/10000
None
episode 754/10000
None
episode 755/10000
None
episode 756/10000
None
episode 757/10000
None
episode 758/10000
None
episode 759/10000
None
episode 760/10000
None
episode 761/10000
None
episode 762

episode 1073/10000
None
episode 1074/10000
None
episode 1075/10000
None
episode 1076/10000
None
episode 1077/10000
None
episode 1078/10000
None
episode 1079/10000
None
episode 1080/10000
None
episode 1081/10000
None
episode 1082/10000
None
episode 1083/10000
None
episode 1084/10000
None
episode 1085/10000
None
episode 1086/10000
None
episode 1087/10000
None
episode 1088/10000
None
episode 1089/10000
None
episode 1090/10000
None
episode 1091/10000
None
episode 1092/10000
None
episode 1093/10000
None
episode 1094/10000
None
episode 1095/10000
None
episode 1096/10000
None
episode 1097/10000
None
episode 1098/10000
None
episode 1099/10000
None
episode 1100/10000
None
episode 1101/10000
None
episode 1102/10000
None
episode 1103/10000
None
episode 1104/10000
None
episode 1105/10000
None
episode 1106/10000
None
episode 1107/10000
None
episode 1108/10000
None
episode 1109/10000
None
episode 1110/10000
None
episode 1111/10000
None
episode 1112/10000
None
episode 1113/10000
None
episode 1114/100

episode 1415/10000
None
episode 1416/10000
None
episode 1417/10000
None
episode 1418/10000
None
episode 1419/10000
None
episode 1420/10000
None
episode 1421/10000
None
episode 1422/10000
None
episode 1423/10000
None
episode 1424/10000
None
episode 1425/10000
None
episode 1426/10000
None
episode 1427/10000
None
episode 1428/10000
None
episode 1429/10000
None
episode 1430/10000
None
episode 1431/10000
None
episode 1432/10000
None
episode 1433/10000
None
episode 1434/10000
None
episode 1435/10000
None
episode 1436/10000
None
episode 1437/10000
None
episode 1438/10000
None
episode 1439/10000
None
episode 1440/10000
None
episode 1441/10000
None
episode 1442/10000
None
episode 1443/10000
None
episode 1444/10000
None
episode 1445/10000
None
episode 1446/10000
None
episode 1447/10000
None
episode 1448/10000
None
episode 1449/10000
None
episode 1450/10000
None
episode 1451/10000
None
episode 1452/10000
None
episode 1453/10000
None
episode 1454/10000
None
episode 1455/10000
None
episode 1456/100

In [310]:
state=env.reset()
state=np.reshape(state,(1,agent.state_size))

for step in range(3000):
    env.render()
    #action=np.argmax(agent.model.predict(state),axis=1)
    prediction=agent.model.predict(state)
    action=np.random.choice([0,1,2],p=prediction.reshape((3,)))
    next_state,reward,done,_=env.step(action)
    next_state=np.reshape(next_state,(1,state_size))
    state=next_state
env.close()