In [1]:
import gym
import numpy as np
import torch
from torch import nn

In [286]:
LEARNING_RATE = 1e-2
EXPLORATION_PROBABILITY = 0.10
EVAL_PERIOD = 10
GAMMA = 0.95
REWARD_SHAPING_WEIGHT = 100

class Actor(nn.Module):
    def __init__(self):
        super(Actor, self).__init__()
        self.fc = nn.Sequential(
            nn.Linear(2, 16),
            nn.ReLU(),
            nn.Linear(16, 16),
            nn.ReLU(),
            nn.Linear(16, 1)
        )
        
    def forward(self, x):
        return self.fc(x)
    
    
class Critic(nn.Module):
    def __init__(self):
        super(Critic, self).__init__()
        self.fc = nn.Sequential(
            nn.Linear(3, 16),
            nn.ReLU(),
            nn.Linear(16, 16),
            nn.ReLU(),
            nn.Linear(16, 1)
        )
        
    def forward(self, x):
        return self.fc(x)

class MCCAgent():
    def __init__(self):
        self.env = gym.make('MountainCarContinuous-v0')
        self.state_space_low = self.env.observation_space.low
        self.state_space_high = self.env.observation_space.high
        self.state_space_mid = (self.state_space_high + self.state_space_low) / 2
        self.state_space_rad = (self.state_space_high - self.state_space_low) / 2
        
        self.actor = Actor()
        self.critic = Critic()
        
    def _normalize_coords(self, coords):
        return (coords - self.state_space_mid) / self.state_space_rad
        
    def _choose_action(self, state, exploration_probability):
        explore = np.random.binomial(1, exploration_probability)
        if explore:
            return np.float32(np.random.rand(1) * 2 - 1)
        actors_choice = self.actor(torch.tensor(self._normalize_coords(state)).unsqueeze(0).float())
        return actors_choice.detach().numpy()[0].astype(np.float32)
        
    def train(self, iterations):
        critic_optimizer = torch.optim.Adam(self.critic.parameters(), lr=LEARNING_RATE)
        actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=LEARNING_RATE)
        loss_function = nn.MSELoss()
        for i in range(iterations):
            state = self.env.reset()
            done = False
            states = []
            rewards = []
            actions = []
            while not done:
                action = self._choose_action(state, EXPLORATION_PROBABILITY)
                next_state, reward, done, _ = self.env.step(action)
                shaped_reward = reward + (abs(next_state[1]) * GAMMA - abs(state[1])) * REWARD_SHAPING_WEIGHT
                                
                states.append(self._normalize_coords(state))
                rewards.append(shaped_reward)
                actions.append(action)
                state = next_state
                
            next_states = states[1:] + [self._normalize_coords(state)]
            next_actions = torch.tensor([self._choose_action(i, 0) for i in next_states]).float()
            states = torch.tensor(states).float()
            next_states = torch.tensor(next_states).float()
            rewards = torch.tensor(rewards).float()
            actions = torch.tensor(actions)
            
            self.critic.zero_grad()
            pred_qs = self.critic(torch.cat((states, actions), dim=1))
            next_qs = self.critic(torch.cat((next_states, next_actions), dim=1))
            critic_loss = loss_function(pred_qs, rewards + GAMMA * next_qs)
            critic_loss.backward()
            critic_optimizer.step()
            
            self.actor.zero_grad()
            self.critic.zero_grad()
            chosen_actions = self.actor(states)
            expected_qs = self.critic(torch.cat([states, chosen_actions], dim=1))
            (-expected_qs).mean().backward()
            actor_optimizer.step()
            
            if (i + 1) % EVAL_PERIOD == 0:
                print(self.run())
            
    def run(self, show=False):
        env = gym.make('MountainCarContinuous-v0')
        done = False
        steps = 0
        state = env.reset()
        if show:
            env.render()
        while not done:
            action = self._choose_action(state, 0)
            state, _, done, _ = env.step(action)
            steps += 1
            if show:
                env.render()
        env.close()

        return steps

In [287]:
model = MCCAgent()
model.train(200)

999
999
999
999
999
999
999
999
999
999
999
999
999
999
999
999
999
999
999
999


In [288]:
model.run(True)

999