## Learning Inverted Pendulum Using Reinforcement Learning 

![SegmentLocal](pendulum.gif "segment")


In this task, we are going to train a neural network policy for inverted pendulum using reinforcement learning. 
The inverted pendulum swingup problem is based on the classic problem in control theory. The system consists of a pendulum attached at one end to a fixed point, and the other end being free. The pendulum starts in a random position and the goal is to apply torque on the free end to swing it into an upright position, with its center of gravity right above the fixed point. The diagram below specifies the coordinate system used for the implementation of the pendulum's dynamic equations.


### Action Space
The action is a `ndarray` with shape `(1,)` representing the torque applied to free end of the pendulum.

| Num | Action | Min  | Max |
|-----|--------|------|-----|
| 0   | Torque | -2.0 | 2.0 |

### Observation Space
The observation is a `ndarray` with shape `(3,)` representing the x-y coordinates of the pendulum's free
end and its angular velocity.

| Num | Observation      | Min  | Max |
|-----|------------------|------|-----|
| 0   | x = cos(theta)   | -1.0 | 1.0 |
| 1   | y = sin(theta)   | -1.0 | 1.0 |
| 2   | Angular Velocity | -8.0 | 8.0 |

### Rewards

The reward function is defined as:
$$r = -(theta^2 + 0.1 * \hat{theta}^2 + 0.001 * torque^2)$$

where $\theta$ is the pendulum's angle normalized between $[-pi, pi]$ (with 0 being in the upright position).

### Starting State

The starting state is a random angle in *[-pi, pi]* and a random angular velocity in *[-1,1]*.

### Episode Truncation
The episode truncates at 200 time steps.

In [None]:
import gym
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Normal
import numpy as np

In [2]:
#Hyperparameters
learning_rate  = 0.0003
gamma           = 0.9
lmbda           = 0.9
eps_clip        = 0.2
K_epoch         = 10
rollout_len    = 3
buffer_size    = 30
minibatch_size = 32

In [4]:
class PPO(nn.Module):
    def __init__(self):
        super(PPO, self).__init__()
        self.data = []
        
        self.fc1   = nn.Linear(3,128)
        self.fc_mu = nn.Linear(128,1)
        self.fc_std  = nn.Linear(128,1)
        self.fc_v = nn.Linear(128,1)
        self.optimizer = optim.Adam(self.parameters(), lr=learning_rate)
        self.optimization_step = 0

    def pi(self, x, softmax_dim = 0):
        x = F.relu(self.fc1(x))
        mu = 2.0*torch.tanh(self.fc_mu(x))
        std = F.softplus(self.fc_std(x))
        return mu, std
    
    def v(self, x):
        x = F.relu(self.fc1(x))
        v = self.fc_v(x)
        return v
      
    def put_data(self, transition):
        self.data.append(transition)
        
    def make_batch(self):
        s_batch, a_batch, r_batch, s_prime_batch, prob_a_batch, done_batch = [], [], [], [], [], []
        data = []

        for j in range(buffer_size):
            for i in range(minibatch_size):
                rollout = self.data.pop()
                s_lst, a_lst, r_lst, s_prime_lst, prob_a_lst, done_lst = [], [], [], [], [], []

                for transition in rollout:
                    s, a, r, s_prime, prob_a, done = transition
                    
                    s_lst.append(s)
                    a_lst.append([a])
                    r_lst.append([r])
                    s_prime_lst.append(s_prime)
                    prob_a_lst.append([prob_a])
                    done_mask = 0 if done else 1
                    done_lst.append([done_mask])

                s_batch.append(s_lst)
                a_batch.append(a_lst)
                r_batch.append(r_lst)
                s_prime_batch.append(s_prime_lst)
                prob_a_batch.append(prob_a_lst)
                done_batch.append(done_lst)
                    
            mini_batch = torch.tensor(s_batch, dtype=torch.float), torch.tensor(a_batch, dtype=torch.float), \
                          torch.tensor(r_batch, dtype=torch.float), torch.tensor(s_prime_batch, dtype=torch.float), \
                          torch.tensor(done_batch, dtype=torch.float), torch.tensor(prob_a_batch, dtype=torch.float)
            data.append(mini_batch)

        return data

    def calc_advantage(self, data):
        data_with_adv = []
        for mini_batch in data:
            s, a, r, s_prime, done_mask, old_log_prob = mini_batch
            with torch.no_grad():
                td_target = r + gamma * self.v(s_prime) * done_mask
                delta = td_target - self.v(s)
            delta = delta.numpy()

            advantage_lst = []
            advantage = 0.0
            for delta_t in delta[::-1]:
                advantage = gamma * lmbda * advantage + delta_t[0]
                advantage_lst.append([advantage])
            advantage_lst.reverse()
            advantage = torch.tensor(advantage_lst, dtype=torch.float)
            data_with_adv.append((s, a, r, s_prime, done_mask, old_log_prob, td_target, advantage))

        return data_with_adv

        
    def train_net(self):
        if len(self.data) == minibatch_size * buffer_size:
            data = self.make_batch()
            data = self.calc_advantage(data)

            for i in range(K_epoch):
                for mini_batch in data:
                    s, a, r, s_prime, done_mask, old_log_prob, td_target, advantage = mini_batch
                    
                    # Please implement the policy update here using policy gradient. 
                    # 
                    # TODOs: 

In [5]:
env = gym.make('Pendulum-v0')
model = PPO()
score = 0.0
print_interval = 20
rollout = []

for n_epi in range(10000):
    s = env.reset()
    done = False
    while not done:
        for t in range(rollout_len):
            mu, std = model.pi(torch.from_numpy(s).float())
            dist = Normal(mu, std)
            a = dist.sample()
            log_prob = dist.log_prob(a)
            s_prime, r, done, info = env.step([a.item()])

            rollout.append((s, a, r/10.0, s_prime, log_prob.item(), done))
            if len(rollout) == rollout_len:
                model.put_data(rollout)
                rollout = []

            s = s_prime
            score += r
            if done:
                break

        model.train_net()

    if n_epi%print_interval==0 and n_epi!=0:
        print("# of episode :{}, avg score : {:.1f}, opt step: {}".format(n_epi, score/print_interval, model.optimization_step))
        score = 0.0

env.close()

  mini_batch = torch.tensor(s_batch, dtype=torch.float), torch.tensor(a_batch, dtype=torch.float), \


# of episode :20, avg score : -1651.4, opt step: 300
# of episode :40, avg score : -1592.2, opt step: 600
# of episode :60, avg score : -1613.3, opt step: 1200
# of episode :80, avg score : -1591.4, opt step: 1500
# of episode :100, avg score : -1518.4, opt step: 2100
# of episode :120, avg score : -1474.6, opt step: 2400
# of episode :140, avg score : -1505.4, opt step: 2700
# of episode :160, avg score : -1581.2, opt step: 3300
# of episode :180, avg score : -1428.2, opt step: 3600
# of episode :200, avg score : -1596.7, opt step: 3900
# of episode :220, avg score : -1450.9, opt step: 4500
# of episode :240, avg score : -1483.9, opt step: 4800
# of episode :260, avg score : -1432.6, opt step: 5400
# of episode :280, avg score : -1493.6, opt step: 5700
# of episode :300, avg score : -1537.7, opt step: 6000
# of episode :320, avg score : -1402.1, opt step: 6600
# of episode :340, avg score : -1255.5, opt step: 6900
# of episode :360, avg score : -1214.2, opt step: 7500
# of episode :38

# of episode :2960, avg score : -766.2, opt step: 61500
# of episode :2980, avg score : -716.9, opt step: 62100
# of episode :3000, avg score : -705.3, opt step: 62400
# of episode :3020, avg score : -763.8, opt step: 62700
# of episode :3040, avg score : -750.4, opt step: 63300
# of episode :3060, avg score : -727.3, opt step: 63600
# of episode :3080, avg score : -707.6, opt step: 63900
# of episode :3100, avg score : -742.8, opt step: 64500
# of episode :3120, avg score : -718.4, opt step: 64800
# of episode :3140, avg score : -683.7, opt step: 65400
# of episode :3160, avg score : -693.7, opt step: 65700
# of episode :3180, avg score : -780.9, opt step: 66000
# of episode :3200, avg score : -740.4, opt step: 66600
# of episode :3220, avg score : -724.7, opt step: 66900
# of episode :3240, avg score : -766.3, opt step: 67500
# of episode :3260, avg score : -689.8, opt step: 67800
# of episode :3280, avg score : -809.8, opt step: 68100
# of episode :3300, avg score : -738.8, opt step

# of episode :5880, avg score : -691.4, opt step: 122400
# of episode :5900, avg score : -683.3, opt step: 122700
# of episode :5920, avg score : -683.9, opt step: 123300
# of episode :5940, avg score : -707.6, opt step: 123600
# of episode :5960, avg score : -700.8, opt step: 123900
# of episode :5980, avg score : -690.0, opt step: 124500
# of episode :6000, avg score : -723.4, opt step: 124800
# of episode :6020, avg score : -721.3, opt step: 125400
# of episode :6040, avg score : -753.3, opt step: 125700
# of episode :6060, avg score : -728.5, opt step: 126000
# of episode :6080, avg score : -698.3, opt step: 126600
# of episode :6100, avg score : -688.1, opt step: 126900
# of episode :6120, avg score : -714.5, opt step: 127500
# of episode :6140, avg score : -686.9, opt step: 127800
# of episode :6160, avg score : -697.0, opt step: 128100
# of episode :6180, avg score : -770.1, opt step: 128700
# of episode :6200, avg score : -719.1, opt step: 129000
# of episode :6220, avg score :

# of episode :8760, avg score : -715.1, opt step: 182400
# of episode :8780, avg score : -590.7, opt step: 182700
# of episode :8800, avg score : -777.2, opt step: 183300
# of episode :8820, avg score : -723.0, opt step: 183600
# of episode :8840, avg score : -612.9, opt step: 183900
# of episode :8860, avg score : -681.4, opt step: 184500
# of episode :8880, avg score : -723.2, opt step: 184800
# of episode :8900, avg score : -736.6, opt step: 185400
# of episode :8920, avg score : -718.4, opt step: 185700
# of episode :8940, avg score : -759.1, opt step: 186000
# of episode :8960, avg score : -702.9, opt step: 186600
# of episode :8980, avg score : -803.1, opt step: 186900
# of episode :9000, avg score : -693.0, opt step: 187500
# of episode :9020, avg score : -674.2, opt step: 187800
# of episode :9040, avg score : -775.7, opt step: 188100
# of episode :9060, avg score : -696.6, opt step: 188700
# of episode :9080, avg score : -694.3, opt step: 189000
# of episode :9100, avg score :