In [1]:
import gym
import utils
import numpy as np
import random
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline

from collections import namedtuple
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torchvision.transforms as T

import torch.autograd as autograd
import torch.nn as nn
from enum import Enum

import copy

In [2]:
class Vanilla_reinforce_agent():

  def __init__(self, input_size, output_size):
    self.w = np.random.rand(input_size[0], output_size)
    self.actions = range(output_size)


  def train(self, env, num_episodes, max_steps, gamma = 0.9, lr = 0.000025):
    episode_rewards = []
    for e in range(num_episodes):
      state = env.reset()
      episode_reward = 0
      rewards = []
      grads = []
      for step in range(max_steps):
        probs = self.policy(state)
        action = np.random.choice(self.actions, p=probs)
        next_state, reward, done, _ = env.step(action)
        reward = reward if not done else -10

        rewards.append(reward)
        grad = self.p_gradient(state, probs, action)
        grads.append(grad)
        episode_reward += reward

        state = next_state

        if done or step == max_steps-1:
          episode_rewards.append(episode_reward)
          print("Episode " + str(e) + ": " + str(episode_reward))
          break


      for i in range(len(grads)):
        grad = grads[i] * lr
        future_discounted_rewards = sum([ r * (gamma ** r) for t,r in enumerate(rewards[i:])])
        self.w += grad*future_discounted_rewards

  def policy(self, state):
    # forward propagate state trough the Neural network
    h = state.dot(self.w) # dot prodcut state and w
    o = np.exp(h)  # softmax activation function
    o = o/np.sum(o)
    return o

  # Vectorized softmax Jacobian
  def softmax_grad(self, softmax):
    s = softmax.reshape(-1,1)
    return np.diagflat(s) - np.dot(s, s.T)

  def p_gradient(self, state, probs, action):
    # derivative of activation function 
    dlog = self.softmax_grad(probs)[action,:]
    grad = state.reshape(-1,1).dot(dlog.reshape(1,-1))  # geet gradient dot product of dlog and input
    grad = grad/ probs[action]
    return grad

In [3]:
env = gym.make('CartPole-v1') # observation [position of cart, velocity of cart, angle of pole, rotation rate of pole]
np.random.seed(1)
p_agent = Vanilla_reinforce_agent(env.observation_space.shape, env.action_space.n)

p_agent.train(env, 5000, 500)

Episode 0: 26.0
Episode 1: 1.0
Episode 2: 20.0
Episode 3: 5.0
Episode 4: 2.0
Episode 5: 14.0
Episode 6: 3.0
Episode 7: 7.0
Episode 8: 17.0
Episode 9: 15.0
Episode 10: 4.0
Episode 11: 11.0
Episode 12: 2.0
Episode 13: 4.0
Episode 14: 3.0
Episode 15: 9.0
Episode 16: 3.0
Episode 17: 1.0
Episode 18: 10.0
Episode 19: 17.0
Episode 20: 3.0
Episode 21: 8.0
Episode 22: 12.0
Episode 23: 1.0
Episode 24: 5.0
Episode 25: 10.0
Episode 26: 47.0
Episode 27: 3.0
Episode 28: 10.0
Episode 29: 19.0
Episode 30: 11.0
Episode 31: 8.0
Episode 32: 5.0
Episode 33: 9.0
Episode 34: 4.0
Episode 35: 3.0
Episode 36: 13.0
Episode 37: 23.0
Episode 38: 18.0
Episode 39: 3.0
Episode 40: -2.0
Episode 41: 8.0
Episode 42: 14.0
Episode 43: 0.0
Episode 44: 7.0
Episode 45: 5.0
Episode 46: 7.0
Episode 47: 3.0
Episode 48: 35.0
Episode 49: 4.0
Episode 50: 5.0
Episode 51: 2.0
Episode 52: 10.0
Episode 53: 12.0
Episode 54: 2.0
Episode 55: 7.0
Episode 56: 8.0
Episode 57: 16.0
Episode 58: 2.0
Episode 59: 12.0
Episode 60: 9.0
Episode 61

Episode 484: 40.0
Episode 485: 23.0
Episode 486: 8.0
Episode 487: 7.0
Episode 488: 11.0
Episode 489: 23.0
Episode 490: 5.0
Episode 491: 8.0
Episode 492: 5.0
Episode 493: 16.0
Episode 494: 11.0
Episode 495: 9.0
Episode 496: 7.0
Episode 497: 41.0
Episode 498: 13.0
Episode 499: 19.0
Episode 500: 24.0
Episode 501: 0.0
Episode 502: 24.0
Episode 503: 5.0
Episode 504: 40.0
Episode 505: 4.0
Episode 506: 5.0
Episode 507: 8.0
Episode 508: 15.0
Episode 509: 6.0
Episode 510: 0.0
Episode 511: 34.0
Episode 512: 6.0
Episode 513: 7.0
Episode 514: 11.0
Episode 515: 5.0
Episode 516: 29.0
Episode 517: 9.0
Episode 518: 12.0
Episode 519: 18.0
Episode 520: 23.0
Episode 521: 38.0
Episode 522: 22.0
Episode 523: 15.0
Episode 524: 12.0
Episode 525: 47.0
Episode 526: 10.0
Episode 527: 8.0
Episode 528: 2.0
Episode 529: 8.0
Episode 530: 19.0
Episode 531: 28.0
Episode 532: 60.0
Episode 533: 6.0
Episode 534: 0.0
Episode 535: 86.0
Episode 536: 7.0
Episode 537: 20.0
Episode 538: 138.0
Episode 539: 9.0
Episode 540: 4.0

Episode 978: 7.0
Episode 979: 42.0
Episode 980: 18.0
Episode 981: 58.0
Episode 982: 91.0
Episode 983: 30.0
Episode 984: 12.0
Episode 985: 29.0
Episode 986: 98.0
Episode 987: 7.0
Episode 988: 22.0
Episode 989: 20.0
Episode 990: 4.0
Episode 991: 10.0
Episode 992: 26.0
Episode 993: 0.0
Episode 994: 17.0
Episode 995: 15.0
Episode 996: 51.0
Episode 997: 17.0
Episode 998: 10.0
Episode 999: 81.0
Episode 1000: 22.0
Episode 1001: 26.0
Episode 1002: 21.0
Episode 1003: 35.0
Episode 1004: 7.0
Episode 1005: 12.0
Episode 1006: 10.0
Episode 1007: 6.0
Episode 1008: 34.0
Episode 1009: 41.0
Episode 1010: 61.0
Episode 1011: 14.0
Episode 1012: 21.0
Episode 1013: 7.0
Episode 1014: 3.0
Episode 1015: 4.0
Episode 1016: 8.0
Episode 1017: 43.0
Episode 1018: 11.0
Episode 1019: 36.0
Episode 1020: 34.0
Episode 1021: 28.0
Episode 1022: 9.0
Episode 1023: 7.0
Episode 1024: 9.0
Episode 1025: 77.0
Episode 1026: 4.0
Episode 1027: 53.0
Episode 1028: 72.0
Episode 1029: 58.0
Episode 1030: 58.0
Episode 1031: 16.0
Episode 10

Episode 1418: 10.0
Episode 1419: 45.0
Episode 1420: 156.0
Episode 1421: 52.0
Episode 1422: 113.0
Episode 1423: 9.0
Episode 1424: 38.0
Episode 1425: 33.0
Episode 1426: 16.0
Episode 1427: 15.0
Episode 1428: 22.0
Episode 1429: 23.0
Episode 1430: 44.0
Episode 1431: 59.0
Episode 1432: 8.0
Episode 1433: 52.0
Episode 1434: 151.0
Episode 1435: 65.0
Episode 1436: 33.0
Episode 1437: 47.0
Episode 1438: 47.0
Episode 1439: 69.0
Episode 1440: 31.0
Episode 1441: 70.0
Episode 1442: 36.0
Episode 1443: 49.0
Episode 1444: 104.0
Episode 1445: 54.0
Episode 1446: 38.0
Episode 1447: 29.0
Episode 1448: 23.0
Episode 1449: 18.0
Episode 1450: 73.0
Episode 1451: 35.0
Episode 1452: 5.0
Episode 1453: 16.0
Episode 1454: 34.0
Episode 1455: 41.0
Episode 1456: 37.0
Episode 1457: 70.0
Episode 1458: 46.0
Episode 1459: 10.0
Episode 1460: 21.0
Episode 1461: 16.0
Episode 1462: 81.0
Episode 1463: 57.0
Episode 1464: 143.0
Episode 1465: 19.0
Episode 1466: 166.0
Episode 1467: 48.0
Episode 1468: 40.0
Episode 1469: 35.0
Episode 1

Episode 1861: 21.0
Episode 1862: 28.0
Episode 1863: 64.0
Episode 1864: 134.0
Episode 1865: 49.0
Episode 1866: 149.0
Episode 1867: 116.0
Episode 1868: 20.0
Episode 1869: 38.0
Episode 1870: 59.0
Episode 1871: 97.0
Episode 1872: 118.0
Episode 1873: 39.0
Episode 1874: 46.0
Episode 1875: 63.0
Episode 1876: 172.0
Episode 1877: 47.0
Episode 1878: 8.0
Episode 1879: 156.0
Episode 1880: 45.0
Episode 1881: 55.0
Episode 1882: 33.0
Episode 1883: 42.0
Episode 1884: 11.0
Episode 1885: 115.0
Episode 1886: 63.0
Episode 1887: 49.0
Episode 1888: 122.0
Episode 1889: 135.0
Episode 1890: 23.0
Episode 1891: 85.0
Episode 1892: 37.0
Episode 1893: 6.0
Episode 1894: 107.0
Episode 1895: 35.0
Episode 1896: 43.0
Episode 1897: 15.0
Episode 1898: 41.0
Episode 1899: 134.0
Episode 1900: 27.0
Episode 1901: 54.0
Episode 1902: 8.0
Episode 1903: 46.0
Episode 1904: 111.0
Episode 1905: 61.0
Episode 1906: 40.0
Episode 1907: 132.0
Episode 1908: 62.0
Episode 1909: 30.0
Episode 1910: 33.0
Episode 1911: 74.0
Episode 1912: 82.0
Ep

Episode 2293: 258.0
Episode 2294: 138.0
Episode 2295: 13.0
Episode 2296: 253.0
Episode 2297: 96.0
Episode 2298: 168.0
Episode 2299: 199.0
Episode 2300: 58.0
Episode 2301: 171.0
Episode 2302: 84.0
Episode 2303: 104.0
Episode 2304: 204.0
Episode 2305: 224.0
Episode 2306: 28.0
Episode 2307: 81.0
Episode 2308: 157.0
Episode 2309: 179.0
Episode 2310: 100.0
Episode 2311: 159.0
Episode 2312: 168.0
Episode 2313: 83.0
Episode 2314: 125.0
Episode 2315: 253.0
Episode 2316: 214.0
Episode 2317: 195.0
Episode 2318: 116.0
Episode 2319: 180.0
Episode 2320: 21.0
Episode 2321: 154.0
Episode 2322: 266.0
Episode 2323: 124.0
Episode 2324: 76.0
Episode 2325: 275.0
Episode 2326: 126.0
Episode 2327: 230.0
Episode 2328: 243.0
Episode 2329: 166.0
Episode 2330: 168.0
Episode 2331: 89.0
Episode 2332: 38.0
Episode 2333: 158.0
Episode 2334: 48.0
Episode 2335: 222.0
Episode 2336: 197.0
Episode 2337: 97.0
Episode 2338: 90.0
Episode 2339: 96.0
Episode 2340: 109.0
Episode 2341: 215.0
Episode 2342: 43.0
Episode 2343: 13

Episode 2711: 172.0
Episode 2712: 129.0
Episode 2713: 87.0
Episode 2714: 484.0
Episode 2715: 58.0
Episode 2716: 133.0
Episode 2717: 408.0
Episode 2718: 276.0
Episode 2719: 206.0
Episode 2720: 184.0
Episode 2721: 120.0
Episode 2722: 270.0
Episode 2723: 427.0
Episode 2724: 460.0
Episode 2725: 438.0
Episode 2726: 63.0
Episode 2727: 138.0
Episode 2728: 219.0
Episode 2729: 402.0
Episode 2730: 164.0
Episode 2731: 409.0
Episode 2732: 282.0
Episode 2733: 489.0
Episode 2734: 489.0
Episode 2735: 136.0
Episode 2736: 207.0
Episode 2737: 102.0
Episode 2738: 218.0
Episode 2739: 265.0
Episode 2740: 90.0
Episode 2741: 275.0
Episode 2742: 147.0
Episode 2743: 51.0
Episode 2744: 194.0
Episode 2745: 320.0
Episode 2746: 13.0
Episode 2747: 198.0
Episode 2748: 233.0
Episode 2749: 253.0
Episode 2750: 476.0
Episode 2751: 323.0
Episode 2752: 223.0
Episode 2753: 124.0
Episode 2754: 272.0
Episode 2755: 311.0
Episode 2756: 191.0
Episode 2757: 272.0
Episode 2758: 330.0
Episode 2759: 245.0
Episode 2760: 142.0
Episod

Episode 3128: 217.0
Episode 3129: 131.0
Episode 3130: 262.0
Episode 3131: 171.0
Episode 3132: 334.0
Episode 3133: 415.0
Episode 3134: 386.0
Episode 3135: 298.0
Episode 3136: 306.0
Episode 3137: 223.0
Episode 3138: 199.0
Episode 3139: 217.0
Episode 3140: 186.0
Episode 3141: 150.0
Episode 3142: 381.0
Episode 3143: 187.0
Episode 3144: 361.0
Episode 3145: 199.0
Episode 3146: 300.0
Episode 3147: 196.0
Episode 3148: 452.0
Episode 3149: 395.0
Episode 3150: 232.0
Episode 3151: 394.0
Episode 3152: 224.0
Episode 3153: 237.0
Episode 3154: 101.0
Episode 3155: 385.0
Episode 3156: 177.0
Episode 3157: 389.0
Episode 3158: 489.0
Episode 3159: 464.0
Episode 3160: 262.0
Episode 3161: 157.0
Episode 3162: 373.0
Episode 3163: 234.0
Episode 3164: 204.0
Episode 3165: 159.0
Episode 3166: 271.0
Episode 3167: 75.0
Episode 3168: 199.0
Episode 3169: 250.0
Episode 3170: 260.0
Episode 3171: 389.0
Episode 3172: 489.0
Episode 3173: 444.0
Episode 3174: 357.0
Episode 3175: 245.0
Episode 3176: 385.0
Episode 3177: 262.0
E

Episode 3541: 462.0
Episode 3542: 489.0
Episode 3543: 179.0
Episode 3544: 489.0
Episode 3545: 489.0
Episode 3546: 489.0
Episode 3547: 489.0
Episode 3548: 489.0
Episode 3549: 489.0
Episode 3550: 280.0
Episode 3551: 252.0
Episode 3552: 322.0
Episode 3553: 432.0
Episode 3554: 330.0
Episode 3555: 489.0
Episode 3556: 170.0
Episode 3557: 489.0
Episode 3558: 98.0
Episode 3559: 153.0
Episode 3560: 489.0
Episode 3561: 309.0
Episode 3562: 489.0
Episode 3563: 489.0
Episode 3564: 489.0
Episode 3565: 434.0
Episode 3566: 419.0
Episode 3567: 489.0
Episode 3568: 489.0
Episode 3569: 332.0
Episode 3570: 425.0
Episode 3571: 345.0
Episode 3572: 489.0
Episode 3573: 189.0
Episode 3574: 489.0
Episode 3575: 278.0
Episode 3576: 441.0
Episode 3577: 447.0
Episode 3578: 334.0
Episode 3579: 489.0
Episode 3580: 489.0
Episode 3581: 489.0
Episode 3582: 489.0
Episode 3583: 318.0
Episode 3584: 489.0
Episode 3585: 489.0
Episode 3586: 221.0
Episode 3587: 489.0
Episode 3588: 489.0
Episode 3589: 236.0
Episode 3590: 489.0
E

Episode 3951: 489.0
Episode 3952: 489.0
Episode 3953: 312.0
Episode 3954: 435.0
Episode 3955: 489.0
Episode 3956: 489.0
Episode 3957: 404.0
Episode 3958: 455.0
Episode 3959: 298.0
Episode 3960: 99.0
Episode 3961: 80.0
Episode 3962: 327.0
Episode 3963: 405.0
Episode 3964: 472.0
Episode 3965: 489.0
Episode 3966: 489.0
Episode 3967: 382.0
Episode 3968: 489.0
Episode 3969: 489.0
Episode 3970: 185.0
Episode 3971: 406.0
Episode 3972: 489.0
Episode 3973: 489.0
Episode 3974: 489.0
Episode 3975: 272.0
Episode 3976: 376.0
Episode 3977: 489.0
Episode 3978: 489.0
Episode 3979: 489.0
Episode 3980: 489.0
Episode 3981: 223.0
Episode 3982: 489.0
Episode 3983: 489.0
Episode 3984: 489.0
Episode 3985: 415.0
Episode 3986: 489.0
Episode 3987: 489.0
Episode 3988: 489.0
Episode 3989: 203.0
Episode 3990: 489.0
Episode 3991: 229.0
Episode 3992: 489.0
Episode 3993: 489.0
Episode 3994: 368.0
Episode 3995: 489.0
Episode 3996: 489.0
Episode 3997: 489.0
Episode 3998: 489.0
Episode 3999: 138.0
Episode 4000: 489.0
Ep

Episode 4364: 489.0
Episode 4365: 489.0
Episode 4366: 489.0
Episode 4367: 489.0
Episode 4368: 489.0
Episode 4369: 476.0
Episode 4370: 489.0
Episode 4371: 489.0
Episode 4372: 446.0
Episode 4373: 489.0
Episode 4374: 489.0
Episode 4375: 489.0
Episode 4376: 396.0
Episode 4377: 487.0
Episode 4378: 357.0
Episode 4379: 489.0
Episode 4380: 163.0
Episode 4381: 489.0
Episode 4382: 489.0
Episode 4383: 489.0
Episode 4384: 489.0
Episode 4385: 489.0
Episode 4386: 489.0
Episode 4387: 489.0
Episode 4388: 489.0
Episode 4389: 489.0
Episode 4390: 489.0
Episode 4391: 489.0
Episode 4392: 489.0
Episode 4393: 489.0
Episode 4394: 322.0
Episode 4395: 489.0
Episode 4396: 489.0
Episode 4397: 256.0
Episode 4398: 489.0
Episode 4399: 489.0
Episode 4400: 285.0
Episode 4401: 412.0
Episode 4402: 312.0
Episode 4403: 489.0
Episode 4404: 251.0
Episode 4405: 305.0
Episode 4406: 295.0
Episode 4407: 375.0
Episode 4408: 489.0
Episode 4409: 489.0
Episode 4410: 290.0
Episode 4411: 489.0
Episode 4412: 396.0
Episode 4413: 489.0


Episode 4775: 213.0
Episode 4776: 391.0
Episode 4777: 489.0
Episode 4778: 489.0
Episode 4779: 489.0
Episode 4780: 489.0
Episode 4781: 489.0
Episode 4782: 338.0
Episode 4783: 489.0
Episode 4784: 489.0
Episode 4785: 293.0
Episode 4786: 114.0
Episode 4787: 399.0
Episode 4788: 489.0
Episode 4789: 396.0
Episode 4790: 489.0
Episode 4791: 489.0
Episode 4792: 242.0
Episode 4793: 489.0
Episode 4794: 489.0
Episode 4795: 489.0
Episode 4796: 489.0
Episode 4797: 489.0
Episode 4798: 489.0
Episode 4799: 192.0
Episode 4800: 479.0
Episode 4801: 489.0
Episode 4802: 368.0
Episode 4803: 396.0
Episode 4804: 418.0
Episode 4805: 489.0
Episode 4806: 182.0
Episode 4807: 489.0
Episode 4808: 470.0
Episode 4809: 489.0
Episode 4810: 489.0
Episode 4811: 489.0
Episode 4812: 373.0
Episode 4813: 209.0
Episode 4814: 489.0
Episode 4815: 489.0
Episode 4816: 302.0
Episode 4817: 489.0
Episode 4818: 489.0
Episode 4819: 489.0
Episode 4820: 374.0
Episode 4821: 489.0
Episode 4822: 460.0
Episode 4823: 466.0
Episode 4824: 489.0


In [4]:
env = utils.wrap_env(gym.make("CartPole-v1"))
observation = env.reset()
total_raward_rnd = 0
while True:
    env.render()
    policy = p_agent.policy(observation)
    action = np.random.choice(p_agent.actions, p=policy)
    observation, reward, done, info = env.step(action)
    total_raward_rnd += reward
    if done: 
      break;
print(total_raward_rnd)
env.close()
utils.show_video()

295.0


# Deep Deterministic Policy Gradients

In [6]:
Transition = namedtuple('Transition',
                        ('state', 'action', 'next_state', 'reward', 'done'))

class ReplayMemory(object):
    '''
          A simple memory for storing episodes where each episodes 
          is a names tuple with (state, action, next_state, reward, done)
    '''

    def __init__(self, capacity):
        '''
            Initialize memory of size capacity
            Input: Capacity : int 
                        size of the memory

            output: initialized ReplayMemory object
        '''
        self.capacity = capacity
        self.memory = []
        self.position = 0

    def push(self, *args):
        '''
            input *args : list  *args is list for transition 
            [state, action, next_state, reward, done] and add
            transition to memory.
            Returns : None
        '''
        if len(self.memory) < self.capacity:
            self.memory.append(None)
        self.memory[self.position] = Transition(*args)
        self.position = (self.position + 1) % self.capacity

    def sample(self, batch_size):
        '''
            Randomly sample transitions from memory
            Input batch_size : int
                    numer of transition to sample
            Output:  namedtuple
                      Namedtupe with each field contains a list of data points

        '''
        batch = random.sample(self.memory, batch_size)
        return Transition(*zip(*batch))


    def __len__(self):
        '''
              returns current size of memory
        '''
        return len(self.memory)

In [7]:
# [reference] https://github.com/matthiasplappert/keras-rl/blob/master/rl/random.py

class RandomProcess(object):
    def reset_states(self):
        pass

class AnnealedGaussianProcess(RandomProcess):
    def __init__(self, mu, sigma, sigma_min, n_steps_annealing):
        self.mu = mu
        self.sigma = sigma
        self.n_steps = 0

        if sigma_min is not None:
            self.m = -float(sigma - sigma_min) / float(n_steps_annealing)
            self.c = sigma
            self.sigma_min = sigma_min
        else:
            self.m = 0.
            self.c = sigma
            self.sigma_min = sigma

    @property
    def current_sigma(self):
        sigma = max(self.sigma_min, self.m * float(self.n_steps) + self.c)
        return sigma


# Based on http://math.stackexchange.com/questions/1287634/implementing-ornstein-uhlenbeck-in-matlab
class OrnsteinUhlenbeckProcess(AnnealedGaussianProcess):
    def __init__(self, theta, mu=0., sigma=1., dt=1e-2, x0=None, size=1, sigma_min=None, n_steps_annealing=1000):
        super(OrnsteinUhlenbeckProcess, self).__init__(mu=mu, sigma=sigma, sigma_min=sigma_min, n_steps_annealing=n_steps_annealing)
        self.theta = theta
        self.mu = mu
        self.dt = dt
        self.x0 = x0
        self.size = size
        self.reset_states()

    def sample(self):
        x = self.x_prev + self.theta * (self.mu - self.x_prev) * self.dt + self.current_sigma * np.sqrt(self.dt) * np.random.normal(size=self.size)
        self.x_prev = x
        self.n_steps += 1
        return x

    def reset_states(self):
        self.x_prev = self.x0 if self.x0 is not None else np.zeros(self.size)

In [8]:
class Abstract_DQNN(nn.Module):
    '''
            Abstract class gives skelleton that all DQN algos should implement/inherit

            DQNN are the neural network of the Deep Q learning part. Deep Q learning consits of netowkr and agent.
    '''
    def __init__(self, input_dim, output_dim):

        '''
            input_dim : tuple  shape of enviroment state
            output_dim: int    number of actions  
        '''
        super(Abstract_DQNN, self).__init__()
        self.input_dim = input_dim
        self.output_dim = output_dim
        self._init_fc()
        self._init_o()

    def _init_fc(self):
        '''
              Initialize the feature generation part of the NN
        '''
        pass
    
    def _init_o(self):
        '''
              Initialize the output layer of the NN
        '''
        pass
    
    def _forward_fc(self, state):
        '''
              Pass the state trough the feature layer generating hidden state (i.e. features)
        '''
        pass

    def _forward_o(self, fc):
        '''
              pass features from fc layer trough the output layer
        '''
        pass

    def forward(self, state):
        '''
            Complete forward pass from state to otput
            Input: State: list   state of the enviroment
            Output: Value function for each action in input state

        '''
        features = self._forward_fc(state)
        features = features.view(features.size(0), -1)
        out = self._forward_o(features)
        return out

    def init_weights_for_linear(self, m):
        if type(m) == nn.Linear:
            fanin = m.weight.data.size()
            v = 1. / np.sqrt(fanin)
            m.weight.data =  torch.Tensor(size).uniform_(-v, v)


class Actor(Abstract_DQNN):
    '''
          Fully connected Nural network for Q-Learning
    '''
    def _init_fc(self):
        '''
              Initialize the feature generation part of the NN
              Here it is two dense layer
        '''
        self.fc = nn.Sequential(
            nn.Linear(self.input_dim[0], 4),
            nn.ReLU(),
            nn.Linear(4, 4),
            nn.ReLU(),
        )

    def _init_o(self):
        '''
              pass features from fc layer trough the output layer
              here a single dense layer
        '''
        self.o = nn.Sequential(
            nn.Linear(4, self.output_dim),
            nn.Tanh()
        )

    def _forward_fc(self, state):
        '''
              Pass the state trough the feature layer generating hidden state (i.e. features)
              Input state env.state state of the enviroment
              returns hidden state of fc layers
        '''
        return self.fc(state)

    def _forward_o(self, fcs):
        '''
              pass features from fc layer trough the output layer
              Input output hidden layer of fc layer
              Output value state action function for each action in state
        '''
        return self.o(fcs)


class Critic(Abstract_DQNN):


    '''
        Fully connected Nural network for Q-Learning
    '''
    
    def _init_fc(self):
        '''
              Initialize the feature generation part of the NN
              Here it is two dense layer
        '''
        self.fc1 = nn.Sequential(
            nn.Linear(self.input_dim[0], 4),
            nn.ReLU()
        )
        
         

        self.fc2 = nn.Sequential(
            nn.Linear(4+self.output_dim, 4),
            nn.ReLU(),
            nn.Linear(4, 4),
            nn.ReLU(),
        )

    def _init_o(self):
        '''
              pass features from fc layer trough the output layer
              here a single dense layer
        '''
        self.o = nn.Sequential(
            nn.Linear(4, 1),
        )

    def _forward_fc(self, state):
        '''
              Pass the state trough the feature layer generating hidden state (i.e. features)
              Input state env.state state of the enviroment
              returns hidden state of fc layers
        '''
        (state, a) = state
        h1 = self.fc1(state)
        out =  self.fc2(torch.cat([h1,a],1))
        return out

    def _forward_o(self, fcs):
        '''
              pass features from fc layer trough the output layer
              Input output hidden layer of fc layer
              Output value state action function for each action in state
        '''
        return self.o(fcs)



In [9]:
class DDPG_Agent:

    '''
          Base model for Deep Q-Learning Agents
    '''

    def __init__(self, env, replay_buffer, nb_actions,
                 learning_rate=0.001, 
                 gamma=0.99, 
                 update_target = 100, 
                 eps=1, 
                 eps_decay=0.01,  
                 tau = 0.001, 
                 ou_theta = 0.15,
                 ou_mu = 0.0,
                 ou_sigma = 0.2):
        '''
              Iniitialize Deep Q Agent

              Input:

                        env: openai gym enviroment
                        NN_class: Abstract_DQNN Neural network class for neural network part of Deep Q-Learning
                        replay_buffer: Replay Bufer to store Transitions
                        learning_rate: learning rate to update neural network
                        gamma: future discount rate
                        update_target: step % update_target ==0 is when target network gets updated
                        eps: epsilon for choosing random action
                        eps_decay: rate at which epsilon decays
                        eps_min: minimal epsilon value 
        '''
        self.nb_actions = nb_actions
        self.nb_states  = env.observation_space.shape
        self.env = env
        self.tau = tau
        self.learning_rate = learning_rate
        self.gamma = gamma
        self.replay_buffer = replay_buffer
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.update_target = update_target
        self.i = 0
        self.eps = eps
        self.eps_decay = eps_decay
        self.random_process = OrnsteinUhlenbeckProcess(size=self.nb_actions, theta=ou_theta, mu=ou_mu, sigma=ou_sigma)


        self.actor = Actor(self.nb_states, self.nb_actions)
        self.actor_target = Actor(self.nb_states, self.nb_actions)
        self.actor_optim  = torch.optim.Adam(self.actor.parameters(), lr=learning_rate)

        self.critic = Critic(self.nb_states, self.nb_actions)
        self.critic_target = Critic(self.nb_states, self.nb_actions)
        self.critic_optim  = torch.optim.Adam(self.critic.parameters(), lr=learning_rate)

######################################################################
############################  Core Functons ##########################
######################################################################


    def random_action(self):
        action = np.random.uniform(-1.,1.,self.nb_actions)
        return action

    def get_action(self, state, is_training=1):
        state = torch.FloatTensor(state).float().unsqueeze(0).to(self.device)
        action = self.actor.forward(state).squeeze(0).data.numpy()
        action += is_training*max(self.eps, 0)*self.random_process.sample()
        action = np.clip(action, -1., 1.)        
        return action

    def train(self, env, max_episodes, max_steps, batch_size, warm_up=100):
        '''
                trains Deep Q Agent for a given enviroment
                Input:
                        env: enviroment to train on
                        max_episodes: number of epsiodes to play
                        max_steps: number of steps to take in an epsiode
                        batch_size: size of batch
        '''
        episode_rewards = []

        for episode in range(max_episodes):
            state = env.reset()
            episode_reward = 0

            for step in range(max_steps):
                if warm_up > 0:
                  action = self.random_action()
                  warm_up -= 1
                else:
                  action = self.get_action(state)

                next_state, reward, done, _ = env.step(action)
                self.replay_buffer.push(state, action, next_state, reward, done)
                episode_reward += reward

                if len(self.replay_buffer) > batch_size:
                    self.update(batch_size)   

                if done or step == max_steps-1:
                    episode_rewards.append(episode_reward)
                    self.update_eps()
                    print("Episode " + str(episode) + ": " + str(episode_reward), '\t', self.eps)
                    break

                state = next_state

        return episode_rewards

    def update(self, batch_size):

        '''
              Forward and backward pass for a batch
                Input: 
                        batch_size: number of Transitions to be sampled from memory
        '''
        batch = self.replay_buffer.sample(batch_size)
        states, actions, rewards, next_states, dones = self.__get_tensors_from_batch__(batch)

        actor_policies = self.forward_actor(next_states)
        actor_policies = actor_policies.view(actor_policies.size(0), 1)
        
        next_q_values = self.forward_critic(states, actor_policies)
        next_q_values = next_q_values.view(next_q_values.size(0), 1)

        yis = self._yis(rewards, dones, next_q_values)

        self.update_critic(states, actions, yis)
        self.update_actor(states)


        # Copy the moving NN in the target NN
        if self.i % self.update_target == 0:
          self.soft_update(self.actor_target, self.actor)
          self.soft_update(self.critic_target, self.critic)
        self.i += 1

    def forward_critic(self, next_states, actor_policies):
        return self.critic_target([next_states, actor_policies]) # run forward critic form state, policy -> q-value


    def update_critic(self, states, actions, yis):
        q_batch = self.critic([ states, actions ]) # get q_vals by running critic forward critic
        loss_critic = F.mse_loss(q_batch, yis)     # get  F.mse_loss betwen yis and q-vals
        self.critic.zero_grad()
        loss_critic.backward()
        self.critic_optim.step()

    def forward_actor(self, next_state):
        return self.actor_target(next_state) # run forward actor form state -> policy

    def update_actor(self, states):
        
        policies = self.actor.forward(states)  # forwad actor
        policies = policies.view(policies.size(0), 1)  
        actor_loss = self.critic([  # forward critic
              states,
              policies
          ])

        actor_loss = (-actor_loss).mean()
        self.actor.zero_grad()
        actor_loss.backward()
        self.actor_optim.step()


    def _yis(self, rewards, dones, next_Q):
      return rewards + (1 - dones) * self.gamma * next_Q # calculate yis = reward + (1 - done)*gamma*next_Q    

######################################################################
############################  Helper Functons ########################
######################################################################

    def __get_tensors_from_batch__(self, batch):
        '''
              maps list values from abtch to pytorch tensors

              Input: batch: list of Transitions

              output: pytorch tensors of all Transitions values
        '''
        states = torch.FloatTensor(batch.state, device=self.device)
        actions = torch.FloatTensor(batch.action, device=self.device)
        rewards = torch.FloatTensor(batch.reward, device=self.device)
        next_states = torch.FloatTensor(batch.next_state, device=self.device)
        dones = torch.FloatTensor(batch.done, device=self.device)
        
        # resize tensors
        actions = actions.view(actions.size(0), 1)
        dones = dones.view(dones.size(0), 1)
        rewards = rewards.view(rewards.size(0), 1)

        return states, actions, rewards, next_states, dones

    def update_eps(self):
      '''
              perform epsilon decay
      '''
      if self.eps > 0:
          self.eps -= self.eps_decay

    def soft_update(self, target, source):
      for target_param, param in zip(target.parameters(), source.parameters()):
        target_param.data.copy_(target_param.data * (1.0 - self.tau) + param.data * self.tau)


In [11]:
env_id = "Pendulum-v0"
MAX_EPISODES = 50
MAX_STEPS = 500
BATCH_SIZE = 32

env = gym.make(env_id)
buffer = ReplayMemory(10000)

agent = DDPG_Agent(env, buffer, 1)
agent.train(env, 60, 200, 32)

Episode 0: -1545.2300128717802 	 0.99
Episode 1: -1591.3164403319533 	 0.98
Episode 2: -1832.525958066842 	 0.97
Episode 3: -1062.2342312386836 	 0.96
Episode 4: -1804.8807649507835 	 0.95
Episode 5: -1779.3327245195471 	 0.94
Episode 6: -1066.6838535280438 	 0.9299999999999999
Episode 7: -1188.5306255291732 	 0.9199999999999999
Episode 8: -1273.1448828615478 	 0.9099999999999999
Episode 9: -1053.1976282593769 	 0.8999999999999999
Episode 10: -1333.3550468397852 	 0.8899999999999999
Episode 11: -1153.968228739508 	 0.8799999999999999
Episode 12: -1410.6028076227524 	 0.8699999999999999
Episode 13: -1043.7579864403092 	 0.8599999999999999
Episode 14: -1069.2754826796572 	 0.8499999999999999
Episode 15: -967.5634300163796 	 0.8399999999999999
Episode 16: -1324.660900955263 	 0.8299999999999998
Episode 17: -1696.1210524508972 	 0.8199999999999998
Episode 18: -1758.913829228976 	 0.8099999999999998
Episode 19: -1086.328812105399 	 0.7999999999999998
Episode 20: -1185.320030065231 	 0.78999

[-1545.2300128717802,
 -1591.3164403319533,
 -1832.525958066842,
 -1062.2342312386836,
 -1804.8807649507835,
 -1779.3327245195471,
 -1066.6838535280438,
 -1188.5306255291732,
 -1273.1448828615478,
 -1053.1976282593769,
 -1333.3550468397852,
 -1153.968228739508,
 -1410.6028076227524,
 -1043.7579864403092,
 -1069.2754826796572,
 -967.5634300163796,
 -1324.660900955263,
 -1696.1210524508972,
 -1758.913829228976,
 -1086.328812105399,
 -1185.320030065231,
 -1313.7064584858792,
 -1128.9181110860006,
 -1731.9702169615514,
 -1214.6034315290253,
 -1157.6567630803577,
 -1367.173406445887,
 -1189.2090979976472,
 -1348.352660063678,
 -1702.8157803200108,
 -1336.4030650006925,
 -1602.842739054669,
 -1376.1547506130012,
 -1077.5344208309682,
 -1309.7618519483306,
 -1719.1902971041695,
 -1361.5191098749335,
 -1321.802472696683,
 -1345.6053861766845,
 -1741.8716911403465,
 -1594.81199795516,
 -1247.9268713762533,
 -1343.4567552290225,
 -1296.9688638127084,
 -1289.8457488534784,
 -1495.290022526372,
 -

# Build your own open AI enviroment
## Custom enviroments

In [18]:
import gym
from gym import spaces

class CustomEnv(gym.Env):
    """Custom Environment that follows gym interface"""
    metadata = {'render.modes': ['human']}
    
    def __init__(self, arg1, arg2):
        super(CustomEnv, self).__init__()    # Define action and observation space
        # They must be gym.spaces objects    # Example when using discrete actions:
        self.action_space = spaces.Discrete(N_DISCRETE_ACTIONS)    # Example for using image as input:
        self.observation_space = spaces.Box(low=0, high=255, shape=
                    (HEIGHT, WIDTH, N_CHANNELS), dtype=np.uint8)
        
    def _step(self, action):
        """
            Parameters
            ----------
            action :

            Returns
            -------
            ob, reward, episode_over, info : tuple
                ob (object) :
                    an environment-specific object representing your observation of
                    the environment.
                reward (float) :
                    amount of reward achieved by the previous action. The scale
                    varies between environments, but the goal is always to increase
                    your total reward.
                episode_over (bool) :
                    whether it's time to reset the environment again. Most (but not
                    all) tasks are divided up into well-defined episodes, and done
                    being True indicates the episode has terminated. (For example,
                    perhaps the pole tipped too far, or you lost your last life.)
                info (dict) :
                     diagnostic information useful for debugging. It can sometimes
                     be useful for learning (for example, it might contain the raw
                     probabilities behind the environment's last state change).
                     However, official evaluations of your agent are not allowed to
                     use this for learning.
        """
        self._take_action(action)
        self.status = self.env.step()
        reward = self._get_reward()
        ob = self.env.getState()
        episode_over = self.status != hfo_py.IN_GAME
        return ob, reward, episode_over, {}


    def _reset(self):
        pass

    def _render(self, mode='human', close=False):
        pass

    def _take_action(self, action):
        pass

    def _get_reward(self):
        """ Reward is given for XY. """
        if self.status == FOOBAR:
            return 1
        elif self.status == ABC:
            return self.somestate ** 2
        else:
            return 0

In [19]:
import sys
from contextlib import closing

import numpy as np
from six import StringIO, b

from gym import utils
from gym.envs.toy_text import discrete

LEFT = 0
DOWN = 1
RIGHT = 2
UP = 3

MAPS = {
    "4x4": [
        "SFFF",
        "FHFH",
        "FFFH",
        "HFFG"
    ],
    "8x8": [
        "SFFFFFFF",
        "FFFFFFFF",
        "FFFHFFFF",
        "FFFFFHFF",
        "FFFHFFFF",
        "FHHFFFHF",
        "FHFFHFHF",
        "FFFHFFFG"
    ],
}


def generate_random_map(size=8, p=0.8):
    """Generates a random valid map (one that has a path from start to goal)
    :param size: size of each side of the grid
    :param p: probability that a tile is frozen
    """
    valid = False

    # DFS to check that it's a valid path.
    def is_valid(res):
        frontier, discovered = [], set()
        frontier.append((0,0))
        while frontier:
            r, c = frontier.pop()
            if not (r,c) in discovered:
                discovered.add((r,c))
                directions = [(1, 0), (0, 1), (-1, 0), (0, -1)]
                for x, y in directions:
                    r_new = r + x
                    c_new = c + y
                    if r_new < 0 or r_new >= size or c_new < 0 or c_new >= size:
                        continue
                    if res[r_new][c_new] == 'G':
                        return True
                    if (res[r_new][c_new] not in '#H'):
                        frontier.append((r_new, c_new))
        return False

    while not valid:
        p = min(1, p)
        res = np.random.choice(['F', 'H'], (size, size), p=[p, 1-p])
        res[0][0] = 'S'
        res[-1][-1] = 'G'
        valid = is_valid(res)
    return ["".join(x) for x in res]


class FrozenLakeEnv(discrete.DiscreteEnv):
    """
    Winter is here. You and your friends were tossing around a frisbee at the park
    when you made a wild throw that left the frisbee out in the middle of the lake.
    The water is mostly frozen, but there are a few holes where the ice has melted.
    If you step into one of those holes, you'll fall into the freezing water.
    At this time, there's an international frisbee shortage, so it's absolutely imperative that
    you navigate across the lake and retrieve the disc.
    However, the ice is slippery, so you won't always move in the direction you intend.
    The surface is described using a grid like the following
        SFFF
        FHFH
        FFFH
        HFFG
    S : starting point, safe
    F : frozen surface, safe
    H : hole, fall to your doom
    G : goal, where the frisbee is located
    The episode ends when you reach the goal or fall in a hole.
    You receive a reward of 1 if you reach the goal, and zero otherwise.
    """

    metadata = {'render.modes': ['human', 'ansi']}

    def __init__(self, desc=None, map_name="4x4",is_slippery=True):
        if desc is None and map_name is None:
            desc = generate_random_map()
        elif desc is None:
            desc = MAPS[map_name]
        self.desc = desc = np.asarray(desc,dtype='c')
        self.nrow, self.ncol = nrow, ncol = desc.shape
        self.reward_range = (0, 1)

        nA = 4
        nS = nrow * ncol

        isd = np.array(desc == b'S').astype('float64').ravel()
        isd /= isd.sum()

        P = {s : {a : [] for a in range(nA)} for s in range(nS)}

        def to_s(row, col):
            return row*ncol + col

        def inc(row, col, a):
            if a == LEFT:
                col = max(col-1,0)
            elif a == DOWN:
                row = min(row+1,nrow-1)
            elif a == RIGHT:
                col = min(col+1,ncol-1)
            elif a == UP:
                row = max(row-1,0)
            return (row, col)

        for row in range(nrow):
            for col in range(ncol):
                s = to_s(row, col)
                for a in range(4):
                    li = P[s][a]
                    letter = desc[row, col]
                    if letter in b'GH':
                        li.append((1.0, s, 0, True))
                    else:
                        if is_slippery:
                            for b in [(a-1)%4, a, (a+1)%4]:
                                newrow, newcol = inc(row, col, b)
                                newstate = to_s(newrow, newcol)
                                newletter = desc[newrow, newcol]
                                done = bytes(newletter) in b'GH'
                                rew = float(newletter == b'G')
                                li.append((1.0/3.0, newstate, rew, done))
                        else:
                            newrow, newcol = inc(row, col, a)
                            newstate = to_s(newrow, newcol)
                            newletter = desc[newrow, newcol]
                            done = bytes(newletter) in b'GH'
                            rew = float(newletter == b'G')
                            li.append((1.0, newstate, rew, done))

        super(FrozenLakeEnv, self).__init__(nS, nA, P, isd)

    def render(self, mode='human'):
        outfile = StringIO() if mode == 'ansi' else sys.stdout

        row, col = self.s // self.ncol, self.s % self.ncol
        desc = self.desc.tolist()
        desc = [[c.decode('utf-8') for c in line] for line in desc]
        desc[row][col] = utils.colorize(desc[row][col], "red", highlight=True)
        if self.lastaction is not None:
            outfile.write("  ({})\n".format(["Left","Down","Right","Up"][self.lastaction]))
        else:
            outfile.write("\n")
        outfile.write("\n".join(''.join(line) for line in desc)+"\n")

        if mode != 'human':
            with closing(outfile):
                return outfile.getvalue()

## Wrappers

https://github.com/openai/baselines/blob/master/baselines/common/atari_wrappers.py

In [20]:
class FrameStack(gym.Wrapper):
    def __init__(self, env, k):
        """Stack k last frames.
        Returns lazy array, which is much more memory efficient.
        See Also
        --------
        baselines.common.atari_wrappers.LazyFrames
        """
        gym.Wrapper.__init__(self, env)
        self.k = k
        self.frames = deque([], maxlen=k)
        shp = env.observation_space.shape
        self.observation_space = spaces.Box(low=0, high=255, shape=(shp[:-1] + (shp[-1] * k,)), dtype=env.observation_space.dtype)

    def reset(self):
        ob = self.env.reset()
        for _ in range(self.k):
            self.frames.append(ob)
        return self._get_ob()

    def step(self, action):
        ob, reward, done, info = self.env.step(action)
        self.frames.append(ob)
        return self._get_ob(), reward, done, info

    def _get_ob(self):
        assert len(self.frames) == self.k
        return LazyFrames(list(self.frames))

In [21]:
class ClipRewardEnv(gym.RewardWrapper):
    def __init__(self, env):
        gym.RewardWrapper.__init__(self, env)

    def reward(self, reward):
        """Bin reward to {+1, 0, -1} by its sign."""
        return np.sign(reward)

In [22]:
class MaxAndSkipEnv(gym.Wrapper):
    def __init__(self, env, skip=4):
        """Return only every `skip`-th frame"""
        gym.Wrapper.__init__(self, env)
        # most recent raw observations (for max pooling across time steps)
        self._obs_buffer = np.zeros((2,)+env.observation_space.shape, dtype=np.uint8)
        self._skip       = skip

    def step(self, action):
        """Repeat action, sum reward, and max over last observations."""
        total_reward = 0.0
        done = None
        for i in range(self._skip):
            obs, reward, done, info = self.env.step(action)
            if i == self._skip - 2: self._obs_buffer[0] = obs
            if i == self._skip - 1: self._obs_buffer[1] = obs
            total_reward += reward
            if done:
                break
        # Note that the observation on the done=True frame
        # doesn't matter
        max_frame = self._obs_buffer.max(axis=0)

        return max_frame, total_reward, done, info

    def reset(self, **kwargs):
        return self.env.reset(**kwargs)