In [2]:
import torch
from torch import nn as nn
from torch.utils import data
import numpy as np
import gym

# Environment

In [3]:
env = gym.make('CartPole-v0')

# NN model

In [6]:
class cart_pole(nn.Module):
  def __init__(self):
    super(cart_pole,self).__init__()
    self.fc1 = nn.Linear(4,64)
    self.fc3 = nn.Linear(64,64)
    self.fc2 = nn.Linear(64,2)

  def forward(self,s):
    output = torch.nn.functional.leaky_relu((self.fc1(s)))
    output = torch.nn.functional.leaky_relu((self.fc3(output)))
    output = self.fc2(output)
    return output


# hyperparameters and essential functions

In [8]:
class TD_0_cart_pole():
  def __init__(self):
    self.q = cart_pole()
    self.history = []
    self.gamma = 0.99
    self.w_e = 0.009
    self.max_e = 0.9
    self.loss = nn.MSELoss()
    self.optimizer = torch.optim.SGD(self.q.parameters(),lr=0.01, momentum=0.9)

  def act(self,s):
    a = torch.argmax(self.q(torch.from_numpy(s).float()))
    return a.item()
  
  def e_greedy(self,s,e):
    if np.random.rand() < e:
      a = env.action_space.sample()
    else:
      a = self.act(s)
    return a

  def decay(self,episode):
    e = max(-episode * self.w_e + self.max_e, 0.1/(episode + 1))
    return e

  def update(self,s,a,s1,a1,r):
    next_state_estimation = self.q(torch.from_numpy(s).float())[a]
    next_state_max = self.gamma * self.q(torch.from_numpy(s1).float())[a1] + r
    #print(next_state_estimation,next_state_max)
    l = self.loss(next_state_estimation,next_state_max)
    #print(l)
    self.optimizer.zero_grad()
    l.backward()
    #for param in self.q.parameters():
      #param.grad.data.clamp_(-1, 1)
    self.optimizer.step()
    return l

In [9]:
agent = TD_0_cart_pole()
n_episodes = 200

# Training

In [10]:
for episoda in range(n_episodes):
  obs = env.reset()
  e = agent.decay(episoda)
  #e = 0.1
  s = obs
  a = agent.e_greedy(s,e)
  G = 0
  L = 0
  #for t in range(500):
  while True:
    obs, r, done, _ = env.step(a)
    G += r
    s1 = obs
    a1 = agent.e_greedy(s1,e)
    l = agent.update(s,a,s1,a1,r)
    L += l
    if done:
      agent.history.append(G)
      #agent.policy_inprovement()
      print("Episoda: {} | Final return: {} | average loss: {}".format(episoda,G,L.item()/G))
      break
    s,a = s1,a1

Episoda: 0 | Final return: 22.0 | average loss: 1.065208001570268
Episoda: 1 | Final return: 11.0 | average loss: 1.0375029823996804
Episoda: 2 | Final return: 31.0 | average loss: 1.0542174308530745
Episoda: 3 | Final return: 24.0 | average loss: 1.031367301940918
Episoda: 4 | Final return: 20.0 | average loss: 1.0337353706359864
Episoda: 5 | Final return: 13.0 | average loss: 1.0058646568885217
Episoda: 6 | Final return: 43.0 | average loss: 1.0288989044899164
Episoda: 7 | Final return: 14.0 | average loss: 1.0084991455078125
Episoda: 8 | Final return: 16.0 | average loss: 1.070162296295166
Episoda: 9 | Final return: 12.0 | average loss: 0.9792939027150472
Episoda: 10 | Final return: 45.0 | average loss: 1.0336130777994792
Episoda: 11 | Final return: 22.0 | average loss: 1.014268008145419
Episoda: 12 | Final return: 11.0 | average loss: 1.094565044749867
Episoda: 13 | Final return: 20.0 | average loss: 0.9968213081359864
Episoda: 14 | Final return: 26.0 | average loss: 1.021886752201

# results

In [12]:
test = []
for i in range(10):
  obs = env.reset()
  s = obs
  a = agent.act(s)
  G = 0
  while True:
    obs, r, done, _ = env.step(a)
    G += r
    s1 = obs
    a = agent.act(s1)
    if done:
      test.append(G)
      break
print("SARSA | average score: {}".format(np.mean(test)))

SARSA | average score: 199.8


# Environment (mountain_car)

In [29]:
env = gym.make('MountainCar-v0')

# NN mountain_car

In [19]:
class mountain_car(nn.Module):
  def __init__(self):
    super(mountain_car,self).__init__()
    self.fc1 = nn.Linear(2,32)
    self.fc3 = nn.Linear(32,32)
    self.fc2 = nn.Linear(32,3)

  def forward(self,s):
    output = nn.functional.leaky_relu(self.fc1(s))
    output = nn.functional.leaky_relu(self.fc3(output))
    output = self.fc2(output)
    return output

# Dataset class

In [20]:
class training(data.Dataset):
  def __init__(self,s,a,s1,a1,r):
    self.s = s
    self.a = a
    self.s1 = s1
    self.a1 = a1
    self.r = r

  def __len__(self):
    return self.s.shape[0]

  def __getitem__(self,idx):
    return self.s[idx],self.a[idx],self.s1[idx],self.a1[idx],self.r[idx]

# hyperparameters and functions mountain_car

In [21]:
#dual network training to handle potential problem from backward() and optimizer.step()
training_q = mountain_car()
target_q = mountain_car()
gamma = 0.9
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(training_q.parameters(),lr=0.01)
def act(s):
  a = torch.argmax(training_q(torch.from_numpy(s).float()))
  return a.item()
def e_greedy(s,e):
  if np.random.rand() < e:
    a = env.action_space.sample()
  else:
    a = act(s)
  return a

# Training

In [23]:
training_s = []
training_a = []
training_s1 = []
training_a1 = []
training_r = []
for episode in range(300):
  e = 0.8 - 0.0077*episode
  for i in range(10):
    s = env.reset()
    a = e_greedy(s,e)
    G = 0
    step = 0
    while True:
      s1, r, done, _ = env.step(a)
      a1 = e_greedy(s1,e)
      G += r
      if s1[0] > -0.2:
        r += 0.2
      elif s1[0] > -0.15:
        r += 0.5
      elif s1[0] > -0.1:
        r += 0.7
      training_s.append(s)
      training_a.append(a)
      training_s1.append(s1)
      training_a1.append(a1)
      training_r.append(r)
      step += 1 
      if done:
        if step < 160:
          print("Episode: {} | Final return: {}".format(episode,G))
          torch.save(training_q.state_dict(), "my" + str(step) + ".pth")
        break
      s, a = s1, a1
  if len(training_s) > 4000:
    training_s = training_s[-4000:]
    training_a = training_a[-4000:]
    training_s1 = training_s1[-4000:]
    training_a1 = training_a1[-4000:]
    training_r = training_r[-4000:]
  MyData = training(torch.FloatTensor(training_s),torch.LongTensor(training_a),torch.FloatTensor(training_s1),torch.LongTensor(training_a1),torch.LongTensor(training_r))
  loader = data.DataLoader(MyData,128,True)
  iter_times = 0
  training_q.train()
  target_q.eval()
  for epoch in range(1,11):
    for s,a,s1,a1,r in loader:
      iter_times += 1
      if iter_times % 100 == 0:
        target_q.load_state_dict(training_q.state_dict())       
      state_estimation = training_q(s)
      state_max = (r + torch.gather(target_q(s1), dim=1, index=a1.unsqueeze(1)).squeeze(1)) * gamma
      loss = criterion(torch.gather(state_estimation, dim=1, index=a.unsqueeze(1)).squeeze(1),state_max)
      optimizer.zero_grad()
      loss.backward()
      optimizer.step()
    print("Episode: {} | Loss: {}".format(episode,loss))
  target_q.load_state_dict(training_q.state_dict())  

Episode: 0 | Loss: 0.0154880927875638
Episode: 0 | Loss: 0.015072302892804146
Episode: 0 | Loss: 0.0034881203901022673
Episode: 0 | Loss: 0.003061148803681135
Episode: 0 | Loss: 0.0025990265421569347
Episode: 0 | Loss: 0.0033721080981194973
Episode: 0 | Loss: 0.0657004565000534
Episode: 0 | Loss: 0.010915705934166908
Episode: 0 | Loss: 0.002020889427512884
Episode: 0 | Loss: 0.0008007368305698037
Episode: 1 | Loss: 0.005950151011347771
Episode: 1 | Loss: 0.0003212250885553658
Episode: 1 | Loss: 7.10006061126478e-05
Episode: 1 | Loss: 0.0007185870781540871
Episode: 1 | Loss: 7.530758011853322e-05
Episode: 1 | Loss: 3.257026037317701e-05
Episode: 1 | Loss: 0.020044144243001938
Episode: 1 | Loss: 0.0005060104886069894
Episode: 1 | Loss: 4.98011322633829e-05
Episode: 1 | Loss: 0.02732022851705551
Episode: 2 | Loss: 0.010050633922219276
Episode: 2 | Loss: 0.0014134312514215708
Episode: 2 | Loss: 0.0013854341814294457
Episode: 2 | Loss: 0.006537473760545254
Episode: 2 | Loss: 0.0001605162688

# Results

In [54]:
control_mountain_car = mountain_car()
control_mountain_car.load_state_dict(torch.load("/content/my92.pth"))

<All keys matched successfully>

In [55]:
def act_mountain_car(s):
  a = torch.argmax(control_mountain_car(torch.from_numpy(s).float()))
  return a.item()

In [56]:
test = []
for i in range(10):
  obs = env.reset()
  s = obs
  a = act_mountain_car(s)
  G = 0
  while True:
    obs, r, done, _ = env.step(a)
    G += r
    s1 = obs
    a = act_mountain_car(s1)
    if done:
      test.append(G)
      break
print("SARSA | average score: {}".format(np.mean(test)))

SARSA | average score: -123.5
