## Check if installed

In [1]:
import tianshou as ts
print(ts.__version__)

0.4.10


## Make an environment

In [2]:
import gym

env = gym.make("CartPole-v0")  # cart carrying a pole moving on a track

# Create 10 environments in train_evns and 100 in test_evns
train_envs = ts.env.DummyVectorEnv([lambda: gym.make('CartPole-v0') for _ in range(10)])
test_envs = ts.env.DummyVectorEnv([lambda: gym.make('CartPole-v0') for _ in range(100)])

In [3]:
# Can instead import EnvPool to speed up
#import envpool - OBS CANNOT INSTALL ENVPOOL ON ANACONDA??!??!
#train_envs = envpool.make_gym("CartPole-v0", num_evns=10)
#test_envs = envpool.make_gym("CartPole-v0", num_evns=100)

## Build the network

**Old code where the result did not work. Copy pasted from website instead**
import torch, numpy as np
from torch import nn

class Net(nn.Module):
    def __init__(self, state_shape, action_shape):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(np.prod(state_shape), 128), nn.ReLU(inplace=True),
            nn.Linear(128, 128), nn.ReLU(inplace=True),
            nn.Linear(128, 128), nn.ReLU(inplace=True),
            nn.Linear(128, np.prod(action_shape)),
            )
            
    def forward(self, obs, state=None, info={}):
        if not isinstance(obs, torch.Tensor):
            obs = torch.tensor(obs, dtype=torch.float)  
        batch = obs.shape[0]
        logits = self.model(obs.view(batch, -1))
        return logits, state  
    
state_shape = env.observation_space.shape or env.observation_space.n
action_shape = env.observation_space.shape or env.observation_space.n
net = Net(state_shape, action_shape)
optim = torch.optim.Adam(net.parameters(), lr=1e-3)

In [4]:
import torch, numpy as np
from torch import nn

class Net(nn.Module):
    def __init__(self, state_shape, action_shape):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(np.prod(state_shape), 128), nn.ReLU(inplace=True),
            nn.Linear(128, 128), nn.ReLU(inplace=True),
            nn.Linear(128, 128), nn.ReLU(inplace=True),
            nn.Linear(128, np.prod(action_shape)),
        )

    def forward(self, obs, state=None, info={}):
        if not isinstance(obs, torch.Tensor): 
            obs = torch.tensor(obs, dtype=torch.float)  # If observation is not a tensor, make it a tensor
        batch = obs.shape[0]
        logits = self.model(obs.view(batch, -1))  # Logits are "raw output of the neural network"
        return logits, state  

state_shape = env.observation_space.shape or env.observation_space.n
action_shape = env.action_space.shape or env.action_space.n
net = Net(state_shape, action_shape)
optim = torch.optim.Adam(net.parameters(), lr=1e-3)

## Setup policy

In [5]:
#use the neural network "net" and the optimizer "optim" from above 
policy = ts.policy.DQNPolicy(net, optim, discount_factor=0.9, estimation_step=3, target_update_freq=320)

## Setup collector
Collector is a Tianshou concept.

"Allows policty to interact with different types of environments conveniently".

Number of buffers should be the number of environments.

In [6]:
train_collector = ts.data.Collector(policy, train_envs, ts.data.VectorReplayBuffer(20000, 10), exploration_noise=True)
test_collector = ts.data.Collector(policy, test_envs, exploration_noise=True)

## Train policy with a Trainer

Several options for trainer. DQN is an off-policy algorithm, so `offpolicy_trainer()` is used. It stops training when `stop_fn` condition is reached.

In [7]:
from torch.utils.tensorboard import SummaryWriter
from tianshou.utils import TensorboardLogger
writer = SummaryWriter("log/dqn")
logger = TensorboardLogger(writer)  # Gets massive warnings if passed into result

result = ts.trainer.offpolicy_trainer(
    policy, train_collector, test_collector,
    max_epoch=10, step_per_epoch=10000, step_per_collect=10,
    update_per_step=0.1, episode_per_test=100, batch_size=64,
    train_fn=lambda epoch, env_step: policy.set_eps(0.1),
    test_fn=lambda epoch, env_step: policy.set_eps(0.05),
    stop_fn=lambda mean_rewards: mean_rewards >= env.spec.reward_threshold)
print(f'Finished training! Use {result["duration"]}')

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  np.object,
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  np.bool,
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  np.object: SlowAppendObjectArrayToTensorProto,
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  np.bool: SlowAppendBoolArrayToTensorProto,
  class IteratorBase(collections.Iterator, trackable.Trackable,
  class DatasetV2(collections.Iterable, tracking_base.Trackable,
  import imp
  'nearest': pil_image.NEAREST,
  'bilinear': pil_image.BILINEAR,
  'bicubic': pil_image.BICUBIC,
  if hasattr(pil_image, 'HAMMING'):
  if hasattr(pil_image, 'BOX'):
  if hasattr(pil_image, 'LANCZOS'):
Epoch #1: 10001it [00:15, 635.22it/s, env_step=100

Epoch #1: test_reward: 156.120000 ± 29.993093, best_reward: 156.120000 ± 29.993093 in #1


Epoch #2: 10001it [00:13, 746.04it/s, env_step=20000, len=149, loss=0.388, n/ep=0, n/st=10, rew=149.00]                


Epoch #2: test_reward: 187.220000 ± 17.330655, best_reward: 187.220000 ± 17.330655 in #2


Epoch #3: 10001it [00:13, 748.41it/s, env_step=30000, len=157, loss=0.095, n/ep=0, n/st=10, rew=157.00]                


Epoch #3: test_reward: 149.790000 ± 10.573831, best_reward: 187.220000 ± 17.330655 in #2


Epoch #4: 10001it [00:13, 768.76it/s, env_step=40000, len=161, loss=0.013, n/ep=0, n/st=10, rew=161.00]                


Epoch #4: test_reward: 149.880000 ± 10.943747, best_reward: 187.220000 ± 17.330655 in #2


Epoch #5: 10001it [00:12, 809.78it/s, env_step=50000, len=128, loss=0.013, n/ep=0, n/st=10, rew=128.00]                


Epoch #5: test_reward: 133.410000 ± 13.350727, best_reward: 187.220000 ± 17.330655 in #2


Epoch #6: 10001it [00:12, 819.25it/s, env_step=60000, len=160, loss=0.018, n/ep=0, n/st=10, rew=160.00]                


Epoch #6: test_reward: 152.690000 ± 10.017679, best_reward: 187.220000 ± 17.330655 in #2


Epoch #7: 10001it [00:11, 845.91it/s, env_step=70000, len=143, loss=0.022, n/ep=0, n/st=10, rew=143.00]                


Epoch #7: test_reward: 133.040000 ± 7.956029, best_reward: 187.220000 ± 17.330655 in #2


Epoch #8: 10001it [00:12, 820.66it/s, env_step=80000, len=167, loss=0.024, n/ep=0, n/st=10, rew=167.00]                


Epoch #8: test_reward: 148.350000 ± 12.129613, best_reward: 187.220000 ± 17.330655 in #2


Epoch #9: 10001it [00:12, 791.13it/s, env_step=90000, len=127, loss=0.050, n/ep=0, n/st=10, rew=127.00]                


Epoch #9: test_reward: 200.000000 ± 0.000000, best_reward: 200.000000 ± 0.000000 in #9
Finished training! Use 124.42s


## Save and load policy

In [8]:
if load_file == False:
    torch.save(policy.state_dict(), "dqn.pth")

In [None]:
if load_file:
    policy.load_state_dict(torch.load("dqn.pth"))

## Watch performance

In [12]:
policy.eval()
policy.set_eps(0.05)
collector = ts.data.Collector(policy, env, exploration_noise=True)
collector.collect(n_episode=1, render=1 / 35)  # 35 fps


{'n/ep': 1,
 'n/st': 200,
 'rews': array([200.]),
 'lens': array([200]),
 'idxs': array([0]),
 'rew': 200.0,
 'len': 200.0,
 'rew_std': 0.0,
 'len_std': 0.0}