In [2]:
import random
import gym
import sys
import numpy as np
from collections import deque,namedtuple
import os
from copy import deepcopy

import torch
import torch.nn as nn
from torch.optim import Adam

In [4]:
gym.__version__

AttributeError: module 'gym' has no attribute '__version__'

In [5]:
env = gym.make("LunarLander-v2")

[2022-11-13 01:00:11,812] Making new env: LunarLander-v2
  result = entry_point.load(False)


ModuleNotFoundError: No module named 'Box2D'

In [None]:

s = env.reset()
for _ in range(1000):
    action = env.action_space.sample()  # Random action
    s_prime, reward,done, _ = env.step(action)
    s = s_prime
    if done:
        s = env.reset()
    env.render()


In [None]:
env.action_space

In [None]:
state_sz = env.observation_space.shape[0]
action_sz = env.action_space.n
print('State space: ',state_sz)
print('Action space: ',action_sz)

In [None]:
env.action_space.sample()

In [None]:
s = env.reset()
print(s)
s_prime, r, done,_ = env.step(0)
print(s_prime, r, done, s)

In [None]:
n_episodes = 250
batch_size = 128
gamma = 0.995
lr = 0.0001
eps = 1.0
decay = 0.99

In [None]:
class DQN(nn.Module):
    def __init__(self,hidden_sz):
        super().__init__()
        self.hidden_sz = hidden_sz
        
        self.fc1 = nn.Linear(state_sz,self.hidden_sz)
        self.fc2 = nn.Linear(self.hidden_sz,self.hidden_sz)
        self.fc3 = nn.Linear(self.hidden_sz,action_sz)
        self.relu = nn.ReLU()
        
    def forward(self,x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        x = self.relu(x)
        x = self.fc3(x)
        
        return x

In [None]:
replay_buffer = deque(maxlen = 10000)

In [None]:
transition = namedtuple('transition',['s_prime', 'reward', 's', 'action', 'done'])

In [None]:
def store(transition):
    replay_buffer.append(transition)

In [None]:
dq_network = DQN(256)
target_network = deepcopy(dq_network)

In [None]:
dq_network.state_dict()

In [None]:
optimizer = Adam(dq_network.parameters(),lr = lr)
loss_fn = nn.MSELoss()

In [None]:
def update():
    
    if len(replay_buffer)<batch_size:
        return
    
    batch = random.sample(replay_buffer,batch_size)
    
    s = torch.tensor(np.array([t.s for t in batch]))
    r = torch.FloatTensor(np.array([t.reward for t in batch]))
    s_prime = torch.FloatTensor(np.array([t.s_prime for t in batch]))
    a = torch.LongTensor(np.array([t.action for t in batch])).unsqueeze(1)
    done = torch.FloatTensor(np.array([t.done for t in batch]))
    
    target = (r + gamma*target_network(s_prime).max(dim=1)[0]*(1 - done))
    
    prediction = dq_network(s).gather(1, a)
    
    
    optimizer.zero_grad()
    
    loss = loss_fn(target.unsqueeze(1), prediction)
    
    loss.backward()
    
    optimizer.step() 

In [None]:

class Agent():
    def __init__(self,target_update_frequency=100,eps=1):
        
        self.eps = eps
        self.target_update_frequency = target_update_frequency
        self.target_update_counter = 0
        self.rewards = []
        
    def select_action(self,state,eps):
        
        t = np.random.random()
        if t < eps:
            a = np.random.choice(range(action_sz))
        else:
            q = dq_network(torch.FloatTensor(state))
            a = q.argmax().item()   
        return a
            
        
        
    def run_episode(self,render):

        s = env.reset()
        done = False
        total_reward = 0.0
        self.eps = self.eps * decay
        transition_count = 0
        
        while not done:

            self.target_update_counter += 1
            if self.eps > 0.01:
                eps = self.eps
            else:
                eps = 0.01
            
            action = self.select_action(s,eps)
            
            s_prime, reward, done,_= env.step(action)
            
            store(transition(s_prime, reward, s, action, done))
            
           
            
            total_reward += reward
            
            s = s_prime
            
            if render:
                env.render()
                
            update()
            
            done = done

            transition_count += 1
            
        print('Transition Count: ', transition_count)    
        print('Episode Reward: ', total_reward)    
        self.rewards.append(total_reward)
            
    def train(self):
          
        for k in range(n_episodes):
            
            render = False
            
            if k % 100 < 10:
                render = True
            
            print('Episode: ',k)
            self.run_episode(render)
                    
            if self.target_update_counter >= self.target_update_frequency:

                self.target_update_counter = 0
                target_network.load_state_dict(dq_network.state_dict())
               

In [None]:
agent = Agent()

In [None]:
%%tim
agent.train()

In [None]:
s = env.reset()
for _ in range(1000):
    action = dq_network(torch.tensor(s)).argmax().item()
    s_prime, reward,done, _ = env.step(action)
    s = s_prime
    if done:
        s = env.reset()
    env.render()

In [None]:
print(np.mean(agent.rewards))
print(np.std(agent.rewards))

In [None]:
plt.plot([np.mean(agent.rewards[i-50:i]) for i in range(50,250)]);

In [None]:
np.max(agent.rewards)

In [None]:
len(replay_buffer)

In [None]:
    batch = random.sample(replay_buffer,batch_size)
    
    s = torch.FloatTensor(np.array([t.s for t in batch]))
    r = torch.FloatTensor(np.array([t.reward for t in batch]))
    s_prime = torch.FloatTensor(np.array([t.s_prime for t in batch]))
    a = torch.LongTensor(np.array([t.action for t in batch])).unsqueeze(1)
    done = torch.FloatTensor(np.array([t.done for t in batch]))

In [None]:
target_network(s_prime).max(dim=1)[0].shape

In [None]:
r.shape

In [None]:
target = r + target_network(s_prime).max(dim=1)[0]*(1-done)

In [None]:
dq_network(s).gather(1,a)

In [None]:
a

In [None]:
target.unsqueeze(1)

In [None]:
torch.cuda_version