In [1]:
import torch
import torch.nn as nn
import numpy as np
import random
import matplotlib.pyplot as plt
%matplotlib inline
import pandas as pd
import copy
import gym
import retro
from tqdm import tqdm
import collections

In [None]:
class ImageToPyTorch(gym.ObservationWrapper):
    def __init__(self, env):
        super(ImageToPyTorch, self).__init__(env)
        old_shape = self.observation_space.shape
        self.observation_space = gym.spaces.Box(low=0.0, high=1.0, shape=(old_shape[-1], old_shape[0], old_shape[1]), dtype=np.float32)

    def observation(self, observation):
        return np.moveaxis(observation, 2, 0)

class ScaledFloatFrame(gym.ObservationWrapper):
    def observation(self, obs):
        return np.array(obs).astype(np.float32) / 255.0
    
env = retro.make(game='AirStriker-Genesis', state="Level1")
env = ImageToPyTorch(env)
env = ScaledFloatFrame(env)

In [5]:
class DQN(nn.Module):
    def __init__(self, input_shape, n_actions):
        super(DQN, self).__init__()
        self.conv = nn.Sequential(
            nn.Conv2d(input_shape[0], 32, kernel_size=8, stride=4),
            nn.ReLU(),
            nn.Conv2d(32, 64, kernel_size=4, stride=2),
            nn.ReLU(),
            nn.Conv2d(64, 64, kernel_size=3, stride=1),
            nn.ReLU()
        )

        conv_out_size = self._get_conv_out(input_shape)
        self.fc = nn.Sequential(
            nn.Linear(conv_out_size, 512),
            nn.ReLU(),
            nn.Linear(512, n_actions)
        )
        
    def _get_conv_out(self, shape):
        o = self.conv(torch.zeros(1, *shape))
        return int(np.prod(o.size()))

    def forward(self, x):
        conv_out = self.conv(x)
        conv_out = conv_out.view(x.size()[0], -1)
        return self.fc(conv_out)

In [6]:
gpu = False

In [7]:
device = torch.device("cuda" if gpu else "cpu")
Q = DQN(env.observation_space.shape, env.action_space.n)

In [8]:
env.observation_space.shape

(3, 224, 320)

In [9]:
Q.to(device)

DQN(
  (conv): Sequential(
    (0): Conv2d(3, 32, kernel_size=(8, 8), stride=(4, 4))
    (1): ReLU()
    (2): Conv2d(32, 64, kernel_size=(4, 4), stride=(2, 2))
    (3): ReLU()
    (4): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1))
    (5): ReLU()
  )
  (fc): Sequential(
    (0): Linear(in_features=55296, out_features=512, bias=True)
    (1): ReLU()
    (2): Linear(in_features=512, out_features=12, bias=True)
  )
)

In [10]:
obs = env.reset()
obs = np.array([env.reset() for _ in range(10)])

In [11]:
Q(torch.FloatTensor(obs).to(device)).shape

torch.Size([10, 12])

In [156]:
def main(nEpisode=100, gamma=0.99, epsilon0=1, epsilonF=0.02, decayingRate=10**(-5), storeQ=1000,
         maxIter=200000, batchSize = 32, replaySize = 10000, replayStartSize=10000, learningRate=1e-4, gpu=False):
    
    device = torch.device("cuda" if gpu else "cpu")
    Q = DQN(env.observation_space.shape, env.action_space.n).to(device)
    QHat = DQN(env.observation_space.shape, env.action_space.n).to(device)
    epsilon = epsilon0
    buffer = collections.deque(maxlen=replaySize)
    loss_fn = torch.nn.MSELoss()
    optimizer = torch.optim.Adam(Q.parameters(), lr=learningRate)
    
    for step in tqdm(range(nEpisode)):
        obs = env.reset()
        for yassin in range(maxIter):
            epsilon = max(epsilonF, (1 - decayingRate) * epsilon)
            action = np.zeros(12, dtype = int)
            if np.random.random() < epsilon:
                actionInd = np.random.randint(env.action_space.n)
            else:
                obs = np.array([obs], copy=False)
                obs = torch.tensor(obs).to(device)
                qVals = Q(obs)
                _, actionV = torch.max(qVals, dim=1)
                actionInd = int(actionV.item())
            action[actionInd] = 1
            obsNext, reward, done , _ = env.step(action)
            buffer.append(collections.deque([obs, action, reward, done, obsNext]))
            obs = obsNext
            
            if len(buffer) >= replayStartSize:
                indices = np.random.choice(len(buffer), batch_size, replace=False)
                observations, actions, rewards, dones, observationsNext = zip(*[self.buffer[idx] for idx in indices])
                observations, actions, rewards, dones, observationsNext = np.array(states), np.array(actions), np.array(rewards, dtype=np.float32), np.array(dones, dtype=np.uint8), np.array(next_states) 
                minibatch = random.sample(buffer, min(len(buffer), batchSize))
                observationsV = torch.tensor(observations).to(device)
                observationsNextV = torch.tensor(observationsNext).to(device)
                actionsV = torch.tensor(actions).to(device)
                rewardsV = torch.tensor(rewards).to(device)
                doneMask = torch.ByteTensor(dones).to(device)

                stateActionValues = Q(observationsV).gather(1, actionsV.unsqueeze(-1)).squeeze(-1)
                nextStateValues = QHat(observationsNextV).max(1)[0]
                nextStateValues[doneMask] = 0.0
                nextStateValues = nextStateValues.detach()

                expectedStateActionValues = nextStateValues * gamma + rewardsV
                optimizer.zero_grad()
                loss = loss_fn(stateActionValues, expectedStateActionValues)
                loss.backward()
                optimizer.step()
            
            if step % storeQ == 0:
                QHat = copy.deepcopy(Q)
                
            if done:
                print(yassin)
                break
    return Q

In [157]:
Q = main(20)



  0%|                                                                                                                                                         | 0/20 [00:00<?, ?it/s]

1416




  5%|███████▏                                                                                                                                        | 1/20 [02:51<54:14, 171.29s/it]

1763




 10%|██████████████▍                                                                                                                                 | 2/20 [03:00<36:48, 122.68s/it]

1455




 15%|█████████████████████▊                                                                                                                           | 3/20 [03:14<25:32, 90.16s/it]

MemoryError: 

In [144]:
def transform(Q, s, a):
    return float(Q(torch.FloatTensor([s, a])))

In [145]:
q_table = np.ones((env.observation_space.n, env.action_space.n))

for i in range(env.observation_space.n):
    for j in range(env.action_space.n):
        q_table[i, j] = transform(Q, i, j)

In [146]:
def testPolicy (q_table, nEpisode = 2000):
    success = 0
    for _ in range(nEpisode):
        t = 0
        observation = env.reset()
        done  = False
        actionTable = np.argmax(q_table, axis = 1)
        while not done and t < 200:
            action = actionTable[observation]
            observation, reward, done, info = env.step(action)
            t += 1

        if reward == 1:
            success += 1
    return success / nEpisode

In [147]:
testPolicy(q_table)

0.0

In [148]:
q_table

array([[0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.]])