In [1]:
import doom_env 
import torch
import torch.nn as nn
import torch.optim as optim
from torch.autograd import Variable
import torch.nn.functional as F
import numpy as np
import matplotlib.pyplot as plt
from collections import namedtuple, deque,Counter
import os
import cv2
import glob
from gym.wrappers import Monitor
from gym.core import ObservationWrapper
from gym.spaces.box import Box 
import time

PATH = 'last_brain.pth'
# Defining one Step
Step = namedtuple('Step', ['state', 'action', 'reward', 'done'])
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


In [2]:
class PreprocessImage(ObservationWrapper):
    
    def __init__(self, env, height = 64, width = 64, grayscale = True, crop = lambda img: img):
        super(PreprocessImage, self).__init__(env)
        self.img_size = (height, width)
        self.grayscale = grayscale
        self.crop = crop
        n_colors = 1 if self.grayscale else 3
        self.observation_space = Box(0.0, 1.0, [n_colors, height, width])

    def observation(self, img):
        img = self.crop(img)
        img = cv2.resize(img,self.img_size,interpolation = cv2.INTER_CUBIC)
        if self.grayscale:
            img = img.mean(-1, keepdims = True)
        img = np.transpose(img, (2, 0, 1))
        img = img.astype('float32') / 255.
        return img

In [3]:
class CNN(nn.Module):
    def __init__(self,num_actions):
        super(CNN,self).__init__()
        self.conv1 = nn.Conv2d(in_channels=1,out_channels = 32,kernel_size = 5)
        self.conv2 = nn.Conv2d(32,32,3)
        self.conv3 = nn.Conv2d(32,64,2)
        self.fc1 = nn.Linear(self.count_neurons((1,64,64)),40)
        self.fc2 = nn.Linear(40,num_actions)

    def count_neurons(self,image_dim):
        x = Variable(torch.rand(1,*image_dim))
        x = F.relu(F.max_pool2d(self.conv1(x),3,2))
        x = F.relu(F.max_pool2d(self.conv2(x),3,2))
        x = F.relu(F.max_pool2d(self.conv3(x),3,2))
        return x.data.view(1,-1).size(1)

    def forward(self,x):
        x = F.relu(F.max_pool2d(self.conv1(x),3,2))
        x = F.relu(F.max_pool2d(self.conv2(x),3,2))
        x = F.relu(F.max_pool2d(self.conv3(x),3,2))
        x =x.view(x.size(0),-1)
        x = self.fc2(F.relu(self.fc1(x)))
        return x
        


In [4]:
class SoftMaxBody(nn.Module):
    def __init__(self,temperature):
        super(SoftMaxBody,self).__init__()
        self.temperature = temperature
    
    def forward(self,outputs):
        probs = F.softmax(outputs*self.temperature)
        action = probs.multinomial(num_samples=1)
        return action



In [5]:
class AI:
    def __init__(self,brain,body,useGPU=True):
        self.brain = brain 
        self.body = body 
        self.useGPU = useGPU

    def __call__(self,inputs_):
        #Converting the images into torch variable
        inputs = Variable(torch.from_numpy(np.array(inputs_,dtype=np.float32)))
        if self.useGPU:
            if not inputs.is_cuda:
                inputs = inputs.to(device)
        outputs = self.brain(inputs).to('cpu')
        action = self.body(outputs)

        #returning action as numpy array
        return action.data.numpy()


In [6]:
# Making the AI progress on several (n_step) steps
class NStepProgress:
    
    def __init__(self, env, ai, n_step):
        self.ai = ai
        self.rewards = []
        self.env = env
        self.n_step = n_step
    
    def __iter__(self):
        state = self.env.reset()
        history = deque()
        reward = 0.0
        while True:
            action = self.ai(np.array([state]))[0][0]
            next_state, r, is_done, _ = self.env.step(action)
            reward += r
            history.append(Step(state = state, action = action, reward = r, done = is_done))
            while len(history) > self.n_step + 1:
                history.popleft()
            if len(history) == self.n_step + 1:
                yield tuple(history)
            state = next_state
            if is_done:
                if len(history) > self.n_step + 1:
                    history.popleft()
                while len(history) >= 1:
                    yield tuple(history)
                    history.popleft()
                self.rewards.append(reward)
                reward = 0.0
                state = self.env.reset()
                history.clear()
    
    def rewards_steps(self):
        rewards_step = self.rewards
        self.rewards = []
        return rewards_step

In [7]:
# Implementing Experience Replay

class ReplayMemory:
    
    def __init__(self, n_steps, capacity = 10000):
        self.capacity = capacity
        self.n_steps = n_steps
        self.n_steps_iter = iter(n_steps)
        self.buffer = deque()
        
    # creates an iterator that returns random batches
    def sample_batch(self, batch_size): 
        ofs = 0
        vals = list(self.buffer)
        np.random.shuffle(vals)
        while (ofs+1)*batch_size <= len(self.buffer):
            yield vals[ofs*batch_size:(ofs+1)*batch_size]
            ofs += 1

    def run_steps(self, samples):
        while samples > 0:
            # 10 consecutive steps
            entry = next(self.n_steps_iter) 
            
            # we put 200 for the current episode
            self.buffer.append(entry) 
            samples -= 1
            
        # we accumulate no more than the capacity (10000)
        while len(self.buffer) > self.capacity: 
            self.buffer.popleft()


### Eligibility Trace

In [8]:
def eligibilityTrace(batch):
    gamma = 0.99
    inputs = []
    targets = []
    
    for series in batch:
        input_ = Variable(torch.from_numpy(np.array([series[0].state,series[-1].state],dtype=np.float32))).to(device)
        output = cnn(input_)
        
        #Updating the cummulative reward based on last transition of the series is over or not
        cummul_reward = 0.0 if series[-1].done else output[1].data.max()
        
        for step in reversed(series[:-1]):
            cummul_reward = step.reward + cummul_reward*gamma
            
        #State of first transition
        state = series[0].state
        
        #Q-val of input-state of first transition
        target = output[0].data
        
        target[series[0].action] = cummul_reward
        
        inputs.append(state)
        targets.append(target)
    return torch.from_numpy(np.array(inputs,dtype=np.float32)), torch.stack(targets)

### Making moving average on 100 steps


In [9]:
class MA:
    def __init__(self,size=100):
        self.size = size
        self.list_of_rewards = []
        
    def add(self,rewards):
        if isinstance(rewards,list):
            self.list_of_rewards += rewards
        else:
            self.list_of_rewards.append(rewards)
        
        #Maintaining the size of the reward list
        while len(self.list_of_rewards) > self.size:
            del self.list_of_rewards[0]
    def average(self):
        return np.mean(self.list_of_rewards)

In [10]:
### Creating GAME Environment

In [11]:
env = doom_env.VizDoomGym()
env = Monitor(env, "./videos", force = True)
num_actions = env.action_space.n



### Building AI
#### Loading model if exits

In [12]:
checkpoint = torch.load(PATH)

cnn = CNN(num_actions=7)
cnn.to(device)
cnn.load_state_dict(checkpoint['state_dict'])
cnn.eval()

softmax_body = SoftMaxBody(temperature=50)
ai = AI(brain=cnn,body=softmax_body)

### Experience Replay

In [13]:
n_steps = NStepProgress(env = env, ai = ai,n_step = 10)
memory = ReplayMemory(n_steps=n_steps)

movingAvg = MA(100)

#Training the AI
loss_func = nn.MSELoss()
optimizer = optim.Adam(cnn.parameters(),lr=0.001)
optimizer.load_state_dict(checkpoint['optimizer'])
num_epocs = 10


In [14]:
print('Training...')
for epoch in range(1,num_epocs+1):
    memory.run_steps(200)
    for batch in memory.sample_batch(128):
        inputs,targets = eligibilityTrace(batch)
        inputs,targets = Variable(inputs),Variable(targets)
        predictions = cnn(inputs.to(device))
        loss_error = loss_func(predictions,targets)
        
        #Clearing previous gradients
        optimizer.zero_grad()
        loss_error.backward()
        optimizer.step()
        
    #Computing avg. rewards
    rewards_steps = n_steps.rewards_steps()
    movingAvg.add(rewards_steps)
    avg_reward = movingAvg.average()
        
    print("Epoch : %s, Average Reward : %s" % (str(epoch),str(avg_reward)))
print('Training finished...')
print('Saving model...')
torch.save({
            'state_dict':cnn.state_dict(),
            'optimizer':optimizer.state_dict()
        },'last_brain.pth')
print('Done.')

Training...


  probs = F.softmax(outputs*self.temperature)


Epoch : 1, Average Reward : 214.93863747336647
Epoch : 2, Average Reward : 166.6723095703125
Epoch : 3, Average Reward : 156.23417502955385
Epoch : 4, Average Reward : 144.62301232679835
Epoch : 5, Average Reward : 139.58223100142047
Epoch : 6, Average Reward : 139.46278858184814
Epoch : 7, Average Reward : 146.77168323682702
Epoch : 8, Average Reward : 148.13127487182618
Epoch : 9, Average Reward : 146.0398712158203
Epoch : 10, Average Reward : 149.16573364257812
Training finished...
Saving model...
Done.


In [15]:
### AI-Automovement

In [16]:
environment = doom_env.VizDoomGym(gray_scale=False)
observation = environment.reset()
img_array = []

while True:
    img_array.append(observation)
    gray_img = cv2.cvtColor(np.moveaxis(observation,0,-1),cv2.COLOR_BGR2GRAY)
    state = cv2.resize(gray_img,(64,64),interpolation = cv2.INTER_CUBIC)
    state = np.reshape(state,(1,64,64))
    actions = cnn(Variable(torch.from_numpy(np.array([state],dtype=np.float32)).to(device)))
    action = np.argmax(actions.detach().to('cpu').numpy(), axis=-1)[0]
    new_observation, reward, done, _ = environment.step(action)
    observation = new_observation
    time.sleep(1)
    print('Reward :',reward)
    if done:
        break
environment.close()

Reward : 0.0
Reward : 0.78125
Reward : 9.571823120117188
Reward : 17.305709838867188
Reward : -0.9128875732421875
Reward : 9.464981079101562
Reward : 15.615982055664062
Reward : 21.382583618164062
Reward : 25.272293090820312
Reward : 27.89593505859375
Reward : 29.66558837890625
Reward : 31.014480590820312
Reward : -93.66749572753906


In [18]:
images = np.array(img_array)

In [23]:
folder_name = "videos"
video_name = "play.mp4"

if not os.path.exists(folder_name):
    os.makedirs(folder_name)

for i in range(len(img_array)):
    plt.imsave("{}/image{}.jpg".format(folder_name, i),np.moveaxis(images[i],0,-1))

files = glob.glob(os.path.expanduser("{}/*".format(folder_name)))
frames_array = []

for filename in sorted(files, key=lambda t: os.stat(t).st_mtime):
    img = cv2.imread(filename)
    height, width, layers = img.shape
    size = (width,height)
    frames_array.append(img)

out = cv2.VideoWriter(video_name, cv2.VideoWriter_fourcc(*'DIVX'), 15, size)

for i in range(len(frames_array)):
    out.write(frames_array[i])

out.release()