In [None]:
pip install tensorflow==2.0

In [None]:
pip install atari_py

In [None]:
pip install gym

In [None]:
pip install keras

In [None]:
pip install pyglet==1.4.9

# Atari DQN

In [None]:
import gym
from gym import wrappers
import numpy as np
import random
import keras
from collections import deque
from keras import backend as back
back.image_data_format()
from keras.models import Model, Sequential, load_model
from keras.optimizers import RMSprop
from keras.layers import Dense, merge, Lambda, Input, Add, Conv2D, LeakyReLU, Flatten, Multiply, Reshape




import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
# Needed to break down images shot by shot
def processImage(frame):
    frame = np.dot(frame[..., :3], [0.299, 0.587, 0.114])
    frame = cv.resize(frame, dsize=(84, 110), interpolation=cv.INTER_AREA)
    frame = frame[16:100]
    return frame

In [None]:
class AtariDQN:
    cp_env = gym.make('Breakout-v0')
    def __init__(self, sizeOfState, sizeOfAction):
        
        # best parameters I can come up with
        self.sizeOfState = (84, 84, 4)
        self.sizeOfAction = sizeOfAction

        self.gamma = .99
        self.minEpsilon = .1
        self.maxEpsilon = 1.0
        self.decay = .999
        self.batchSize = 32
        self.training = 50000
        self.mem = deque(maxlen=1000000)
        self.saveImage = deque(maxlen=4)
        
        
        self.model = self.ddqnModel()
        self.target = self.ddqnModel()
        
    def ddqnModel(self):
        
        
        # create, seperate then aggregate layers for qvalue
        model = Sequential()
        model.add(Conv2D(32, (8, 8), strides=(4, 4), activation='relu', input_shape=self.sizeOfState))
        model.add(Conv2D(64, (4, 4), strides=(2, 2), activation='relu'))
        model.add(Conv2D(64, (3, 3), strides=(1, 1), activation='relu'))
        model.add(Flatten())
        model.add(Dense(512, activation='relu'))
        model.add(Dense(self.sizeOfAction))
        model.compile(loss="mse", optimizer=RMSprop(lr=0.00025, epsilon=0.01))
        model.summary()
        return model

    
    # update target model
    def updateModel(self):
        self.target.set_weights(self.model.get_weights())
        return
    
    # get action - either explore or exploit
    def actions(self, s):
        s = np.reshape(s, (-1, 84, 84, 4))
        # Exploration vs Exploitation
        if np.random.random()<= self.maxEpsilon :
            return random.randrange(self.sizeOfAction)
        else:
            return np.argmax(self.model.predict(s))
        
    # store experience in memory
    def memReplay(self, s, a, r, ns, done):
        s = np.reshape(s, (-1, 84, 84, 4))
        ns = np.reshape(ns, (-1, 84, 84, 4))
        self.mem.append((s, a, r, ns, done))
        
        if len(self.mem) > self.training:
            if self.maxEpsilon > self.minEpsilon:
                self.maxEpsilon *= self.decay

                
    # use that experience to train
    def repSample(self):
        
        # get random sample for manibatch
        miniBatch = random.sample(self.mem, self.batchSize)

        current = np.zeros((self.batchSize, self.sizeOfState[0],self.sizeOfState[1],self.sizeOfState[2]))
        ns = np.zeros((self.batchSize, self.sizeOfState[0],self.sizeOfState[1],self.sizeOfState[2]))
        a, r, done = [], [], []

        for x in range(self.batchSize):
            current[x] = miniBatch[x][0]
            a.append(miniBatch[x][1])
            r.append(miniBatch[x][2])
            ns[x] = miniBatch[x][3]
            done.append(miniBatch[x][4])
        
        # get target and next target
        #print(current.shape)
        tar = self.model.predict(current)
        nt = self.model.predict(ns)

        for x in range(len(miniBatch)):
            if done[x]:
                tar[x][a[x]] = r[x]
            else:
                # formula
                tar[x][a[x]] = r[x] + self.gamma * (np.amax(nt[x]))

        self.model.fit(current, tar, batch_size=self.batchSize, verbose=0)

In [None]:
bo_env = gym.make('Breakout-v0')
sizeOfState = (84, 84, 4)
sizeOfActions = bo_env.action_space.n

obs = processImage(bo_env.reset())

scores, episodes, eps = [],[],[] # for stats

agent = AtariDQN(sizeOfState, sizeOfActions) # initialize agent

for episode in range(10000):
            
    current = processImage(bo_env.reset()) # fresh start after every episode
    agent.saveImage.extend([current] * 4)

    #current = np.reshape(current, [1, sizeOfState[0],sizeOfState[1],sizeOfState[2]])
    done = False
    i = 0 # score tracker
    s = 0

    # keep going until done
    while not done:

        processed = processImage(current)
        agent.saveImage.append(processed)
        a = 0

        # Do nothing first action to avoid being in sub-optimal policy
        if len(agent.saveImage) < 4:
            ns, r, done, info = bo_env.step(a)
        else:
            # save everything then store in memory
            now = np.stack([agent.saveImage[0], agent.saveImage[1], agent.saveImage[2], agent.saveImage[3]])
            a = agent.actions(now)
            ns, r, done, info = bo_env.step(a)
            s += r
            r = r if not done else -10
            nsImage = processImage(ns)
            ns = np.stack([agent.saveImage[0], agent.saveImage[1], agent.saveImage[2], nsImage])

            agent.memReplay(now, a, r, ns, done) # store in memory

        current = ns

        if done:
            # update model
            agent.updateModel()

            # stats
            score = int(s)
            scores.append(score)
            episodes.append(episode)
            eps.append(agent.maxEpsilon)
            print('Episode: ', episode, ' Score: ', score, 'Epsilon: ', agent.maxEpsilon)
            if i == bo_env._max_episode_steps:
                break

# plotting
plt.plot(episodes, scores)
plt.show()