In [None]:
from keras.models import Sequential, load_model
from keras.layers import Conv2D, Flatten, Dense, Activation
from keras.optimizers import Adam
from keras import backend as kerasBackend
from ple import PLE

In [None]:
from rlbirdv1 import *

x = [i for i in range(3)]
y = [i for i in range(3)]
y.reverse()

# List of coordinates for islands
island = [(0, 0)]
birdStart = (0, 0)

TILESIZE = 40
SCREEN_WIDTH = TILESIZE*(len(x)+3)
SCREEN_HEIGHT = TILESIZE * len(y)

reward = {'lose' : -1000,
          'win' : 1000}

listAction = [K_LEFT, K_RIGHT, K_DOWN, K_UP, K_s, K_d]

N = 20

In [None]:
# Epsilon
INITIAL_EXPLORATION = 200
EXPLORATION_STEPS = 800
INITIAL_EPSILON = 1
FINAL_EPSILON = 1e-3

# Constants
GAMMA = 0.99
NUMBER_GAMES = 2000
LEARNING_RATE = 0.01

def epsilon(step):
    """
    Epsilon for exploration/exploitation trade-off
    """
    if step < INITIAL_EXPLORATION:
        return 1
    elif step < EXPLORATION_STEPS:
        return INITIAL_EPSILON + (FINAL_EPSILON - INITIAL_EPSILON)/(EXPLORATION_STEPS-INITIAL_EXPLORATION) * (step-INITIAL_EXPLORATION)
    else:
        return FINAL_EPSILON
    
def createDQN():
    """
    Create deep Q network
    """
    # Neural network
    dqn = Sequential()
    dqn.add(Dense(units = 6, input_dim = 4 , activation='relu'))
    dqn.add(Activation('softmax'))

#     dqn = Sequential()
#     dqn.add(Conv2D(filters=16, kernel_size=(8,8), strides=4, activation='relu', input_shape=(80,80,4)))
#     dqn.add(Conv2D(filters=32, kernel_size=(4,4), strides=2, activation='relu'))
#     dqn.add(Flatten())
#     dqn.add(Dense(units=256, activation='relu'))
#     dqn.add(Dense(units=2, activation='linear'))
    
    dqn.compile(optimizer=Adam(lr=LEARNING_RATE), loss='mean_squared_error')
    return dqn


def epsilonGreedy(dqn, x, step):
    """
    Epsilon-greedy action
    """
    if np.random.rand() < epsilon(step):
        return np.random.randint(6)
    else:
        return np.argmax(dqn.predict(np.array([x])))


In [None]:
# Try to load DQN, or create a new one
dqn = createDQN()

# Environment
game = RLBird(width = SCREEN_WIDTH, height = SCREEN_HEIGHT, x = x, y = y,\
              init_bird_position = birdStart, island_position = island, \
              energyMax = 10, catchMax = 2, costMove = -1, costDive = -1, gainFish = 2, factorFishFly = 0.75,\
              nbStep = N, reward = reward)
p = PLE(game, fps=100, frame_skip=1, num_steps=1, force_fps=False, display_screen=True, reward_values = reward)

In [None]:
# Initialization
p.init()
p.reset_game()
saveFishMap = game.map.fishMap.copy()

for k in range(NUMBER_GAMES):
    
    p.init()
    p.reset_game()
    game.updateFishMap(saveFishMap.copy())
    reward = 0.0
    
    while(not p.game_over()):
        state = game.getGameState()
        x = game.listStates.state2idx(np.array(state))
        
        a = epsilonGreedy(dqn, state, k)
        reward = p.act(listAction[a])
        
        state_new = game.getGameState()
        x_new = game.listStates.state2idx(np.array(state_new))
        
        
        Q  = dqn.predict(np.array([state]))
        Q_new = dqn.predict(np.array([state_new]))
        
        
        if p.game_over():
            update = reward 
        else :
            update = reward + GAMMA * np.max(Q_new)
        Q[0,a] = update
       
        
        dqn.train_on_batch(np.array([state]),Q)

        
        
# Q-learning's final value function and policy


In [None]:
np.random.randint(5)