In [13]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [14]:
import gym
import numpy as np
import keras

In [15]:
ENV_ID = "BreakoutDeterministic-v4"

env = gym.make(ENV_ID)
controller = AtariController(env)
frame_shape = (105,80)
frame_shape_channel = (105,80,1)
n_actions = env.action_space.n

In [16]:
#network parameters
learning_rate = 0.001
#gradient_momentum = 0.95
#min_sq_gradient = 0.01

In [17]:
def to_grayscale(img):return np.mean(img, axis=2).astype(np.uint8)
def downsample(img):return img[::2, ::2]
def preprocess(img):return to_grayscale(downsample(img))

In [18]:
def create_model():
    input_layer = keras.layers.Input(frame_shape_channel)
    
    h_layer_1 = keras.layers.Conv2D(16, (8, 8), activation="relu", strides=(4, 4))(input_layer)
    h_layer_2 = keras.layers.Conv2D(32, (8, 8), activation="relu", strides=(4, 4))(h_layer_1)
    
    flattened_layer = keras.layers.core.Flatten()(h_layer_2)
    
    softmax_output = keras.layers.Dense(n_actions,activation='softmax',use_bias=False)(flattened_layer)
    ddpg = keras.models.Model(inputs=input_layer,outputs=softmax_output)
    
    ddpg.summary()
    

    optimizer = keras.optimizers.RMSprop(lr=learning_rate)
    ddpg.compile(optimizer, loss='mse')
    
    return ddpg

In [19]:
ddpg = create_model()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, 105, 80, 1)        0         
_________________________________________________________________
conv2d_3 (Conv2D)            (None, 25, 19, 16)        1040      
_________________________________________________________________
conv2d_4 (Conv2D)            (None, 5, 3, 32)          32800     
_________________________________________________________________
flatten_2 (Flatten)          (None, 480)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 4)                 1920      
Total params: 35,760
Trainable params: 35,760
Non-trainable params: 0
_________________________________________________________________


In [93]:
def process_rewards(r_list):
    reward_decay=0.99
    tmp_r=0
    rew=np.zeros_like(r_list,dtype=np.float32)
    for i in range(len(r_list)-1,-1,-1):
        if r_list[i]==0:
            tmp_r=tmp_r*reward_decay
            rew[i]=tmp_r
        else: 
            tmp_r = r_list[i]
            rew[i]=tmp_r
  #  rew -= np.mean(rew) # subtract by average
  #  rew /= np.std(rew) # divide by std
    return rew

In [94]:
def clip_reward(r):return np.sign(r)

In [95]:
env.step

<bound method TimeLimit.step of <TimeLimit<AtariEnv<BreakoutDeterministic-v4>>>>

In [110]:
def generate_episode(ddpg, env):
    states_list = [] # shape = (x,80,80)
    action_list=[] # 1 if we chose up. 0 if down
    rewards_list=[]
    network_output_list=[]
    observation = env.reset()
    observation, reward, done, info = env.step(1)
    lives = info['ale.lives']
    done = False
    policy_output_list = []
    lost_live = True
    
    
    while done == False:
        if lost_live: observation, reward, done, info = env.step(1)
        
        #processed_network_input = preprocess_frames(new_frame=new_observation,last_frame=observation)
        processed_image = preprocess(observation)
        states_list.append(processed_image)
        reshaped_input = np.expand_dims(processed_image,axis=2) # x shape is (80,80) so we need similar reshape(x,(1,80,80))
        reshaped_input = np.expand_dims(reshaped_input,axis=0)
        
        prediction = ddpg.predict(reshaped_input)
        network_output_list.append(prediction[0])
        action = np.argmax(prediction)
        action_list.append(action)
        
        observation, reward, done, info = env.step(action)
        lost_live = True if info['ale.lives'] < lives else False
        lives = info['ale.lives']
        rewards_list.append(reward + lost_live*-1)
        
        if done:
            break
            
    env.close()
    return states_list,action_list,rewards_list, network_output_list

In [111]:
a, b, c, d= generate_episode(ddpg,env)

In [112]:
for i in c:
    if i!= 0.0: print (i)

-1.0
-1.0
-1.0
-1.0
-1.0


In [113]:
d

[array([9.9682570e-01, 2.9860267e-03, 1.4518409e-04, 4.3163185e-05],
       dtype=float32),
 array([9.9951613e-01, 4.2109721e-04, 4.8426868e-05, 1.4469902e-05],
       dtype=float32),
 array([9.9911267e-01, 7.8831369e-04, 8.1557475e-05, 1.7442630e-05],
       dtype=float32),
 array([9.9868423e-01, 1.2146344e-03, 9.1671413e-05, 9.4525694e-06],
       dtype=float32),
 array([9.9927694e-01, 5.8995833e-04, 8.1992905e-05, 5.1059680e-05],
       dtype=float32),
 array([9.9891686e-01, 1.0394815e-03, 3.2427884e-05, 1.1272429e-05],
       dtype=float32),
 array([9.9944502e-01, 5.0437264e-04, 4.1182189e-05, 9.4429697e-06],
       dtype=float32),
 array([9.9901426e-01, 9.2585816e-04, 4.2739102e-05, 1.7127093e-05],
       dtype=float32),
 array([9.9945515e-01, 4.6984784e-04, 6.0232414e-05, 1.4708652e-05],
       dtype=float32),
 array([9.9931979e-01, 6.0340879e-04, 6.9894013e-05, 6.9166176e-06],
       dtype=float32),
 array([9.9908018e-01, 8.4533304e-04, 5.6633839e-05, 1.7797513e-05],
       dtyp

In [227]:
# we define a helper function to create a batch of simulations
# and after the batch simulations, preprocess data and fit the network
def train_ddpg(ddpg, env, n_batches=10):
    batch_state_list=[]
    batch_action_list=[]
    batch_rewards_list=[]
    batch_network_output_list = []
    for i in range(n_batches):
        states_list,action_list,rewards_list, network_output_list = generate_episode(ddpg, env) 
        batch_state_list.extend(states_list)
        batch_action_list.extend(action_list)
        batch_rewards_list.extend(rewards_list)
        batch_network_output_list.extend(network_output_list)
    
    episode_reward=process_rewards(batch_rewards_list)
    print(episode_reward)
    x=np.array(batch_state_list)
    x = np.expand_dims(x, 3)
    bnol = np.asarray(batch_network_output_list)
    #max_indexes = np.zeros_like(bnol)
    er = np.asarray(episode_reward)
    
    y_true = np.multiply(bnol,er[:, None])
    y = bnol + y_true
    print(y)
    #print(y.shape)
    ddpg.fit(x,y)

    return batch_state_list, action_list, batch_rewards_list, network_output_list

In [228]:
train_n_times = 21 # for actual training, about 5000 may be a good start. 
for i in range(train_n_times):
    states_list,up_or_down_action_list,rewards_list,network_output_list=train_ddpg(ddpg, env, 10)
    if i%10==0:
        print("i="+str(i))
        rr=np.array(rewards_list)
        # i keep how many times we won in batch. you can use log more details more frequently
        print('count win='+str(len(rr[rr>0]))) 
        ddpg.save("policy_network_model_simple.h5")
        ddpg.save("policy_network_model_simple"+str(i)+".h5")
        with open('rews_model_simple.txt','a') as f_rew:
            f_rew.write("i="+str(i)+'       reward= '+str(len(rr[rr > 0])))
            f_rew.write("\n")

[-0.8016306  -0.80972785 -0.8179069  ... -0.9801     -0.99
 -1.        ]
[[0.0000000e+00 1.9836938e-01 2.8214806e-26 1.6575967e-30]
 [0.0000000e+00 1.9027215e-01 3.5336827e-26 6.5014069e-30]
 [0.0000000e+00 1.8209308e-01 6.2627174e-26 5.3080781e-30]
 ...
 [0.0000000e+00 1.9900024e-02 3.9623867e-27 3.0821650e-31]
 [0.0000000e+00 9.9999905e-03 1.9911466e-27 1.5488238e-31]
 [0.0000000e+00 0.0000000e+00 0.0000000e+00 0.0000000e+00]]
Epoch 1/1
i=0
count win=0
[-0.8016306  -0.80972785 -0.8179069  ... -0.9801     -0.99
 -1.        ]
[[0.0000000e+00 1.9836938e-01 2.8214806e-26 1.6575967e-30]
 [0.0000000e+00 1.9027215e-01 3.5336827e-26 6.5014069e-30]
 [0.0000000e+00 1.8209308e-01 6.2627174e-26 5.3080781e-30]
 ...
 [0.0000000e+00 1.9900024e-02 3.9623867e-27 3.0821650e-31]
 [0.0000000e+00 9.9999905e-03 1.9911466e-27 1.5488238e-31]
 [0.0000000e+00 0.0000000e+00 0.0000000e+00 0.0000000e+00]]
Epoch 1/1
[-0.8016306  -0.80972785 -0.8179069  ... -0.9801     -0.99
 -1.        ]
[[0.0000000e+00 1.9836938

[-0.8016306  -0.80972785 -0.8179069  ... -0.9801     -0.99
 -1.        ]
[[0.0000000e+00 1.9836938e-01 2.8214806e-26 1.6575967e-30]
 [0.0000000e+00 1.9027215e-01 3.5336827e-26 6.5014069e-30]
 [0.0000000e+00 1.8209308e-01 6.2627174e-26 5.3080781e-30]
 ...
 [0.0000000e+00 1.9900024e-02 3.9623867e-27 3.0821650e-31]
 [0.0000000e+00 9.9999905e-03 1.9911466e-27 1.5488238e-31]
 [0.0000000e+00 0.0000000e+00 0.0000000e+00 0.0000000e+00]]
Epoch 1/1


KeyboardInterrupt: 

In [201]:
a,b,c = train_ddpg(ddpg, env)

(1190, 105, 80)
(1190, 105, 80, 1)
(1190, 4)
Epoch 1/1


In [34]:
import time
def play_and_show_episode(policy_network, env):
    done=False
    observation = env.reset()
    observation, reward, done, info = env.step(1)
    lives = info['ale.lives']
    done = False
    lost_live = True
    env.render()
    while done==False:
        time.sleep(1/80)
        if lost_live: 
            observation, reward, done, info = env.step(1)
            env.render()
            time.sleep(1/80)
        
        processed_image = preprocess(observation)
        reshaped_input = np.expand_dims(processed_image,axis=0)
        reshaped_input = np.expand_dims(reshaped_input,axis=3) # x shape is (80,80) so we need similar reshape(x,(1,80,80))

        prediction = ddpg.predict(reshaped_input)
        action = np.argmax(prediction)
        
        env.render()
        
        observation, reward, done, info = env.step(action)
        lost_live = True if info['ale.lives'] < lives else False
        lives = info['ale.lives']
        if reward!=0:
            print(reward)
        if done:
            break

In [39]:
play_and_show_episode(ddpg, env)

In [None]:
env.unwrapped.get_action_meanings()