In [None]:
# run the definitions included in the file
%run CartPoleQLearning.ipynb

In [None]:
# initialise the memory
memory = ReplayMemory(10000)

In [None]:
# increase the font size in all matplotlib plots
plt.rcParams.update({'font.size': 18})

# increase the title font size in all matplotlib plots
plt.rcParams.update({'axes.titlesize':22})


# Cart Pole with Q-learning with Linear Q-function Approximation

In [None]:
ql_basic = QLearning(n_features=5, n_actions=2, feature_transformer=AddInterceptFeatures())
ql_basic.train(episodes=1000, regul_strength=1e-2, learning_rate=7e-6,  epsilon_decay=0.995, reward_func=None)

In [None]:
# calculate the average reward over the last 100 episodes
avgs = [0 for _ in range(100)]
for i in range(100, len(ql_basic.runs)+1):
    avgs.append(np.mean(ql_basic.runs[i-100:i]))


In [None]:
# plot the reward per episode and the average reward per 100 episodes
fig, ax = plt.subplots(figsize=(15, 8))
ax.plot(ql_basic.runs, label='Reward per episode')
ax.plot(avgs, label='Average reward per 100 episodes')
ax.lines[1].set_linewidth(3)

ax.set_xlabel('Episode')
ax.set_ylabel('Reward')
ax.set_title('Q-learning with linear function approximation')
ax.legend()
plt.show()

In [None]:
# save the figure as pdf
fig.savefig('./Plots/cartpole_q_learning.pdf', bbox_inches='tight')


## Grid search

In [None]:
grid_search = {}
for lr in 10**np.linspace(-7, -2, 20, dtype=float):
    for regul_strength in 10**np.arange(-4,0,0.5, dtype=float):
        print(lr, regul_strength)
        grid_search[(lr, regul_strength)] = []
        for i in range(10):
            ql_basic = QLearning(n_features=5, n_actions=2, feature_transformer=AddInterceptFeatures())
            ql_basic.train(episodes=5000, regul_strength=regul_strength, learning_rate=lr, reward_func=None)
            grid_search[(lr, regul_strength)].append(ql_basic.avgs)




### Save to json

In [None]:
# convert jsonifiable format
grid_search_json = {}
for lr, regul_strength in grid_search.keys():
    grid_search_json[lr]= {}
for lr, regul_strength in grid_search.keys():
    grid_search_json[lr][regul_strength] = grid_search[(lr, regul_strength)]


In [None]:
# save to json
with open('grid_search.json', 'w') as fp:
    json.dump(grid_search_json, fp)


### Load from json file

In [None]:

# load from json
with open('grid_search.json', 'r') as fp:
    grid_search_json = json.load(fp)

grid_search = {}
for lr in grid_search_json.keys():
    for regul_strength in grid_search_json[lr].keys():
        grid_search[(float(lr), float(regul_strength))] = grid_search_json[lr][regul_strength]

    

In [None]:
grid_search_maxs = {}
grid_search_prop_success = {}
grid_search_avgs = {}
for lr_regul in grid_search.keys():
    avgs = grid_search[lr_regul]
    grid_search_maxs[lr_regul] = max([avg[-1] for avg in avgs])
    grid_search_prop_success[lr_regul] = sum([1 for avg in avgs if avg[-1]>40])/len(avgs)
    grid_search_avgs[lr_regul] = np.mean([avg[-1] for avg in avgs])


### Grid search plots

In [None]:

fig, ax = plt.subplots(3,1, figsize=(10, 20))
im0 = ax[0].imshow(np.array(list(grid_search_maxs.values())).reshape(20,8).T, cmap='magma', interpolation='nearest')
ax[0].set_yticks(np.arange(8))
ax[0].set_yticklabels([f'{regul_strength:.2f}' for regul_strength in 10**np.arange(-4,0,0.5, dtype=float)])
ax[0].set_xticks(np.arange(20))
ax[0].set_xticklabels([f'{lr:.2e}' for lr in 10**np.linspace(-7, -2, 20, dtype=float)])
plt.setp(ax[0].get_xticklabels(), rotation=45, ha="right",
            rotation_mode="anchor")

ax[0].set_ylabel('Regularisation strength')
ax[0].set_xlabel('Learning rate')
ax[0].set_title('Maximum final average reward over 10 initializations')
fig.colorbar(im0, ax=ax[0])


im1 = ax[1].imshow(np.array(list(grid_search_prop_success.values())).reshape(20,8).T, cmap='magma', interpolation='nearest')
ax[1].set_yticks(np.arange(8))
ax[1].set_yticklabels([f'{regul_strength:.2f}' for regul_strength in 10**np.arange(-4,0,0.5, dtype=float)])
ax[1].set_xticks(np.arange(20))
ax[1].set_xticklabels([f'{lr:.2e}' for lr in 10**np.linspace(-7, -2, 20, dtype=float)])
plt.setp(ax[1].get_xticklabels(), rotation=45, ha="right",
            rotation_mode="anchor")
ax[1].set_ylabel('Regularisation strength')
ax[1].set_xlabel('Learning rate')
ax[1].set_title('Proportion of final averages >40 per 10 initializations')
fig.colorbar(im1, ax=ax[1])

im1 = ax[2].imshow(np.array(list(grid_search_avgs.values())).reshape(20,8).T, cmap='magma', interpolation='nearest')
ax[2].set_yticks(np.arange(8))
ax[2].set_yticklabels([f'{regul_strength:.3f}' for regul_strength in 10**np.arange(-4,0,0.5, dtype=float)])
ax[2].set_xticks(np.arange(20))
ax[2].set_xticklabels([f'{lr:.2e}' for lr in 10**np.linspace(-7, -2, 20, dtype=float)])
plt.setp(ax[2].get_xticklabels(), rotation=45, ha="right",
            rotation_mode="anchor")

ax[2].set_ylabel('Regularisation strength')
ax[2].set_xlabel('Learning rate')
ax[2].set_title('Average of final average rewards per 10 initializations')
fig.colorbar(im1, ax=ax[2])


In [None]:
# save the figure as pdf
fig.savefig('./Plots/cartpole_grid_search.pdf', bbox_inches='tight')

In [None]:
fig, ax = plt.subplots(1,1, figsize=(15, 6))
im0 = ax.imshow(np.array(list(grid_search_maxs.values())).reshape(20,8).T, cmap='magma', interpolation='nearest')
ax.set_yticks(np.arange(8))
ax.set_yticklabels([f'1e{regul_strength:.1f}' for regul_strength in np.arange(-4,0,0.5, dtype=float)])
ax.set_xticks(np.arange(20))
ax.set_xticklabels([f'1e{lr:.2f}' for lr in np.linspace(-7, -2, 20, dtype=float)])
plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
            rotation_mode="anchor")

ax.set_ylabel('Regularisation strength')
ax.set_xlabel('Learning rate')
ax.set_title('Maximum final average reward over 10 initializations')
fig.colorbar(im0, ax=ax)

In [None]:
# save the figure as pdf
fig.savefig('./Plots/cartpole_grid_search_maxs.pdf', bbox_inches='tight')

In [None]:
fig, ax = plt.subplots(1,1, figsize=(15, 6))
 
im1 = ax.imshow(np.array(list(grid_search_prop_success.values())).reshape(20,8).T, cmap='magma', interpolation='nearest')
ax.set_yticks(np.arange(8))
ax.set_yticklabels([f'1e{regul_strength:.1f}' for regul_strength in np.arange(-4,0,0.5, dtype=float)])
ax.set_xticks(np.arange(20))
ax.set_xticklabels([f'1e{lr:.2f}' for lr in np.linspace(-7, -2, 20, dtype=float)])
plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
            rotation_mode="anchor")
ax.set_ylabel('Regularisation strength')
ax.set_xlabel('Learning rate')
ax.set_title(r'Proportion of successful initializations')
fig.colorbar(im1, ax=ax)


In [None]:
# save the figure as pdf
fig.savefig('./Plots/cartpole_grid_search_props.pdf', bbox_inches='tight')

In [None]:
# get the top 5 max lr regul pairs for grid_search_maxs
top5 = sorted(grid_search_maxs, key=grid_search_maxs.get, reverse=True)[:5]


In [None]:
top5

In [None]:
[grid_search_maxs[el] for el in top5]

## Q-learning with augmented reward

In [None]:
ql_basic_aug = QLearning(n_features=5, n_actions=2, feature_transformer=AddInterceptFeatures())
ql_basic_aug.train(episodes=1000, regul_strength=1e-2, learning_rate=7e-6,  epsilon_decay=0.995, reward_func=reward_func)

In [None]:
# calculate the average reward over the last 100 episodes
avgs_aug = [0 for _ in range(100)]
for i in range(100, len(ql_basic_aug.runs)+1):
    avgs_aug.append(np.mean(ql_basic_aug.runs[i-100:i]))

In [None]:
# plot the reward per episode and the average reward per 100 episodes
fig, ax = plt.subplots(figsize=(15, 8))
ax.plot(ql_basic_aug.runs, label='Reward per episode')
ax.plot(avgs_aug, label='Average reward per 100 episodes')

ax.lines[1].set_linewidth(3)
ax.set_xlabel('Episode')
ax.set_ylabel('Reward')
ax.set_title('Q-learning with linear function approximation and augmented rewards')
ax.legend()


In [None]:

# save the figure as pdf
fig.savefig('./Plots/cartpole_q_learning_augmented_reward.pdf', bbox_inches='tight')

In [None]:
# plot the comparison 
fig, ax = plt.subplots(figsize=(15, 8))
ax.plot(avgs, label='Average reward per 100 episodes (default rewards)')
ax.plot(avgs_aug, label='Average reward per 100 episodes (augmented rewards)')
ax.lines[1].set_color('purple')
ax.set_xlabel('Episode')
ax.set_ylabel('Reward')
ax.set_title('Impact of augmented rewards on Q-learning with linear function approximation')
ax.legend()
plt.show()


In [None]:
# save the figure as pdf
fig.savefig('./Plots/cartpole_q_learning_augmented_reward_comparison.pdf', bbox_inches='tight')

# Q-learning with Experience Replay

In [None]:
memory = ReplayMemory(10000)

In [None]:
# feed the memory
qler = QLearningWithExperienceReplay(n_features=5, n_actions=2, feature_transformer=AddInterceptFeatures())
qler.run(episodes=20)

In [None]:
cer = CombinedQ(n_features=5, n_actions=2, feature_transformer=AddInterceptFeatures())

In [None]:
cer.train(K=1, batch_size=128, learning_rate=1e-4, reward_func=None, episodes=1000, epsilon_decay=0.995, penalty=0.99)

In [None]:
# calculate the average reward over the last 100 episodes
avgs = [0 for _ in range(100)]
for i in range(100, len(cer.runs)+1):
    avgs.append(np.mean(cer.runs[i-100:i]))


In [None]:
# plot the reward per episode and the average reward per 100 episodes
fig, ax = plt.subplots(figsize=(15, 8))
ax.plot(cer.runs, label='Reward per episode')
ax.plot(avgs, label='Average reward per 100 episodes')
ax.set_xlabel('Episode')
ax.set_ylabel('Reward')

ax.lines[1].set_linewidth(3)
ax.set_title('Q-learning with experience replay')
ax.legend()
plt.show()


In [None]:

# save to pdf
fig.savefig('./Plots/combined_q_learning_with_experience_replay.pdf', bbox_inches='tight')


## Grid search

In [None]:
grid_search_cer = {}
for lr in 10**np.arange(-7,-1, dtype=float):
    grid_search_cer[lr] = {}
    for K in [1, 5, 10]:
        grid_search_cer[lr][K] = {}
        for batch_size in [8, 32, 128, 256]:
            grid_search_cer[lr][K][batch_size] = {}
            print(lr, K, batch_size)
            cer = CombinedQ(n_features=5, n_actions=2, feature_transformer=AddInterceptFeatures())
            cer.run(K=K, batch_size=batch_size, learning_rate=lr, reward_func=None, episodes=400, epsilon_decay=0.995, penalty=0.99)
            grid_search_cer[lr][K][batch_size] = cer.runs


In [None]:
grid_search_cer_buffer = {}

for buffer_size in [1000, 10000, 100000, 1000000, 10000000]:
    print(buffer_size)

    memory = ReplayMemory(buffer_size)

    # feed the memory with random actions
    qler = QLearningWithExperienceReplay(n_features=5, n_actions=2, feature_transformer=AddInterceptFeatures())
    qler.run(episodes=20)

    grid_search_cer_buffer[buffer_size] = []

    for _ in range(5):
        print(_)

        cer = CombinedQ(n_features=5, n_actions=2, feature_transformer=AddInterceptFeatures())
        cer.train(K=1, batch_size=32, learning_rate=1e-6, reward_func=None, episodes=1000, epsilon_decay=0.995, penalty=0.99)
        grid_search_cer_buffer[buffer_size].append(cer.runs)



In [None]:
#save to json
with open('grid_search_cer_buffer.json', 'w') as fp:
    json.dump(grid_search_cer_buffer, fp)

In [None]:
# calculate the average reward over the last 100 episodes
grid_search_cer_buffer_last_avgs = {}
for buffer_size in grid_search_cer_buffer.keys():
    grid_search_cer_buffer_last_avgs[buffer_size] = [np.mean(el[-100:]) for el in grid_search_cer_buffer[buffer_size]]

In [None]:

x = np.arange(5)
y_1 = [grid_search_cer_buffer_last_avgs[x][0] for x in grid_search_cer_buffer_last_avgs.keys()]
y_2 = [grid_search_cer_buffer_last_avgs[x][1] for x in grid_search_cer_buffer_last_avgs.keys()]
y_3 = [grid_search_cer_buffer_last_avgs[x][2] for x in grid_search_cer_buffer_last_avgs.keys()]
y_4 = [grid_search_cer_buffer_last_avgs[x][3] for x in grid_search_cer_buffer_last_avgs.keys()]
y_5 = [grid_search_cer_buffer_last_avgs[x][4] for x in grid_search_cer_buffer_last_avgs.keys()]

width = 0.15

fig, ax = plt.subplots(figsize=(15, 8))
ax.bar(x - 2*width, y_1, width, label='Run 1', color='cornflowerblue') 
ax.bar(x - width, y_2, width, label='Run 2', color='gold')
ax.bar(x, y_3, width, label='Run 3', color ='seagreen')
ax.bar(x + width, y_4, width, label='Run 4', color='coral')
ax.bar(x + 2*width, y_5, width, label='Run 5', color='purple')

ax.set_xticks(x)
ax.set_xticklabels(grid_search_cer_buffer_last_avgs.keys())
ax.set_xlabel('Buffer size')
ax.set_ylabel('Average reward')
ax.set_title('Average reward over last 100 episodes for different buffer sizes')
ax.legend()




In [None]:
# save as pdf
fig.savefig('./Plots/combined_q_learning_with_experience_replay_buffer_size.pdf', bbox_inches='tight')

# Cart Pole with Deep Q-Network

In [None]:
LR = 1e-4
%run CartPoleDQN.ipynb

In [None]:
# calculate the average reward over the last 100 episodes
dqn_avgs = [0 for _ in range(100)]
for i in range(100, len(episode_durations)+1):
    dqn_avgs.append(np.mean(episode_durations[i-100:i]))

In [None]:
# plot the reward per episode and the average reward per 100 episodes
fig, ax = plt.subplots(figsize=(15, 8))
ax.plot(episode_durations, label='Reward per epsisode')
ax.plot(dqn_avgs, label='Average reward per 100 episodes')

ax.lines[1].set_linewidth(3)
ax.set_xlabel('Episode')
ax.set_ylabel('Reward')
ax.set_title('Deep Q-Network')
ax.legend()


In [None]:

# save the figure as pdf
fig.savefig('./Plots/cartpole_dqn.pdf', bbox_inches='tight')


## Grid search

In [None]:
gridsearch_DQN = {}

for LR in 10**np.arange(-8, -1, dtype=float):
    print(LR)
    gridsearch_DQN[LR] = {}
    gridsearch_DQN[LR]['episode_durations'] = []
    gridsearch_DQN[LR]['averages'] = []
    for i in range(5):
        %run CartPoleDQN.ipynb
        gridsearch_DQN[LR]['episode_durations'].append(durations_t)
        gridsearch_DQN[LR]['averages'].append(means)
    

In [None]:
final_averages = {}
for lr in gridsearch_DQN.keys():
    final_averages[lr] = [el[-1].item() for el in gridsearch_DQN[lr]['averages']]

In [None]:
final_averages_max = {}
final_averages_avg = {}
for lr in final_averages.keys():
    final_averages_max[lr] = max(final_averages[lr])
    final_averages_avg[lr] = np.mean(final_averages[lr])

In [None]:
dqn_actions = [el.item() for el in diagnostics['actions'][-1]]
dqn_rewards = [el.item() for el in diagnostics['rewards'][-1]]

In [None]:
# plot the actions taken by the DQN
fig, ax = plt.subplots(figsize=(15,3))
ax.plot(dqn_actions[0:100], label= 'DQN action', c='green' , marker='o')

ax.set_xlabel('Step')
ax.set_ylabel('Action')
ax.set_title('DQN actions')
ax.legend()

In [None]:
# save the figure as pdf
fig.savefig('./Plots/cartpole_dqn_actions.pdf', bbox_inches='tight')

# Mountain Car with Normalized Advantage Functions

In [None]:
LR = 1e-3
%run MountainCarDQN.ipynb

In [None]:
naf_scores = [score.item() for score in episode_scores]

# calculate the average reward over the last 100 episodes
naf_averages = [0 for _ in range(100)]
for i in range(100, len(naf_scores)):
    naf_averages.append(sum(naf_scores[i-100:i])/100) 

In [None]:
# plot the reward per episode and the average reward per 100 episodes
fig, ax = plt.subplots(figsize=(15, 8))
ax.plot(naf_scores, label= 'Reward per episode')
ax.plot(naf_averages, label='Average reward per 100 episodes')
ax.set_xlabel('Episode')
ax.set_ylabel('Reward')

ax.lines[1].set_linewidth(3)
ax.set_title('Normalized Advantage Function')
ax.legend()


In [None]:

# save the figure as pdf
fig.savefig('./Plots/mountaincar_NAF.pdf')



In [None]:
naf_actions = [el.item() for el in naf_diagnostics['actions'][-4]]
naf_rewards = [el.item() for el in naf_diagnostics['rewards'][-4]]
naf_neg_rewards = [-el for el in naf_rewards]


In [None]:
# plot the actions taken by the NAF and the penalty
fig, ax = plt.subplots(figsize=(15, 8))
ax.plot(naf_actions, label= 'NAF action', c='green')
ax.plot(naf_neg_rewards[:-1], label='NAF penalty', c='red')

ax.set_xlabel('Step')
ax.set_ylabel('Action/Penalty')
ax.set_title('NAF Actions and penalty')
ax.legend()


In [None]:

# save the figure as pdf
fig.savefig('./Plots/mountaincar_NAF_actions_penalty1.pdf')



In [None]:
naf_actions = [el.item() for el in naf_diagnostics['actions'][-1]]
naf_rewards = [el.item() for el in naf_diagnostics['rewards'][-1]]
naf_neg_rewards = [-el for el in naf_rewards]

In [None]:
# plot the actions taken by the NAF and the penalty
fig, ax = plt.subplots(figsize=(15, 8))
ax.plot(naf_actions, label= 'NAF action', c='green')
ax.plot(naf_neg_rewards[:-1], label='NAF penalty', c='red')

ax.set_xlabel('Step')
ax.set_ylabel('Action/Penalty')
ax.set_title('NAF Actions and penalty')
ax.legend()


In [None]:
fig.savefig('./Plots/mountaincar_NAF_actions_penalty2.pdf')

### Grid search for NAF

In [None]:
gridsearch_NAF = {}

for lr in 10**np.arange(-8, -1, dtype=float):
    print(LR)
    gridsearch_NAF[LR] = {}
    gridsearch_NAF[LR]['episode_durations'] = []
    gridsearch_NAF[LR]['averages'] = []
    for i in range(5):
        %run MountainCarDQN.ipynb
        gridsearch_NAF[LR]['episode_durations'].append(durations_t)
        gridsearch_NAF[LR]['averages'].append(means)