In [1]:
import numpy as np
import random
import torch
import numpy as np
from collections import deque
from itertools import product
import pandas as pd
import matplotlib.pyplot as plt
from agents.q_learner import Q_learner
from utils.cartpole import CartPoleEnv
%matplotlib inline


In [10]:
args = dict()
args["BUFFER_SIZE"] = int(500)  # replay buffer size
args["BATCH_SIZE"] = 32  # minibatch size
args["GAMMA"] = 0.95  # discount factor
args["TAU"] = 1e-3  # for soft update of target parameters
args["LR"] = 0.001  # learning rate
args["UPDATE_EVERY"] = 4  # how often to update the network

env_name = 'CartPole-v1'

def my_product(inp):
    return (dict(zip(inp.keys(), values)) for values in product(*inp.values()))

In [11]:
class Task_Wrapper():
    def __init__(self, env_name, params):
        self.env_name = env_name
        self.params = list(my_product(params))
        self.current_param = 0
        self.seed = seed
        
    def next_task(self):
        params = self.params[self.current_param]
        env = CartPoleEnv(**params)
        env.seed(self.seed)
        self.current_param+=1
        return env 
    
    def get_env(self, index):
        params = self.params[index]
        env = CartPoleEnv(**params)
        env.seed(self.seed+10)
        return env 

In [15]:
def dqn(env, agent = None, n_episodes=10000, max_t=1000, eps_start=1, eps_end=0.01, eps_decay=0.995):

    scores = []                        # list containing scores from each episode
    scores_window = deque(maxlen=100)  # last 100 scores
    eps = eps_start                    # initialize epsilon
    for i_episode in range(1, n_episodes+1):
        state = env.reset()
        score = 0
        for t in range(max_t):
            action = agent.act(state, eps)
            next_state, reward, done, _ = env.step(action)
            agent.step(state, action, reward, next_state, done)
            state = next_state
            score += reward
            if done:
                break 
        scores_window.append(score)       # save most recent score
        scores.append(score)              # save most recent score
        eps = max(eps_end, eps_decay*eps) # decrease epsilon
        
        print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_window)), end="")
        if i_episode % 100 == 0:
            print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_window)))
        if np.mean(scores_window)>=200.0:
            print('\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'.format(i_episode-100, np.mean(scores_window)))
            torch.save(agent.qnetwork_local.state_dict(), 'models/checkpoints/checkpoint.pth')
            break
    return scores

def test_dqn(env, agent = None, n_episodes=1000, max_t=1000):
    scores = []                        # list containing scores from each episode
    for i_episode in range(1, n_episodes+1):
        state = env.reset()
        score = 0
        for t in range(max_t):
            action = agent.act(state, 0.0)
            next_state, reward, done, _ = env.step(action)
            state = next_state
            score += reward
            if done:
                break 
        scores.append(score)              # save most recent score
    return scores

In [16]:
def transform_dict_to_tuple(param):
    param_list = []
    if "seed" not in param.keys():
        param_list += [0]
    else:
        param_list += [param["seed"]]
        
    if "length" not in param.keys():
        param_list += [0.5]
    else:
        param_list += [param["length"]]
        
    if "gravity" not in param.keys():
        param_list += [9.8]
    else:
        param_list += [param["gravity"]]
        
    if "force_mag" not in param.keys():
        param_list += [10.0]
    else:
        param_list += [param["force_mag"]]
    return tuple(param_list)

In [21]:
params = {"length": [1, 10, 100]
         }
agent = Q_learner(state_size=4, action_size=2, seed=0, hiddens = [24,24], args = args)
seed = 0

task_wrapper = Task_Wrapper(env_name,params)
scores = dict()
test_scores = []
for task_id in range(len(task_wrapper.params)):
    env = task_wrapper.next_task()
    param = task_wrapper.params[task_wrapper.current_param-1]
    print(param)
    param_tuple = transform_dict_to_tuple(param)
    scores[param_tuple] = dqn(env, agent)
    
    test_scores_i = dict()
    print("current:{}".format(task_wrapper.current_param))
    for i in range(task_wrapper.current_param):
        score_test_i = np.array(test_dqn(task_wrapper.get_env(i), agent))
        score_test_i_mean = score_test_i.mean()
        test_scores_i[transform_dict_to_tuple(task_wrapper.params[i])] = score_test_i_mean
    test_scores.append(test_scores_i)  

{'length': 1}
Episode 100	Average Score: 27.99
Episode 200	Average Score: 18.36
Episode 300	Average Score: 14.94
Episode 400	Average Score: 14.98
Episode 500	Average Score: 13.44
Episode 600	Average Score: 13.19
Episode 700	Average Score: 13.14
Episode 800	Average Score: 12.96
Episode 900	Average Score: 12.87
Episode 1000	Average Score: 12.84
Episode 1100	Average Score: 12.99
Episode 1200	Average Score: 12.92
Episode 1300	Average Score: 13.16
Episode 1400	Average Score: 12.79
Episode 1500	Average Score: 12.98
Episode 1600	Average Score: 12.75
Episode 1700	Average Score: 43.18
Episode 1800	Average Score: 111.61
Episode 1900	Average Score: 131.59
Episode 2000	Average Score: 55.586
Episode 2100	Average Score: 151.73
Episode 2135	Average Score: 201.13
Environment solved in 2035 episodes!	Average Score: 201.13
current:1
{'length': 10}
Episode 100	Average Score: 117.88
Episode 200	Average Score: 130.53
Episode 300	Average Score: 138.55
Episode 400	Average Score: 164.50
Episode 500	Average Sc

In [18]:
print(scores)

{(0, 1, 9.8, 10.0): [22.0, 47.0, 18.0, 25.0, 47.0, 20.0, 73.0, 23.0, 20.0, 50.0, 25.0, 31.0, 24.0, 46.0, 18.0, 79.0, 54.0, 20.0, 19.0, 23.0, 32.0, 43.0, 125.0, 28.0, 37.0, 17.0, 86.0, 18.0, 21.0, 38.0, 28.0, 28.0, 27.0, 29.0, 38.0, 17.0, 24.0, 28.0, 20.0, 17.0, 18.0, 30.0, 47.0, 19.0, 29.0, 14.0, 23.0, 26.0, 30.0, 24.0, 28.0, 20.0, 24.0, 29.0, 68.0, 17.0, 21.0, 23.0, 19.0, 26.0, 19.0, 27.0, 55.0, 26.0, 16.0, 15.0, 27.0, 49.0, 17.0, 23.0, 17.0, 17.0, 15.0, 23.0, 33.0, 25.0, 14.0, 17.0, 19.0, 19.0, 15.0, 20.0, 16.0, 22.0, 19.0, 18.0, 20.0, 20.0, 23.0, 30.0, 25.0, 22.0, 21.0, 17.0, 21.0, 29.0, 22.0, 16.0, 15.0, 15.0, 27.0, 18.0, 16.0, 17.0, 18.0, 45.0, 14.0, 15.0, 17.0, 21.0, 20.0, 15.0, 22.0, 16.0, 15.0, 14.0, 21.0, 12.0, 20.0, 18.0, 11.0, 12.0, 20.0, 21.0, 18.0, 15.0, 23.0, 16.0, 14.0, 19.0, 19.0, 21.0, 17.0, 27.0, 34.0, 31.0, 14.0, 14.0, 17.0, 15.0, 19.0, 13.0, 15.0, 19.0, 20.0, 18.0, 15.0, 21.0, 25.0, 17.0, 34.0, 18.0, 16.0, 14.0, 15.0, 17.0, 22.0, 22.0, 21.0, 18.0, 14.0, 12.0, 19.0, 

In [19]:
print(test_scores)

[{(0, 1, 9.8, 10.0): 131.728}, {(0, 1, 9.8, 10.0): 170.925, (0, 10, 9.8, 10.0): 238.98}, {(0, 1, 9.8, 10.0): 16.974, (0, 10, 9.8, 10.0): 121.212, (0, 100, 9.8, 10.0): 325.736}]


<h3>Save params</h3>

Training error

In [22]:
columns = ["Seed", "Gravity", "Length", "Force_mag", "Episode", "Score"]
df = pd.DataFrame(columns = columns)
for param in scores.keys():
    values = scores[param]
    liste = []

    for i in range(len(values)):
        liste.append([param[0], param[1], param[2],param[3], i, values[i]])
    df2 = pd.DataFrame(data = liste, columns = columns)
    df = pd.concat([df,df2])
    df.reset_index()
path= "results/length_v1.csv"
df.to_csv(path)

Test error

In [19]:
score_list = [np.array(list(test_scores[-1].keys()))]
for test_score in test_scores:
    score_list.append(np.array(list(test_score.values())))
scores= np.array(score_list)
print(scores)
path= "results/force_mag_v0.npy"
np.save(path, scores)

[array([[ 0. ,  0.5,  9.8,  4. ],
       [ 0. ,  0.5,  9.8, 10. ],
       [ 0. ,  0.5,  9.8, 20. ]])
 array([211.95]) array([299.61, 302.33]) array([ 15.09,  58.11, 129.02])]


In [None]:
offset = 0
plt.clf()

fig , ax = plt.subplots(1, len(params), figsize=(20,10))
#concatenated_score = []
#concatenated_index = []
for task_id, score in enumerate(scores):
    #concatenated_score += score
    #concatenated_index += list(np.array([i for i in range(len(score))]) + offset)
    episode = np.array([i for i in range(len(score))])
    offseted_episode = list(episode + offset)
    ax[task_id].plot(episode, score)
    offset += len(score)
    ax[task_id].set_ylim([0,300])
    ax[task_id].set_title("Task n°{} (Length: {})".format(task_id, params[task_id]))
ax[0].set_ylabel("Score")
