In [1]:
import numpy as np
import random
import torch
import numpy as np
from collections import deque
from itertools import product
import pandas as pd
import matplotlib.pyplot as plt
from agents.q_learner import Q_learner
from utils.cartpole import CartPoleEnv
%matplotlib inline


In [2]:
args = dict()
args["BUFFER_SIZE"] = int(500)  # replay buffer size
args["BATCH_SIZE"] = 32  # minibatch size
args["GAMMA"] = 0.95  # discount factor
args["TAU"] = 1e-3  # for soft update of target parameters
args["LR"] = 0.001  # learning rate
args["UPDATE_EVERY"] = 4  # how often to update the network

env_name = 'CartPole-v1'

def my_product(inp):
    return (dict(zip(inp.keys(), values)) for values in product(*inp.values()))

In [3]:
def transform_dict_to_tuple(param):
    param_list = []
    if "seed" not in param.keys():
        param_list += [0]
    else:
        param_list += [param["seed"]]
        
    if "length" not in param.keys():
        param_list += [0.5]
    else:
        param_list += [param["length"]]
        
    if "gravity" not in param.keys():
        param_list += [9.8]
    else:
        param_list += [param["gravity"]]
        
    if "force_mag" not in param.keys():
        param_list += [10.0]
    else:
        param_list += [param["force_mag"]]
    return tuple(param_list)

In [4]:
class Task_Wrapper():
    def __init__(self, env_name, params):
        self.env_name = env_name
        self.params = list(my_product(params))
        self.current_param = 0
        self.seed = seed
        self.envs = []
        
    def next_task(self):
        params = self.params[self.current_param]
        params_tuple = transform_dict_to_tuple(params)
        env = CartPoleEnv(**params)
        env.seed(self.seed)
        self.current_param+=1
        self.envs.append({params_tuple : env})
        return self.envs
    
    def get_env(self, index):
        params = self.params[index]
        env = CartPoleEnv(**params)
        env.seed(self.seed)
        return env 

class Queue():
    def __init__(self, capacity):
        self.capacity = capacity-1
        self.queue = []
        self.nb_elems = -1
        
    def add(self, elem):
        if self.nb_elems == self.capacity:
            self.pop()
            self.add(elem)
        else:
            self.queue.append(elem)
            self.nb_elems+=1
    
    def pop(self):        
        self.nb_elems -=1
        return self.queue.pop(0)

In [12]:
x = [i for i in range(10)]
x[-3:]

[7, 8, 9]

In [13]:
def dqn(envs, agent = None, n_episodes=10000, max_t=1000, eps_start=1, eps_end=0.01, eps_decay=0.995):
    scores_test = [Queue(20) for i in range(len(envs))]
    scores = []                        
    scores_window = deque(maxlen=100)  
    eps = eps_start                    
    env = list(envs[-1].values())[0]
    for i_episode in range(1, n_episodes+1):
        test_dqns(scores_test, envs, agent)

        state = env.reset()
        score = 0
        for t in range(max_t):
            action = agent.act(state, eps)
            next_state, reward, done, _ = env.step(action)
            agent.step(state, action, reward, next_state, done)
            state = next_state
            score += reward
            if done:
                break 
        scores_window.append(score)       
        scores.append(score)              
        eps = max(eps_end, eps_decay*eps)
        
        print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_window)), end="")
        if i_episode % 100 == 0:
            print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_window)))
        if np.mean(scores_window)>=200.0:
            print('\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'.format(i_episode-100, np.mean(scores_window)))
            torch.save(agent.qnetwork_local.state_dict(), 'models/checkpoints/checkpoint.pth')
            break
    scores_test_list = [np.array(scores_test[i].queue).mean() for i in range(len(scores_test)) ]
    return scores, scores_test_list


def test_dqns(scores_test, envs, agent, n_episodes = 10, max_t = 1000):
    for i in range(len(envs)):
        env_i = list(envs[i].values())[0]
        scores_test[i].add(test_dqn(env_i, agent))
            
            
def test_dqn(env, agent = None, n_episodes=10, max_t=1000):
    _scores = 0                       
    for i_episode in range(1, n_episodes+1):
        _state = env.reset()
        _score = 0
        for t in range(max_t):
            _action = agent.act(_state, 0.0)
            _next_state, _reward, _done, _ = env.step(_action)
            _state = _next_state
            _score += _reward
            if _done:
                break 
        _scores +=  _score              
    return _scores/n_episodes

In [16]:
params = {"length": [1, 10], 
         "gravity": [9.8, 2]}

print("Params: (Seed, Length, Gravity, Force_mag)")
agent = Q_learner(state_size=4, action_size=2, seed=0, hiddens = [100,100], args = args)
seed = 0

task_wrapper = Task_Wrapper(env_name,params)
scores = dict()
test_scores = dict()
for task_id in range(len(task_wrapper.params)):
    print("------------ Task n°{}/{} ------------".format(task_id+1,len(task_wrapper.params) ))
    envs = task_wrapper.next_task()
    param_tuple = list(envs[-1].keys())[0]
    print("Current param: {}".format(param_tuple))
    scores[param_tuple], test_scores[param_tuple] = dqn(envs, agent)
    print(test_scores[param_tuple])
    #test_scores_i = dict()
    #print("current:{}".format(task_wrapper.current_param))
    #for i in range(task_wrapper.current_param):
    #    score_test_i = np.array(test_dqn(task_wrapper.get_env(i), agent))
    #    score_test_i_mean = score_test_i.mean()
    #    test_scores_i[transform_dict_to_tuple(task_wrapper.params[i])] = score_test_i_mean
    #test_scores.append(test_scores_i)  

Params: (Seed, Length, Gravity, Force_mag)
------------ Task n°1/4 ------------
Current param: (0, 1, 9.8, 10.0)
Episode 100	Average Score: 25.00
Episode 200	Average Score: 20.80
Episode 300	Average Score: 15.74
Episode 400	Average Score: 16.55
Episode 500	Average Score: 15.36
Episode 600	Average Score: 14.03
Episode 700	Average Score: 16.69
Episode 800	Average Score: 19.23
Episode 900	Average Score: 43.70
Episode 1000	Average Score: 167.24
Episode 1006	Average Score: 203.86
Environment solved in 906 episodes!	Average Score: 203.86
[451.08500000000004]
------------ Task n°2/4 ------------
Current param: (0, 1, 2, 10.0)
Episode 100	Average Score: 81.32
Episode 145	Average Score: 202.39
Environment solved in 45 episodes!	Average Score: 202.39
[331.05499999999995, 501.075]
------------ Task n°3/4 ------------
Current param: (0, 10, 9.8, 10.0)
Episode 100	Average Score: 117.17
Episode 200	Average Score: 128.48
Episode 300	Average Score: 155.07
Episode 400	Average Score: 175.21
Episode 500	

In [17]:
print(test_scores)

{(0, 1, 9.8, 10.0): [451.08500000000004], (0, 1, 2, 10.0): [331.05499999999995, 501.075], (0, 10, 9.8, 10.0): [360.80499999999995, 674.15, 262.54], (0, 10, 2, 10.0): [149.14000000000001, 308.095, 201.57500000000005, 372.56000000000006]}


<h3>Save params</h3>

Training error

In [21]:
print(task_wrapper.params)
print(scores.keys())

[{'length': 1, 'gravity': 9.8}, {'length': 1, 'gravity': 2}, {'length': 10, 'gravity': 9.8}, {'length': 10, 'gravity': 2}]
dict_keys([(0, 1, 9.8, 10.0), (0, 1, 2, 10.0), (0, 10, 9.8, 10.0), (0, 10, 2, 10.0)])


In [25]:
columns = ["Task#","Seed", "Gravity", "Length", "Force_mag", "Episode", "Score"]
df = pd.DataFrame(columns = columns)
for j,param in enumerate(list(scores.keys())):
    print(j)
    values = scores[param]
    liste = []

    for i in range(len(values)):
        liste.append([j, param[0], param[1], param[2],param[3], i, values[i]])
    df2 = pd.DataFrame(data = liste, columns = columns)
    df = pd.concat([df,df2])
    df.reset_index()
path= "results/length_gravity_v0.csv"
df.to_csv(path)

0
1
2
3


Test error

In [10]:
score_list = [np.array(list(test_scores[-1].keys()))]
for test_score in test_scores:
    score_list.append(np.array(list(test_score.values())))
scores= np.array(score_list)
print(scores)
path= "results/force_mag_v0.npy"
np.save(path, scores)

KeyError: -1

In [None]:
offset = 0
plt.clf()

fig , ax = plt.subplots(1, len(params), figsize=(20,10))
#concatenated_score = []
#concatenated_index = []
for task_id, score in enumerate(scores):
    #concatenated_score += score
    #concatenated_index += list(np.array([i for i in range(len(score))]) + offset)
    episode = np.array([i for i in range(len(score))])
    offseted_episode = list(episode + offset)
    ax[task_id].plot(episode, score)
    offset += len(score)
    ax[task_id].set_ylim([0,300])
    ax[task_id].set_title("Task n°{} (Length: {})".format(task_id, params[task_id]))
ax[0].set_ylabel("Score")
