# Mountaincar Environment
* Eval can start anywhere from left to goal state, vel 0 (also training). They need 71 episodes
* Modify cartpole to only have two actions-> left and right. The magnitude of the actions are much larger in nfq paper
* Hint to goal, which sometimes makes the agent perform worse
* Group: the magnitude of the action
* Made the forces symmetric

In [1]:
import configargparse
import torch
import torch.optim as optim
import sys
sys.path.append('../')

from environments import MountainCarEnv, Continuous_MountainCarEnv
from models.agents import NFQAgent
from models.networks import NFQNetwork, ContrastiveNFQNetwork
from util import get_logger, close_logger, load_models, make_reproducible, save_models
import matplotlib.pyplot as plt
import numpy as np
import itertools
import seaborn as sns
import tqdm
from train_mountaincar import generate_data
import random

# Running experiments

## "Structureless Test"
* The dynamics of the systems are actually the same. Do any of the algorithms learn a difference?

In [None]:
import json
from train_mountaincar import fqi, warm_start, transfer_learning
num_iter=15
perf_foreground = []
perf_background = []
for i in range(num_iter):
    print(str(i))
    perf_bg, perf_fg = fqi(epoch=1500, gravity=0.0025, verbose=True, is_contrastive=True, structureless=True, hint_to_goal=False)
    perf_foreground.append(perf_fg)
    perf_background.append(perf_bg)
sns.distplot(perf_foreground, label='Foreground Performance')
sns.distplot(perf_background, label='Background Performance')
plt.legend()
plt.xlabel("Average Reward Earned")
plt.title("Dynamics are the same in fg and bg environments")

## "Performance when force left is different"
* We change the gravity on the foreground environments. 

# Group imbalance test

In [None]:
import json
from train_mountaincar import fqi, warm_start, transfer_learning
num_iter = 2
results = {}

GRAVITY = 0.004

total_samples = 400
fg_sample_fractions = [0.1 * x for x in np.arange(1, 6)]

for i in fg_sample_fractions:
    results[i] = {}
    results[i]["fg_only"] = {}
    results[i]["cfqi"] = {}
    results[i]["fqi_joint"] = {}
    
for i in range(num_iter):

    for fg_sample_fraction in fg_sample_fractions:

        n_fg = int(total_samples * fg_sample_fraction)
        n_bg = int(total_samples - n_fg)
        
        # Only train/test on small set of foreground samples
        perf_bg, perf_fg = fqi(epoch=1500, verbose=False, is_contrastive=True, structureless=False, gravity=GRAVITY, fg_only=True, init_experience_bg=n_fg // 2,
            init_experience_fg=n_fg // 2)
        results[fg_sample_fraction]["fg_only"][i] = (perf_bg, perf_fg)

        # Use contrastive model with larger pool of background samples
        perf_bg, perf_fg = fqi(epoch=1500, is_contrastive=True,init_experience_bg=n_bg,init_experience_fg=n_fg,fg_only=False,verbose=False,gravity=GRAVITY)
        results[fg_sample_fraction]["cfqi"][i] = (perf_bg, perf_fg)

        # Use non-contrastive model with larger pool of background samples
        perf_bg, perf_fg = fqi(is_contrastive=False,init_experience_bg=n_bg,init_experience_fg=n_fg,fg_only=False,gravity=GRAVITY,epoch=1500,verbose=False,)
        results[fg_sample_fraction]["fqi_joint"][i] = (perf_bg, perf_fg)

        with open("class_imbalance_cfqi.json", "w") as f:
            json.dump(results, f)


# Testing out some of the other methods
* Allowing it to succeed, maybe after some training
* Adding the successful episodes to the train_rollouts
* TODO: modifying the environment to include stronger actions
* TODO: modifying training regime
* TODO: evaluating appropriate times

In [2]:
def evaluate_car(agent, env, eps=0.9):
    episode_length = 0
    obs = env.reset()
    done = False
    render = False
    info = {"time_limit": False}
    episode_cost = 0
    rollouts = []
    while not done:
        if random.random() < eps:
            action = np.random.choice(env.unique_actions)
            action = env.a_to_oh(action)
        else:
            action = agent.get_best_action(obs, env.unique_oh_actions, env.group)

        next_obs, cost, done, info = env.step(action)
        rollouts.append((obs.squeeze(), action, cost, next_obs.squeeze(), done, env.group))
        episode_cost += cost
        obs = next_obs
        episode_length += 1

    success = (done)
    return episode_length, success, episode_cost, rollouts

In [None]:
train_rollouts, train_env_bg, train_env_fg = generate_data(init_experience_fg=10, init_experience_bg=10, bg_only=False, structureless=True,initialize_model=False)
nfq_net = ContrastiveNFQNetwork(state_dim=train_env_bg.state_dim, is_contrastive=False, deep=False)
optimizer = optim.Adam(nfq_net.parameters(), lr=1e-2)
nfq_agent = NFQAgent(nfq_net, optimizer)
episodes = 10000
losses = []
eps = 1
for _, ep in enumerate(tqdm.tqdm(range(episodes))):
    state_action_b, target_q_values, groups = nfq_agent.generate_pattern_set(train_rollouts)
    loss = nfq_agent.train((state_action_b, target_q_values, groups))
    losses.append(loss)
    if ep > 1000 and ep % 300 == 0:
        eps *= 0.95
        episode_length, success, episode_cost, rollouts = evaluate_car(nfq_agent, train_env_bg, eps=eps)
        print("Eps: " + str(eps) + " Episode Length: ", episode_length)
        train_rollouts.extend(rollouts)
        if len(train_rollouts) > 5000:
            train_rollouts = train_rollouts[-5000:]
sns.displot(losses)

 12%|█▏        | 1206/10000 [00:15<03:15, 44.94it/s]

Eps: 0.95 Episode Length:  1735


 15%|█▌        | 1505/10000 [00:24<05:02, 28.08it/s]

Eps: 0.9025 Episode Length:  447


 18%|█▊        | 1805/10000 [00:35<10:39, 12.81it/s]

Eps: 0.8573749999999999 Episode Length:  2112


 21%|██        | 2104/10000 [00:54<27:13,  4.83it/s]

Eps: 0.8145062499999999 Episode Length:  4590


 24%|██▍       | 2404/10000 [01:12<09:45, 12.98it/s]

Eps: 0.7737809374999999 Episode Length:  372


 27%|██▋       | 2704/10000 [01:30<07:01, 17.30it/s]

Eps: 0.7350918906249998 Episode Length:  19


 30%|███       | 3004/10000 [01:48<09:19, 12.51it/s]

Eps: 0.6983372960937497 Episode Length:  593


 33%|███▎      | 3304/10000 [02:07<13:55,  8.01it/s]

Eps: 0.6634204312890623 Episode Length:  1182


 36%|███▌      | 3603/10000 [05:56<79:05:15, 44.51s/it] 

Eps: 0.6302494097246091 Episode Length:  404757


 39%|███▉      | 3903/10000 [06:14<10:48,  9.40it/s]   

Eps: 0.5987369392383786 Episode Length:  477


 42%|████▏     | 4202/10000 [08:00<15:00:28,  9.32s/it]

Eps: 0.5688000922764596 Episode Length:  147409


 45%|████▌     | 4503/10000 [12:26<79:16:14, 51.91s/it] 

Eps: 0.5403600876626365 Episode Length:  390441


 48%|████▊     | 4803/10000 [22:54<92:26:57, 64.04s/it] 

Eps: 0.5133420832795047 Episode Length:  919353


 51%|█████     | 5099/10000 [23:33<04:38, 17.62it/s]   

In [None]:

    if hint_to_goal:
        (
            goal_state_action_b_bg,
            goal_target_q_values_bg,
            group_bg,
        ) = train_env_bg.get_goal_pattern_set(group=0)
        (
            goal_state_action_b_fg,
            goal_target_q_values_fg,
            group_fg,
        ) = train_env_fg.get_goal_pattern_set(group=1)

        goal_state_action_b_bg = torch.FloatTensor(goal_state_action_b_bg)
        goal_target_q_values_bg = torch.FloatTensor(goal_target_q_values_bg)
        goal_state_action_b_fg = torch.FloatTensor(goal_state_action_b_fg)
        goal_target_q_values_fg = torch.FloatTensor(goal_target_q_values_fg)

    

    bg_success_queue = [0] * 3
    fg_success_queue = [0] * 3
    evaluations = 5
    losses = []
    for k, ep in enumerate(tqdm.tqdm(range(epoch + 1))):
        
        if hint_to_goal:
            goal_state_action_b = torch.cat(
                [goal_state_action_b_bg, goal_state_action_b_fg], dim=0
            )
            goal_target_q_values = torch.cat(
                [goal_target_q_values_bg, goal_target_q_values_fg], dim=0
            )
            state_action_b = torch.cat([state_action_b, goal_state_action_b], dim=0)
            target_q_values = torch.cat([target_q_values, goal_target_q_values], dim=0)
            goal_groups = torch.cat([group_bg, group_fg], dim=0)
            groups = torch.cat([groups, goal_groups], dim=0)

        loss = nfq_agent.train((state_action_b, target_q_values, groups))
        losses.append(loss)


        (
            eval_episode_length_bg,
            eval_success_bg,
            eval_episode_cost_bg,
        ) = nfq_agent.evaluate_car(eval_env_bg, render=render)
        bg_success_queue = bg_success_queue[1:]
        bg_success_queue.append(1 if eval_success_bg else 0)

        (
            eval_episode_length_fg,
            eval_success_fg,
            eval_episode_cost_fg,
        ) = nfq_agent.evaluate_car(eval_env_fg, render=render)
        fg_success_queue = fg_success_queue[1:]
        fg_success_queue.append(1 if eval_success_fg else 0)

        if (sum(bg_success_queue) == 3 and not nfq_net.freeze_shared == True) or ep == int(epoch*0.75):
            nfq_net.freeze_shared = True
            if verbose:
                print("FREEZING SHARED")
            if is_contrastive:
                for param in nfq_net.layers_shared.parameters():
                    param.requires_grad = False
                for param in nfq_net.layers_last_shared.parameters():
                    param.requires_grad = False
                for param in nfq_net.layers_fg.parameters():
                    param.requires_grad = True
                for param in nfq_net.layers_last_fg.parameters():
                    param.requires_grad = True
            else:
                for param in nfq_net.layers_fg.parameters():
                    param.requires_grad = False
                for param in nfq_net.layers_last_fg.parameters():
                    param.requires_grad = False

            optimizer = optim.Adam(
                itertools.chain(
                    nfq_net.layers_fg.parameters(),
                    nfq_net.layers_last_fg.parameters(),
                ),
                lr=1e-1,
            )
            nfq_agent._optimizer = optimizer
        if sum(fg_success_queue) == 3:
            if verbose:
                print("FG Trained")
            break

        if ep % 600 == 0:
            perf_bg = []
            perf_fg = []
            for it in range(evaluations):
                (
                    eval_episode_length_bg,
                    eval_success_bg,
                    eval_episode_cost_bg,
                ) = nfq_agent.evaluate_car(eval_env_bg, render=render)
                (
                    eval_episode_length_fg,
                    eval_success_fg,
                    eval_episode_cost_fg,
                ) = nfq_agent.evaluate_car(eval_env_fg, render=render)
                perf_bg.append(eval_episode_cost_bg)
                perf_fg.append(eval_episode_cost_fg)
                train_env_bg.close()
                train_env_fg.close()
                eval_env_bg.close()
                eval_env_fg.close()
            if verbose:
                print(
                    "Evaluation bg: " + str(perf_bg) + " Evaluation fg: " + str(perf_fg)
                )
    perf_bg = []
    perf_fg = []
    for it in range(evaluations * 10):
        (
            eval_episode_length_bg,
            eval_success_bg,
            eval_episode_cost_bg,
        ) = nfq_agent.evaluate_car(eval_env_bg, render=render)
        (
            eval_episode_length_fg,
            eval_success_fg,
            eval_episode_cost_fg,
        ) = nfq_agent.evaluate_car(eval_env_fg, render=render)
        perf_bg.append(eval_episode_cost_bg)
        perf_fg.append(eval_episode_cost_fg)
        eval_env_bg.close()
        eval_env_fg.close()
    if verbose:
        print(
            "Evaluation bg: "
            + str(sum(perf_bg) / len(perf_bg))
            + " Evaluation fg: "
            + str(sum(perf_fg) / len(perf_fg))
        )
    sns.distplot(losses)
    return sum(perf_bg) / len(perf_bg), sum(perf_fg) / len(perf_fg)