# Mountaincar Environment
* Eval can start anywhere from left to goal state, vel 0 (also training). They need 71 episodes
* Modify cartpole to only have two actions-> left and right. The magnitude of the actions are much larger in nfq paper
* Hint to goal, which sometimes makes the agent perform worse
* Group: the magnitude of the action
* Made the forces symmetric

In [1]:
import configargparse
import torch
import torch.optim as optim
import sys
sys.path.append('../')

from environments import MountainCarEnv, Continuous_MountainCarEnv
from models.agents import NFQAgent
from models.networks import NFQNetwork, ContrastiveNFQNetwork
from util import get_logger, close_logger, load_models, make_reproducible, save_models
import matplotlib.pyplot as plt
import numpy as np
import itertools
import seaborn as sns
import tqdm

In [2]:
def generate_data(init_experience=100, bg_only=False, continuous=False, agent=None, dataset='train'):
    if continuous:
        env_bg = Continuous_MountainCarEnv(group=0)
        env_fg = Continuous_MountainCarEnv(group=1)
    else:
        env_bg = MountainCarEnv(group=0)
        env_fg = MountainCarEnv(group=1)
    bg_rollouts = []
    fg_rollouts = []
    if init_experience > 0:
        for _ in range(init_experience):
            rollout_bg, episode_cost = env_bg.generate_rollout(
                agent, render=False, group=0, dataset=dataset
            )
            bg_rollouts.extend(rollout_bg)
            if not bg_only:
                rollout_fg, episode_cost = env_fg.generate_rollout(
                    agent, render=False, group=1, dataset=dataset
                )
                fg_rollouts.extend(rollout_fg)
    bg_rollouts.extend(fg_rollouts)
    all_rollouts = bg_rollouts.copy()
    return all_rollouts, env_bg, env_fg

# Continuous Mountaincar
* There are quite a few actions. This makes it hard for FQI to learn to succeed on this task. 

# Regular Mountaincar
* Has three actions, easier for FQI to understand what's going on. 

## Using random actions to generate rollout results in the agent never succeeding

## Use a model to generate actions that are better than random
* This doesn't do well either without any adjustments. The model can't really learn how to make it past -0.4 position
* Even if we change the reward to be continuous, FQI can't learn it.
* However, we can make the reset speed 0.3, we sometimes learn it. 
* When we train a model, the reset has a speed of 0. 

## Train a new network with the result of better rollouts

## Using foreground and background samples
* May need to reparameterize the fg specific layers. 
* If we reverse: we learn fg way easier, need to force it to learn bg first (or just generally change the training regime). Or better convergence criteria. 

In [None]:
train_rollouts, train_env_bg, train_env_fg = generate_data(init_experience=200, bg_only=False, continuous=False)
test_rollouts, eval_env_bg, eval_env_fg = generate_data(init_experience=200, bg_only=False, continuous=False)

is_contrastive=True
epoch = 3000
hint_to_goal = True
if hint_to_goal:
    goal_state_action_b_bg, goal_target_q_values_bg, group_bg = train_env_bg.get_goal_pattern_set(group=0)
    goal_state_action_b_fg, goal_target_q_values_fg, group_fg = train_env_fg.get_goal_pattern_set(group=1)
    
    goal_state_action_b_bg = torch.FloatTensor(goal_state_action_b_bg)
    goal_target_q_values_bg = torch.FloatTensor(goal_target_q_values_bg)
    goal_state_action_b_fg = torch.FloatTensor(goal_state_action_b_fg)
    goal_target_q_values_fg = torch.FloatTensor(goal_target_q_values_fg)
    
nfq_net = ContrastiveNFQNetwork(state_dim=train_env_bg.state_dim, is_contrastive=is_contrastive, deep=False)
optimizer = optim.Adam(nfq_net.parameters(), lr=1e-1)

nfq_agent = NFQAgent(nfq_net, optimizer)

bg_success_queue = [0] * 3
fg_success_queue = [0] * 3
eval_fg = 0
evaluations = 5
for k, ep in enumerate(tqdm.tqdm(range(epoch + 1))):
    state_action_b, target_q_values, groups = nfq_agent.generate_pattern_set(train_rollouts)
    if hint_to_goal:
        goal_state_action_b = torch.cat([goal_state_action_b_bg, goal_state_action_b_fg], dim=0)
        goal_target_q_values = torch.cat([goal_target_q_values_bg, goal_target_q_values_fg], dim=0)
        state_action_b = torch.cat([state_action_b, goal_state_action_b], dim=0)
        target_q_values = torch.cat([target_q_values, goal_target_q_values], dim=0)
        goal_groups = torch.cat([group_bg, group_fg], dim=0)
        groups = torch.cat([groups, goal_groups], dim=0)

    if not nfq_net.freeze_shared:
        loss = nfq_agent.train((state_action_b, target_q_values, groups))

    eval_episode_length_fg, eval_success_fg, eval_episode_cost_fg = 0, 0, 0
    if nfq_net.freeze_shared:
        eval_fg += 1
        if eval_fg > 50:
            loss = nfq_agent.train((state_action_b, target_q_values, groups))

    (eval_episode_length_bg, eval_success_bg, eval_episode_cost_bg) = nfq_agent.evaluate_car(eval_env_bg, render=False)
    bg_success_queue = bg_success_queue[1:]
    bg_success_queue.append(1 if eval_success_bg else 0)
    
    (eval_episode_length_fg, eval_success_fg, eval_episode_cost_fg) = nfq_agent.evaluate_car(eval_env_fg, render=False)
    fg_success_queue = fg_success_queue[1:]
    fg_success_queue.append(1 if eval_success_fg else 0)

    if sum(bg_success_queue) == 3 and not nfq_net.freeze_shared == True:
        nfq_net.freeze_shared = True
        print("FREEZING SHARED")
        if is_contrastive:
            for param in nfq_net.layers_shared.parameters():
                param.requires_grad = False
            for param in nfq_net.layers_last_shared.parameters():
                param.requires_grad = False
            for param in nfq_net.layers_fg.parameters():
                param.requires_grad = True
            for param in nfq_net.layers_last_fg.parameters():
                param.requires_grad = True
        else:
            for param in nfq_net.layers_fg.parameters():
                param.requires_grad = False
            for param in nfq_net.layers_last_fg.parameters():
                param.requires_grad = False

        optimizer = optim.Adam(
            itertools.chain(
                nfq_net.layers_fg.parameters(),
                nfq_net.layers_last_fg.parameters(),
            ),
            lr=1e-1,
        )
        nfq_agent._optimizer = optimizer
    if sum(fg_success_queue) == 3:
        print("FG Trained")
        
    if ep % 300 == 0:
        perf_bg = []
        perf_fg = []
        for it in range(evaluations):
            (eval_episode_length_bg,eval_success_bg,eval_episode_cost_bg) = nfq_agent.evaluate_car(eval_env_bg, render=False)
            (eval_episode_length_fg,eval_success_fg,eval_episode_cost_fg) = nfq_agent.evaluate_car(eval_env_fg, render=False)
            perf_bg.append(eval_episode_cost_bg)
            perf_fg.append(eval_episode_cost_fg)
            train_env_bg.close()
            train_env_fg.close()
            eval_env_bg.close()
            eval_env_fg.close()
        print("Evaluation bg: " + str(perf_bg) + " Evaluation fg: " + str(perf_fg))
perf_bg = []
perf_fg = []
for it in range(evaluations*10):
    (eval_episode_length_bg,eval_success_bg,eval_episode_cost_bg) = nfq_agent.evaluate_car(eval_env_bg, render=False)
    (eval_episode_length_fg,eval_success_fg,eval_episode_cost_fg) = nfq_agent.evaluate_car(eval_env_fg, render=False)
    perf_bg.append(eval_episode_cost_bg)
    perf_fg.append(eval_episode_cost_fg)
    eval_env_bg.close()
    eval_env_fg.close()
print("Evaluation bg: " + str(sum(perf_bg)/len(perf_bg)) + " Evaluation fg: " + str(sum(perf_fg)/len(perf_fg)))

  0%|          | 1/3001 [00:00<20:10,  2.48it/s]

Evaluation bg: [100, 100, 0, 100, 0] Evaluation fg: [100, 100, 100, 100, 100]


  1%|          | 16/3001 [00:02<08:11,  6.07it/s]

FG Trained
FG Trained
FG Trained


  1%|          | 19/3001 [00:03<06:44,  7.37it/s]

FG Trained
FG Trained
FG Trained


  1%|          | 22/3001 [00:03<05:58,  8.30it/s]

FREEZING SHARED
FG Trained
FG Trained
FG Trained


  1%|          | 23/3001 [00:03<05:47,  8.58it/s]

FG Trained
FG Trained


  1%|          | 26/3001 [00:04<05:55,  8.37it/s]

FG Trained
FG Trained
FG Trained


  1%|          | 28/3001 [00:04<05:30,  9.00it/s]

FG Trained
FG Trained


  1%|          | 31/3001 [00:04<05:37,  8.79it/s]

FG Trained
FG Trained
FG Trained


  1%|          | 34/3001 [00:04<06:15,  7.89it/s]

FG Trained
FG Trained


  1%|          | 36/3001 [00:05<06:01,  8.20it/s]

FG Trained
FG Trained


  1%|▏         | 39/3001 [00:05<06:23,  7.73it/s]

FG Trained
FG Trained
FG Trained


  1%|▏         | 42/3001 [00:05<05:42,  8.64it/s]

FG Trained
FG Trained
FG Trained


  1%|▏         | 44/3001 [00:06<05:43,  8.60it/s]

FG Trained
FG Trained


  2%|▏         | 47/3001 [00:06<05:31,  8.90it/s]

FG Trained
FG Trained
FG Trained
FG Trained


  2%|▏         | 49/3001 [00:06<05:48,  8.46it/s]

FG Trained
FG Trained


  2%|▏         | 54/3001 [00:07<05:26,  9.04it/s]

FG Trained
FG Trained
FG Trained
FG Trained


  2%|▏         | 56/3001 [00:07<05:31,  8.88it/s]

FG Trained
FG Trained


  2%|▏         | 59/3001 [00:07<05:41,  8.61it/s]

FG Trained
FG Trained
FG Trained


  2%|▏         | 61/3001 [00:08<05:37,  8.70it/s]

FG Trained
FG Trained


  2%|▏         | 63/3001 [00:08<06:02,  8.11it/s]

FG Trained
FG Trained


  2%|▏         | 65/3001 [00:08<05:42,  8.58it/s]

FG Trained
FG Trained


  2%|▏         | 67/3001 [00:08<05:28,  8.93it/s]

FG Trained
FG Trained


  2%|▏         | 69/3001 [00:08<05:58,  8.17it/s]

FG Trained
FG Trained


  2%|▏         | 71/3001 [00:09<05:56,  8.22it/s]

FG Trained
FG Trained


  2%|▏         | 74/3001 [00:09<06:09,  7.93it/s]

FG Trained
FG Trained
FG Trained


  3%|▎         | 76/3001 [00:09<05:55,  8.24it/s]

FG Trained
FG Trained


  3%|▎         | 77/3001 [00:10<06:45,  7.22it/s]

FG Trained
FG Trained


 10%|█         | 302/3001 [00:49<12:49,  3.51it/s]

Evaluation bg: [0, 0, 0, 100, 100] Evaluation fg: [0, 0, 0, 0, 0]


 20%|██        | 602/3001 [01:43<12:17,  3.25it/s]

Evaluation bg: [0, 0, 0, 0, 0] Evaluation fg: [0, 0, 0, 0, 0]


 27%|██▋       | 816/3001 [02:22<05:10,  7.03it/s]

FG Trained
FG Trained


 27%|██▋       | 818/3001 [02:22<04:56,  7.37it/s]

FG Trained
FG Trained


 27%|██▋       | 819/3001 [02:22<04:48,  7.56it/s]

FG Trained
FG Trained


 28%|██▊       | 826/3001 [02:23<05:03,  7.17it/s]

FG Trained


 28%|██▊       | 835/3001 [02:24<04:34,  7.89it/s]

FG Trained
FG Trained
FG Trained


 30%|███       | 902/3001 [02:37<10:06,  3.46it/s]

Evaluation bg: [0, 0, 100, 0, 100] Evaluation fg: [0, 0, 0, 0, 0]


 40%|████      | 1202/3001 [03:32<08:37,  3.48it/s]

Evaluation bg: [0, 0, 0, 100, 100] Evaluation fg: [0, 0, 0, 0, 0]


 44%|████▍     | 1313/3001 [03:52<05:12,  5.39it/s]