In [1]:
import os
import json
import numpy as np
import shutil
from collections import defaultdict

In [2]:
# directory with newly sampled trajectories
data = '/root/data_alfred/demo_generated/'

# write to ~/ for now because of permission problems to write in /data_alfred
dout = '/root/data_alfred/json_data_augmentation_20200820/'
dout_split = '/root/data_alfred/splits/'

sampling_directory_names = ['new_trajectories_T20200814_164117_470008', 
                            'new_trajectories_T20200816_210636_008457', 
                            'new_trajectories_T20200820_065140_149815']

# task name text files
with open('/root/data/home/hoyeung/alfred/gen/scripts/task_names.txt') as f:
    task_name_list = f.read().splitlines()
len(task_name_list)

13918

## filter down to sampled trajectories with 1) at least 1 successful subgoal and 2) given a task name, the best trial with max number of subgoal

In [3]:
# filter down to sampled trajectories with 1) at least 1 successful subgoal and 2) given a task name, the best trial with max number of subgoal

task_to_traj = {}
missing = []

for task_name in task_name_list:
    
    task_info = {'best_trial_dir':None, 'max_collected_subgoals':0, 'full_traj_success':False}

    found_task_dir = False
    for sampl_dir in sampling_directory_names:
        task_dir = os.path.join(data, sampl_dir, task_name)
        if os.path.exists(task_dir):
            found_task_dir = True
            for trial_dir in os.listdir(task_dir):
                traj_data_p = os.path.join(task_dir, trial_dir, 'traj_data.json')
                if os.path.exists(traj_data_p):
                    with open(traj_data_p, 'r') as f:
                        traj_data = json.load(f)
                    collected_num_subgoals = len(traj_data['plan']['high_pddl'])
                    if os.path.exists(os.path.join(task_dir, trial_dir, 'video.mp4')):
                        # save successful full trajectory
                        task_info['best_trial_dir'] = os.path.join(sampl_dir, task_name, trial_dir)
                        task_info['max_collected_subgoals'] = collected_num_subgoals
                        task_info['full_traj_success'] = True
                        task_to_traj[task_name] = task_info
                        break
                    else:
                        # save the longest version. save only if at least one subgoal has been completed. 
                        if len(traj_data['plan']['high_pddl']) > task_info['collected_subgoals'] and len(traj_data['plan']['high_pddl']) > 1:
                            task_info['collected_subgoals'] = collected_num_subgoals - 1 # the last subgoal failed, remove it
                            task_info['best_trial_dir'] = os.path.join(sampl_dir, task_name, trial_dir)
                            task_info['full_traj_success'] = False
                            task_to_traj[task_name] = task_info

    if not found_task_dir:
        missing.append(task_name)


In [22]:
task_to_traj['pick_and_place_with_movable_recep-Apple-Plate-DiningTable-11']

{'best_trial_dir': 'new_trajectories_T20200814_164117_470008/pick_and_place_with_movable_recep-Apple-Plate-DiningTable-11/trial_T20200814_220339_693684',
 'max_collected_subgoals': 2,
 'full_traj_success': False}

In [4]:
len(task_to_traj.keys())

8851

In [5]:
list(task_to_traj.keys())[0]

'look_at_obj_in_light-BaseballBat-None-DeskLamp-301'

In [52]:
task_to_traj['pick_two_obj_and_place-Cloth-None-SinkBasin-422']

{'best_trial_dir': 'new_trajectories_T20200816_210636_008457/pick_two_obj_and_place-Cloth-None-SinkBasin-422/trial_T20200818_005722_610055',
 'max_collected_subgoals': 6,
 'full_traj_success': False}

## look at subgoal distribution

In [7]:
subgoal_counts = defaultdict(int)

for k in task_to_traj.keys():
    subgoal_counts[task_to_traj[k]['max_collected_subgoals']] += 1

In [8]:
sorted([(num_subgoal, ct)for num_subgoal, ct in subgoal_counts.items()], key=lambda x: x[1], reverse=True)

[(3, 3291),
 (6, 1194),
 (1, 1171),
 (2, 1150),
 (5, 733),
 (4, 365),
 (7, 314),
 (9, 276),
 (8, 143),
 (12, 94),
 (11, 43),
 (10, 36),
 (13, 32),
 (14, 8),
 (16, 1)]

In [16]:
for k in task_to_traj.keys():
    if task_to_traj[k]['max_collected_subgoals'] == 16:
        print(k)

pick_two_obj_and_place-TomatoSliced-None-SinkBasin-22


## clean up the collected trajectory to remove the failed last step

In [22]:
# clean up the collected trajectory to remove the failed last step

split_entries = []
images_to_move = {} # [(src, des)]
for task_name in task_to_traj.keys():
    
    # first check if clean traj is already saved
    # pick_two_obj_and_place-Bread-None-Microwave-20/trial_T20200817_133657_955225
    task_name_trial_name = '/'.join(task_to_traj[task_name]['best_trial_dir'].split('/')[-2:])
    # /home/hoyeung/json_data_augmentation_20200819/pick_two_obj_and_place-Bread-None-Microwave-20/trial_T20200817_133657_955225
    traj_data_out_dir = os.path.join(dout, task_name_trial_name)
    if not os.path.exists(os.path.join(traj_data_out_dir, 'traj_data.json')):
#     if True:
        # make directory if not exists
        if not os.path.exists(traj_data_out_dir):
            os.makedirs(traj_data_out_dir)

        num_complete_subgoals = task_to_traj[task_name]['max_collected_subgoals'] 
        traj_data_p = os.path.join(data, task_to_traj[task_name]['best_trial_dir'], 'traj_data.json')
        with open(traj_data_p, 'r') as f:
            traj_data = json.load(f)

        if task_to_traj[task_name]['full_traj_success']:
            pass # anything to do here?
        else:
            # remove the failed last subgoal from plan
            traj_data['plan']['high_pddl'].pop()
            # e.g. 5 complete, high idx to keep 0, 1, 2, 3, 4, all low actions with high idx 5 should be removed
            while traj_data['plan']['low_actions'][-1]['high_idx'] >= num_complete_subgoals:
                traj_data['plan']['low_actions'].pop()
            
            # remove the failed last subgoal from language template
            traj_data['template']['high_descs'].pop()

            # remove the failed last subgoal from image frames
            while traj_data['images'][-1]['high_idx'] >= num_complete_subgoals:
                traj_data['images'].pop()

        # prepare to copy the only useful images over
        images_to_move[task_name] = [
            (os.path.join(data, task_to_traj[task_name]['best_trial_dir'], 'raw_images', image_entry['image_name']),
            os.path.join(traj_data_out_dir, 'raw_images', image_entry['image_name']))
            for image_entry in traj_data['images']]

        # save the trajectory to dout = '/data_alfred/json_data_augmentation_<date>/'
        traj_data_out_p = os.path.join(traj_data_out_dir, 'traj_data.json')
        # import pdb; pdb.set_trace()
        with open(traj_data_out_p, 'w') as f:
            json.dump(traj_data, f)

    # Save to raw split
    split_entries.append({
        'task':task_name_trial_name, 
        'repeat_idx':0, 
        'full_traj_success':task_to_traj[task_name]['full_traj_success'],
        'collected_subgoals': task_to_traj[task_name]['collected_subgoals']
    })

# Save to raw split
split_path = os.path.join(dout_split, 'sample_failed_20200820_raw.json')
with open(split_path, 'w') as f:
    json.dump({'augmentation':split_entries}, f)


In [14]:
# first 2 rounds
len(images_to_move)

5779

In [26]:
split_entries[0]

{'task': 'look_at_obj_in_light-BaseballBat-None-DeskLamp-301/trial_T20200814_164125_595727',
 'repeat_idx': 0,
 'full_traj_success': False,
 'collected_subgoals': 2}

In [23]:
# third round
len(images_to_move)

3072

In [24]:
5779 + 3072

8851

In [30]:
images_to_move['pick_and_place_with_movable_recep-Lettuce-Bowl-DiningTable-16'][0]

('/root/data_alfred/demo_generated/new_trajectories_T20200820_065140_149815/pick_and_place_with_movable_recep-Lettuce-Bowl-DiningTable-16/trial_T20200820_065158_967281/raw_images/000000000.png',
 '/root/data_alfred/json_data_augmentation_20200820/pick_and_place_with_movable_recep-Lettuce-Bowl-DiningTable-16/trial_T20200820_065158_967281/raw_images/000000000.png')

## Copy images over

In [31]:
# copy only useful image files over
dest_paths = []
for key in images_to_move.keys():
    for src_fpath, dest_fpath in images_to_move[key]:
        os.makedirs(os.path.dirname(dest_fpath), exist_ok=True)
        dest_paths.append(shutil.copy(src_fpath, dest_fpath))

In [16]:
# first 2 rounds
len(dest_paths)

917884

In [32]:
# third round
len(dest_paths)

496918

## Ignore below

In [2]:
import json
with open('/data_alfred/splits/may17.json', 'r') as f:
    splits = json.load(f) 

In [3]:
splits['train'][0]

{'repeat_idx': 0,
 'task': 'pick_cool_then_place_in_recep-LettuceSliced-None-DiningTable-17/trial_T20190909_070538_437648'}

In [9]:
with open(os.path.join(data, task_to_traj['pick_clean_then_place_in_recep-SoapBar-None-CounterTop-421']['best_trial_dir'], 'traj_data.json'), 'r') as f:
    traj_data = json.load(f)

traj_data.keys()

dict_keys(['images', 'plan', 'scene', 'template', 'pddl_state', 'dataset_params', 'pddl_params', 'task_type', 'task_id'])

In [11]:
[image_entry['image_name'] for image_entry in traj_data['images']]

['000000000.png',
 '000000001.png',
 '000000002.png',
 '000000003.png',
 '000000004.png',
 '000000005.png',
 '000000006.png',
 '000000007.png',
 '000000008.png',
 '000000009.png',
 '000000010.png',
 '000000011.png',
 '000000012.png',
 '000000013.png',
 '000000014.png',
 '000000015.png',
 '000000016.png',
 '000000017.png',
 '000000018.png',
 '000000019.png',
 '000000020.png',
 '000000021.png',
 '000000022.png',
 '000000023.png',
 '000000024.png',
 '000000025.png',
 '000000026.png',
 '000000027.png',
 '000000028.png',
 '000000029.png',
 '000000030.png',
 '000000031.png',
 '000000032.png',
 '000000033.png',
 '000000034.png',
 '000000035.png',
 '000000036.png',
 '000000037.png',
 '000000038.png',
 '000000039.png',
 '000000040.png',
 '000000041.png',
 '000000042.png',
 '000000043.png',
 '000000044.png',
 '000000045.png',
 '000000046.png',
 '000000047.png',
 '000000048.png',
 '000000049.png',
 '000000050.png',
 '000000051.png',
 '000000052.png',
 '000000053.png',
 '000000054.png',
 '00000005