# ALFRED language model dataset generation

In [None]:
import json
import string
import numpy as np
import pickle 

%cd ../..

In [2]:
import ALFRED_task_helper as alf # for dataset generation in FILM manner
from alfred_utils.gen.constants import * # ALFRED constants

exclude = set(string.punctuation)

### Dataset generation

The "make_dataset" function processes ALFRED traj_data of the chosen split (train, valid_seen, valid_unseen) and outputs the NL instructions of the queried type (film, recept, no_recept) with the corresponding lists of subtasks.

In [3]:
PATH_TO_SCENE_NAMES = 'alfred_utils/data/splits/'
PATH_TO_JSON = 'alfred_utils/data/'

def make_dataset(split: str, instr_type: str):
   scene_names = json.load(open(PATH_TO_SCENE_NAMES + 'oct21.json'))
   frames = []
   split_data = scene_names[split]
   for i, e in enumerate(split_data):
        frame = {}
        r_idx = e['repeat_idx']
        task = e['task']
        path_to_json = PATH_TO_JSON + 'json_2.1.0/'+ f'/{task}/pp/ann_{r_idx}.json'
        traj_data = json.load(open(path_to_json, 'r'))

        # Extract NL goal and step-by-step high-level instructions
        anns = traj_data['turk_annotations']['anns']
        goal = anns[r_idx]['task_desc'].lower().strip().replace('\n', '')
        goal = ''.join(ch for ch in goal if ch not in exclude)
        high_descs = [
            ''.join(
                ch for ch in desc if ch not in exclude
                ).lower().strip().replace('\n', '') 
                for desc in anns[r_idx]['high_descs']]

        # Get list of subtasks with the required instructions type
        if instr_type == 'film':
            # FILM instructions processing procedures 
            list_of_actions = alf.get_list_of_highlevel_actions(traj_data)[0]
        elif instr_type in ('no_recept', 'recept'):
            list_of_actions = get_actions(traj_data['plan'], high_descs, instr_type)
        else:
            print('Unknown instructions type')
            return

        frame['nl'] = goal + ' . ' + ' . '.join(high_descs)
        frame['code'] = ' ; '.join([' '.join(t) for t in list_of_actions]).strip()
        frame['r_idx'] = r_idx
        frame['task_id'] = traj_data['task_id']
        frame['list_of_actions'] = list_of_actions
        frames.append(frame)
        
   return frames


def get_actions(plan, high_descs, instr_type: str):
    """
    Custom function for ground-truth trajectories generation. 
    For 'no_recept' and 'recept' instruction types.
    """
    list_of_actions = []
    low_actions = plan['low_actions']
    high_actions = plan['high_pddl']
    for i, _ in enumerate(high_descs):
        step_actions = [act for act in low_actions if act['high_idx'] == i]
        for act in step_actions:
            task = act['discrete_action']['action']
            # Select action that involves object interacting
            if 'objectId' in act['api_action'].keys():
                tokens = act['api_action']['objectId'].split('|')
                obj = tokens[0]
                if obj in VAL_ACTION_OBJECTS['Sliceable'] and 'Sliced' in tokens[-1]:
                    obj = tokens[-1][:-2]
                recept = None
                # Search for receptacle in traj_data
                if 'receptacleObjectId' in act['api_action'].keys():
                    recept = act['api_action']['receptacleObjectId'].split('|')[0]
                    # Append Basin to some class names
                    if recept == 'Sink':
                        recept = 'SinkBasin'
                    if recept == 'Bathtub':
                        recept = 'BathtubBasin'
                
                # For some objects the receptacle can be found in
                # 'coordinateReceptacleObjectId' field
                elif (
                    'coordinateReceptacleObjectId' 
                    in high_actions[i]['planner_action'].keys() 
                    and obj in set(NON_RECEPTACLES) | set(MOVABLE_RECEPTACLES)):
                     recept = high_actions[i]['planner_action']['coordinateReceptacleObjectId'][0]
                
                else:
                    recept = 'None'
                
                # Subtasks for 'recept' are triplets, for 'no_recept' -- pairs
                if instr_type == 'recept':            
                    list_of_actions.append((obj, recept, task))
                
                elif instr_type == 'no_recept':
                    if task != 'PutObject':
                        list_of_actions.append((obj, task))
                    else:
                        list_of_actions.append((recept, task))
                
                    


    return list_of_actions

In [4]:
split = 'valid_seen' # can also take the value of 'train', 'valid_unseen'
instr_type = 'no_recept' # can also take the value of 'film', 'recept'
frames = make_dataset(split, instr_type)

In [5]:
frames[0]['list_of_actions']

[('ButterKnife', 'PickupObject'),
 ('Fridge', 'OpenObject'),
 ('Potato', 'SliceObject'),
 ('Fridge', 'CloseObject'),
 ('Microwave', 'OpenObject'),
 ('Microwave', 'PutObject'),
 ('Microwave', 'CloseObject'),
 ('Fridge', 'OpenObject'),
 ('PotatoSliced', 'PickupObject'),
 ('Fridge', 'CloseObject'),
 ('Microwave', 'OpenObject'),
 ('Microwave', 'PutObject'),
 ('Microwave', 'CloseObject'),
 ('Microwave', 'ToggleObjectOn'),
 ('Microwave', 'ToggleObjectOff'),
 ('Microwave', 'OpenObject'),
 ('PotatoSliced', 'PickupObject'),
 ('Microwave', 'CloseObject'),
 ('SinkBasin', 'PutObject')]

### Create training sets for [CodeT5 training](https://github.com/salesforce/codet5)

The function below is used to create json files for CodeT5 training with special names (train.json, dev.json, test.json).

In [6]:
OUTPUT_PATH = 'fiqa/language_processing/processed_instructions/'

def make_codet5_training_set(frames: dict, split: str):
    
    # Shuffle the tasks
    shuffler = np.random.permutation(len(frames))
    frames = np.array(frames)[shuffler]

    # Create a file with the proper name for CodeT5 training,
    # validation and testing 
    if split == 'train':
        file_name = 'train'
    elif split == 'valid_seen':
        file_name = 'dev'
    elif split == 'valid_unseen':
        file_name = 'test'
    with open(OUTPUT_PATH + f'{file_name}.json', 'w') as f:
        # We need only natural language text and the sequence of subtasks (code)
        for frame in frames:
            new_frame = {}
            new_frame['code'] = frame['code']
            new_frame['nl'] = frame['nl']
            json_data = json.dumps(new_frame)
            f.write(json_data + '\n')



Run this multiple times with different splits to create train, dev and test sets for CodeT5 training.

In [7]:
make_codet5_training_set(frames, split)

### Create files with GT trajectories for FIQA oracle agent

In [8]:
def make_gt_trajectories(frames: dict, split: str, instr_type: str):
    new_frames = {}
    for frame in frames:
        task_key = (frame['task_id'], frame['r_idx'])
        new_frames[task_key] = frame['list_of_actions']
    with open(
       OUTPUT_PATH + f'{split}_{instr_type}_gt_alfred.p', 'wb') as f:
     pickle.dump(new_frames, f)

Run this multiple times with different splits to create GT trajectories for train, valid_seen and unseen. These files has to be processed by the lp_outputs.py script inside FIQA to obtain the GT instructions with the navigation inserted.

In [9]:
make_gt_trajectories(frames, split, instr_type)