In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
from pathlib import Path

class LLaMACommandGenerator:
    role_text: str = ""
    action_text: str = "You can use the following actions:" \
    " 1. **go to [object]**: Move to the specified object. " \
    " 2. **open [object]**: Open the specified object. " \
    " 3. **close [object]**: Close the specified object. " \
    " 4. **look**: Look around your current location. " \
    " 5. **take [object] from [container]**: Pick up an object from a receptacle. " \
    " 6. **move [object] to [location]**: Move an object to a target location. " \
    " 7. **put [object] in/on [container]**: Place an object into or onto another object. " \
    " 8. **examine [object]**: Examine an object or container to get more information. " \
    " 9. **use [object]**: Interact with a toggleable object (e.g., turn on a lamp). " \
    "10. **heat [object] with [appliance]**: Heat an object using something like a microwave. " \
    "11. **clean [object] with [appliance]**: Clean an object using a sink or basin. " \
    "12. **cool [object] with [appliance]**: Cool an object using a fridge. " \
    "13. **slice [object] with [tool]**: Slice an object using a knife or similar tool."
    
    def __init__(self, model_name="meta-llama/Meta-Llama-3-8B", device="cuda", save_name="lama-8b"):
        self.device = device if torch.cuda.is_available() and device == "cuda" else "cpu"
        # set gpu id
        if self.device == "cuda":
            torch.cuda.set_device(4)
        print(f"Using device: {self.device}")
        self.tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token=True)

        # Fix missing pad token
        self.tokenizer.pad_token = self.tokenizer.eos_token

        self.model = AutoModelForCausalLM.from_pretrained(
            model_name,
            device_map=None,
            torch_dtype=torch.float16 if self.device == "cuda" else torch.float32
        )
        self.model.to(self.device)
        self.model.eval()

        self.save_name = save_name
    
    @classmethod
    def read_my_file(cls):
        # Read "role.txt" from the same folder as this script
        try:
            folder = Path(__file__).parent  # works only in .py files
        except NameError:
            folder = Path.cwd()             # fallback for notebooks or interactive mode
        file_path = folder / "role.txt"
        text = file_path.read_text()
        # 2. assign to cls.role_prompt
        cls.role_text = text
    
    def generata_prompt(self, obs, task):
        """
        Step 1: Select a role prompt based on task description. -> done
        Step 2: [SEP] refers to previous action or observation, group them to previous actions.
        Step 3: Regulate user output to only one comaand with templates - alfred.twl2.
        """
        
        # Step 1: select role prompt based on task description
        if "put a" in task or "put some" in task:
            # Pick & Place
            role_prompt = self.__class__.role_text.split("**Pick & Place**")[1].split("---")[0].strip()
        elif "look at" in task or "examine" in task:
            # Examine in Light
            role_prompt = self.__class__.role_text.split("**Examine in Light**")[1].split("---")[0].strip()
        elif "clean" in task and "put" in task:
            # Clean & Place
            role_prompt = self.__class__.role_text.split("**Clean & Place**")[1].split("---")[0].strip()
        elif "heat" in task and "put" in task:
            # Heat & Place
            role_prompt = self.__class__.role_text.split("**Heat & Place**")[1].split("---")[0].strip()
        elif "cool" in task and "put" in task:
            # Cool & Place
            role_prompt = self.__class__.role_text.split("**Cool & Place**")[1].split("---")[0].strip()
        elif "put two" in task or "find two" in task:
            # Pick Two & Place
            role_prompt = self.__class__.role_text.split("**Pick Two & Place**")[1].split("---")[0].strip()
        else:
            role_prompt = ""  # fallback or raise an error/log warning

        # Step 2: Group previous actions and observations
        obs_split = obs.split("[SEP]")
        env_prompt = obs_split[0].strip()  # Initial environment description
        observations = [s.strip() for s in obs_split[1:] if s.strip()]  # Filter out empty strings
        
        prompt = f"{role_prompt}\n\n"
        prompt += f"Environment: {env_prompt}\n"
        prompt += f"\nTask: {task}\n"
        prompt += "Previous Actions and Observations:\n"
        for i, observation in enumerate(observations):
            if i % 2 == 0:
                prompt += f"Observation {i//2 + 1}: {observation}\n"
            else:
                prompt += f"Action {i//2 + 1}: {observation}\n"
        
        prompt += self.__class__.action_text + "\n"
        prompt += "Please follow your role to choose one action.\n"
        prompt += "Action: "
        return prompt

    def command_generation_lama(self, observation_strings, task_desc_strings):
        res = []
        read_my_file = self.__class__.read_my_file
        if not hasattr(self.__class__, 'role_text') or not self.__class__.role_text:
            read_my_file()
        if not self.__class__.role_text:
            raise ValueError("Role text not loaded. Please ensure 'role.txt' is present in the same directory as this script.")
        
        for obs, task in zip(observation_strings, task_desc_strings):
            # Construct prompt
            prompt = self.generata_prompt(obs, task)

            # Tokenize prompt (no padding needed for single input)
            input_ids = self.tokenizer.encode(prompt, return_tensors="pt", padding=True, truncation=True, max_length=1028, return_attention_mask=True).to(self.device)
            attention_mask = (input_ids != self.tokenizer.pad_token_id).long()

            # Generate response (small token limit for speed)
            with torch.no_grad():
                output_ids = self.model.generate(
                    input_ids,
                    attention_mask=attention_mask,
                    max_new_tokens=1028,
                    temperature=0.7,
                    top_p=1.0,
                    do_sample=True,
                    pad_token_id=self.tokenizer.eos_token_id
                )

            # Decode and remove the prompt prefix
            generated_text = self.tokenizer.decode(output_ids[0], skip_special_tokens=True)
            # response_text = generated_text[len(prompt):].strip()
            
            # response_text = response_text.split("\n")[0].strip()

            res.append(generated_text)
            # res.append(response_text)

        return res, None  # current_dynamics is unused

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
generator = LLaMACommandGenerator(model_name="meta-llama/Meta-Llama-3-8B", device="cuda", save_name="lama-8b")
    
    

Using device: cuda


Loading checkpoint shards: 100%|██████████| 4/4 [00:01<00:00,  2.29it/s]


In [3]:
observation_strings = ['-= Welcome to TextWorld, ALFRED! =- You are in the middle of a room. Looking quickly around you, you see a cabinet 1, a cabinet 10, a cabinet 11, a cabinet 12, a cabinet 2, a cabinet 3, a cabinet 4, a cabinet 5, a cabinet 6, a cabinet 7, a cabinet 8, a cabinet 9, a coffeemachine 1, a countertop 1, a countertop 2, a diningtable 1, a drawer 1, a drawer 2, a drawer 3, a fridge 1, a garbagecan 1, a microwave 1, a sinkbasin 1, a stoveburner 1, a stoveburner 2, a stoveburner 3, a stoveburner 4, and a toaster 1. [SEP] You arrive at cabinet 12. The cabinet 12 is closed. [SEP] go to cabinet 12 [SEP] You open the cabinet 12. The cabinet 12 is open. In it, you see a bowl 3. [SEP] open cabinet 12 [SEP] You close the cabinet 12. [SEP] close cabinet 12', '-= Welcome to TextWorld, ALFRED! =- You are in the middle of a room. Looking quickly around you, you see a cabinet 1, a cabinet 10, a cabinet 11, a cabinet 12, a cabinet 13, a cabinet 14, a cabinet 15, a cabinet 16, a cabinet 17, a cabinet 18, a cabinet 19, a cabinet 2, a cabinet 20, a cabinet 21, a cabinet 22, a cabinet 23, a cabinet 24, a cabinet 25, a cabinet 26, a cabinet 3, a cabinet 4, a cabinet 5, a cabinet 6, a cabinet 7, a cabinet 8, a cabinet 9, a coffeemachine 1, a countertop 1, a countertop 2, a countertop 3, a drawer 1, a drawer 10, a drawer 11, a drawer 12, a drawer 2, a drawer 3, a drawer 4, a drawer 5, a drawer 6, a drawer 7, a drawer 8, a drawer 9, a fridge 1, a garbagecan 1, a microwave 1, a sinkbasin 1, a stoveburner 1, a stoveburner 2, a stoveburner 3, a stoveburner 4, and a toaster 1. [SEP] You arrive at drawer 11. The drawer 11 is closed. [SEP] go to drawer 11 [SEP] You open the drawer 11. The drawer 11 is open. In it, you see nothing. [SEP] open drawer 11 [SEP] You close the drawer 11. [SEP] close drawer 11', '-= Welcome to TextWorld, ALFRED! =- You are in the middle of a room. Looking quickly around you, you see a cabinet 1, a cabinet 2, a cabinet 3, a cabinet 4, a cabinet 5, a cabinet 6, a cabinet 7, a cabinet 8, a cabinet 9, a coffeemachine 1, a countertop 1, a countertop 2, a drawer 1, a drawer 10, a drawer 11, a drawer 12, a drawer 13, a drawer 2, a drawer 3, a drawer 4, a drawer 5, a drawer 6, a drawer 7, a drawer 8, a drawer 9, a fridge 1, a garbagecan 1, a microwave 1, a sinkbasin 1, a stoveburner 1, a stoveburner 2, a stoveburner 3, a stoveburner 4, a stoveburner 5, a stoveburner 6, and a toaster 1. [SEP] You close the drawer 12. [SEP] close drawer 12 [SEP] You arrive at drawer 10. On the drawer 10, you see nothing. [SEP] go to drawer 10 [SEP] You arrive at drawer 12. The drawer 12 is closed. [SEP] go to drawer 12']
task_desc_strings = ['put a cool plate in cabinet.', 'put two spatula in drawer.', 'put a clean butterknife in drawer.']
Actions = ['go to cabinet 12', 'go to countertop 3', 'open drawer 12']


commands, _ = generator.command_generation_lama(observation_strings, task_desc_strings)

for cmd, baseline in zip(commands, Actions):
    print(f"Generated Command: {cmd} ")
    print(f"Baseline Command: {baseline}")

Generated Command: You are an embodied agent whose job is to execute “Pick & Place” tasks. Given a natural‐language instruction like “put a plate on the coffee table,” you must:
1. Identify an object of the specified type in the scene.
2. Issue a command to navigate to that object and pick it up.
3. Determine the correct destination (e.g., the coffee table) and navigate there.
4. Issue a command to place the object down in the designated location.
   Now, given the current textual observation of the environment, produce exactly one next high‐level action (e.g., “go to cabinet 2,” “pick up plate,” “go to coffee table,” or “put down plate”) that advances the “Pick & Place” objective.

Environment: -= Welcome to TextWorld, ALFRED! =- You are in the middle of a room. Looking quickly around you, you see a cabinet 1, a cabinet 10, a cabinet 11, a cabinet 12, a cabinet 2, a cabinet 3, a cabinet 4, a cabinet 5, a cabinet 6, a cabinet 7, a cabinet 8, a cabinet 9, a coffeemachine 1, a countertop