In [2]:
import torch
import transformers
from transformers import AutoTokenizer, AutoConfig
from peft import PeftModel
from gpt4tools.data.conversation import SeparatorStyle, get_conv_template

  from .autonotebook import tqdm as notebook_tqdm


[2023-12-16 14:20:11,720] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)


In [3]:
model_name_or_path = "lmsys/vicuna-7b-v1.5"
CACHE_DIR="/data01/rayyang/GPT4Tools/cache"
model_max_length=2048
lora_path="/data01/rayyang/GPT4Tools/output_vicuna"

In [4]:
tokenizer = transformers.AutoTokenizer.from_pretrained(
    model_name_or_path,
    cache_dir=CACHE_DIR,
    model_max_length=model_max_length,
    padding_side="right",
    use_fast=False,
)
tokenizer.pad_token = tokenizer.unk_token

In [6]:
original_model = transformers.AutoModelForCausalLM.from_pretrained(
    model_name_or_path,
    cache_dir=CACHE_DIR,
    torch_dtype=torch.float16, 
    low_cpu_mem_usage=True, 
    device_map="auto"
)

Loading checkpoint shards: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:08<00:00,  4.27s/it]


In [7]:
print(f"Loading LoRA weights from {lora_path}")
model = PeftModel.from_pretrained(original_model, lora_path)
print(f"Merging weights")
model = model.merge_and_unload()
print('Convert to FP16...')
model.to(torch.float16)

Loading LoRA weights from /data01/rayyang/GPT4Tools/output_vicuna
Merging weights
Convert to FP16...


LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 4096, padding_idx=0)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (v_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=4096, out_features=11008, bias=False)
          (up_proj): Linear(in_features=4096, out_features=11008, bias=False)
          (down_proj): Linear(in_features=11008, out_features=4096, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    (norm): LlamaRMSNo

In [8]:
prompt = """
GPT4Tools can handle various text and visual tasks, such as answering questions and providing in-depth explanations and discussions. It generates human-like text and uses tools to indirectly understand images. When referring to images, GPT4Tools follows strict file name rules. To complete visual tasks, GPT4Tools uses tools and stays loyal to observation outputs. Users can provide new images to GPT4Tools with a description, but tools must be used for subsequent tasks.
TOOLS:
------

GPT4Tools has access to the following tools:

> Edge Detection On Image: useful when you want to detect the edge of the image. like: detect the edges of this image, or canny detection on image, or perform edge detection on this image, or detect the canny image of this image. The input to this tool should be a string, representing the image_path
> Generate Image Condition On Canny Image: useful when you want to generate a new real image from both the user description and a canny image. like: generate a real image of a object or something from this canny image, or generate a new real image of a object or something from this edge image. The input to this tool should be a comma separated string of two, representing the image_path and the user description.
> Remove Something From The Photo: useful when you want to remove and object or something from the photo from its description or location. The input to this tool should be a comma separated string of two, representing the image_path and the object need to be removed.

To use a tool, please use the following format:

```
Thought: Do I need to use a tool? Yes
Action: the action to take, should be one of [Edge Detection On Image, Generate Image Condition On Canny Image, Hed Detection On Image, Segment the given object]
Action Input: the input to the action
Observation: the result of the action
```

When you have a response to say to the Human, or if you do not need to use a tool, you MUST use the format:

```
Thought: Do I need to use a tool? No
AI: [your response here]
```

Follow file name rules and do not fake non-existent file names. Remember to provide the image file name loyally from the last tool observation.

Previous conversation:

Human: Provide an image named image/alzkcuai.png. Description: A person is lying on the grass. Understand the image using tools.
AI: Received.

New input: Remove the person in the image. 
GPT4Tools needs to use tools to observe images, not directly imagine them. Thoughts and observations in the conversation are only visible to GPT4Tools. When answering human questions, repeat important information. Let's think step by step.
"""
print(prompt)


GPT4Tools can handle various text and visual tasks, such as answering questions and providing in-depth explanations and discussions. It generates human-like text and uses tools to indirectly understand images. When referring to images, GPT4Tools follows strict file name rules. To complete visual tasks, GPT4Tools uses tools and stays loyal to observation outputs. Users can provide new images to GPT4Tools with a description, but tools must be used for subsequent tasks.
TOOLS:
------

GPT4Tools has access to the following tools:

> Edge Detection On Image: useful when you want to detect the edge of the image. like: detect the edges of this image, or canny detection on image, or perform edge detection on this image, or detect the canny image of this image. The input to this tool should be a string, representing the image_path
> Generate Image Condition On Canny Image: useful when you want to generate a new real image from both the user description and a canny image. like: generate a real 

In [9]:
conv = get_conv_template("gpt4tools_vicuna_v1.1")
conv.append_message(conv.roles[0], prompt)
conv.append_message(conv.roles[1], None)
my_prompt = conv.get_prompt()
print(my_prompt)

USER: 
GPT4Tools can handle various text and visual tasks, such as answering questions and providing in-depth explanations and discussions. It generates human-like text and uses tools to indirectly understand images. When referring to images, GPT4Tools follows strict file name rules. To complete visual tasks, GPT4Tools uses tools and stays loyal to observation outputs. Users can provide new images to GPT4Tools with a description, but tools must be used for subsequent tasks.
TOOLS:
------

GPT4Tools has access to the following tools:

> Edge Detection On Image: useful when you want to detect the edge of the image. like: detect the edges of this image, or canny detection on image, or perform edge detection on this image, or detect the canny image of this image. The input to this tool should be a string, representing the image_path
> Generate Image Condition On Canny Image: useful when you want to generate a new real image from both the user description and a canny image. like: generate a

In [10]:
inputs = tokenizer([my_prompt], return_tensors="pt")
output_ids = model.generate(
    **inputs,
    do_sample=False,
    temperature=0.1,
    max_new_tokens=512,
)
output_ids = output_ids[0][len(inputs["input_ids"][0]) :]
outputs = tokenizer.decode(
    output_ids, skip_special_tokens=True, spaces_between_special_tokens=False
)
print(outputs)



Thought: Do I need to use a tool? Yes
Action: Remove Something From The Photo
Action Input: image/alzkcuai.png, person
Observation:


In [11]:
my_prompt

"USER: \nGPT4Tools can handle various text and visual tasks, such as answering questions and providing in-depth explanations and discussions. It generates human-like text and uses tools to indirectly understand images. When referring to images, GPT4Tools follows strict file name rules. To complete visual tasks, GPT4Tools uses tools and stays loyal to observation outputs. Users can provide new images to GPT4Tools with a description, but tools must be used for subsequent tasks.\nTOOLS:\n------\n\nGPT4Tools has access to the following tools:\n\n> Edge Detection On Image: useful when you want to detect the edge of the image. like: detect the edges of this image, or canny detection on image, or perform edge detection on this image, or detect the canny image of this image. The input to this tool should be a string, representing the image_path\n> Generate Image Condition On Canny Image: useful when you want to generate a new real image from both the user description and a canny image. like: g