In [1]:
import os
import ast
import json
import wandb
import torch
from datetime import datetime
from functools import partial
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
from torch.utils.tensorboard import SummaryWriter
from transformers import BitsAndBytesConfig, AutoProcessor, get_linear_schedule_with_warmup, get_constant_schedule_with_warmup
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from model.utils import find_target_linear_names
from main.trainer import train
from main.eval_aitw import validate_aitw
from main.eval_mind2web import validate_mind2web
from main.eval_screenspot import validate_screenspot
from main.evaluator import validate as validate_default
from data.dataset import HybridDataset, collate_fn
from utils.utils import save_args_to_json, create_log_dir
from merge_weight import load_sharded_weights

In [2]:
args = {
    # wandb配置参数
    "wandb_key": "7dd9f7e0d1d48f0b0296d469ce4b6365e615094d", # 修改为你的wandb API key
    "local_rank": 0, # 本地rank，0表示单机单卡训练

    # 需要修改的路径参数
    "model_path": "D:/Project/showui-2b", # 修改为你的基模型路径
    "train_dataset": "showui-desktop", # 修改为你的训练数据集路径
    "train_json": "metadata", # 修改为你的训练数据集标注文件名
    "val_dataset": "showui-desktop", # 修改为你的验证数据集路径
    "val_json": "metadata", # 修改为你的验证数据集标注文件名
    "dataset_dir": "D:/Project/my_dataset", # 修改为你的数据集目录路径
    "exp_dir": "D:/Project/logs/debug/2025-06-22_19-28-54", # 请修改为你的LoRA权重保存路径

    # 模型配置参数
    "model_id": "local_ShowUI-2B", # 模型ID
    "version": "showlab/ShowUI-2B", # 模型版本路径
    "min_visual_tokens": 256, # 最小视觉token数量
    "max_visual_tokens": 1344, # 最大视觉token数量
    "model_max_length": 8192, # 模型最大长度，8192表示支持长文本输入
    "max_new_tokens": 128, # 最大新生成token数量

    # ui图配置参数
    "uigraph_train": True, # 启用ui图训练
    "uigraph_test": False, # 启用ui图测试
    "uigraph_diff": 1, # UI图差异阈值，1表示仅保留有差异的patch
    "uigraph_rand": False, # 是否随机选择patch进行训练，False表示均匀选择
    "uimask_pre": True, # 是否预处理UI图，True表示预处理
    "uimask_ratio": 0.5, # UI图掩码比例，0.5表示50%的UI图被掩码
    "uimask_rand": False, # 是否随机掩码UI图，False表示均匀掩码
    
    "precision": "bf16", # 训练精度，支持"fp16", "bf16", "fp32"
    
    # 语言和视觉层跳过参数
    "lm_skip_ratio": 0.5, # 语言层跳过比例，0.5表示跳过50%的语言层
    "lm_skip_layer": '[1,28,0]', # 语言层跳过层数，e.g., [1,28,0]表示跳过第1层和第28层
    "vis_skip_ratio": 0.5, # 视觉层跳过比例，0.5表示跳过50%的视觉层
    "vis_skip_layer": '[1,32,0]', # 视觉层跳过层数，e.g., [1,32,0]表示跳过第1层和第32层
    "attn_imple": "sdpa", # 注意力实现方式，支持"flash_attention_2", "sdpa", "eager"
    
    # LoRA微调配置参数
    "use_qlora": False, # 是否使用QLoRA进行训练
    "lora_r": 32, # LoRA的r值，表示低秩矩阵的秩
    "lora_alpha": 64, # LoRA的alpha值，表示缩放因子
    "lora_dropout": 0.05, # LoRA的dropout率
    "lora_target_modules": ["q_proj", "v_proj", "k_proj", "o_proj"], # LoRA目标模块，支持"qkv_proj", "q_proj", "v_proj", "k_proj", "o_proj"
    "tune_visual_encoder": False, # 是否微调视觉编码器
    "freeze_lm_embed": False, # 是否冻结语言模型嵌入层
    "tune_visual_encoder_projector": False, # 是否微调视觉编码器投影层
    
    # 梯度检查点配置参数
    "gradient_checkpointing": True, # 是否启用梯度检查点


    # 数据集配置参数
    "train_ratio": "1.0", # 训练数据比例，可以是0到1之间的浮点数
    "val_ratio": "1.0", # 验证数据比例，可以是0到1之间的浮点数
    "uniform_sample": False, # 是否使用均匀采样进行训练数据采样
    "random_sample": False, # 是否使用随机采样进行训练数据采样
    "record_sample": False, # 是否记录采样数据

    # 训练配置参数
    "log_base_dir": "D:/Project/logs", # 日志基础目录
    "exp_id": "debug", # 实验ID，用于区分不同实验
    "lr": 1e-5, # 学习率
    "beta1": 0.9, # 优化器的Beta1参数
    "beta2": 0.999, # 优化器的Beta2参数
    "epochs": 10, # 训练轮数
    "steps_per_epoch": 100, # 每个epoch的训练步数
    "warmup_steps": 30, # 预热步数
    "batch_size": 1, # 训练批次大小
    "grad_accumulation_steps": 2, # 梯度累积步数
    "val_batch_size": 1, # 验证批次大小
    "workers": 0, # 数据加载器工作线程数

    # Grounding setting
    "num_turn": 100, # 交互轮数
    "shuffle_image_token": False, # 随机打乱图像token顺序
    "uniform_prompt": True, # 使用统一的提示词
    "text2point": 1.0, # 文本到点的任务采样比例
    "text2bbox": 0.0, # 文本到边界框的任务采样比例
    "point2text": 0.0, # 点到文本的任务采样比例
    "bbox2text": 0.0, # 边界框到文本的任务采样比例
    "crop_min": 0.5 , # 
    "crop_max": 1.5, # 
    "xy_int": False, # 是否将坐标转换为整数

    # Navigation setting
    "num_history": 4, # 历史交互轮数
    "interleaved_history": 'tttt', # 视觉-动作交互设置，选项=['tttt', 'vvvv', 'vtvt', 'tvtv', 'vvtt', 'ttvv']
    "skip_readme_train": False, # 是否跳过README训练数据
    "skip_readme_test": False, # 是否跳过README测试数据

    # 模型检测点和评估配置参数
    "eval_only": False, # 是否仅进行评估，不进行训练
    "start_epoch": 0, # 开始训练的epoch
    "no_eval": False, # 是否跳过评估
    "debug": False, # debug模式，True表示启用，不保存模型日志
    "print_freq": 1, # 输出频率，表示每隔多少步输出一次日志


    
}

In [3]:
from types import SimpleNamespace
# 如果args是字典，将其转换为SimpleNamespace对象
if isinstance(args, dict):
    args = SimpleNamespace(**args)

In [4]:
# 训练模型主函数
def ShowUItrain(args):

    args.global_rank = int(os.environ.get("RANK", 0))
    args.local_rank = int(os.environ.get("LOCAL_RANK", 0))
    args.world_size = int(os.environ.get("WORLD_SIZE", 1))


    if args.attn_imple in ["eager", "sdpa"]:
        # suggested by https://github.com/Lightning-AI/litgpt/issues/327
        torch.backends.cuda.enable_mem_efficient_sdp(False)
        torch.backends.cuda.enable_flash_sdp(False)

    timestamp = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')

    args.distributed = args.world_size > 1


    args.log_dir = os.path.join(args.log_base_dir, args.exp_id, timestamp)
    args.tmp_dir = os.path.join(args.log_dir, "tmp")

    # must provide wandb-key
    assert args.wandb_key is not None
    wandb.login(key=args.wandb_key)


    writer = None  # TensorBoard writer, if needed, can be initialized later
    os.makedirs(args.log_dir, exist_ok=True)
    os.makedirs(args.tmp_dir, exist_ok=True)
    save_args_to_json(args, os.path.join(args.log_dir, "args.json"))  # 保存参数
    if not args.debug:
        # 创建TensorBoard日志目录
        writer = SummaryWriter(os.path.join(args.log_dir, "tensorboard"))
        # 初始化wandb
        wandb.init(
            project="ShowUI",
            group=args.exp_id,
            name=f'{args.exp_id}_{timestamp}',
            config=args,
            dir=args.log_dir,
        )
    print(f"Start Job: {args.exp_id}")

    # 创建处理器

    from model.showui.processing_showui import ShowUIProcessor

    processor = ShowUIProcessor.from_pretrained(args.model_path,
                                                min_pixels=args.min_visual_tokens *28*28,
                                                max_pixels=args.max_visual_tokens *28*28,
                                                model_max_length=args.model_max_length,
                                                uigraph_train=args.uigraph_train, uigraph_test=args.uigraph_test,
                                                uigraph_diff=args.uigraph_diff,  uigraph_rand=args.uigraph_rand,
                                                uimask_pre=args.uimask_pre, uimask_ratio=args.uimask_ratio, uimask_rand=args.uimask_rand,
                                                size = {"shortest_edge": 3136, "longest_edge": 1003520}
                                              )
    
    CHAT_TEMPLATE = "{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}<|im_start|>{{ message['role'] }}\n{% if message['content'] is string %}{{ message['content'] }}<|im_end|>\n{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>\n{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}"
    processor.chat_template = CHAT_TEMPLATE
    processor.tokenizer.chat_template = CHAT_TEMPLATE
    
    
    
    # 创建模型
    torch_dtype = torch.float32
    if args.precision == "bf16":
        torch_dtype = torch.bfloat16
    elif args.precision == "fp16":
        torch_dtype = torch.half

    model_path = args.model_path
    
    bnb_config = BitsAndBytesConfig(
                    load_in_4bit=True,
                    bnb_4bit_compute_dtype=torch.bfloat16,
                    bnb_4bit_use_double_quant=True,
                    bnb_4bit_quant_type="nf4",
                    llm_int8_skip_modules=["img_projection"],
                ) if args.use_qlora else None # 仅在使用QLoRA时才需要配置
    
    from model.utils import parse_layer_type
    from model.showui.modeling_showui import ShowUIForConditionalGeneration

    lm_qwen_layer = 28
    vis_qwen_layer = 32
    lm_skip_layer = parse_layer_type(args.lm_skip_layer, lm_qwen_layer)
    vis_skip_layer = parse_layer_type(args.vis_skip_layer, vis_qwen_layer)

    model = ShowUIForConditionalGeneration.from_pretrained(
        model_path,
        torch_dtype=torch_dtype, # 模型精度
        low_cpu_mem_usage=True, # 低内存使用模式
        _attn_implementation=args.attn_imple, # 注意力实现方式
        # quantization_config=bnb_config, # 量化配置
        device_map="cuda", # 自动设备映射
        lm_skip_layer=lm_skip_layer, # 跳过语言层
        lm_skip_ratio=args.lm_skip_ratio, # 跳过语言层比例
        tie_word_embeddings=False, # 是否共享词嵌入
    )



    # 加载模型检测点
    # if args.version != args.model_id:
    #     state_dict = torch.load(args.version, map_location="cpu")
    #     model.load_state_dict(state_dict, strict=False)

    model.config.use_cache = False # 禁用缓存以节省内存

    # 在评估模式下，不需要加载LoRA
    if args.eval_only:
        print("evaluation mode, thus set the `lora_r' as zero.")
        args.lora_r = 0
    if not args.eval_only and args.use_qlora:
        model = prepare_model_for_kbit_training(model)

    # 配置LoRA
    lora_r = args.lora_r
    if lora_r > 0:
        lora_alpha = args.lora_alpha
        lora_dropout = args.lora_dropout
        exclude_module = ["visual"] if not args.tune_visual_encoder else []
        exclude_module += ["lm_head"] if args.freeze_lm_embed else exclude_module
        lora_target_modules = find_target_linear_names(model, lora_namespan_exclude=exclude_module)

        lora_config = LoraConfig(
            r=lora_r,
            lora_alpha=lora_alpha,
            target_modules=lora_target_modules,
            lora_dropout=lora_dropout,
            bias="none",
            task_type="CAUSAL_LM",
        )
        model = get_peft_model(model, lora_config)
        # model.print_trainable_parameters()

        # 如果使用LoRA，则原始模型被包装2次
        # 一次是peft的get_peft_model包装，一次是ShowUIForConditionalGeneration的包装
        model_child = model.model.model # 获取原始模型，疑似不可使用base_model方法
    else:
        # 如果不使用LoRA，则原始模型只被ShowUIForConditionalGeneration包装
        model_child = model.model
    
    # 梯度检查点，降低显存使用
    if args.gradient_checkpointing:
        model.enable_input_require_grads()
        model.gradient_checkpointing_enable()
        
    
    if not args.tune_visual_encoder:
        # 冻结视觉编码器
        if args.lora_r > 0:
            for p in model.base_model.model.visual.parameters():
                p.requires_grad = False
        elif args.lora_r == 0:
            for p in model.visual.parameters():
                p.requires_grad = False
        
    if args.tune_visual_encoder_projector:
        for k, p in model.named_parameters():
            if 'visual.merger' in k:
                p.requires_grad = True
    
    if args.freeze_lm_embed:
        if args.lora_r > 0:
            for p in model_child.embed_tokens.parameters():
                p.requires_grad = False
        elif args.lora_r == 0:
            for p in model_child.embed_tokens.parameters():
                p.requires_grad = False
    
    # 检查可训练参数
    list_of_params_to_optimize = []
    for n, p in model.named_parameters():
        if p.requires_grad:
            # print("[Name]", n, " [Shape]", p.shape)
            list_of_params_to_optimize.append(p)
    
    # 创建数据集
    args.samples_per_epoch = args.batch_size    \
                    * args.grad_accumulation_steps  \
                    * args.steps_per_epoch

    train_dataset = HybridDataset(
        processor,
        inference=False,  # 仅用于训练
        args=args,
    )
    
    val_dataset = HybridDataset(
        processor,
        inference=True,  # 仅用于验证
        args=args,
    )

    if args.val_dataset == "mind2web":
        validate = validate_mind2web
    elif args.val_dataset == "screenspot":
        validate = validate_screenspot
    elif args.val_dataset == "aitw":
        validate = validate_aitw
    else:
        validate = validate_default

    if not args.random_sample:
        args.steps_per_epoch = len(train_dataset) // (args.batch_size * args.world_size)
    print("step for epoch: ", args.steps_per_epoch)
    # deepspeed参数（待完成）
    # 如果使用DeepSpeed，参考https://github.com/showlab/ShowUI/blob/main/train.py

    # LoRA微调
    if lora_r > 0:
        print("LoRA training, r: {}, alpha: {}, dropout: {}".format(
            lora_r, lora_alpha, lora_dropout))
        # 创建优化器
        optimizer = torch.optim.AdamW(
            list_of_params_to_optimize,
            lr=args.lr,
            betas=(args.beta1, args.beta2),
            weight_decay=0.0,
            )

        

        # DeepSpeed 用的是 WarmupDecayLR，PyTorch 没有内置这个，但可以用类似的调度器
        total_steps = args.epochs * args.steps_per_epoch
        scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=args.warmup_steps,
            num_training_steps=total_steps,
        )

        # 创建数据加载器
        train_loader = torch.utils.data.DataLoader(
            train_dataset,
            batch_size=args.batch_size,
            shuffle=False,
            collate_fn=partial(collate_fn, processor=processor),
            num_workers=args.workers,  # 根据你的CPU核心数调整
        )
        

        # 模型引擎
        model_engine = model
        model_engine = model_engine.to("cuda")

        

    # 如果不使用LoRA微调
    # 暂时一样，但是方便后续扩展
    elif lora_r == 0 and not args.eval_only:
        # 创建优化器
        print("No LoRA, using full model training")
        optimizer = torch.optim.AdamW(
            list_of_params_to_optimize,
            lr=args.lr,
            betas=(args.beta1, args.beta2),
            weight_decay=0.0,
            )

        # DeepSpeed 用的是 WarmupDecayLR，PyTorch 没有内置这个，但可以用类似的调度器
        total_steps = args.epochs * args.steps_per_epoch
        scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=args.warmup_steps,
            num_training_steps=total_steps,
        )

        # 创建数据加载器
        train_loader = torch.utils.data.DataLoader(
            train_dataset,
            batch_size=args.batch_size,
            shuffle=False,
            collate_fn=partial(collate_fn, processor=processor),
            num_workers=args.workers,  # 根据你的CPU核心数调整
        )

        # 模型引擎
        model_engine = model
        model_engine = model_engine.to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))

    # 仅评估模式
    elif args.eval_only:
        print("Evaluation mode, no training")
        for param in model.parameters():
            param.requires_grad = False 
        model_engine = model
    else:
        raise ValueError("Invalid setting")
    

    # 断点加载（待完成）

    # 验证集
    if val_dataset is not None:
        val_loader = torch.utils.data.DataLoader(
            val_dataset,
            batch_size=args.val_batch_size,
            shuffle=False,
            num_workers=args.workers,
            pin_memory=False,
            sampler= None,  # 若分布式训练，此处参考https://github.com/showlab/ShowUI/blob/main/train.py
            collate_fn=partial(collate_fn, processor=processor)
        )
    else:
        val_loader = None
    
    if args.eval_only:
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        model_engine = model_engine.to(device)
        validate(val_loader, model_engine, processor, 0, 0, writer, args)
        exit()

    train_iter = iter(train_loader)
    best_score = 0.0
    # args.start_epoch 是为了支持断点恢复训练
    print("开始训练")
    for epoch in range(args.start_epoch, args.epochs):
        # train for one epoch
        train_iter, global_step = train(
            train_loader,
            model_engine,
            optimizer,
            epoch,
            scheduler,
            writer,
            train_iter,
            args,
        )

        if args.no_eval == False and val_loader is not None:
            score = validate(
                val_loader,
                model_engine,
                processor,
                epoch,
                global_step,
                writer,
                args,
            )
            is_best = score > best_score
            best_score = max(score, best_score)
        else:
            is_best = True
            score = 0.0
        
        if args.no_eval or is_best:
            save_dir = os.path.join(args.log_dir,"ckpt_model")
            
            os.makedirs(save_dir, exist_ok=True)
            torch.save(
                {"epoch": epoch},
                os.path.join(
                    save_dir,
                    "meta_log_epo{:.0f}_score{:.2f}.pth".format(
                            epoch, best_score
                        ),
                ),
            )
            # if args.distributed:
            #     # 确保所有进程都完成保存
            #     torch.distributed.barrier()
            try:
                torch.save(
                    model_engine.state_dict(),
                    os.path.join(
                        save_dir,
                        "model_epo{:.0f}_score{:.2f}.pth".format(
                            epoch, best_score
                        ),
                    ),
                )
            except Exception as e:
                print("Failed to save checkpoint (): ", e)
    
    
    if args.global_rank == 0:
        if not args.debug:
            wandb.finish()
            writer.close()

# 合并LoRA权重和原始模型
def ShowUImerge(args):
    json_url = os.path.join(args.exp_dir, "args.json")
    with open(json_url,'r') as f:
        json_args = json.load(f)
    for key, value in json_args.items():
        setattr(args, key, value)
    
    args.save_path = args.exp_dir +"/ckpt_model/merged_model"
    args.weight_url = args.exp_dir +"/ckpt_model/adapter_model.safetensors"

    torch_dtype = torch.float32
    if args.precision == "bf16":
        torch_dtype = torch.bfloat16
    elif args.precision == "fp16":
        torch_dtype = torch.half
    
    from model.showui.processing_showui import ShowUIProcessor

    processor = ShowUIProcessor.from_pretrained(args.model_path,
                                                min_pixels=args.min_visual_tokens *28*28,
                                                max_pixels=args.max_visual_tokens *28*28,
                                                model_max_length=args.model_max_length,
                                                uigraph_train=args.uigraph_train, uigraph_test=args.uigraph_test,
                                                uigraph_diff=args.uigraph_diff,  uigraph_rand=args.uigraph_rand,
                                                uimask_pre=args.uimask_pre, uimask_ratio=args.uimask_ratio, uimask_rand=args.uimask_rand,
                                                size = {"shortest_edge": 3136, "longest_edge": 1003520}
                                              )
    
    from model.utils import parse_layer_type
    from model.showui.modeling_showui import ShowUIForConditionalGeneration

    lm_qwen_layer = 28
    vis_qwen_layer = 32
    lm_skip_layer = parse_layer_type(args.lm_skip_layer, lm_qwen_layer)
    vis_skip_layer = parse_layer_type(args.vis_skip_layer, vis_qwen_layer)

    model = ShowUIForConditionalGeneration.from_pretrained(
        args.model_path,
        torch_dtype=torch_dtype, # 模型精度
        low_cpu_mem_usage=True, # 低内存使用模式
        _attn_implementation=args.attn_imple, # 注意力实现方式
        # quantization_config=bnb_config, # 量化配置
        device_map="cuda", # 自动设备映射
        lm_skip_layer=lm_skip_layer, # 跳过语言层
        lm_skip_ratio=args.lm_skip_ratio, # 跳过语言层比例
    )
    
    model.config.use_cache = False
    model.config.tokenizer_model_max_length = processor.tokenizer.model_max_length

    lora_r = args.lora_r
    if lora_r > 0:
        lora_alpha = args.lora_alpha
        lora_dropout = args.lora_dropout
        lora_target_modules = find_target_linear_names(model, lora_namespan_exclude=["visual"])
        lora_config = LoraConfig(
            r=lora_r,
            lora_alpha=lora_alpha,
            target_modules=lora_target_modules,
            lora_dropout=lora_dropout,
            bias="none",
            task_type="CAUSAL_LM",
        )
        model = get_peft_model(model, lora_config)
        
        model.print_trainable_parameters()
        
        print("Loading LoRA weights from {}".format(args.weight_url))
        state_dict = load_sharded_weights(args.weight_url, device_map="cuda")
        missing_keys, unexpected_keys = model.load_state_dict(state_dict, strict=False)
        print("Loaded weights with {len(missing_keys)} missing keys and {len(unexpected_keys)} unexpected keys")
        
        # 合并LoRA权重
        print("Merging LoRA weights...")
        model = model.merge_and_unload()

        # 将合并后的模型权重保存到指定路径
        print("Saving merged model to {args.save_path}")
        model.save_pretrained(
            args.save_path,
            max_shard_size="10GB",  # 分片大小
            safe_serialization=True,  # 安全序列化
        )
        processor.save_pretrained(args.save_path)
        

In [None]:
# 训练ShowUI模型
ShowUItrain(args)

wandb: Appending key for api.wandb.ai to your netrc file: C:\Users\AMATEUR\_netrc
wandb: Currently logged in as: aa1687159592 (aa1687159592-) to https://api.wandb.ai. Use `wandb login --relogin` to force relogin


Start Job: debug




Found 197 lora modules: ['model.layers.0.self_attn.q_proj', 'model.layers.0.self_attn.k_proj', 'model.layers.0.self_attn.v_proj', 'model.layers.0.self_attn.o_proj', 'model.layers.0.mlp.gate_proj', 'model.layers.0.mlp.up_proj', 'model.layers.0.mlp.down_proj', 'model.layers.1.self_attn.q_proj', 'model.layers.1.self_attn.k_proj', 'model.layers.1.self_attn.v_proj', 'model.layers.1.self_attn.o_proj', 'model.layers.1.mlp.gate_proj', 'model.layers.1.mlp.up_proj', 'model.layers.1.mlp.down_proj', 'model.layers.2.self_attn.q_proj', 'model.layers.2.self_attn.k_proj', 'model.layers.2.self_attn.v_proj', 'model.layers.2.self_attn.o_proj', 'model.layers.2.mlp.gate_proj', 'model.layers.2.mlp.up_proj', 'model.layers.2.mlp.down_proj', 'model.layers.3.self_attn.q_proj', 'model.layers.3.self_attn.k_proj', 'model.layers.3.self_attn.v_proj', 'model.layers.3.self_attn.o_proj', 'model.layers.3.mlp.gate_proj', 'model.layers.3.mlp.up_proj', 'model.layers.3.mlp.down_proj', 'model.layers.4.self_attn.q_proj', 'mod



Current lr: 0.0
Epoch: [0][  1/100]	Batch time (s)  1.243 ( 1.243)	Loss 0.3138 (0.3106)	Iter time (s)  0.621 ( 0.621)	Epoch time (h)  0.017 ( 0.017)	Remain time (h)  0.017 ( 0.017)	Seq Len 543.000 (543.000)	Ctx Len 147.000 (147.000)	Vis Len 396.000 (396.000)


INFO:numexpr.utils:Note: NumExpr detected 16 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
INFO:numexpr.utils:NumExpr defaulting to 8 threads.


Current lr: 3.3333333333333335e-07
Epoch: [0][  2/100]	Batch time (s)  1.216 ( 1.216)	Loss 0.2515 (0.2051)	Iter time (s)  0.608 ( 0.608)	Epoch time (h)  0.017 ( 0.017)	Remain time (h)  0.017 ( 0.017)	Seq Len 515.000 (515.000)	Ctx Len 147.000 (147.000)	Vis Len 368.000 (368.000)




Current lr: 6.666666666666667e-07
Epoch: [0][  3/100]	Batch time (s)  1.247 ( 1.247)	Loss 0.2672 (0.2969)	Iter time (s)  0.624 ( 0.624)	Epoch time (h)  0.017 ( 0.017)	Remain time (h)  0.017 ( 0.017)	Seq Len 675.000 (675.000)	Ctx Len 147.000 (147.000)	Vis Len 528.000 (528.000)




Current lr: 1.0000000000000002e-06
Epoch: [0][  4/100]	Batch time (s)  0.934 ( 0.934)	Loss 0.3339 (0.2737)	Iter time (s)  0.467 ( 0.467)	Epoch time (h)  0.013 ( 0.013)	Remain time (h)  0.012 ( 0.012)	Seq Len 511.000 (511.000)	Ctx Len 143.000 (143.000)	Vis Len 368.000 (368.000)




Current lr: 1.3333333333333334e-06
Epoch: [0][  5/100]	Batch time (s)  1.841 ( 1.841)	Loss 0.3121 (0.3448)	Iter time (s)  0.920 ( 0.920)	Epoch time (h)  0.026 ( 0.026)	Remain time (h)  0.024 ( 0.024)	Seq Len 829.000 (829.000)	Ctx Len 147.000 (147.000)	Vis Len 682.000 (682.000)




Current lr: 1.6666666666666667e-06
Epoch: [0][  6/100]	Batch time (s)  0.758 ( 0.758)	Loss 0.4290 (0.4520)	Iter time (s)  0.379 ( 0.379)	Epoch time (h)  0.011 ( 0.011)	Remain time (h)  0.010 ( 0.010)	Seq Len 392.000 (392.000)	Ctx Len 122.000 (122.000)	Vis Len 270.000 (270.000)




Current lr: 2.0000000000000003e-06
Epoch: [0][  7/100]	Batch time (s)  0.951 ( 0.951)	Loss 0.4161 (0.2993)	Iter time (s)  0.476 ( 0.476)	Epoch time (h)  0.013 ( 0.013)	Remain time (h)  0.012 ( 0.012)	Seq Len 425.000 (425.000)	Ctx Len 145.000 (145.000)	Vis Len 280.000 (280.000)




Current lr: 2.3333333333333336e-06
Epoch: [0][  8/100]	Batch time (s)  1.012 ( 1.012)	Loss 0.3551 (0.4237)	Iter time (s)  0.506 ( 0.506)	Epoch time (h)  0.014 ( 0.014)	Remain time (h)  0.013 ( 0.013)	Seq Len 511.000 (511.000)	Ctx Len 143.000 (143.000)	Vis Len 368.000 (368.000)




Current lr: 2.666666666666667e-06
Epoch: [0][  9/100]	Batch time (s)  0.715 ( 0.715)	Loss 0.4931 (0.4692)	Iter time (s)  0.357 ( 0.357)	Epoch time (h)  0.010 ( 0.010)	Remain time (h)  0.009 ( 0.009)	Seq Len 346.000 (346.000)	Ctx Len 76.000 (76.000)	Vis Len 270.000 (270.000)




Current lr: 3e-06
Epoch: [0][ 10/100]	Batch time (s)  1.215 ( 1.215)	Loss 0.2326 (0.3081)	Iter time (s)  0.607 ( 0.607)	Epoch time (h)  0.017 ( 0.017)	Remain time (h)  0.015 ( 0.015)	Seq Len 515.000 (515.000)	Ctx Len 147.000 (147.000)	Vis Len 368.000 (368.000)




Current lr: 3.3333333333333333e-06
Epoch: [0][ 11/100]	Batch time (s)  1.400 ( 1.400)	Loss 0.2133 (0.3347)	Iter time (s)  0.700 ( 0.700)	Epoch time (h)  0.019 ( 0.019)	Remain time (h)  0.017 ( 0.017)	Seq Len 817.000 (817.000)	Ctx Len 145.000 (145.000)	Vis Len 672.000 (672.000)




Current lr: 3.6666666666666666e-06
Epoch: [0][ 12/100]	Batch time (s)  0.999 ( 0.999)	Loss 0.4306 (0.4047)	Iter time (s)  0.499 ( 0.499)	Epoch time (h)  0.014 ( 0.014)	Remain time (h)  0.012 ( 0.012)	Seq Len 346.000 (346.000)	Ctx Len 76.000 (76.000)	Vis Len 270.000 (270.000)




Current lr: 4.000000000000001e-06
Epoch: [0][ 13/100]	Batch time (s)  0.842 ( 0.842)	Loss 0.3984 (0.4514)	Iter time (s)  0.421 ( 0.421)	Epoch time (h)  0.012 ( 0.012)	Remain time (h)  0.010 ( 0.010)	Seq Len 515.000 (515.000)	Ctx Len 147.000 (147.000)	Vis Len 368.000 (368.000)




Current lr: 4.333333333333334e-06
Epoch: [0][ 14/100]	Batch time (s)  1.580 ( 1.580)	Loss 0.2695 (0.4491)	Iter time (s)  0.790 ( 0.790)	Epoch time (h)  0.022 ( 0.022)	Remain time (h)  0.019 ( 0.019)	Seq Len 867.000 (867.000)	Ctx Len 147.000 (147.000)	Vis Len 720.000 (720.000)




Current lr: 4.666666666666667e-06
Epoch: [0][ 15/100]	Batch time (s)  1.358 ( 1.358)	Loss 0.4078 (0.3257)	Iter time (s)  0.679 ( 0.679)	Epoch time (h)  0.019 ( 0.019)	Remain time (h)  0.016 ( 0.016)	Seq Len 514.000 (514.000)	Ctx Len 146.000 (146.000)	Vis Len 368.000 (368.000)




Current lr: 5e-06
Epoch: [0][ 16/100]	Batch time (s)  0.971 ( 0.971)	Loss 0.5224 (0.4137)	Iter time (s)  0.486 ( 0.486)	Epoch time (h)  0.013 ( 0.013)	Remain time (h)  0.011 ( 0.011)	Seq Len 416.000 (416.000)	Ctx Len 141.000 (141.000)	Vis Len 275.000 (275.000)




Current lr: 5.333333333333334e-06
Epoch: [0][ 17/100]	Batch time (s)  1.262 ( 1.262)	Loss 0.2610 (0.3243)	Iter time (s)  0.631 ( 0.631)	Epoch time (h)  0.018 ( 0.018)	Remain time (h)  0.015 ( 0.015)	Seq Len 767.000 (767.000)	Ctx Len 146.000 (146.000)	Vis Len 621.000 (621.000)




Current lr: 5.666666666666667e-06
Epoch: [0][ 18/100]	Batch time (s)  0.752 ( 0.752)	Loss 0.3976 (0.3862)	Iter time (s)  0.376 ( 0.376)	Epoch time (h)  0.010 ( 0.010)	Remain time (h)  0.009 ( 0.009)	Seq Len 424.000 (424.000)	Ctx Len 148.000 (148.000)	Vis Len 276.000 (276.000)




Current lr: 6e-06
Epoch: [0][ 19/100]	Batch time (s)  1.258 ( 1.258)	Loss 0.4031 (0.3754)	Iter time (s)  0.629 ( 0.629)	Epoch time (h)  0.017 ( 0.017)	Remain time (h)  0.014 ( 0.014)	Seq Len 522.000 (522.000)	Ctx Len 147.000 (147.000)	Vis Len 375.000 (375.000)




Current lr: 6.333333333333333e-06
Epoch: [0][ 20/100]	Batch time (s)  1.172 ( 1.172)	Loss 0.1934 (0.2874)	Iter time (s)  0.586 ( 0.586)	Epoch time (h)  0.016 ( 0.016)	Remain time (h)  0.013 ( 0.013)	Seq Len 369.000 (369.000)	Ctx Len 97.000 (97.000)	Vis Len 272.000 (272.000)




Current lr: 6.666666666666667e-06
Epoch: [0][ 21/100]	Batch time (s)  1.290 ( 1.290)	Loss 0.2861 (0.2160)	Iter time (s)  0.645 ( 0.645)	Epoch time (h)  0.018 ( 0.018)	Remain time (h)  0.014 ( 0.014)	Seq Len 683.000 (683.000)	Ctx Len 143.000 (143.000)	Vis Len 540.000 (540.000)




Current lr: 7e-06
Epoch: [0][ 22/100]	Batch time (s)  0.920 ( 0.920)	Loss 0.3792 (0.4042)	Iter time (s)  0.460 ( 0.460)	Epoch time (h)  0.013 ( 0.013)	Remain time (h)  0.010 ( 0.010)	Seq Len 457.000 (457.000)	Ctx Len 145.000 (145.000)	Vis Len 312.000 (312.000)




Current lr: 7.333333333333333e-06
Epoch: [0][ 23/100]	Batch time (s)  1.024 ( 1.024)	Loss 0.1436 (0.1939)	Iter time (s)  0.512 ( 0.512)	Epoch time (h)  0.014 ( 0.014)	Remain time (h)  0.011 ( 0.011)	Seq Len 515.000 (515.000)	Ctx Len 147.000 (147.000)	Vis Len 368.000 (368.000)




Current lr: 7.666666666666667e-06
Epoch: [0][ 24/100]	Batch time (s)  1.048 ( 1.048)	Loss 0.2720 (0.2723)	Iter time (s)  0.524 ( 0.524)	Epoch time (h)  0.015 ( 0.015)	Remain time (h)  0.011 ( 0.011)	Seq Len 506.000 (506.000)	Ctx Len 122.000 (122.000)	Vis Len 384.000 (384.000)




Current lr: 8.000000000000001e-06
Epoch: [0][ 25/100]	Batch time (s)  1.778 ( 1.778)	Loss 0.2982 (0.3039)	Iter time (s)  0.889 ( 0.889)	Epoch time (h)  0.025 ( 0.025)	Remain time (h)  0.019 ( 0.019)	Seq Len 673.000 (673.000)	Ctx Len 145.000 (145.000)	Vis Len 528.000 (528.000)




Current lr: 8.333333333333334e-06
Epoch: [0][ 26/100]	Batch time (s)  1.156 ( 1.156)	Loss 0.2696 (0.2165)	Iter time (s)  0.578 ( 0.578)	Epoch time (h)  0.016 ( 0.016)	Remain time (h)  0.012 ( 0.012)	Seq Len 369.000 (369.000)	Ctx Len 99.000 (99.000)	Vis Len 270.000 (270.000)




Current lr: 8.666666666666668e-06
Epoch: [0][ 27/100]	Batch time (s)  1.048 ( 1.048)	Loss 0.3517 (0.2960)	Iter time (s)  0.524 ( 0.524)	Epoch time (h)  0.015 ( 0.015)	Remain time (h)  0.011 ( 0.011)	Seq Len 510.000 (510.000)	Ctx Len 142.000 (142.000)	Vis Len 368.000 (368.000)




Current lr: 9e-06
Epoch: [0][ 28/100]	Batch time (s)  0.927 ( 0.927)	Loss 0.1725 (0.2251)	Iter time (s)  0.463 ( 0.463)	Epoch time (h)  0.013 ( 0.013)	Remain time (h)  0.009 ( 0.009)	Seq Len 546.000 (546.000)	Ctx Len 143.000 (143.000)	Vis Len 403.000 (403.000)




Current lr: 9.333333333333334e-06
Epoch: [0][ 29/100]	Batch time (s)  0.719 ( 0.719)	Loss 0.3371 (0.3176)	Iter time (s)  0.360 ( 0.360)	Epoch time (h)  0.010 ( 0.010)	Remain time (h)  0.007 ( 0.007)	Seq Len 342.000 (342.000)	Ctx Len 76.000 (76.000)	Vis Len 266.000 (266.000)




Current lr: 9.666666666666667e-06
Epoch: [0][ 30/100]	Batch time (s)  1.469 ( 1.469)	Loss 0.1920 (0.2110)	Iter time (s)  0.735 ( 0.735)	Epoch time (h)  0.020 ( 0.020)	Remain time (h)  0.014 ( 0.014)	Seq Len 685.000 (685.000)	Ctx Len 145.000 (145.000)	Vis Len 540.000 (540.000)




Current lr: 1e-05
Epoch: [0][ 31/100]	Batch time (s)  0.774 ( 0.774)	Loss 0.3021 (0.2677)	Iter time (s)  0.387 ( 0.387)	Epoch time (h)  0.011 ( 0.011)	Remain time (h)  0.007 ( 0.007)	Seq Len 416.000 (416.000)	Ctx Len 143.000 (143.000)	Vis Len 273.000 (273.000)




Current lr: 9.989690721649485e-06
Epoch: [0][ 32/100]	Batch time (s)  0.960 ( 0.960)	Loss 0.3887 (0.2860)	Iter time (s)  0.480 ( 0.480)	Epoch time (h)  0.013 ( 0.013)	Remain time (h)  0.009 ( 0.009)	Seq Len 575.000 (575.000)	Ctx Len 146.000 (146.000)	Vis Len 429.000 (429.000)




Current lr: 9.97938144329897e-06
Epoch: [0][ 33/100]	Batch time (s)  0.843 ( 0.843)	Loss 0.2780 (0.2959)	Iter time (s)  0.421 ( 0.421)	Epoch time (h)  0.012 ( 0.012)	Remain time (h)  0.008 ( 0.008)	Seq Len 515.000 (515.000)	Ctx Len 147.000 (147.000)	Vis Len 368.000 (368.000)




Current lr: 9.969072164948454e-06
Epoch: [0][ 34/100]	Batch time (s)  0.807 ( 0.807)	Loss 0.3059 (0.2264)	Iter time (s)  0.403 ( 0.403)	Epoch time (h)  0.011 ( 0.011)	Remain time (h)  0.007 ( 0.007)	Seq Len 388.000 (388.000)	Ctx Len 122.000 (122.000)	Vis Len 266.000 (266.000)




Current lr: 9.958762886597939e-06
Epoch: [0][ 35/100]	Batch time (s)  1.170 ( 1.170)	Loss 0.2659 (0.2324)	Iter time (s)  0.585 ( 0.585)	Epoch time (h)  0.016 ( 0.016)	Remain time (h)  0.011 ( 0.011)	Seq Len 512.000 (512.000)	Ctx Len 144.000 (144.000)	Vis Len 368.000 (368.000)




Current lr: 9.948453608247423e-06
Epoch: [0][ 36/100]	Batch time (s)  0.933 ( 0.933)	Loss 0.3540 (0.2753)	Iter time (s)  0.467 ( 0.467)	Epoch time (h)  0.013 ( 0.013)	Remain time (h)  0.008 ( 0.008)	Seq Len 364.000 (364.000)	Ctx Len 76.000 (76.000)	Vis Len 288.000 (288.000)




Current lr: 9.938144329896908e-06
Epoch: [0][ 37/100]	Batch time (s)  0.730 ( 0.730)	Loss 0.3552 (0.3900)	Iter time (s)  0.365 ( 0.365)	Epoch time (h)  0.010 ( 0.010)	Remain time (h)  0.006 ( 0.006)	Seq Len 384.000 (384.000)	Ctx Len 99.000 (99.000)	Vis Len 285.000 (285.000)




Current lr: 9.927835051546392e-06
Epoch: [0][ 38/100]	Batch time (s)  1.388 ( 1.388)	Loss 0.2233 (0.2195)	Iter time (s)  0.694 ( 0.694)	Epoch time (h)  0.019 ( 0.019)	Remain time (h)  0.012 ( 0.012)	Seq Len 645.000 (645.000)	Ctx Len 145.000 (145.000)	Vis Len 500.000 (500.000)




Current lr: 9.917525773195877e-06
Epoch: [0][ 39/100]	Batch time (s)  0.810 ( 0.810)	Loss 0.3088 (0.3066)	Iter time (s)  0.405 ( 0.405)	Epoch time (h)  0.011 ( 0.011)	Remain time (h)  0.007 ( 0.007)	Seq Len 513.000 (513.000)	Ctx Len 145.000 (145.000)	Vis Len 368.000 (368.000)




Current lr: 9.907216494845361e-06
Epoch: [0][ 40/100]	Batch time (s)  1.261 ( 1.261)	Loss 0.3367 (0.2477)	Iter time (s)  0.630 ( 0.630)	Epoch time (h)  0.018 ( 0.018)	Remain time (h)  0.011 ( 0.011)	Seq Len 430.000 (430.000)	Ctx Len 100.000 (100.000)	Vis Len 330.000 (330.000)




Current lr: 9.896907216494846e-06
Epoch: [0][ 41/100]	Batch time (s)  1.036 ( 1.036)	Loss 0.2537 (0.2705)	Iter time (s)  0.518 ( 0.518)	Epoch time (h)  0.014 ( 0.014)	Remain time (h)  0.008 ( 0.008)	Seq Len 537.000 (537.000)	Ctx Len 146.000 (146.000)	Vis Len 391.000 (391.000)




Current lr: 9.88659793814433e-06
Epoch: [0][ 42/100]	Batch time (s)  0.820 ( 0.820)	Loss 0.3351 (0.3186)	Iter time (s)  0.410 ( 0.410)	Epoch time (h)  0.011 ( 0.011)	Remain time (h)  0.007 ( 0.007)	Seq Len 431.000 (431.000)	Ctx Len 146.000 (146.000)	Vis Len 285.000 (285.000)




In [None]:
# 合并LoRA权重和原始模型
ShowUImerge(args)

In [None]:
# ShowUI模型推理

class ShowUI:
    def __init__(self, model_path: str, args):
        self.model_path = model_path
        self.model = None
        self.processor = None
        self.args = args

    def load_model(self):

        torch_dtype = torch.float32
        if self.args.precision == "bf16":
            torch_dtype = torch.bfloat16
        elif self.args.precision == "fp16":
            torch_dtype = torch.half
        
        print("Loading processor...")
        
        from model.showui.processing_showui import ShowUIProcessor

        self.processor = ShowUIProcessor.from_pretrained(self.args.model_path,
                                                min_pixels=self.args.min_visual_tokens *28*28,
                                                max_pixels=self.args.max_visual_tokens *28*28,
                                                model_max_length=self.args.model_max_length,
                                                uigraph_train=self.args.uigraph_train, uigraph_test=self.args.uigraph_test,
                                                uigraph_diff=self.args.uigraph_diff,  uigraph_rand=self.args.uigraph_rand,
                                                uimask_pre=self.args.uimask_pre, uimask_ratio=self.args.uimask_ratio, uimask_rand=self.args.uimask_rand,
                                                size = {"shortest_edge": 3136, "longest_edge": 1003520}
                                              )
        
        print("Processor loaded successfully.")

        print(f"Loading model from {self.model_path}...")

        bnb_config = BitsAndBytesConfig(
                    load_in_4bit=True,
                    bnb_4bit_compute_dtype=torch.bfloat16,
                    bnb_4bit_use_double_quant=True,
                    bnb_4bit_quant_type="nf4",
                    llm_int8_skip_modules=["img_projection"],
                ) if self.args.use_qlora else None

        from model.utils import parse_layer_type
        from model.showui.modeling_showui import ShowUIForConditionalGeneration

        lm_qwen_layer = 28
        vis_qwen_layer = 32
        lm_skip_layer = parse_layer_type(self.args.lm_skip_layer, lm_qwen_layer)
        vis_skip_layer = parse_layer_type(self.args.vis_skip_layer, vis_qwen_layer)

        self.model = ShowUIForConditionalGeneration.from_pretrained(
                                                    self.args.model_path,
                                                    torch_dtype=torch_dtype, # 模型精度
                                                    low_cpu_mem_usage=True, # 低内存使用模式
                                                    _attn_implementation=self.args.attn_imple, # 注意力实现方式
                                                    # quantization_config=bnb_config, # 量化配置
                                                    device_map="cuda", # 自动设备映射
                                                    lm_skip_layer=lm_skip_layer, # 跳过语言层
                                                    lm_skip_ratio=self.args.lm_skip_ratio, # 跳过语言层比例
    )
        
        print("Model loaded successfully.")

    def invoke(self, img_url: str, query: str, args):
        image = Image.open(img_url)

        print(f"Image loaded from {img_url}, size: {image.size}")
        print(f"Query: {query}")
        

        print("Processing messages for model input...")

        _SYSTEM = (
            "Based on the screenshot of the page, I give a text description and you give its corresponding location. "
            "The coordinate represents a clickable location [x, y] for an element, which is a relative coordinate on the screenshot, scaled from 0 to 1."
        )
        messages = [
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": _SYSTEM},
                    {"type": "image", "image": img_url, "min_pixels": self.args.min_visual_tokens * 28 * 28, "max_pixels": self.args.max_visual_tokens * 28 * 28},
                    {"type": "text", "text": query}
                ],
            }
        ]
        
        text = self.processor.apply_chat_template(
            messages, tokenize=False, add_generation_prompt=True,
        )
        image_inputs, video_inputs = process_vision_info(messages)
        inputs = self.processor(
            text=[text],
            images=image_inputs,
            videos=video_inputs,
            padding=True,
            return_tensors="pt",
        )
        inputs = inputs.to("cuda")
        print("Inputs prepared for model generation.")
        generated_ids = self.model.generate(**inputs, max_new_tokens=128)
        generated_ids_trimmed = [
            out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
        ]
        print("Model generation completed.")
        print("Decoding generated IDs to text...")
        output_text = self.processor.batch_decode(
            generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
        )[0]
        print(f"Output text: {output_text}")
        print("Decoding completed.")
        
        click_xy = ast.literal_eval(output_text)
        x, y = click_xy[0] * image.width, click_xy[1] * image.height

        
        return x, y, image

    def draw_point(self, image, x, y, radius=2):
        print(f"Drawing point at ({x}, {y}) with radius {radius} on the image.")
        draw = ImageDraw.Draw(image)
        draw.ellipse((x - radius, y - radius, x + radius, y + radius), fill='red', outline='red')
        image.show()

In [None]:
# 创建 ShowUI 实例并加载模型
model_path = args.exp_dir + "/ckpt_model/merged_model"
showui = ShowUI(model_path,args)
showui.load_model()

In [None]:
img_url = "D:/Project/my_dataset/unlabel_images/image0.png"
query = "请输入密码"

x, y, image = showui.invoke(img_url, query, args)
print(f"Click coordinates: ({x}, {y})")
showui.draw_point(image, x, y)