In [1]:
# config env
pkgs_path = "/bohr/pkgs-7x29/v5/pkgs"
llava_lib_path = "/bohr/libb-bg5b/v3/llava"
tsr_model_path = "microsoft/table-structure-recognition-v1.1-all"
model_path = "lmms-lab/llava-onevision-qwen2-7b-si"
cache_path = "/bohr/cach-rxl3/v3/cache"

# pkgs_path = "/personal/pkgs"
# llava_lib_path = "/personal/llava"
# model_path = "lmms-lab/llava-onevision-qwen2-0.5b-ov"
# cache_path = "/personal/cache"

!pip install {pkgs_path}/*
!cp {llava_lib_path} . -r

import os

# # 提交时可能不能联网，设置成离线模式防止联网失败报错
os.environ['TRANSFORMERS_OFFLINE'] = '1'
os.environ['HF_DATASETS_OFFLINE'] = '1'
os.environ['HF_HUB_OFFLINE'] = '1'
os.environ["HF_ENDPOINT"] = "https://hf-mirror.com"
os.environ["HUGGINGFACE_HUB_CACHE"] = cache_path
os.environ["HF_HOME"] = cache_path
device = "cuda"

In [2]:
from llava.conversation import Conversation, SeparatorStyle
from llava.utils import disable_torch_init
import json
from llava.model.builder import load_pretrained_model
from llava.mm_utils import get_model_name_from_path, process_images, tokenizer_image_token
from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN
import torch

from PIL import Image

import warnings

warnings.filterwarnings("ignore")

In [3]:
args = type('Args', (), {
    "model_path": model_path,
    "model_base": None,
    "model_name": get_model_name_from_path(model_path),
    "conv_mode": None,
    "sep": ",",
    "temperature": 0,
    "top_p": 1,
    "num_beams": 1,
    "max_new_tokens": 8
})()
torch.cuda.empty_cache()
disable_torch_init()

llava_model_args = {
    "multimodal": True,
}
overwrite_config = {}
# overwrite_config["image_aspect_ratio"] = "pad"
overwrite_config["image_aspect_ratio"] = "anyres_max_9"
llava_model_args["overwrite_config"] = overwrite_config

tokenizer, model, image_processor, context_len = load_pretrained_model(
    args.model_path, args.model_base, args.model_name, device_map="auto",
    attn_implementation='eager',
    # load_8bit=True,
    # load_4bit=False,
    **llava_model_args
)

# tsr_img_processor = AutoImageProcessor.from_pretrained(tsr_model_path)
# tsr_img_processor.size['shortest_edge'] = image_processor.size[0]
# tsr_model = TableTransformerForObjectDetection.from_pretrained(tsr_model_path)
# label2id = tsr_model.config.label2id
# label_row = label2id['table row']
# label_col = label2id['table column']

In [4]:
if os.environ.get('DATA_PATH_B'):  # 提交时会选择隐藏的测试数据集路径（A+B榜），数据集的格式与A榜数据相同，但数目不同（5360张）
    base_dir = os.environ.get('DATA_PATH_B')
else:
    base_dir = '/bohr/form-recognition-train-b6y2/v4'  # 示例，把A榜测试数据集路径作为测试集路径，仅开发时挂载A榜数据用于debug   # 示例，把A榜测试数据集路径作为测试集路径，仅开发时挂载A榜数据用于debug

In [5]:
with open(os.path.join(base_dir, 'dataset.json'), 'r') as f:
    data = json.load(f)

In [6]:
from collections import defaultdict
import re

l2i = defaultdict(lambda: -1)
for i, letter in enumerate('ABCDEFGH'):
    l2i[letter] = i
sub_list = ('Physics', 'Mathematics', 'ComputerScience', 'QuantitativeBiology', 'QuantitativeFinance',
            'Statistics', 'ElectricalEngineeringandSystemsScience', 'Economics', '')


def clean_out(image_path, out_list):
    matches = re.findall(r"\d+", out_list[0])
    if len(matches) == 2:
        rows, cols = int(matches[0]), int(matches[1])
    elif len(matches) == 1:
        rows = cols = int(matches[0])
    else:
        rows = cols = -1

    sub_item = {
        "image_path": image_path,
        "category": sub_list[l2i[out_list[1]]],
        "cols": cols,
        "rows": rows,
        "answer": l2i[out_list[2]],
    }
    return sub_item

In [7]:
def one_image(img_path, caption, qs_list):
    image = Image.open(img_path).convert("RGB")
    image_sizes = [image.size]
    images = [image]
    image_tensors = [process_images(images, image_processor, model.config)[0].to(dtype=torch.float16, device=device)]
    out_list = []
    with torch.inference_mode():
        conv = Conversation(
            system="""<|im_start|>system
            You are a helpful assistant. Provide only an option's letter or an integer for each question, without any additional explanation.""",
            roles=["<|im_start|>user", "<|im_start|>assistant"],
            version="qwen",
            messages=[
                ["<|im_start|>user",
                 f'{DEFAULT_IMAGE_TOKEN}\n This is a table image. The caption of the table is "{caption}".'],
                ["<|im_start|>assistant", "I have a general understanding of the information in this table."]
            ],
            offset=0,
            sep_style=SeparatorStyle.CHATML,
            sep="<|im_end|>",
        )
        for qs in qs_list:
            conv.append_message(conv.roles[0], qs)
            conv.append_message(conv.roles[1], None)
            prompt = conv.get_prompt()
            input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt").unsqueeze(
                0).cuda()

            output_ids = model.generate(
                input_ids,
                images=image_tensors,
                image_sizes=image_sizes,
                do_sample=True if args.temperature > 0 else False,
                temperature=args.temperature,
                top_p=args.top_p,
                num_beams=args.num_beams,
                max_new_tokens=args.max_new_tokens,
                use_cache=True,
            )
            outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip()
            conv.messages[-1][-1] = outputs
            out_list.append(outputs)
    return out_list

In [9]:
submission = []

for item in data:
    image_path = os.path.join(base_dir, 'test_images', item["image_path"])
    qs_list = [
        f'Based on the provided table, what is its shape? Answer with two positive integers for rows and columns, separated by a comma:',
        f"""Based on the provided table and caption, select the most relevant subject from (A. Physics, B. Mathematics, C. ComputerScience, D. QuantitativeBiology, E. QuantitativeFinance, F. Statistics, G. ElectricalEngineeringandSystemsScience, H. Economics). Answer with the option's letter from the given choices directly.""",
        f"""Based on the provided table and caption, for the question: "{item["question"]}", select the most correct option from (A. {item["options"][0]}, B. {item["options"][1]}, C. {item["options"][2]}, D. {item["options"][3]}). Answer with the option's letter from the given choices directly."""
    ]
    out_list = one_image(image_path, item["caption"], qs_list)
    sub_item = clean_out(item["image_path"], out_list)
    submission.append(sub_item)

with open('submission.json', 'w') as f:
    json.dump(submission, f)