In [None]:
import pandas as pd
from tqdm import tqdm

from transformers import AutoTokenizer, Qwen2VLForConditionalGeneration, AutoProcessor
from qwen_vl_utils import process_vision_info

model_path = "LLaMA-Factory/qwen2vl_7b_instruct_lora_merged"

tokenizer = AutoTokenizer.from_pretrained(model_path)
model = Qwen2VLForConditionalGeneration.from_pretrained(
    model_path, torch_dtype="auto", device_map="auto"
)
processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")

In [2]:
categories = {
    "airplane": [
        "Artificial noise patterns in uniform surfaces",
        "Metallic surface artifacts",
        "Impossible mechanical connections",
        "Inconsistent scale of mechanical parts",
        "Physically impossible structural elements",
        "Implausible aerodynamic structures",
        "Misaligned body panels",
        "Impossible mechanical joints",
        "Distorted window reflections",
    ],
    "automobile": [
        "Artificial noise patterns in uniform surfaces",
        "Metallic surface artifacts",
        "Impossible mechanical connections",
        "Inconsistent scale of mechanical parts",
        "Physically impossible structural elements",
        "Incorrect wheel geometry",
        "Misaligned body panels",
        "Impossible mechanical joints",
        "Distorted window reflections",
    ],
    "ship": [
        "Artificial noise patterns in uniform surfaces",
        "Metallic surface artifacts",
        "Impossible mechanical connections",
        "Inconsistent scale of mechanical parts",
        "Physically impossible structural elements",
        "Misaligned body panels",
    ],
    "truck": [
        "Artificial noise patterns in uniform surfaces",
        "Metallic surface artifacts",
        "Impossible mechanical connections",
        "Inconsistent scale of mechanical parts",
        "Physically impossible structural elements",
        "Incorrect wheel geometry",
        "Misaligned body panels",
        "Impossible mechanical joints",
        "Distorted window reflections",
    ],
    "bird": [
        "Unrealistic eye reflections",
        "Misshapen ears or appendages",
        "Anatomically impossible joint configurations",
        "Unnatural pose artifacts",
        "Biological asymmetry errors",
        "Regular grid-like artifacts in textures",
        "Impossible foreshortening in animal bodies",
        "Misaligned bilateral elements in animal faces",
        "Over-smoothing of natural textures",
    ],
    "cat": [
        "Unrealistic eye reflections",
        "Misshapen ears or appendages",
        "Anatomically impossible joint configurations",
        "Unnatural pose artifacts",
        "Biological asymmetry errors",
        "Regular grid-like artifacts in textures",
        "Impossible foreshortening in animal bodies",
        "Misaligned bilateral elements in animal faces",
        "Over-smoothing of natural textures",
        "Anatomically incorrect paw structures",
        "Improper fur direction flows",
    ],
    "deer": [
        "Unrealistic eye reflections",
        "Misshapen ears or appendages",
        "Anatomically impossible joint configurations",
        "Unnatural pose artifacts",
        "Biological asymmetry errors",
        "Regular grid-like artifacts in textures",
        "Impossible foreshortening in animal bodies",
        "Misaligned bilateral elements in animal faces",
        "Over-smoothing of natural textures",
        "Improper fur direction flows",
    ],
    "dog": [
        "Unrealistic eye reflections",
        "Misshapen ears or appendages",
        "Anatomically impossible joint configurations",
        "Unnatural pose artifacts",
        "Biological asymmetry errors",
        "Regular grid-like artifacts in textures",
        "Impossible foreshortening in animal bodies",
        "Misaligned bilateral elements in animal faces",
        "Over-smoothing of natural textures",
        "Dental anomalies in mammals",
        "Anatomically incorrect paw structures",
        "Improper fur direction flows",
    ],
    "frog": [
        "Unrealistic eye reflections",
        "Misshapen ears or appendages",
        "Anatomically impossible joint configurations",
        "Unnatural pose artifacts",
        "Biological asymmetry errors",
        "Regular grid-like artifacts in textures",
        "Impossible foreshortening in animal bodies",
        "Misaligned bilateral elements in animal faces",
        "Over-smoothing of natural textures",
    ],
    "horse": [
        "Unrealistic eye reflections",
        "Misshapen ears or appendages",
        "Anatomically impossible joint configurations",
        "Unnatural pose artifacts",
        "Biological asymmetry errors",
        "Regular grid-like artifacts in textures",
        "Impossible foreshortening in animal bodies",
        "Misaligned bilateral elements in animal faces",
        "Over-smoothing of natural textures",
        "Dental anomalies in mammals",
    ],
    "major": [
        "Discontinuous surfaces",
        "Non-manifold geometries in rigid structures",
        "Asymmetric features in naturally symmetric objects",
        "Texture bleeding between adjacent regions",
        "Excessive sharpness in certain image regions",
        "Artificial smoothness",
        "Movie-poster-like composition of ordinary scenes",
        "Unnatural lighting gradients",
        "Fake depth of field",
        "Abruptly cut-off objects",
        "Color coherence breaks",
        "Spatial relationship errors",
        "Depth perception anomalies",
        "Over-sharpening artifacts",
        "Incorrect reflection mapping",
        "Inconsistent object boundaries",
        "Floating or disconnected components",
        "Texture repetition patterns",
        "Unrealistic specular highlights",
        "Inconsistent material properties",
        "Inconsistent shadow directions",
        "Multiple light source conflicts",
        "Missing ambient occlusion",
        "Incorrect perspective rendering",
        "Scale inconsistencies within single objects",
        "Aliasing along high-contrast edges",
        "Blurred boundaries in fine details",
        "Jagged edges in curved structures",
        "Random noise patterns in detailed areas",
        "Loss of fine detail in complex structures",
        "Artificial enhancement artifacts",
        "Repeated element patterns",
        "Systematic color distribution anomalies",
        "Frequency domain signatures",
        "Unnatural color transitions",
        "Resolution inconsistencies within regions",
        "Glow or light bleed around object boundaries",
        "Ghosting effects: Semi-transparent duplicates of elements",
        "Cinematization effects",
        "Dramatic lighting that defies natural physics",
        "Artificial depth of field in object presentation",
        "Unnaturally glossy surfaces",
        "Synthetic material appearance",
        "Multiple inconsistent shadow sources",
        "Exaggerated characteristic features",
        "Scale inconsistencies within the same object class",
        "Incorrect skin tones",
    ],
}


In [3]:
prompt = "Analyze the provided image and its corresponding Grad-CAM output generated by a model trained to detect fake visuals. Identify and explain artifacts that indicate it is fake. Focus primarily on the original image to identify and explain distinguishing artifacts that indicate it is fake. Use the Grad-CAM output for reference only when necessary. If multiple artifacts share similar meanings, select all relevant artifacts. Provide clear, concise explanations (maximum 50 words each) for each artifact. Include positional references like 'top left' or 'bottom right' when relevant. DO NOT include any other sentences or artifacts in your response.\nOutput Format:\n\nWrite each artifact and explanation on a separate line, using the format:\nArtifact Name: Explanation.\nFor example:\nUnrealistic eye reflections: Unnatural symmetrical light reflections in both eyes, suggesting generated elements.\nOver-smoothing of natural textures: Fur appears unusually smooth in the top right, lacking natural texture variation.\n\nNotes:\nPositional references like 'top left' or 'bottom right' should be included where applicable to aid specificity.\nExplanations should remain under 50 words for clarity.\AVOID referencing artifacts not listed or including extra commentary.\n\nONLY use the artifacts listed below:\n"

In [None]:
data = pd.read_csv("mc_llava_output.csv")

for i, row in tqdm(data.iterrows()):
    messages = [
        {
            "role": "user",
            "content": [
                {
                    "type": "image",
                    "image": f"final_test/{row['path']}",
                },
                {
                    "type": "image",
                    "image": f"gradcams/{row['path']}",
                },
                {
                    "type": "text",
                    "text": prompt
                    + "["
                    + ", ".join(categories["major"])
                    + ", ".join(categories[row["label"]])
                    + "]",
                },
            ],
        }
    ]
    # print(messages)
    # break
    text = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
    image_inputs, video_inputs = process_vision_info(messages)
    inputs = processor(
        text=[text],
        images=image_inputs,
        videos=video_inputs,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to("cuda")

    generated_ids = model.generate(**inputs, max_new_tokens=768)
    generated_ids_trimmed = [
        out_ids[len(in_ids) :]
        for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
    ]
    output_text = processor.batch_decode(
        generated_ids_trimmed,
        skip_special_tokens=True,
        clean_up_tokenization_spaces=False,
    )

    data.loc[i, "output"] = output_text[0]


In [5]:
data.to_csv("qwen_output.csv", index=False)