# Config

In [None]:
OPENAI_API_KEY=""
HUGGINGFACE_API_KEY=""

# Prompt

In [22]:
role_identifier_prompt = """
You are part of a team tasked with generating role-aware and context-aware image alt texts for images on websites. Your role is to identify the role of the given image in the website according to the definitions provided by the WCAG Web Accessibility Initiative (WAI) outlined below.


1. informative: Images that graphically represent concepts and information, typically pictures, photos, and illustrations. The text alternative should be at least a short description conveying the essential information presented by the image.

2. decorative: The only purpose of an image is to add visual decoration to the page, rather than to convey information that is important to understanding the page. This includes images that are considered eye candy or used for visual effect. Classify the image to decorative if having a null alt-text (alt="") will not result in any loss of information.

3. functional: Images used as a link or as a button, which carry a functionality to the page. Examples of such images are a printer icon to represent the print function or a button to submit a form. The alt text should describe the functionality of the link or button rather than the visual image.

4. complex: Images used to convey data or detailed information, such as graphs or charts. Alt texts provide a complete text equivalent of the data or information provided in the image as the text alternative.


As each role needs to be handled differently when generating alt texts, your output will be used to help another team member write the most suitable alt text that is role-aware and contex-aware for the image to help create more accessible websites.

Return only the role of the image from the list above. Return the role as a single word without any enclosing bracket, e.g., informative, decorative, functional, text, or complex. THIS IS IMPORTANT! RETURN ONLY THE ROLE OF THE IMAGE.

You are given the details of the image found on a website as follows:
{message}
"""

# Load Data

In [None]:
import json
import os
import random

json_dir = "../../scraper/output"

filenames = os.listdir(json_dir)

# Shuffle the filenames
random.seed(42)
random.shuffle(filenames)

# Llama

In [None]:
from huggingface_hub import InferenceClient

client = InferenceClient(api_key=HUGGINGFACE_API_KEY)

def determine_role_llama(image):
    image_details = f"""
        The image's attributes: {json.dumps(image["attrs"])}\n\n
        The image's <a> or <button> parent: {image["a_button_parent"]}\n\n
        The previous text before the image appears: {image["previous_text"]}\n\n
        The next text after the image appears: {image["next_text"]}\n\n
    """

    messages = [
        {
            "role": "user",
            "content": [
                {
                    "type": "image_url",
                    "image_url": {
                        "url": image["src"]
                    }
                },
                {
                    "type": "text",
                    "text": role_identifier_prompt.format(message=image_details)
                },
            ]
        }
    ]
    
    completion = client.chat.completions.create(
        model="meta-llama/Llama-3.2-11B-Vision-Instruct", 
        messages=messages, 
        max_tokens=500
    )

    return completion.choices[0].message.content

# GPT-4o

In [None]:
from langchain_openai import ChatOpenAI

def determine_role_gpt_4o(image):
    image_details = f"""
        The image's attributes: {json.dumps(image["attrs"])}\n\n
        The image's <a> or <button> parent: {image["a_button_parent"]}\n\n
        The previous text before the image appears: {image["previous_text"]}\n\n
        The next text after the image appears: {image["next_text"]}\n\n
    """
        
    role_identifier_llm = ChatOpenAI(model='gpt-4o', temperature=0.5, api_key=OPENAI_API_KEY)
    predicted_role = role_identifier_llm.invoke(
        [
            (
                "system",
                role_identifier_prompt
            ),
            (
                "human",
                [
                    {
                        "type": "image_url", "image_url": {"url": image["src"]}
                    },
                    {
                        "type": "text", "text": image_details
                    }
                ]
            )
        ]
    )

    return predicted_role.content

# GPT-4o-mini

In [None]:
from langchain_openai import ChatOpenAI

def determine_role_gpt_4o(image):
    image_details = f"""
        The image's attributes: {json.dumps(image["attrs"])}\n\n
        The image's <a> or <button> parent: {image["a_button_parent"]}\n\n
        The previous text before the image appears: {image["previous_text"]}\n\n
        The next text after the image appears: {image["next_text"]}\n\n
    """
        
    role_identifier_llm = ChatOpenAI(model='gpt-4o-mini', temperature=0.5, api_key=OPENAI_API_KEY)
    predicted_role = role_identifier_llm.invoke(
        [
            (
                "system",
                role_identifier_prompt
            ),
            (
                "human",
                [
                    {
                        "type": "image_url", "image_url": {"url": image["src"]}
                    },
                    {
                        "type": "text", "text": image_details
                    }
                ]
            )
        ]
    )

    return predicted_role.content

# Experiment

In [38]:
def calculate_scores(results):
    scores = {
        "llama": {
            "whole_accuracy": 0,
            "precision": {},
            "recall": {},
            "f1": {},
        },
        "gpt-4o": {
            "whole_accuracy": 0,
            "precision": {},
            "recall": {},
            "f1": {},
        }
    }

    for model in results["details"]:
        # Whole accuracy
        scores[model]["whole_accuracy"] = sum([results["details"][model][role]["true_positive"] for role in results["details"][model]]) / results["total_images"]

        # Accuracy, Precision, Recall, F1 for each role
        for role in results["details"][model]:
            true_positive = results["details"][model][role]["true_positive"]
            false_positive = results["details"][model][role]["false_positive"]
            false_negative = results["details"][model][role]["false_negative"]

            scores[model]["precision"][role] = true_positive / (true_positive + false_positive) if true_positive + false_positive > 0 else 0
            scores[model]["recall"][role] = true_positive / (true_positive + false_negative) if true_positive + false_negative > 0 else 0
            scores[model]["f1"][role] = 2 * (scores[model]["precision"][role] * scores[model]["recall"][role]) / (scores[model]["precision"][role] + scores[model]["recall"][role]) if scores[model]["precision"][role] + scores[model]["recall"][role] > 0 else 0

    return scores

In [39]:
results = {
    "total_images": 0,
    "details": {
        "llama" : {
            "informative": {"true_positive": 0, "false_positive": 0, "false_negative": 0},
            "decorative": {"true_positive": 0, "false_positive": 0, "false_negative": 0},
            "functional": {"true_positive": 0, "false_positive": 0, "false_negative": 0},
            "complex": {"true_positive": 0, "false_positive": 0, "false_negative": 0}
        },
        "gpt-4o" : {
            "informative": {"true_positive": 0, "false_positive": 0, "false_negative": 0},
            "decorative": {"true_positive": 0, "false_positive": 0, "false_negative": 0},
            "functional": {"true_positive": 0, "false_positive": 0, "false_negative": 0},
            "complex": {"true_positive": 0, "false_positive": 0, "false_negative": 0}
        }
    }
}

for filename in os.listdir(json_dir)[0:1]:
    if filename.endswith(".json"):
        try:
            # Read the JSON file
            with open(os.path.join(json_dir, filename), "r") as file:
                data = json.load(file)

            print(f"Processing {filename}...")
            
            # Extract the image link and textual context from the JSON data
            whole_text = data["text"]
            sub_images = data["images"]

            for image in sub_images[0:1]:
                if image["role"] != "informative" and image["role"] != "decorative" and image["role"] != "functional" and image["role"] != "complex":
                    continue

                llama_answer = determine_role_llama(image).strip().lower().replace(".", "")
                gpt_4o_answer = determine_role_gpt_4o(image).strip().lower().replace(".", "")
                print("Correct role:", image["role"])
                print("Predicted role (Llama):", llama_answer)
                print("Predicted role (GPT-4o):", gpt_4o_answer)

                results["total_images"] += 1
                if llama_answer == image["role"]:
                    results["details"]["llama"][image["role"]]["true_positive"] += 1
                else:
                    results["details"]["llama"][image["role"]]["false_negative"] += 1
                    if llama_answer in results["details"]["llama"]:
                        results["details"]["llama"][llama_answer]["false_positive"] += 1

                if gpt_4o_answer == image["role"]:
                    results["details"]["gpt-4o"][image["role"]]["true_positive"] += 1
                else:
                    results["details"]["gpt-4o"][image["role"]]["false_negative"] += 1
                    if gpt_4o_answer in results["details"]["gpt-4o"]:
                        results["details"]["gpt-4o"][gpt_4o_answer]["false_positive"] += 1

            # Save the final state to a JSON file
            with open(f"./output/llama-complete.json", "w") as file:
                json.dump({
                    "results": results,
                    "scores": calculate_scores(results)
                }, file, indent=4)

        except Exception as e:
            print(str(e))

print("Total images processed:", results["total_images"])
print("Results:", results)
print("Scores:", calculate_scores(results))

Processing hannarubbercompany.com.json...
Correct role: functional
Predicted role (Llama): informative
Predicted role (GPT-4o): functional
Total images processed: 1
Results: {'total_images': 1, 'details': {'llama': {'informative': {'true_positive': 0, 'false_positive': 1, 'false_negative': 0}, 'decorative': {'true_positive': 0, 'false_positive': 0, 'false_negative': 0}, 'functional': {'true_positive': 0, 'false_positive': 0, 'false_negative': 1}, 'complex': {'true_positive': 0, 'false_positive': 0, 'false_negative': 0}}, 'gpt-4o': {'informative': {'true_positive': 0, 'false_positive': 0, 'false_negative': 0}, 'decorative': {'true_positive': 0, 'false_positive': 0, 'false_negative': 0}, 'functional': {'true_positive': 1, 'false_positive': 0, 'false_negative': 0}, 'complex': {'true_positive': 0, 'false_positive': 0, 'false_negative': 0}}}}
Scores: {'llama': {'whole_accuracy': 0.0, 'precision': {'informative': 0.0, 'decorative': 0, 'functional': 0, 'complex': 0}, 'recall': {'informative':