In [None]:
import numpy as np
import os
import cv2
from sklearn.metrics import jaccard_score, f1_score, recall_score, precision_score

def compute_iou(ground_truth_binary, predicted_binary, epsilon=1e-7):
    intersection = np.logical_and(ground_truth_binary, predicted_binary).sum()
    union = np.logical_or(ground_truth_binary, predicted_binary).sum()
    iou = intersection / (union + epsilon)
    return iou

def compute_f1(ground_truth_binary, predicted_binary, epsilon=1e-7):
    true_positives = np.logical_and(ground_truth_binary, predicted_binary).sum()
    false_positives = np.logical_and(np.logical_not(ground_truth_binary), predicted_binary).sum()
    false_negatives = np.logical_and(ground_truth_binary, np.logical_not(predicted_binary)).sum()

    precision = true_positives / (true_positives + false_positives + epsilon)
    recall = true_positives / (true_positives + false_negatives + epsilon)
    f1_score = 2 * (precision * recall) / (precision + recall + epsilon)

    return precision, recall, f1_score

In [None]:
datasets = ['forest', 'fish', 'fire', 'ultrasound', 'radiology', 'water', 'dancing', 'road', 'crack']
data_label_prefixes = ['f', 'fish', 'fire', 'm', 'r', 'w', 'd', 'road', 'c']
perturbations = ['raw', 'bright', 'chromatic_aberration', 'compressed', 'contrast', 'defocus_blur', 'motion_blur', 'gaussian_noise', 'salt_pepper_noise', 'elastic_transform', 'fog', 'gaussian_noise', 'radial_distortion', 'saturation', 'shot_noise', 'snow']
prompt_types = ['point', 'box', 'point_box']

results = []

for dataset, data_label_prefix in zip(datasets, data_label_prefixes):
    for perturbation in perturbations:
        for prompt_type in prompt_types:
            iou_sum = 0
            f1_sum = 0
            recall_sum = 0
            precision_sum = 0
            count = 0
            for i in range(1, 11):
                data_label = f'{data_label_prefix}{i}'
                gt_path = os.path.join(str(dataset), f'{data_label}_raw_gt.png' if dataset in ('ultrasound', 'radiology', 'fish', 'dancing', 'road') else f'{data_label}_raw_gt.jpg')
                pred_path = os.path.join(dataset, f'{data_label}_{perturbation}_{prompt_type}_mask.npy')
                gt = cv2.imread(gt_path, cv2.IMREAD_GRAYSCALE)
                pred = np.load(pred_path)
                
                if data_label_prefix == "r":
                    # Resize the predicted masks to match the size of the ground truth mask
                    pred = cv2.resize(pred[0], (gt.shape[1], gt.shape[0]), interpolation=cv2.INTER_NEAREST)
                    
                ground_truth_binary = gt == 255
                predicted_binary = pred == 255
                iou = compute_iou(ground_truth_binary, predicted_binary)
                precision, recall, f1_val = compute_f1(ground_truth_binary, predicted_binary)
                iou_sum += iou
                f1_sum += f1_val
                recall_sum += recall
                precision_sum += precision
                count += 1
            if count > 0:
                results.append((dataset, perturbation, prompt_type, iou_sum / count, f1_sum / count, recall_sum / count, precision_sum / count))

# Print the results in a tabular format
print("Dataset\tPerturbation\tPrompt Type\tIoU\tF1\tRecall\tPrecision")
for res in results:
    print(f"{res[0]}\t{res[1]}\t{res[2]}\t{res[3]:.4f}\t{res[4]:.4f}\t{res[5]:.4f}\t{res[6]:.4f}")


In [None]:
# Calculate the average performance metrics across all datasets
average_results = {}
for res in results:
    key = (res[1], res[2])
    if key not in average_results:
        average_results[key] = [0, 0, 0, 0, 0]
    average_results[key][0] += res[3]
    average_results[key][1] += res[4]
    average_results[key][2] += res[5]
    average_results[key][3] += res[6]
    average_results[key][4] += 1

for key in average_results:
    average_results[key] = [x / average_results[key][4] for x in average_results[key][:4]]

# Print the average results in a tabular format
print("Perturbation\tPrompt Type\tAverage IoU\tAverage F1\tAverage Recall\tAverage Precision")
for key, values in average_results.items():
    print(f"{key[0]}\t{key[1]}\t{values[0]:.4f}\t{values[1]:.4f}\t{values[2]:.4f}\t{values[3]:.4f}")


In [None]:
import matplotlib.pyplot as plt
import numpy as np
import matplotlib.cm as cm

viridis_cmap = cm.get_cmap('viridis')
norm = plt.Normalize(0, len(prompt_types))
plt.rcParams["font.family"] = "Times New Roman"
# Prepare the data for plotting
plot_data = {}
for key, values in average_results.items():
    if key[0] not in plot_data:
        plot_data[key[0]] = {}
    plot_data[key[0]][key[1]] = values

perturbation_categories = {
    "Raw": ["raw"],
    "Noise": ["gaussian_noise", "shot_noise", "salt_pepper_noise"],
    "Blur": ["gaussian_blur", "motion_blur", "defocus_blur"],
    "OG": ["chromatic_aberration", "elastic_transform", "radial_distortion"],
    "IC": ["brightness", "saturation", "contrast"],
    "ENV": ["snow", "fog"],
    "CMP": ["compressed"],
}

# Create a 2x2 grid of subplots
fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(10, 8))
metrics = ['IoU', 'F1', 'Recall', 'Precision']

for row in range(2):
    for col in range(2):
        metric_idx = row * 2 + col
        metric = metrics[metric_idx]
        ax = axes[row, col]
        x = np.arange(len(perturbation_categories))
        width = 0.2

        for idx, prompt_type in enumerate(prompt_types):
            y = [np.mean([plot_data[p][prompt_type][metric_idx] for p in perturbation_categories[category] if p in plot_data]) for category in perturbation_categories]
            color = viridis_cmap(norm(idx))
            ax.bar(x + (idx - 1) * width, y, width, label=prompt_type, color=color)

        ax.set_xticks(x)
        ax.set_xticklabels(perturbation_categories.keys(), rotation=0, fontsize=11)
        ax.set_ylabel(f'Average {metric}', fontsize=12)
        ax.set_title(f'Prompting Effect on {metric}')
legend = fig.legend(['Point', 'Box', 'Combination of Point and Box'], loc='upper center', bbox_to_anchor=(0.5, -0.01), ncol=3, fontsize=11)
fig.add_artist(legend)
plt.subplots_adjust(bottom=0.15, hspace=0.4)
plt.tight_layout()
#plt.savefig("prompting_effect_metrics.pdf", bbox_inches='tight')
plt.show()


In [None]:
import pandas as pd

# Prepare the data for the table
table_data = {
    "Prompting Technique": [],
    "Metric": [],
    "Raw": [],
    "Perturbed": [],
    "Change (%)": []
}

for prompt_type in prompt_types:
    for metric_idx, metric in enumerate(metrics):
        table_data["Prompting Technique"].append(prompt_type)
        table_data["Metric"].append(metric)
        raw_average = plot_data["raw"][prompt_type][metric_idx]
        table_data["Raw"].append(raw_average)

        perturbed_average = []
        for category in perturbation_categories:
            if category == "Raw":
                continue
            perturbed_average.extend([plot_data[p][prompt_type][metric_idx] for p in perturbation_categories[category] if p in plot_data])
        perturbed_mean = np.mean(perturbed_average)
        table_data["Perturbed"].append(perturbed_mean)

        # Calculate the percentage change
        percentage_change = ((perturbed_mean - raw_average) / raw_average) * 100
        table_data["Change (%)"].append(percentage_change)

# Create a pandas DataFrame
performance_table = pd.DataFrame(table_data)

# Display the table
print(performance_table)


In [None]:
# Round the numbers in the table
performance_table_rounded = performance_table.round({'Raw': 3, 'Perturbed': 3, 'Change (%)': 2})

# Convert the DataFrame to a LaTeX-formatted table
latex_table = performance_table_rounded.to_latex(index=False)

# Print the LaTeX-formatted table
print(latex_table)


In [None]:
from collections import defaultdict

datasets = ['forest', 'fish', 'fire', 'ultrasound', 'radiology', 'water', 'dancing', 'road', 'crack']
data_label_prefixes = ['f', 'fish', 'fire', 'm', 'r', 'w', 'd', 'road', 'c']
perturbations = ['raw', 'bright', 'chromatic_aberration', 'compressed', 'contrast', 'defocus_blur', 'motion_blur', 'gaussian_noise', 'salt_pepper_noise', 'elastic_transform', 'fog', 'gaussian_blur', 'radial_distortion', 'saturation', 'shot_noise', 'snow']
prompt_types = ['point', 'box', 'point_box']
top_k = 4

results = defaultdict(lambda: defaultdict(list))

for dataset, data_label_prefix in zip(datasets, data_label_prefixes):
    for prompt_type in prompt_types:
        for perturbation in perturbations:
            iou_sum = 0
            f1_sum = 0
            recall_sum = 0
            precision_sum = 0
            count = 0
            for i in range(1, 11):
                data_label = f'{data_label_prefix}{i}'
                gt_path = os.path.join(str(dataset), f'{data_label}_raw_gt.png' if dataset in ('ultrasound', 'radiology', 'fish', 'dancing', 'road') else f'{data_label}_raw_gt.jpg')
                pred_path = os.path.join(dataset, f'{data_label}_{perturbation}_{prompt_type}_mask.npy')
                gt = cv2.imread(gt_path, cv2.IMREAD_GRAYSCALE)
                pred = np.load(pred_path)

                if data_label_prefix == "r":
                    # Resize the predicted masks to match the size of the ground truth mask
                    pred = cv2.resize(pred[0], (gt.shape[1], gt.shape[0]), interpolation=cv2.INTER_NEAREST)

                ground_truth_binary = gt == 255
                predicted_binary = pred == 255
                iou = compute_iou(ground_truth_binary, predicted_binary)
                precision, recall, f1 = compute_f1(ground_truth_binary, predicted_binary)
                iou_sum += iou
                f1_sum += f1
                recall_sum += recall
                precision_sum += precision
                count += 1

            if count > 0:
                mean_iou = iou_sum / count
                mean_f1 = f1_sum / count
                mean_recall = recall_sum / count
                mean_precision = precision_sum / count
                results[dataset][prompt_type].append((perturbation, mean_iou, mean_f1, mean_recall, mean_precision))

In [None]:
easily_influenced_results = []
difficult_influenced_results = []

for dataset, data_label_prefix in zip(datasets, data_label_prefixes):
    for prompt_type in prompt_types:
        min_iou = float('inf')
        max_iou = float('-inf')
        easily_influenced = None
        difficult_to_influence = None

        for perturbation in perturbations:
            iou_sum = 0
            count = 0
            for i in range(1, 11):
                data_label = f'{data_label_prefix}{i}'
                gt_path = os.path.join(str(dataset), f'{data_label}_raw_gt.png' if dataset in ('ultrasound', 'radiology', 'fish', 'dancing', 'road') else f'{data_label}_raw_gt.jpg')
                pred_path = os.path.join(dataset, f'{data_label}_{perturbation}_{prompt_type}_mask.npy')
                gt = cv2.imread(gt_path, cv2.IMREAD_GRAYSCALE)
                pred = np.load(pred_path)

                if data_label_prefix == "r":
                    # Resize the predicted masks to match the size of the ground truth mask
                    pred = cv2.resize(pred[0], (gt.shape[1], gt.shape[0]), interpolation=cv2.INTER_NEAREST)

                ground_truth_binary = gt == 255
                predicted_binary = pred == 255
                iou = compute_iou(ground_truth_binary, predicted_binary)
                iou_sum += iou
                count += 1

            if count > 0:
                mean_iou = iou_sum / count
                if mean_iou < min_iou:
                    min_iou = mean_iou
                    easily_influenced = perturbation

                if mean_iou > max_iou:
                    max_iou = mean_iou
                    difficult_to_influence = perturbation

        easily_influenced_results.append((dataset, prompt_type, easily_influenced, min_iou))
        difficult_influenced_results.append((dataset, prompt_type, difficult_to_influence, max_iou))

# Print the easily influenced results
print("Dataset\tPrompt Type\tEasily Influenced\tIoU")
for res in easily_influenced_results:
    print(f"{res[0]}\t{res[1]}\t{res[2]}\t{res[3]:.4f}")

print("\n")

# Print the difficult to influence results
print("Dataset\tPrompt Type\tDifficult to Influence\tIoU")
for res in difficult_influenced_results:
    print(f"{res[0]}\t{res[1]}\t{res[2]}\t{res[3]:.4f}")


In [None]:
# influence perturbations
final_results = {}
for dataset in datasets:
    final_results[dataset] = {}
    for prompt_type in prompt_types:
        sorted_results = sorted(results[dataset][prompt_type], key=lambda x: x[1])
        easily_influenced = sorted_results[:top_k]
        difficult_to_influence = sorted_results[-top_k:][::-1]
        final_results[dataset][prompt_type] = {"easy": easily_influenced, "difficult": difficult_to_influence}

# Print the results in a tabular format
print("Dataset\tPrompt Type\tEasily Influenced (Top k)\tDifficult to Influence (Top k)")
for dataset in final_results:
    for prompt_type in final_results[dataset]:
        easy = ", ".join([f"{item[0]} ({item[1]:.4f})" for item in final_results[dataset][prompt_type]["easy"]])
        difficult = ", ".join([f"{item[0]} ({item[1]:.4f})" for item in final_results[dataset][prompt_type]["difficult"]])
        print(f"{dataset}\t{prompt_type}\t{easy}\t{difficult}")

print("\nDataset\tPrompt Type\tPerturbation\tIoU\tF1\tRecall\tPrecision")
for dataset in results:
    for prompt_type in results[dataset]:
        for res in results[dataset][prompt_type]:
            print(f"{dataset}\t{prompt_type}\t{res[0]}\t{res[1]:.4f}\t{res[2]:.4f}\t{res[3]:.4f}\t{res[4]:.4f}")

In [None]:
# Create DataFrame for top-k easily influenced and difficult to influence perturbations
influence_results = []
for dataset in final_results:
    best_prompt = None
    best_prompt_score = float('-inf')
    top_k_easy = []
    top_k_difficult = []

    for prompt_type in final_results[dataset]:
        easy_sorted = sorted(final_results[dataset][prompt_type]["easy"], key=lambda x: x[1])
        difficult_sorted = sorted(final_results[dataset][prompt_type]["difficult"], key=lambda x: x[1], reverse=True)

        easy_score = sum([item[1] for item in easy_sorted[:4]])
        difficult_score = sum([item[1] for item in difficult_sorted[:4]])
        total_score = easy_score + difficult_score

        if total_score > best_prompt_score:
            best_prompt_score = total_score
            best_prompt = prompt_type
            top_k_easy = easy_sorted[:4]
            top_k_difficult = difficult_sorted[:4]

    easy_perturbations = ", ".join([f"{item[0]}" for item in top_k_easy])
    difficult_perturbations = ", ".join([f"{item[0]}" for item in top_k_difficult])

    influence_results.append([dataset, best_prompt, easy_perturbations, difficult_perturbations])

iou_influence_df = pd.DataFrame(influence_results, columns=["Dataset", "Most Suitable Prompt", "Top-4 Easily Influenced", "Top-4 Difficult to Influence"])

In [None]:
# Create DataFrame for top-k easily influenced and difficult to influence perturbations
influence_results = []
for dataset in final_results:
    best_prompt = None
    best_prompt_score = float('-inf')
    top_k_easy = []
    top_k_difficult = []

    for prompt_type in final_results[dataset]:
        easy_sorted = sorted(final_results[dataset][prompt_type]["easy"], key=lambda x: x[1])
        difficult_sorted = sorted(final_results[dataset][prompt_type]["difficult"], key=lambda x: x[1], reverse=True)

        easy_score = sum([item[2] for item in easy_sorted[:4]])
        difficult_score = sum([item[2] for item in difficult_sorted[:4]])
        total_score = easy_score + difficult_score

        if total_score > best_prompt_score:
            best_prompt_score = total_score
            best_prompt = prompt_type
            top_k_easy = easy_sorted[:4]
            top_k_difficult = difficult_sorted[:4]

    easy_perturbations = ", ".join([f"{item[0]}" for item in top_k_easy])
    difficult_perturbations = ", ".join([f"{item[0]}" for item in top_k_difficult])

    influence_results.append([dataset, best_prompt, easy_perturbations, difficult_perturbations])

f1_influence_df = pd.DataFrame(influence_results, columns=["Dataset", "Most Suitable Prompt", "Top-4 Easily Influenced", "Top-4 Difficult to Influence"])

In [None]:
# Create DataFrame for top-k easily influenced and difficult to influence perturbations
influence_results = []
for dataset in final_results:
    best_prompt = None
    best_prompt_score = float('-inf')
    top_k_easy = []
    top_k_difficult = []

    for prompt_type in final_results[dataset]:
        easy_sorted = sorted(final_results[dataset][prompt_type]["easy"], key=lambda x: x[1])
        difficult_sorted = sorted(final_results[dataset][prompt_type]["difficult"], key=lambda x: x[1], reverse=True)

        easy_score = sum([item[3] for item in easy_sorted[:4]])
        difficult_score = sum([item[3] for item in difficult_sorted[:4]])
        total_score = easy_score + difficult_score

        if total_score > best_prompt_score:
            best_prompt_score = total_score
            best_prompt = prompt_type
            top_k_easy = easy_sorted[:4]
            top_k_difficult = difficult_sorted[:4]

    easy_perturbations = ", ".join([f"{item[0]} ({item[3]:.3f})" for item in top_k_easy])
    difficult_perturbations = ", ".join([f"{item[0]} ({item[3]:.3f})" for item in top_k_difficult])

    influence_results.append([dataset, best_prompt, easy_perturbations, difficult_perturbations])

recall_influence_df = pd.DataFrame(influence_results, columns=["Dataset", "Most Suitable Prompt", "Top-4 Easily Influenced", "Top-4 Difficult to Influence"])

In [None]:
# Create DataFrame for top-k easily influenced and difficult to influence perturbations
influence_results = []
for dataset in final_results:
    best_prompt = None
    best_prompt_score = float('-inf')
    top_k_easy = []
    top_k_difficult = []

    for prompt_type in final_results[dataset]:
        easy_sorted = sorted(final_results[dataset][prompt_type]["easy"], key=lambda x: x[1])
        difficult_sorted = sorted(final_results[dataset][prompt_type]["difficult"], key=lambda x: x[1], reverse=True)

        easy_score = sum([item[4] for item in easy_sorted[:4]])
        difficult_score = sum([item[4] for item in difficult_sorted[:4]])
        total_score = easy_score + difficult_score

        if total_score > best_prompt_score:
            best_prompt_score = total_score
            best_prompt = prompt_type
            top_k_easy = easy_sorted[:4]
            top_k_difficult = difficult_sorted[:4]

    easy_perturbations = ", ".join([f"{item[0]}" for item in top_k_easy])
    difficult_perturbations = ", ".join([f"{item[0]}" for item in top_k_difficult])

    influence_results.append([dataset, best_prompt, easy_perturbations, difficult_perturbations])

precision_influence_df = pd.DataFrame(influence_results, columns=["Dataset", "Most Suitable Prompt", "Top-4 Easily Influenced", "Top-4 Difficult to Influence"])

In [None]:
import re

def majority_vote(top3_lists):
    counter = {}
    for top3 in top3_lists:
        for item in top3.split(', '):
            perturbation_name = re.match(r"([\w_]+)", item).group(1)
            if perturbation_name not in counter:
                counter[perturbation_name] = 0
            counter[perturbation_name] += 1
    sorted_items = sorted(counter.items(), key=lambda x: x[1], reverse=True)
    top_items = [item[0] for item in sorted_items[:4]]
    return ", ".join(top_items)

# Get the lists of top-3 easily influenced and difficult to influence perturbations
iou_easy = iou_influence_df["Top-4 Easily Influenced"].tolist()
f1_easy = f1_influence_df["Top-4 Easily Influenced"].tolist()
recall_easy = recall_influence_df["Top-4 Easily Influenced"].tolist()
precision_easy = precision_influence_df["Top-4 Easily Influenced"].tolist()

iou_difficult = iou_influence_df["Top-4 Difficult to Influence"].tolist()
f1_difficult = f1_influence_df["Top-4 Difficult to Influence"].tolist()
recall_difficult = recall_influence_df["Top-4 Difficult to Influence"].tolist()
precision_difficult = precision_influence_df["Top-4 Difficult to Influence"].tolist()

# Create the final dataframe using majority voting
final_data = []
for idx, dataset in enumerate(iou_influence_df["Dataset"]):
    most_suitable_prompt = majority_vote([iou_influence_df.at[idx, "Most Suitable Prompt"],
                                          f1_influence_df.at[idx, "Most Suitable Prompt"],
                                          recall_influence_df.at[idx, "Most Suitable Prompt"],
                                          precision_influence_df.at[idx, "Most Suitable Prompt"]])
    
    easy = majority_vote([iou_easy[idx], f1_easy[idx], recall_easy[idx], precision_easy[idx]])
    difficult = majority_vote([iou_difficult[idx], f1_difficult[idx], recall_difficult[idx], precision_difficult[idx]])
    
    final_data.append([dataset, most_suitable_prompt, easy, difficult])

final_df = pd.DataFrame(final_data, columns=["Dataset", "Most Suitable Prompt (Majority Vote)", "Top-4 Easily Influenced (Majority Vote)", "Top-4 Difficult to Influence (Majority Vote)"])