In [None]:
import json
import logging
import os
from collections import defaultdict
from evaluate import load
import sys

sys.path.append("../")
sys.path.append("./")

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)


def extract_filename(path):
    """Extract filename from path"""
    return os.path.basename(path)


def load_jsonl(file_path):
    """Helper function to load jsonl files"""
    with open(file_path) as f:
        return [json.loads(line) for line in f]


def calculate_metrics(source_texts, ground_truths):
    # Initialize metrics
    bleu_metric = load("bleu")
    rouge_metric = load("rouge")
    meteor_metric = load("meteor")

    # Calculate metrics
    bleu_results = bleu_metric.compute(
        predictions=source_texts, references=ground_truths
    )
    rouge_results = rouge_metric.compute(
        predictions=source_texts, references=ground_truths
    )
    meteor_results = meteor_metric.compute(
        predictions=source_texts, references=ground_truths
    )

    # Extract relevant scores
    return {
        "bleu": bleu_results["bleu"],
        "bleu1": bleu_results["precisions"][0],
        "bleu2": bleu_results["precisions"][1],
        "bleu3": bleu_results["precisions"][2],
        "bleu4": bleu_results["precisions"][3],
        "rouge": rouge_results["rougeL"],
        "meteor": meteor_results["meteor"],
    }


def evaluate_models(gt_path, pred_path):
    # Load data
    logger.info(f"Loading ground truth from {gt_path}")
    gt_data = load_jsonl(gt_path)
    logger.info(f"Loaded {len(gt_data)} ground truth entries")

    logger.info(f"Loading predictions from {pred_path}")
    pred_data = load_jsonl(pred_path)
    logger.info(f"Loaded {len(pred_data)} prediction entries")

    # Create mapping from filenames to ground truth
    gt_mapping = {
        (
            extract_filename(item["pre_image"]),
            extract_filename(item["post_image"]),
        ): item["change_caption"]
        for item in gt_data
    }
    logger.info(f"Created ground truth mapping with {len(gt_mapping)} entries")

    # Group predictions by model_id
    model_predictions = defaultdict(lambda: {"source_texts": [], "ground_truths": []})
    matched_count = 0

    for pred in pred_data:
        key = (
            extract_filename(pred["pre_image"]),
            extract_filename(pred["post_image"]),
        )
        if key in gt_mapping:
            matched_count += 1
            model_predictions[pred["model_id"]]["source_texts"].append(
                pred["change_caption"]
            )
            model_predictions[pred["model_id"]]["ground_truths"].append(gt_mapping[key])
        else:
            logger.debug(f"No match found for: {key}")

    logger.info(f"Matched {matched_count} predictions with ground truth")
    logger.info(f"Found {len(model_predictions)} models in predictions")

    # Calculate metrics for each model
    results = {}
    for model_id, data in model_predictions.items():
        logger.info(
            f"Calculating metrics for model {model_id} with {len(data['source_texts'])} samples"
        )
        results[model_id] = calculate_metrics(
            data["source_texts"], data["ground_truths"]
        )

    return results


results = evaluate_models("../data/xbd/xbd_gt.jsonl", "../output/xbd_subset_baseline.jsonl")
print(results)


In [1]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
# model = SentenceTransformer('../models/sentence-transformers/sentence-t5-base')
model = SentenceTransformer('../models/sentence-transformers/sentence-t5-xxl',device="cuda:1")




In [17]:
import numpy as np
sentences = ["This is an example sentence", "Each sentence is converted"]
embeddings = model.encode(sentences,normalize_embeddings=True, show_progress_bar=True)
print(embeddings[0].shape)
emb1 = np.expand_dims(embeddings[0], axis=0)  # Shape: (1, 768)
emb2 = np.expand_dims(embeddings[1], axis=0)  # Shape: (1, 768)
print(emb1.shape)

scs = abs((cosine_similarity(emb1, emb2)) ** 3)
print(scs)



Batches:   0%|          | 0/1 [00:00<?, ?it/s]

(768,)
(1, 768)
[[0.58259344]]


In [4]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import json
import logging
import os
from collections import defaultdict

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

def load_jsonl(file_path):
    """Helper function to load jsonl files"""
    with open(file_path) as f:
        return [json.loads(line) for line in f]

def extract_filename(path):
    """Extract filename from path"""
    return os.path.basename(path)
   
def calculate_cosine_similarity(source_texts, ground_truths):
    # Get embeddings for both sets of texts
    source_embeddings = model.encode(source_texts, normalize_embeddings=True)
    gt_embeddings = model.encode(ground_truths, normalize_embeddings=True)
    
    # Calculate cosine similarity for each pair
    similarities = []
    for src_emb, gt_emb in zip(source_embeddings, gt_embeddings):
        src_emb = np.expand_dims(src_emb, axis=0)
        gt_emb = np.expand_dims(gt_emb, axis=0)
        similarity = abs(cosine_similarity(src_emb, gt_emb)[0][0] ** 3)
        similarities.append(similarity)
    
    # Return average similarity
    return np.mean(similarities)

def evaluate_models(gt_path, pred_path):
    # Load data using existing load_jsonl function
    logger.info(f"Loading ground truth from {gt_path}")
    gt_data = load_jsonl(gt_path)
    logger.info(f"Loaded {len(gt_data)} ground truth entries")

    logger.info(f"Loading predictions from {pred_path}")
    pred_data = load_jsonl(pred_path)
    logger.info(f"Loaded {len(pred_data)} prediction entries")

    # Create mapping from filenames to ground truth
    gt_mapping = {
        (
            extract_filename(item["pre_image"]),
            extract_filename(item["post_image"]),
        ): item["change_caption"]
        for item in gt_data
    }
    logger.info(f"Created ground truth mapping with {len(gt_mapping)} entries")

    # Group predictions by model_id
    model_predictions = defaultdict(lambda: {"source_texts": [], "ground_truths": []})
    matched_count = 0

    for pred in pred_data:
        key = (
            extract_filename(pred["pre_image"]),
            extract_filename(pred["post_image"]),
        )
        if key in gt_mapping:
            matched_count += 1
            model_predictions[pred["model_id"]]["source_texts"].append(
                pred["change_caption"]
            )
            model_predictions[pred["model_id"]]["ground_truths"].append(gt_mapping[key])
        else:
            logger.debug(f"No match found for: {key}")

    logger.info(f"Matched {matched_count} predictions with ground truth")
    logger.info(f"Found {len(model_predictions)} models in predictions")

    # Calculate metrics for each model
    results = {}
    for model_id, data in model_predictions.items():
        logger.info(
            f"Calculating metrics for model {model_id} with {len(data['source_texts'])} samples"
        )
        # Calculate average caption length
        total_words = sum(len(caption.split()) for caption in data['source_texts'])
        avg_word_count = total_words / len(data['source_texts'])
        
        # Create a new metrics dictionary for each model
        metrics = {
            # "cosine_similarity": calculate_cosine_similarity(
            #     data["source_texts"], data["ground_truths"]
            # ),
            "avg_word_count": avg_word_count  # Add average length metric
        }
        results[model_id] = metrics

    return results

# Run evaluation
results = evaluate_models("../data/xbd/xbd_gt.jsonl", "../output/xbd_subset_baseline.jsonl")
import pprint
pprint.pprint(results, width=120, indent=4, sort_dicts=False)

INFO:__main__:Loading ground truth from ../data/xbd/xbd_gt.jsonl
INFO:__main__:Loaded 44136 ground truth entries
INFO:__main__:Loading predictions from ../output/xbd_subset_baseline.jsonl
INFO:__main__:Loaded 5928 prediction entries
INFO:__main__:Created ground truth mapping with 44136 entries
INFO:__main__:Matched 5928 predictions with ground truth
INFO:__main__:Found 6 models in predictions
INFO:__main__:Calculating metrics for model Qwen/Qwen2-VL-7B-Instruct with 988 samples
INFO:__main__:Calculating metrics for model Salesforce/xgen-mm-phi3-mini-instruct-interleave-r-v1.5 with 988 samples
INFO:__main__:Calculating metrics for model OpenGVLab/InternVL2_5-8B with 988 samples
INFO:__main__:Calculating metrics for model llava-hf/llava-interleave-qwen-7b-hf with 988 samples
INFO:__main__:Calculating metrics for model llava-hf/llava-onevision-qwen2-7b-ov-hf with 988 samples
INFO:__main__:Calculating metrics for model mistralai/Pixtral-12B-2409 with 988 samples


{   'Qwen/Qwen2-VL-7B-Instruct': {'avg_word_count': 82.12246963562752},
    'Salesforce/xgen-mm-phi3-mini-instruct-interleave-r-v1.5': {'avg_word_count': 61.5},
    'OpenGVLab/InternVL2_5-8B': {'avg_word_count': 83.03542510121457},
    'llava-hf/llava-interleave-qwen-7b-hf': {'avg_word_count': 69.00708502024291},
    'llava-hf/llava-onevision-qwen2-7b-ov-hf': {'avg_word_count': 75.99392712550608},
    'mistralai/Pixtral-12B-2409': {'avg_word_count': 87.87854251012146}}
