In [None]:
import json
import os
from collections import defaultdict, Counter  # Used for easier word counting and grouped dictionaries

# Path to directory with model outputs
directory_path = "/home/jovyan/Evaluation/OCR/Data"

# Get all JSON files in the directory (assumed to be individual model outputs)
files = [file for file in os.listdir(directory_path) if file.endswith(".json")]

# Load all model outputs into a dictionary
all_data = {}
for file in files:
    with open(os.path.join(directory_path, file), 'r') as f:
        all_data[file] = json.load(f)  # all_data['ModelName.json'] = { image_name: [word1, word2, ...] }

# Collect all unique image names across all models
all_images = set()
for model_data in all_data.values():
    all_images.update(model_data.keys())  # Combine image keys from all model files

# This will store the final consensus ground truth for each image
consensus_ground_truth = {}

# Go through each image one by one
for image in all_images:
    # Structure to store how many times each word appears in each model
    # word_model_counts[word][model] = count
    word_model_counts = defaultdict(dict)

    # Loop through each model's prediction
    for model_name, model_data in all_data.items():
        if image in model_data:
            # Count how many times each word appears in the current model's prediction
            word_counts = Counter(model_data[image])
            for word, count in word_counts.items():
                word_model_counts[word][model_name] = count  # Store count per model

    # Store consensus-agreed words for this image
    consensus_words = []

    # Check which words are predicted by at least 3 models
    for word, model_counts in word_model_counts.items():
        if len(model_counts) >= 3:  # Word appears in at least 3 models
            min_count = min(model_counts.values())  # Get the minimum count among the models
            consensus_words.extend([word] * min_count)  # Add the word `min_count` times

    # Save the final agreed words for this image
    consensus_ground_truth[image] = consensus_words

# Write the consensus ground truth dictionary to a JSON file
with open("Consensus_Ground_Truth.json", "w") as f:
    json.dump(consensus_ground_truth, f, indent=4)

print("✅ Consensus Ground Truth generated.")  # Done!
