In [1]:
!pip install sentence-transformers


Collecting sentence-transformers
  Downloading sentence_transformers-4.0.2-py3-none-any.whl.metadata (13 kB)
Collecting scikit-learn (from sentence-transformers)
  Downloading scikit_learn-1.6.1-cp312-cp312-macosx_12_0_arm64.whl.metadata (31 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn->sentence-transformers)
  Downloading threadpoolctl-3.6.0-py3-none-any.whl.metadata (13 kB)
Downloading sentence_transformers-4.0.2-py3-none-any.whl (340 kB)
Downloading scikit_learn-1.6.1-cp312-cp312-macosx_12_0_arm64.whl (11.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.2/11.2 MB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hDownloading threadpoolctl-3.6.0-py3-none-any.whl (18 kB)
Installing collected packages: threadpoolctl, scikit-learn, sentence-transformers
Successfully installed scikit-learn-1.6.1 sentence-transformers-4.0.2 threadpoolctl-3.6.0


In [None]:
import requests
import base64
import io
import time
from PIL import Image
import re

# --- CONFIG ---
class ConfigLLaVAVision:
    ENDPOINT_ID = "og65lbckc1lf24"
    API_KEY = ""  # ⛔ Put your API key here!
    BASE_URL = f"https://api.runpod.ai/v2/{ENDPOINT_ID}"


def get_structured_caption():
    # Set the prompt to compare text-based captions only
    prompt = """
You are a strict grader, but you should be generous when scoring captions with a high degree of similarity. Only compare the following captions in terms of content similarity, focusing on their overall meaning and structure. Do not include any explanations, code, or extra text. Return only the following six float values as comma-separated numbers, each between 0 and 1:

main_objects_score, main_object_attributes_score, location_score, action_score, surroundings_score, background_score

Rules:
- Compare the general meaning of each category, not just the wording.
- If both captions have the word "none" or same or same meaning word for a category, return a score of 1.0, as this means there is no difference.
- Be generous with the scores if the content in both captions is largely the same, even if the exact wording differs slightly.
- Return a score of 1.0 for categories where the content is identical or very similar.
- Avoid scoring less than 0.5 for categories with strong similarity.
- Ensure that you understand the context in terms of the visual scene being described (e.g., "trees in the forest" is highly similar to "dense foliage, green leaves").

Teacher:
main_objects: trees; main_object_attributes: tall, slender, green leaves; location: dense forest; action: none; surroundings: undergrowth, winding path; background: bright sunlight, blue sky

Student:
main_objects: trees; main_object_attributes: tall, slender trunks, dense foliage, green leaves; location: forest; action: none; surroundings: pathway, scattered ferns; background: bright sky, sunlight through leaves

Scores:
"""
    # Prepare the payload for the API request
    payload = {
        "input": {
            "prompt": prompt.strip()
        }
    }

    # Set up headers for the request
    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Bearer {ConfigLLaVAVision.API_KEY}"  # Use your API key
    }

    try:
        # Submit the request to the RunPod API
        submit_response = requests.post(
            f"{ConfigLLaVAVision.BASE_URL}/run",
            json=payload,
            headers=headers
        )
        submit_response.raise_for_status()  # Check if request was successful

        job_id = submit_response.json().get("id")

        if not job_id:
            return "[ERROR] No job ID received"

        # Poll job status
        for _ in range(30):  # Retry for 30 iterations (3 seconds wait)
            time.sleep(0.1)
            status_response = requests.get(
                f"{ConfigLLaVAVision.BASE_URL}/status/{job_id}",
                headers=headers
            )
            status_json = status_response.json()
            status = status_json.get("status")

            if status == "COMPLETED":
                return status_json.get("output", {}).get("text", "")
            elif status == "FAILED":
                return "[ERROR] Job failed"

        return "[ERROR] Timed out"  # If timeout exceeded

    except Exception as e:
        return f"[ERROR] {e}"

# --- PARSER ---
def parse_caption_to_dict(caption_str: str) -> dict:
    fields = ["main_objects", "location", "action", "attributes", "background", "time"]
    result = {}

    for field in fields:
        pattern = rf"{field}:\s*(.*?)(?=,\s*\w+:|$)"
        match = re.search(pattern, caption_str, re.IGNORECASE)
        if match:
            value = match.group(1).strip()
            if value.lower() != "none":
                result[field] = [v.strip() for v in value.split(",") if v.strip()]

    return result

def extract_caption_line(raw_caption: str) -> str:
    # Find the part after "assistant" response
    try:
        split_token = "<|start_header_id|>assistant<|end_header_id|>"
        if split_token in raw_caption:
            assistant_part = raw_caption.split(split_token)[-1]
            # Look for the actual "main_objects:" line in that part
            for line in assistant_part.splitlines():
                line = line.strip()
                if line.lower().startswith("main_objects:"):
                    return line
    except Exception as e:
        print(f"[Parse Error] {e}")
    return None

# --- USAGE EXAMPLE ---
if __name__ == "__main__":
    print("🔄 Sending request for text comparison...")
    raw_caption = get_structured_caption()  # Only process text, no image
    print("🧠 Raw caption output:", raw_caption)

    # Extract the model's answer
    caption_line = extract_caption_line(raw_caption)

    if caption_line:
        parsed = parse_caption_to_dict(caption_line)
        print("✅ Parsed Caption Dictionary:")
        print(parsed)
    else:
        print("⚠️ No valid caption found in response.")

🔄 Sending request for text comparison...
🧠 Raw caption output: [ERROR] Job failed
⚠️ No valid caption found in response.


In [None]:
import requests
import base64
import io
import time
import re
import json
from PIL import Image

# --- CONFIG ---
class ConfigLLaVAVision:
    ENDPOINT_ID = "og65lbckc1lf24"
    API_KEY = ""
    BASE_URL = f"https://api.runpod.ai/v2/{ENDPOINT_ID}/runsync"

# --- UTILITIES ---
def image_to_base64(img):
    buffered = io.BytesIO()
    img.save(buffered, format="PNG")
    return base64.b64encode(buffered.getvalue()).decode("utf-8")
def get_structured_caption(image):
    prompt = (
        "You are a vision-language assistant. Describe an image using exactly six categories in a single line:\n\n"
        "main_objects: ... main_object_attributes: ... location: ... action: ... surroundings: ... background: ...\n\n"
        "Format rules:\n"
        "- Each category must start with its name, followed by a colon and a space\n"
        "- Use detailed, specific descriptions\n"
        "- Separate categories with a comma and a space\n"
        "- If a category is unclear, write: [category name]: none\n"
        "- Do NOT include commentary, line breaks, or extra text\n\n"
  )

    image_b64 = image_to_base64(image)
    payload = {
        "input": {
            "prompt": prompt,
            "source": image_b64  # ✅ NO MIME HEADER
        }
    }

    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Bearer {ConfigLLaVAVision.API_KEY}"
    }

    try:
        response = requests.post(ConfigLLaVAVision.BASE_URL, json=payload, headers=headers)
        response.raise_for_status()
        print("🧪 Full API response:", response.json())  # Log full response
        return response.json().get("output", {}).get("text", "")
    except Exception as e:
        return f"[ERROR] {e}"
# --- PARSING ---
def extract_caption_line(raw_caption: str) -> str:
    try:
        split_token = "<|start_header_id|>assistant<|end_header_id|>"
        if split_token in raw_caption:
            assistant_part = raw_caption.split(split_token)[-1]
            for line in assistant_part.splitlines():
                line = line.strip()
                if line.lower().startswith("main_objects:"):
                    return line
        return raw_caption.strip()
    except Exception as e:
        print(f"[Parse Error] {e}")
        return None

def parse_caption_to_dict(caption_str: str) -> dict:
    fields = ["main_objects", "main_object_attributes", "location", "action", "surroundings", "background"]
    result = {}

    for field in fields:
        pattern = rf"{field}:\s*(.*?)(?=,\s*\w+:|$)"
        match = re.search(pattern, caption_str, re.IGNORECASE)
        if match:
            value = match.group(1).strip()
            if value.lower() != "none":
                result[field] = [v.strip() for v in value.split(";") if v.strip()]

    return result

# --- MAIN ---
if __name__ == "__main__":
    # 📥 Image paths (change to your actual image paths)
    teacher_image_path = "/Users/fatihwolf/Downloads/images/row_11_teacher.png"
    student_image_path = "/Users/fatihwolf/Downloads/images/row_11_student.png"

    # 🖼️ Load and convert images
    teacher_img = Image.open(teacher_image_path).convert("RGB")
    student_img = Image.open(student_image_path).convert("RGB")

    # 🔄 Get captions
    print("🔄 Sending TEACHER image to LLaVA...")
    raw_teacher = get_structured_caption(teacher_img)
    print("🧠 Raw caption (Teacher):", raw_teacher)

    print("🔄 Sending STUDENT image to LLaVA...")
    raw_student = get_structured_caption(student_img)
    print("🧠 Raw caption (Student):", raw_student)

    # 🧠 Parse both captions 
    teacher_caption_line = extract_caption_line(raw_teacher)
    student_caption_line = extract_caption_line(raw_student)

    teacher_caption = parse_caption_to_dict(teacher_caption_line or "")
    student_caption = parse_caption_to_dict(student_caption_line or "")

    # 📝 Final result
    result = {
        "teacher_caption": teacher_caption,
        "student_caption": student_caption
    }

    print("✅ Final Caption JSON:")
    print(json.dumps(result, indent=2, ensure_ascii=False))

    # 💾 Save to file
    with open("captions.json", "w", encoding="utf-8") as f:
        json.dump(result, f, indent=2, ensure_ascii=False)
        print("💾 Saved to captions.json")


🔄 Sending TEACHER image to LLaVA...
🧪 Full API response: {'delayTime': 9138, 'executionTime': 29084, 'id': 'sync-e8d1f236-5dc0-4884-bb86-7071bcb4628c-e1', 'output': {'text': '<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n<|image|>You are a vision-language assistant. Describe an image using exactly six categories in a single line:\n\nmain_objects:... main_object_attributes:... location:... action:... surroundings:... background:...\n\nFormat rules:\n- Each category must start with its name, followed by a colon and a space\n- Use detailed, specific descriptions\n- Separate categories with a comma and a space\n- If a category is unclear, write: [category name]: none\n- Do NOT include commentary, line breaks, or extra text\n\n<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nmain_objects: giraffe, tree, grass: tall, brown, green, yellow, spotted, long: none, none, savannah: none, eating: none, none: tree, branch, sky: blue, white, clouds.<|eot_id|>'}, 'status': 'COM

In [54]:
teacher {'main_objects': ['A cartoon dog'], 'location': ['A park'], 'action': ['Running'], 'attributes': ['Brown fur', 'blue collar'], 'background': ['Green grass.<|eot_id|>']}
student {'main_objects': ['A dog'], 'location': ['A park'], 'action': ['Running'], 'attributes': ['Happy', 'smiling', 'red collar'], 'background': ['Trees and people.<|eot_id|>']}

SyntaxError: invalid syntax (958795193.py, line 1)

In [None]:
teacher {'main_objects': ['A white cat'], 'location': ['Sitting on a couch'], 'action': ['Sleeping'], 'attributes': ['White fur', 'large size'], 'background': ['A sunny room with a window.']}
student {'main_objects': ['A large orange and white cat'], 'location': ['Lying on a sofa'], 'action': ['Sleeping'], 'attributes': ['Orange and white fur', 'large size'], 'background': ['A sunny living room with a window.']}

In [None]:
{'main_objects': ['A small brown cat'], 'location': ['Sitting on a windowsill'], 'action': ['Looking outside'], 'attributes': ['Brown fur', 'small size'], 'background': ['A cozy sunlit room<|eot_id|>']}

In [105]:
import json
from sentence_transformers import SentenceTransformer, util

# Load pre-trained sentence transformer
model = SentenceTransformer('all-MiniLM-L6-v2')

# ------------------ LOAD JSON ------------------
with open("captions.json", "r", encoding="utf-8") as f:
    captions = json.load(f)

teacher_caption = captions.get("teacher_caption", {})
student_caption = captions.get("student_caption", {})

# ------------------ CATEGORY WEIGHTS ------------------
weights = {
    "main_objects": 0.3,
    "main_object_attributes": 0.1,
    "location": 0.15,
    "action":0.1,
    "surroundings": 0.15,
    "background": 0.1
}

# ------------------ SIMILARITY FUNCTION ------------------
def compute_similarity(list1, list2):
    if not list1 or not list2:
        return 0.0
    text1 = ', '.join(list1)
    text2 = ', '.join(list2)
    embedding1 = model.encode(text1, convert_to_tensor=True)
    embedding2 = model.encode(text2, convert_to_tensor=True)
    return float(util.cos_sim(embedding1, embedding2).item())

# ------------------ SCORE CALCULATION ------------------
total_score = 0.0
details = {}

for key in weights:
    t_val = teacher_caption.get(key, [])
    s_val = student_caption.get(key, [])
    score = compute_similarity(t_val, s_val)
    weighted_score = score * weights[key]
    total_score += weighted_score
    details[key] = {
        "raw_similarity": round(score, 3),
        "weighted_contribution": round(weighted_score, 3)
    }

total_score = round(total_score, 3)
details["final_similarity_score"] = total_score

# ------------------ PRINT RESULTS ------------------
import pprint
pprint.pprint(details)


{'action': {'raw_similarity': 0.241, 'weighted_contribution': 0.024},
 'background': {'raw_similarity': 0.703, 'weighted_contribution': 0.07},
 'final_similarity_score': 0.761,
 'location': {'raw_similarity': 1.0, 'weighted_contribution': 0.15},
 'main_object_attributes': {'raw_similarity': 0.74,
                            'weighted_contribution': 0.074},
 'main_objects': {'raw_similarity': 1.0, 'weighted_contribution': 0.3},
 'surroundings': {'raw_similarity': 0.95, 'weighted_contribution': 0.143}}


In [None]:
from sentence_transformers import SentenceTransformer, util

# Load pre-trained model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Define structured captions for teacher and student
teacher_caption = {
    "main_objects": ["white cat"],
    "location": ["Sitting on a couch"],
    "action": ["Sleeping"],
    "attributes": ["White fur", "large size"],
    "background": ["A sunny room with a window."]
}

student_caption = {
    "main_objects": ["A large orange and white cat"],
    "location": ["Lying on a sofa"],
    "action": ["Sleeping"],
    "attributes": ["Orange and white fur", "large size"],
    "background": ["A sunny living room with a window."]
}

# Define weights for each category
weights = {
    "main_objects": 0.3,
    "location": 0.2,
    "action": 0.25,
    "attributes": 0.15,
    "background": 0.1
}

# Function to compute semantic similarity
def compute_similarity(list1, list2):
    if not list1 or not list2:
        return 0.0
    text1 = ', '.join(list1)
    text2 = ', '.join(list2)
    embedding1 = model.encode(text1, convert_to_tensor=True)
    embedding2 = model.encode(text2, convert_to_tensor=True)
    return float(util.cos_sim(embedding1, embedding2).item())

# Calculate weighted similarity
total_score = 0.0
details = {}
for key in teacher_caption:
    score = compute_similarity(teacher_caption[key], student_caption[key])
    weighted_score = score * weights[key]
    total_score += weighted_score
    details[key] = {
        "raw_similarity": round(score, 3),
        "weighted_contribution": round(weighted_score, 3)
    }

total_score = round(total_score, 3)
details["final_similarity_score"] = total_score

# Print result
import pprint
pprint.pprint(details)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

{'action': {'raw_similarity': 1.0, 'weighted_contribution': 0.25},
 'attributes': {'raw_similarity': 0.917, 'weighted_contribution': 0.138},
 'background': {'raw_similarity': 0.962, 'weighted_contribution': 0.096},
 'final_similarity_score': 0.842,
 'location': {'raw_similarity': 0.69, 'weighted_contribution': 0.138},
 'main_objects': {'raw_similarity': 0.734, 'weighted_contribution': 0.22}}


In [69]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Pre-trained SentenceTransformer model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Example captions
teacher_caption = "main_objects: trees; main_object_attributes: tall, slender, green leaves; location: dense forest; action: none; surroundings: undergrowth, winding path; background: bright sunlight, blue sky"
student_caption = "main_objects: trees; main_object_attributes: tall, slender trunks, dense foliage, green leaves; location: forest; action: none; surroundings: pathway, scattered ferns; background: bright sky, sunlight through leaves"

# Define weights for each category
weights = {
    "main_objects": 0.3,
    "main_object_attributes": 0.2,
    "location": 0.2,
    "action": 0.1,
    "surroundings": 0.1,
    "background": 0.1
}

# Function to compute cosine similarity for each category
def compute_similarity(text1, text2):
    # Encode the sentences to get embeddings
    embeddings1 = model.encode(text1)
    embeddings2 = model.encode(text2)
    
    # Compute cosine similarity
    similarity = cosine_similarity([embeddings1], [embeddings2])[0][0]
    return similarity

# Split the captions into categories for comparison
categories = ["main_objects", "main_object_attributes", "location", "action", "surroundings", "background"]

# Compute similarity for each category and apply weights
similarities = []
for category in categories:
    # Here we assume the category values are included in the captions for both teacher and student
    teacher_category = f"{category}: {teacher_caption.split(category + ':')[1].split(';')[0].strip()}"
    student_category = f"{category}: {student_caption.split(category + ':')[1].split(';')[0].strip()}"
    
    category_similarity = compute_similarity(teacher_category, student_category)
    
    # Apply weight
    weighted_similarity = category_similarity * weights[category]
    similarities.append(weighted_similarity)

# Final similarity score
final_similarity = np.sum(similarities)
print(f"Overall Similarity: {final_similarity:.2f}")


The history saving thread hit an unexpected error (OperationalError('attempt to write a readonly database')).History will not be written to the database.
Overall Similarity: 0.87


In [95]:
import json
from transformers import RobertaTokenizer, RobertaModel
import torch
from sklearn.metrics.pairwise import cosine_similarity

# Load pre-trained RoBERTa tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained("roberta-large")
model = RobertaModel.from_pretrained("roberta-large")

# ------------------ LOAD JSON ------------------
with open("/Users/fatihwolf/Documents/cap.json", "r", encoding="utf-8") as f:
    captions = json.load(f)

teacher_caption = captions.get("teacher_caption", {})
student_caption = captions.get("student_caption", {})

# ------------------ CATEGORY WEIGHTS ------------------
weights = {
    "main_objects": 0.3,
    "main_object_attributes": 0.1,
    "location": 0.15,
    "action": 0.1,
    "surroundings": 0.15,
    "background": 0.1
}

# ------------------ SIMILARITY FUNCTION ------------------
def compute_similarity(list1, list2):
    if not list1 or not list2:
        return 0.0
    text1 = ', '.join(list1)
    text2 = ', '.join(list2)

    # Tokenize and get embeddings from RoBERTa model
    inputs1 = tokenizer(text1, return_tensors="pt", truncation=True, padding=True, max_length=512)
    inputs2 = tokenizer(text2, return_tensors="pt", truncation=True, padding=True, max_length=512)

    with torch.no_grad():
        output1 = model(**inputs1)
        output2 = model(**inputs2)

    # Take the mean of the token embeddings (i.e., the hidden states of the last layer)
    embedding1 = output1.last_hidden_state.mean(dim=1)
    embedding2 = output2.last_hidden_state.mean(dim=1)

    # Compute cosine similarity
    similarity = cosine_similarity(embedding1.cpu(), embedding2.cpu())[0][0]
    return similarity

# ------------------ SCORE CALCULATION ------------------
total_score = 0.0
details = {}

for key in weights:
    t_val = teacher_caption.get(key, [])
    s_val = student_caption.get(key, [])
    score = compute_similarity(t_val, s_val)
    weighted_score = score * weights[key]
    total_score += weighted_score
    details[key] = {
        "raw_similarity": round(score, 3),
        "weighted_contribution": round(weighted_score, 3)
    }

total_score = round(total_score, 3)
details["final_similarity_score"] = total_score

# ------------------ PRINT RESULTS ------------------
import pprint
pprint.pprint(details)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'action': {'raw_similarity': 1.0, 'weighted_contribution': 0.1},
 'background': {'raw_similarity': 0.998, 'weighted_contribution': 0.1},
 'final_similarity_score': 0.899,
 'location': {'raw_similarity': 1.0, 'weighted_contribution': 0.15},
 'main_object_attributes': {'raw_similarity': 0.997,
                            'weighted_contribution': 0.1},
 'main_objects': {'raw_similarity': 0.998, 'weighted_contribution': 0.299},
 'surroundings': {'raw_similarity': 0.999, 'weighted_contribution': 0.15}}


In [106]:
import json
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import torch
import pprint

# Load best pre-trained model for semantic similarity
model = SentenceTransformer("sentence-transformers/all-roberta-large-v1")

# ------------------ LOAD JSON ------------------
with open("captions.json", "r", encoding="utf-8") as f:
    captions = json.load(f)

teacher_caption = captions.get("teacher_caption", {})
student_caption = captions.get("student_caption", {})

# ------------------ CATEGORY WEIGHTS ------------------
weights = {
    "main_objects": 0.3,
    "main_object_attributes": 0.1,
    "location": 0.15,
    "action": 0.1,
    "surroundings": 0.15,
    "background": 0.1
}

# ------------------ SIMILARITY FUNCTION ------------------
def compute_similarity(list1, list2):
    if not list1 or not list2:
        return 0.0
    text1 = ', '.join(list1)
    text2 = ', '.join(list2)

    embedding1 = model.encode(text1, convert_to_tensor=True)
    embedding2 = model.encode(text2, convert_to_tensor=True)

    similarity = cosine_similarity(
        embedding1.cpu().numpy().reshape(1, -1),
        embedding2.cpu().numpy().reshape(1, -1)
    )[0][0]
    return similarity

# ------------------ SCORE CALCULATION ------------------
total_score = 0.0
details = {}

for key in weights:
    t_val = teacher_caption.get(key, [])
    s_val = student_caption.get(key, [])
    score = compute_similarity(t_val, s_val)
    weighted_score = score * weights[key]
    total_score += weighted_score
    details[key] = {
        "raw_similarity": round(score, 4),
        "weighted_contribution": round(weighted_score, 4)
    }

total_score = round(total_score, 4)
details["final_similarity_score"] = total_score

# ------------------ PRINT RESULTS ------------------
pprint.pprint(details)


{'action': {'raw_similarity': 0.4001, 'weighted_contribution': 0.04},
 'background': {'raw_similarity': 0.8511, 'weighted_contribution': 0.0851},
 'final_similarity_score': 0.7991,
 'location': {'raw_similarity': 1.0, 'weighted_contribution': 0.15},
 'main_object_attributes': {'raw_similarity': 0.7971,
                            'weighted_contribution': 0.0797},
 'main_objects': {'raw_similarity': 1.0, 'weighted_contribution': 0.3},
 'surroundings': {'raw_similarity': 0.9621, 'weighted_contribution': 0.1443}}
