In [1]:
#!pip install open_clip_torch
import open_clip
import torch
from PIL import Image
import numpy as np
# Load the model and preprocessing function
##### Load Model #####
device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)
model, _, preprocess = open_clip.create_model_and_transforms('ViT-L-14', device=device, pretrained='datacomp_xl_s13b_b90k')


cpu


In [2]:
imgs = ["input/000805.jpg","input/000817.jpg", "input/000830.jpg","input/000843.jpg"]

In [3]:
scenes = {
    "refueling":  ["input/000805.jpg","input/000817.jpg", "input/000830.jpg","input/000843.jpg"],
    "standings": ["input/000844.jpg","input/000861.jpg", "input/000879.jpg","input/000897.jpg"],
    "monk":["input/000898.jpg","input/000910.jpg", "input/000922.jpg","input/000935.jpg"],
    "statues":["input/000936.jpg","input/000947.jpg", "input/000958.jpg","input/000969.jpg"],
}

In [11]:
def calculate_scene_vectors(scenes):
    scene_vectors = {}
    
    with torch.no_grad():
        for scene, image_paths in scenes.items():
            vectors = []
            for image_path in image_paths:
                # Load and preprocess the image
                image = preprocess(Image.open(image_path)).unsqueeze(0).to(device)
                # Calculate the vector
                vector = model.encode_image(image).cpu().numpy()
                vectors.append(vector)
            # Average the vectors to get a single vector for the scene
            avg_scene_vector = sum(vectors) / len(vectors)
            scene_vectors[scene] = vectors
            scene_vectors[scene].append(avg_scene_vector)
    
    return scene_vectors

In [12]:
scene_vectors = calculate_scene_vectors(scenes)

In [6]:
scene_vectors["refueling"][0].shape

(1, 768)

In [9]:
def predict_the_scene(text, scene_vectors):
    # Tokenize the text
    text_inputs = open_clip.tokenize([text]).to(device)
    
    with torch.no_grad():
        # Calculate the text vector
        text_vector = model.encode_text(text_inputs).cpu().numpy()
        
        # Calculate the similarity between text vector and each scene vector
        similarities = {}
        for scene, scene_vector in scene_vectors.items():
            # Use cosine similarity
            ts_scene_vt = torch.tensor(np.array(scene_vector)).squeeze(1)
            ts_text_vt = torch.tensor(text_vector)
            similarity = torch.nn.functional.cosine_similarity(ts_text_vt.expand(ts_scene_vt.shape[0],-1), ts_scene_vt, dim=1)
            similarities[scene] = similarity.max().item()
    
    return similarities

In [30]:
text = "many monks stands next to each other"
scene_confidences = predict_the_scene(text, scene_vectors)
print(f"{text}: {scene_confidences}")


many monks stands next to each other: {'refueling': 0.05392731726169586, 'standings': 0.17613336443901062, 'monk': 0.16794085502624512, 'statues': 0.15584662556648254}


In [25]:
text = "many guys with yellow clothes"
scene_confidences = predict_the_scene(text, scene_vectors)
print(f"{text}: {scene_confidences}")


many guys with yellow clothes: {'refueling': 0.10196693241596222, 'standings': 0.16138969361782074, 'monk': 0.09902562201023102, 'statues': 0.1291029453277588}


In [19]:
text = "a man with his car being refueled"
scene_confidences = predict_the_scene(text, scene_vectors)
print(f"{text}: {scene_confidences}")


a man with his car being refueled: {'refueling': 0.19195976853370667, 'standings': -0.012485004030168056, 'monk': -0.004050338640809059, 'statues': -0.04690425097942352}


In [20]:
text = "a man with white suit standing next to 2 women with puper and blue clothes"
scene_confidences = predict_the_scene(text, scene_vectors)
print(f"{text}: {scene_confidences}")


a man with white suit standing next to 2 women with puper and blue clothes: {'refueling': 0.09279666095972061, 'standings': 0.20089998841285706, 'monk': 0.11528919637203217, 'statues': 0.07210902869701385}
