In [None]:
import numpy as np
import pandas as pd
import os
from PIL import Image
import requests
from transformers import CLIPProcessor, CLIPModel
import torch
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
from matplotlib.offsetbox import OffsetImage, AnnotationBbox
from torchvision.transforms import Compose, Resize, ToTensor, Normalize

In [None]:
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

Q1)

The model is a **CLIP** model, consisting of two main components: a **text encoder** and a **vision encoder**.

1. **Text Encoder:**
   - Architecture: **Transformer-based**
   - Purpose: Encodes input text into a fixed-size feature vector.
   - Input: Tokenized text sequence with positional embeddings.
   - Output: 512-dimensional text embeddings.

2. **Vision Encoder:**
   - Architecture: **Vision Transformer (ViT)**.
   - Purpose: Encodes input images into a fixed-size feature vector.
   - Input: Image patches with positional embeddings.
   - Output: 512-dimensional image embeddings.

3. **Projection Heads:**
   - Both the text and vision embeddings are projected to a common 512-dimensional latent space to compute contrastive similarity.



#### Text Encoder Architecture
1. **Embedding Layer:**
   - **Token Embedding:** Maps 49,408 tokens to 512-dimensional vectors.
     - **Parameters:** \( 49408 * 512 = 25,311,232 \).
   - **Position Embedding:** Embeds the position of up to 77 tokens.
     - **Parameters:** \( 77 * 512 = 39,424 \).

2. **Transformer Encoder:**
   - **Number of Layers:** 12.
   - Each layer contains:
     - **Self-Attention Mechanism (CLIPSdpaAttention):**
       - **K, Q, V Projections:** Linear transformations from 512 to 512 dimensions.
         - **Parameters per projection:** \( 512 * 512 + 512 = 262,656 \) (bias included).
         - Total for K, Q, V, and output projection: \( 4 * 262,656 = 1,050,624 \).
     - **Layer Norm:** Maintains stability and speeds up training.
         - **Parameters:** \( 512 * 2 = 1,024 \) (scaling and bias).
     - **MLP Block:**
       - **First Linear Layer:** Expands features from 512 to 2048 dimensions.
         - **Parameters:** \( 512 * 2048 + 2048 = 1,050,624 \).
       - **Second Linear Layer:** Reduces features from 2048 to 512 dimensions.
         - **Parameters:** \( 2048 * 512 + 512 = 1,050,624 \).

   - **Total Parameters per Transformer Layer:**
     \( 1,050,624 + 1,024 + 2,101,248 = 3,152,896 \).
   - **Total Transformer Parameters:** \( 3,152,896 * 12 = 37,834,752 \).

3. **Final Layer Norm:**
   - Parameters: \( 512 * 2 = 1,024 \).


#### Vision Encoder Architecture
1. **Embedding Layer:**
   - **Patch Embedding:** Converts input image (split into patches) into feature vectors using a convolutional layer.
     - Input Channels: 3 (RGB).
     - Output Channels: 768.
     - Kernel Size: \( 32 * 32 \).
     - **Parameters:** \( 3 * 32 * 32 * 768 = 2,359,296 \).
   - **Position Embedding:** Embeds positions for up to 50 patches.
     - **Parameters:** \( 50 * 768 = 38,400 \).

2. **Transformer Encoder:**
   - **Number of Layers:** 12.
   - Each layer contains:
     - **Self-Attention Mechanism (CLIPSdpaAttention):**
       - **K, Q, V Projections:** Linear transformations from 768 to 768 dimensions.
         - **Parameters per projection:** \( 768 * 768 + 768 = 590,592 \).
         - Total for K, Q, V, and output projection: \( 4 * 590,592 = 2,362,368 \).
     - **Layer Norm:** Parameters: \( 768 * 2 = 1,536 \).
     - **MLP Block:**
       - **First Linear Layer:** Expands features from 768 to 3072 dimensions.
         - **Parameters:** \( 768 * 3072 + 3072 = 2,362,368 \).
       - **Second Linear Layer:** Reduces features from 3072 to 768 dimensions.
         - **Parameters:** \( 3072 * 768 + 768 = 2,362,368 \).

   - **Total Parameters per Transformer Layer:**
     \( 2,362,368 + 1,536 + 4,724,736 = 7,088,640 \).
   - **Total Transformer Parameters:** \( 7,088,640 * 12 = 85,063,680 \).

3. **Final Layer Norm:**
   - Parameters: \( 768 * 2 = 1,536 \).


#### Projection Heads
1. **Text Projection:**
   - Maps 512-dimensional text embeddings to the shared 512-dimensional space.
     - **Parameters:** \( 512 * 512 = 262,144 \).

2. **Vision Projection:**
   - Maps 768-dimensional vision embeddings to the shared 512-dimensional space.
     - **Parameters:** \( 768 * 512 = 393,216 \).


#### Parameter Breakdown
1. **Text Model Total:** ~63.1M parameters.
2. **Vision Model Total:** ~87.9M parameters.
3. **Projection Layers Total:** ~0.65M parameters.
4. **Grand Total:** ~151.7M parameters.



### Summary
This CLIP model uses a **transformer architecture** for both text and vision encoders. It leverages **self-attention mechanisms** to encode textual and visual data into feature vectors, and the projection layers align these features into a shared latent space for contrastive learning. Key architectural components include:
- **Self-Attention Layers (K, Q, V projections):** Capture relationships between tokens or image patches.
- **MLP Layers:** Expand and compress features for non-linear transformations.
- **Embedding Layers:** Encode input text tokens and image patches.

This architecture allows the model to learn a joint representation of text and images for tasks like image-text retrieval or zero-shot classification.


In [None]:
print(model)

In [None]:
os.getcwd()

In [None]:
os.listdir()

In [None]:
!unrar x /content/datah4.rar /content/extracted/

In [None]:
extracted_path = '/content/extracted'
print(os.listdir(extracted_path))

In [None]:
datah4_path = os.path.join(extracted_path, 'datah4')
img_cat=os.listdir(datah4_path)
print("Content:", os.listdir(datah4_path))

In [None]:
img_cat

In [None]:
extracted_path = '/content/extracted/datah4'
image_types = ["chair", "hat", "car", "duck", "airplane", "donkey", "dog", "cup"]

image_df = pd.DataFrame()
image_list = []

for cat in img_cat:
  path = os.path.join(extracted_path, cat)
  files = os.listdir(path)
  files = [os.path.join(path, fil) for fil in files]
  image_list.extend(files)

print(len(image_list))

In [None]:
image_df['Images'] = image_list
image_df.head()

In [None]:
per_image_cat = []
for im in image_df["Images"]:
  for cat in img_cat:
    if cat in im.lower():
      per_image_cat.append(cat)
      break

image_df['Image_Category'] = per_image_cat

In [None]:
per_image_type = []
for im in image_df["Images"]:
  for typ in image_types:
    if typ in im.lower():
      per_image_type.append(typ)
      break

image_df['Image_Type'] = per_image_type

In [None]:
image_types = ["chair", "hat", "car", "duck", "airplane", "donkey", "dog", "cup"]
image_labels = []
for im in image_df['Image_Type']:
  im_lbl = []
  for image_typ in image_types:
    if im == image_typ:
      im_lbl.append(1)
    else:
      im_lbl.append(0)
  image_labels.append(im_lbl)

image_df['Image_Label'] = image_labels

In [None]:
statements = []
for i in range(len(image_df['Image_Type'])):
  im_st = []
  for image_typ in image_types:
    im_st.append(f"a photo of {image_typ}")
  statements.append(im_st)

image_df["Statements"] = statements

In [None]:
prediction_probas = []

for im, st in zip(image_df["Images"], image_df["Statements"]):
  image = Image.open(im)

  inputs = processor(text=st, images=image, return_tensors="pt", padding=True)

  outputs = model(**inputs)
  logits_per_image = outputs.logits_per_image
  probs = logits_per_image.softmax(dim=1)

  prediction_probas.append(probs.detach().numpy().tolist())

  del inputs
  del outputs
  del logits_per_image
  del probs
  del image

image_df["Prediction_Probabilities"] = prediction_probas

Getting the Predication Probablities for each category along with Correct Predication

In [None]:
image_df

In [None]:
prediction_accuracy = []

for lbl, preds in zip(image_df['Image_Label'], image_df['Prediction_Probabilities']):
    preds = preds[0]
    max_idx1 = lbl.index(max(lbl))
    max_idx2 = preds.index(max(preds))
    prediction_accuracy.append(int(max_idx1 == max_idx2))

image_df['Correct_Prediction'] = prediction_accuracy

In [None]:
image_df

In [None]:
accuracy_df = pd.pivot_table(
    image_df,
    values='Correct_Prediction',
    index='Image_Category',
    aggfunc='mean',
    fill_value=0
)

In [None]:
accuracy_df = accuracy_df.reset_index()

Q2)

The highest accuracy of 100% was achieved for realistic images, indicating that the model performs exceptionally well when provided with clear and detailed images that closely match real-world scenarios.

For silhouettes, the accuracy was 90%, showing that the model can effectively recognize shapes and outlines despite the absence of fine details.

In the features condition, accuracy reached 76.19%, suggesting that the model can identify objects based on key features with some limitations.

Performance dropped notably for blurred images, where the accuracy was 69.05%, likely due to the loss of critical details in the image.

The lowest accuracy, 48.33%, was observed for the geons condition, highlighting significant difficulty in recognizing objects when presented in simplified geometric forms.



In [None]:
accuracy_df

Approximate data from the paper for baby trials: https://osf.io/preprints/psyarxiv/83gae

blurred: 0.55

features: 0.55

geons: 0.6

realistic: 0.85

silhouettes: 0.8

In [None]:
accuracy_df["Baby_Accuracy"] = [0.55, 0.55, 0.6, 0.85, 0.8]
accuracy_df = accuracy_df.rename(columns = {"Correct_Prediction" : "Model_Accuracy"})

Part b)

The model outperforms babies in the blurred and features conditions (69.05% and 76.19% vs. 55%), as well as in realistic and silhouettes conditions (100% and 90% vs. 85% and 80%).

 However, it underperforms in the geons condition (48.33% vs. 60%), where babies excel at recognizing simplified geometric shapes. Overall, the model is generally more accurate but struggles with abstract representations compared to infants.

In [None]:
accuracy_df

Plot Baby Vs Model

In [None]:
import matplotlib.pyplot as plt


x = np.arange(len(accuracy_df))
width = 0.35

fig, ax = plt.subplots(figsize=(10, 6))

ax.bar(x - width / 2, accuracy_df['Model_Accuracy'], width, label='Model Accuracy', color='b')
ax.bar(x + width / 2, accuracy_df['Baby_Accuracy'], width, label='Baby Accuracy', color='r')

ax.set_xlabel('Image Category')
ax.set_ylabel('Accuracy')
ax.set_title('Model vs Baby Accuracy by Image Category')
ax.set_xticks(x)
ax.set_xticklabels(accuracy_df['Image_Category'], rotation=45)
ax.legend()


plt.tight_layout()
plt.show()

In [None]:
image_df['Images'][0]

Q3)

The t-SNE visualization of the image embeddings from the final layer of the vision encoder displays the clustering of images in a 2D space. Similar images, such as objects of the same category for ex. airplanes, chairs, cups, are grouped closely together, reflecting the model's ability to encode visual features effectively.

Realistic and simplified conditions (like silhouettes and geons) are also organized into distinct regions, showing how the model differentiates between visual variations.

 This clustering highlights the model's capability to learn meaningful visual representations, preserving semantic relationships across different image categories and conditions.

In [None]:


image_paths = list(image_df["Images"])


def get_clip_embeddings(image_paths):
    embeddings = []
    for image_path in image_paths:
        image = Image.open(image_path).convert("RGB")
        inputs = processor(images=image, return_tensors="pt")
        with torch.no_grad():
            embedding = model.get_image_features(**inputs)
        embeddings.append(embedding.squeeze().cpu().numpy())

    return np.array(embeddings)


embeddings = get_clip_embeddings(image_paths)


tsne = TSNE(n_components=2, random_state=42)
tsne_results = tsne.fit_transform(embeddings)


def plot_tsne_with_images(tsne_results, image_paths):
    fig, ax = plt.subplots(figsize=(10, 10))
    for i, (x, y) in enumerate(tsne_results):
        img = Image.open(image_paths[i])
        img = OffsetImage(img, zoom=0.01)
        ab = AnnotationBbox(img, (x, y), frameon=False)
        ax.add_artist(ab)
    ax.scatter(tsne_results[:, 0], tsne_results[:, 1], alpha=0)
    ax.set_xlabel("t-SNE Dimension 1")
    ax.set_ylabel("t-SNE Dimension 2")
    ax.set_title("t-SNE of Image Embeddings with Image Overlays")
    plt.show()

plot_tsne_with_images(tsne_results, image_paths)