In [1]:
import torch
import torch.nn as nn
from transformers import CLIPModel, CLIPProcessor
from torchvision.models import mobilenet_v3_small
from torchvision import transforms
from torch.utils.data import Dataset,DataLoader
import torch.optim as optim
from PIL import Image
import os

In [2]:
teacher_model = CLIPModel.from_pretrained("openai/clip-vit-large-patch14")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-large-patch14")
teacher_model.eval()

config.json:   0%|          | 0.00/4.52k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.71G [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/905 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/961k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/525k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.22M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/389 [00:00<?, ?B/s]

CLIPModel(
  (text_model): CLIPTextTransformer(
    (embeddings): CLIPTextEmbeddings(
      (token_embedding): Embedding(49408, 768)
      (position_embedding): Embedding(77, 768)
    )
    (encoder): CLIPEncoder(
      (layers): ModuleList(
        (0-11): 12 x CLIPEncoderLayer(
          (self_attn): CLIPSdpaAttention(
            (k_proj): Linear(in_features=768, out_features=768, bias=True)
            (v_proj): Linear(in_features=768, out_features=768, bias=True)
            (q_proj): Linear(in_features=768, out_features=768, bias=True)
            (out_proj): Linear(in_features=768, out_features=768, bias=True)
          )
          (layer_norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): CLIPMLP(
            (activation_fn): QuickGELUActivation()
            (fc1): Linear(in_features=768, out_features=3072, bias=True)
            (fc2): Linear(in_features=3072, out_features=768, bias=True)
          )
          (layer_norm2): LayerNorm((768,), eps=1e

In [3]:
student_model = mobilenet_v3_small(pretrained=True)
student_model.classifier[3]=nn.Linear(student_model.classifier[3].in_features,768)

Downloading: "https://download.pytorch.org/models/mobilenet_v3_small-047dcff4.pth" to /root/.cache/torch/hub/checkpoints/mobilenet_v3_small-047dcff4.pth
100%|██████████| 9.83M/9.83M [00:00<00:00, 171MB/s]


In [4]:
device=torch.device('cuda'  if torch.cuda.is_available() else "cpu")
teacher_model.eval().to(device)
student_model.to(device)
print(device)

cuda


In [5]:
transform=transforms.Compose([
    transforms.Resize((224,224)),
    # transforms.RandomHorizontalFlip(),
    # transforms.ColorJitter(brightness=0.4,contrast=0.4,saturation=0.4),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.48145466,0.4578275,0.40821073],std=[0.26862954,0.26130258,0.275777111])
])

In [6]:
class CustomDataset(Dataset):
    def __init__(self,root_dir,transform=None):
        super().__init__()
        self.root_dir=root_dir
        self.transform=transform
        self.image_paths=[os.path.join(root_dir,f_name) for f_name in os.listdir(root_dir) if f_name.endswith(('.jpg','.jpeg','.png'))]

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self,idx):
        img_path=self.image_paths[idx]
        image=Image.open(img_path).convert('RGB')
        if self.transform:
            image_tensor=self.transform(image)
        else:
            image_tensor=transforms.ToTensor()(image)
        return image_tensor,image
        

In [7]:
def custom_collate(batch):
    image_tensors=[item[0] for item in batch]
    pil_images=[item[1] for item in batch]
    image_tensors=torch.stack(image_tensors)
    return  image_tensors,pil_images

In [8]:
dataset=CustomDataset(root_dir='/kaggle/input/coco-train-dataset/train2014',transform=transform)
dataloader=DataLoader(dataset,batch_size=128,shuffle=True,num_workers=4,collate_fn=custom_collate)

## For 5 Epochs

In [9]:
criterion=nn.MSELoss()
optimizer=optim.Adam(student_model.parameters(), lr=0.005)
# scheduler=optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.1)
scheduler=optim.lr_scheduler.CosineAnnealingLR(optimizer,T_max=5)

In [10]:
num_epochs=5
for epoch in range(num_epochs):
    student_model.train()
    running_loss=0.0
    for i,(images,images_pil) in enumerate(dataloader):
        images=images.to(device)
        with torch.no_grad():
            pil_images=list(images_pil)
            inputs=processor(images=pil_images,return_tensors='pt').to(device)
            teacher_embeddings=teacher_model.get_image_features(**inputs)

        student_embeddings=student_model(images)
        loss=criterion(student_embeddings,teacher_embeddings)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        running_loss+=loss.item()
        if i%25==24:
            print(f"Epoch {epoch+1}, Batch {i+1}, Loss: {running_loss/25:.4f}")
            running_loss=0.0
    scheduler.step()
    print(f"Epoch {epoch+1}/{num_epochs} completed")

Epoch 1, Batch 25, Loss: 0.3992
Epoch 1, Batch 50, Loss: 0.2276
Epoch 1, Batch 75, Loss: 0.2159
Epoch 1, Batch 100, Loss: 0.2084
Epoch 1, Batch 125, Loss: 0.2054
Epoch 1, Batch 150, Loss: 0.1988
Epoch 1, Batch 175, Loss: 0.1964
Epoch 1, Batch 200, Loss: 0.1958
Epoch 1, Batch 225, Loss: 0.1930
Epoch 1, Batch 250, Loss: 0.1922
Epoch 1, Batch 275, Loss: 0.1896
Epoch 1, Batch 300, Loss: 0.1865
Epoch 1, Batch 325, Loss: 0.1857
Epoch 1, Batch 350, Loss: 0.1832
Epoch 1, Batch 375, Loss: 0.1835
Epoch 1, Batch 400, Loss: 0.1848
Epoch 1, Batch 425, Loss: 0.1821
Epoch 1, Batch 450, Loss: 0.1805
Epoch 1, Batch 475, Loss: 0.1818
Epoch 1, Batch 500, Loss: 0.1803
Epoch 1, Batch 525, Loss: 0.1779
Epoch 1, Batch 550, Loss: 0.1783
Epoch 1, Batch 575, Loss: 0.1761
Epoch 1, Batch 600, Loss: 0.1776
Epoch 1, Batch 625, Loss: 0.1770
Epoch 1/5 completed
Epoch 2, Batch 25, Loss: 0.1725
Epoch 2, Batch 50, Loss: 0.1734
Epoch 2, Batch 75, Loss: 0.1720
Epoch 2, Batch 100, Loss: 0.1734
Epoch 2, Batch 125, Loss: 0.1

In [11]:
torch.save(student_model.state_dict(), 'mobilenet_v3_small_distilled_state_dict.pth')

## For 10 Epochs

In [9]:
criterion=nn.MSELoss()
optimizer=optim.Adam(student_model.parameters(), lr=0.005)
scheduler=optim.lr_scheduler.CosineAnnealingLR(optimizer,T_max=10)

num_epochs=10
for epoch in range(num_epochs):
    student_model.train()
    running_loss=0.0
    for i,(images,images_pil) in enumerate(dataloader):
        images=images.to(device)
        with torch.no_grad():
            pil_images=list(images_pil)
            inputs=processor(images=pil_images,return_tensors='pt').to(device)
            teacher_embeddings=teacher_model.get_image_features(**inputs)

        student_embeddings=student_model(images)
        loss=criterion(student_embeddings,teacher_embeddings)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        running_loss+=loss.item()
        if i%25==24:
            print(f"Epoch {epoch+1}, Batch {i+1}, Loss: {running_loss/25:.4f}")
            running_loss=0.0
    scheduler.step()
    print(f"Epoch {epoch+1}/{num_epochs} completed")

Epoch 1, Batch 25, Loss: 0.3970
Epoch 1, Batch 50, Loss: 0.2279
Epoch 1, Batch 75, Loss: 0.2143
Epoch 1, Batch 100, Loss: 0.2081
Epoch 1, Batch 125, Loss: 0.2042
Epoch 1, Batch 150, Loss: 0.2009
Epoch 1, Batch 175, Loss: 0.1981
Epoch 1, Batch 200, Loss: 0.1949
Epoch 1, Batch 225, Loss: 0.1939
Epoch 1, Batch 250, Loss: 0.1900
Epoch 1, Batch 275, Loss: 0.1888
Epoch 1, Batch 300, Loss: 0.1893
Epoch 1, Batch 325, Loss: 0.1876
Epoch 1, Batch 350, Loss: 0.1840
Epoch 1, Batch 375, Loss: 0.1841
Epoch 1, Batch 400, Loss: 0.1836
Epoch 1, Batch 425, Loss: 0.1812
Epoch 1, Batch 450, Loss: 0.1814
Epoch 1, Batch 475, Loss: 0.1807
Epoch 1, Batch 500, Loss: 0.1806
Epoch 1, Batch 525, Loss: 0.1783
Epoch 1, Batch 550, Loss: 0.1793
Epoch 1, Batch 575, Loss: 0.1778
Epoch 1, Batch 600, Loss: 0.1795
Epoch 1, Batch 625, Loss: 0.1781
Epoch 1/10 completed
Epoch 2, Batch 25, Loss: 0.1749
Epoch 2, Batch 50, Loss: 0.1733
Epoch 2, Batch 75, Loss: 0.1729
Epoch 2, Batch 100, Loss: 0.1726
Epoch 2, Batch 125, Loss: 0.

In [10]:
torch.save(student_model.state_dict(), 'mobilenet_v3_small_distilled_new_state_dict.pth')

In [11]:
student_model.eval()
image=Image.open('/kaggle/input/coco-train-dataset/train2014/COCO_train2014_000000000009.jpg').convert('RGB')
input_tensor=processor(images=image,return_tensors='pt').to(device)
input_tensor=input_tensor['pixel_values']
with torch.no_grad():
    teacher_embedding=teacher_model.get_image_features(input_tensor)
with torch.no_grad():
    student_embedding=student_model(input_tensor)
    

In [12]:
print("Teacher embedding shape:", teacher_embedding.shape)
print("Student embedding shape:", student_embedding.shape)

teacher_norm = torch.norm(teacher_embedding, p=2, dim=1)
student_norm = torch.norm(student_embedding, p=2, dim=1)

print("Teacher L2 norm:", teacher_norm.item())
print("Student L2 norm:", student_norm.item())

Teacher embedding shape: torch.Size([1, 768])
Student embedding shape: torch.Size([1, 768])
Teacher L2 norm: 19.493438720703125
Student L2 norm: 16.585071563720703


In [13]:
teacher_embedding# Assuming your code from earlier
with torch.no_grad():
    teacher_embedding = teacher_model.get_image_features(input_tensor)  # or encode_image
    student_embedding = student_model(input_tensor)

# Normalize
teacher_normalized = nn.functional.normalize(teacher_embedding, p=2, dim=1)
student_normalized = nn.functional.normalize(student_embedding, p=2, dim=1)

# Check norms (should be 1)
print("Teacher normalized norm:", torch.norm(teacher_normalized, p=2, dim=1).item())
print("Student normalized norm:", torch.norm(student_normalized, p=2, dim=1).item())

# Compute normalized MSE and cosine similarity
mse_normalized = nn.MSELoss()(student_normalized, teacher_normalized)
cos_sim = nn.functional.cosine_similarity(student_normalized, teacher_normalized).mean()

print("Normalized MSE:", mse_normalized.item())
print("Cosine Similarity:", cos_sim.item())

Teacher normalized norm: 1.0
Student normalized norm: 0.9999999403953552
Normalized MSE: 0.00020227096683811396
Cosine Similarity: 0.9223280549049377


In [14]:
with torch.no_grad():
    teacher_embedding = teacher_model.get_image_features(input_tensor)  # or encode_image
    student_embedding = student_model(input_tensor)

# Normalize
teacher_normalized = nn.functional.normalize(teacher_embedding, p=2, dim=1)
student_normalized = nn.functional.normalize(student_embedding, p=2, dim=1)

# Compute metrics
mse_normalized = nn.MSELoss()(teacher_normalized, student_normalized)
cos_sim = nn.functional.cosine_similarity(teacher_normalized, student_normalized).mean()

print("Teacher normalized norm:", torch.norm(teacher_normalized, p=2, dim=1).item())
print("Student normalized norm:", torch.norm(student_normalized, p=2, dim=1).item())
print("Normalized MSE:", mse_normalized.item())
print("Cosine Similarity:", cos_sim.item())

Teacher normalized norm: 1.0
Student normalized norm: 0.9999999403953552
Normalized MSE: 0.00020227096683811396
Cosine Similarity: 0.9223280549049377


In [15]:
diff = (teacher_normalized - student_normalized).abs().mean()
print("Mean absolute difference:", diff.item())

Mean absolute difference: 0.011231629177927971
