<a href="https://colab.research.google.com/github/AreeshaRaza/VVAI/blob/main/CLIP_Code.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install torch torchvision ftfy regex tqdm

Collecting ftfy
  Downloading ftfy-6.2.0-py3-none-any.whl (54 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m54.4/54.4 kB[0m [31m526.4 kB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl (410.6 MB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from torch)
  Using cached nvidia_cufft_cu12-11.0.2.54-py3-

In [None]:
#Mount the drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install git+https://github.com/openai/CLIP.git

Collecting git+https://github.com/openai/CLIP.git
  Cloning https://github.com/openai/CLIP.git to /tmp/pip-req-build-7kqvrxt2
  Running command git clone --filter=blob:none --quiet https://github.com/openai/CLIP.git /tmp/pip-req-build-7kqvrxt2
  Resolved https://github.com/openai/CLIP.git to commit dcba3cb2e2827b402d2701e7e1c7d9fed8a20ef1
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: clip
  Building wheel for clip (setup.py) ... [?25l[?25hdone
  Created wheel for clip: filename=clip-1.0-py3-none-any.whl size=1369490 sha256=14493dafb0187224b5f84ba3ae4a176ac1bef2ef9bea2ce019a499e9426e96a8
  Stored in directory: /tmp/pip-ephem-wheel-cache-i6bgy53d/wheels/da/2b/4c/d6691fa9597aac8bb85d2ac13b112deb897d5b50f5ad9a37e4
Successfully built clip
Installing collected packages: clip
Successfully installed clip-1.0


In [None]:
#Load pre-trained CLIP Model
import torch
import clip
from torchvision import transforms
from PIL import Image
import json

device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device=device)

100%|███████████████████████████████████████| 338M/338M [00:03<00:00, 90.4MiB/s]


In [None]:
from torch.utils.data import Dataset, DataLoader, random_split
import os

In [None]:
# Define paths
base_dir ='/content/drive/My Drive/Datasets/Fashion'
img_dir = os.path.join(base_dir, 'images')
captions_file = os.path.join(base_dir, 'captions.json')

In [None]:
from torch.cuda.amp import GradScaler, autocast  # For mixed precision training

In [None]:
# Custom dataset class
class FashionDataset(Dataset):
    def __init__(self, img_dir, captions_file, transform=None):
        self.img_dir = img_dir
        self.transform = transform
        with open(captions_file, 'r') as file:
            self.captions = json.load(file)
        self.imgs = [entry['image_id'] for entry in self.captions]
        self.texts = {entry['image_id']: entry['text'] for entry in self.captions}

    def __len__(self):
        return len(self.imgs)

    def __getitem__(self, idx):
        img_name = self.imgs[idx]
        img_path = os.path.join(self.img_dir, f"{img_name}.jpg")

        if not os.path.exists(img_path):
            raise FileNotFoundError(f"Image file not found: {img_path}")

        image = Image.open(img_path).convert("RGB")
        if self.transform:
            image = self.transform(image)
        caption = self.texts[img_name]

        # Tokenize and handle truncation
        text = clip.tokenize(caption, truncate=True).squeeze()

        return image, text

# Transform and dataset split
transform = transforms.Compose([preprocess])
dataset = FashionDataset(img_dir=img_dir, captions_file=captions_file, transform=transform)
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

# Use a smaller batch size to reduce memory usage
train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=16, shuffle=False)

# Optimizer and training loop
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5, weight_decay=1e-4)
loss_img = torch.nn.CrossEntropyLoss()
loss_txt = torch.nn.CrossEntropyLoss()

# GradScaler for mixed precision training
scaler = GradScaler()

# Function to save a checkpoint
def save_checkpoint(model, optimizer, scaler, epoch, checkpoint_path):
    checkpoint = {
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'scaler_state_dict': scaler.state_dict(),
        'epoch': epoch
    }
    torch.save(checkpoint, checkpoint_path)
    print(f"Checkpoint saved at epoch {epoch}")

# Function to load a checkpoint
def load_checkpoint(model, optimizer, scaler, checkpoint_path):
    checkpoint = torch.load(checkpoint_path)
    model.load_state_dict(checkpoint['model_state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    scaler.load_state_dict(checkpoint['scaler_state_dict'])
    epoch = checkpoint['epoch']
    print(f"Checkpoint loaded from epoch {epoch}")
    return epoch + 1

def train_clip(model, train_dataloader, val_dataloader, optimizer, device, epochs=10, checkpoint_path='/content/drive/My Drive/clip_checkpoint2.pth'):
    start_epoch = 0

    # Load checkpoint if exists
    if os.path.exists(checkpoint_path):
        start_epoch = load_checkpoint(model, optimizer, scaler, checkpoint_path)

    for epoch in range(start_epoch, epochs):
        model.train()
        train_loss = 0.0
        num_train_batches = len(train_dataloader)

        for images, texts in train_dataloader:
            images = images.to(device)
            texts = texts.to(device)

            with autocast():
                logits_per_image, logits_per_text = model(images, texts)
                ground_truth = torch.arange(len(images), dtype=torch.long, device=device)

                loss_i = loss_img(logits_per_image, ground_truth)
                loss_t = loss_txt(logits_per_text, ground_truth)
                loss = (loss_i + loss_t) / 2

            optimizer.zero_grad()
            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()

            train_loss += loss.item()

        avg_train_loss = train_loss / num_train_batches
        print(f"Epoch [{epoch+1}/{epochs}], Training Loss: {avg_train_loss:.4f}")

        # Validation phase
        model.eval()
        val_loss = 0.0
        num_val_batches = len(val_dataloader)

        with torch.no_grad():
            for images, texts in val_dataloader:
                images = images.to(device)
                texts = texts.to(device)

                logits_per_image, logits_per_text = model(images, texts)
                ground_truth = torch.arange(len(images), dtype=torch.long, device=device)

                loss_i = loss_img(logits_per_image, ground_truth)
                loss_t = loss_txt(logits_per_text, ground_truth)
                loss = (loss_i + loss_t) / 2

                val_loss += loss.item()

        avg_val_loss = val_loss / num_val_batches
        print(f"Epoch [{epoch+1}/{epochs}], Validation Loss: {avg_val_loss:.4f}")

        # Save checkpoint at the end of each epoch
        save_checkpoint(model, optimizer, scaler, epoch, checkpoint_path)

# Train the model with checkpointing
train_clip(model, train_dataloader, val_dataloader, optimizer, device, epochs=10, checkpoint_path='/content/drive/My Drive/clip_checkpoint2.pth')

# Save the final fine-tuned model
torch.save(model.state_dict(), '/content/drive/My Drive/checkpoint_clip2.pth')




Checkpoint loaded from epoch 8




Epoch [10/10], Training Loss: 0.1085
Epoch [10/10], Validation Loss: 0.1068
Checkpoint saved at epoch 9


In [None]:
import torch
import clip

device = "cuda" if torch.cuda.is_available() else "cpu"

# Load the CLIP model and preprocess function
model, preprocess = clip.load("ViT-B/32", device=device)

# Load your trained model's weights
checkpoint_path = '/content/drive/My Drive/clip_checkpoint2.pth'

In [None]:
checkpoint = torch.load(checkpoint_path)

In [None]:
model.load_state_dict(checkpoint['model_state_dict'])

<All keys matched successfully>

In [None]:
model.eval()  # Set to evaluation mode

print("Model loaded and set to evaluation mode.")

Model loaded and set to evaluation mode.
