[![Fixel Algorithms](https://fixelalgorithms.co/images/CCExt.png)](https://fixelalgorithms.gitlab.io)

# AI for System Engineers and Project Managers

## Deep Learning - Multi Modal - Contrastive Language Image Pre Training (CLIP)

Displays using a _Zero Shot Model_ for _Image Classification_.

> Notebook by:
> - Royi Avital RoyiAvital@fixelalgorithms.com

## Revision History

| Version | Date       | User        |Content / Changes                                                   |
|---------|------------|-------------|--------------------------------------------------------------------|
| 1.0.000 | 07/03/2025 | Royi Avital | First version                                                      |

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/FixelAlgorithmsTeam/FixelCourses/blob/master/AIProgram/2024_02/0037FeaturesTransform.ipynb)

In [None]:
# Import Packages

# General Tools
import numpy as np
import scipy as sp
import pandas as pd

# Machine Learning

# Deep Learning
import clip
import torch
import torch.nn as nn
from torch.utils.data import Dataset
import torchvision
from torchvision.transforms import v2 as TorchVisionTrns

# Image Processing
import skimage as ski

# Miscellaneous
import os
import onedrivedownloader #<! https://github.com/loribonna/onedrivedownloader
from platform import python_version
import random
import time

# Typing
from typing import Callable, Dict, List, Optional, Self, Set, Tuple, Union

# Visualization
import matplotlib.pyplot as plt

# Jupyter
from IPython import get_ipython

## Notations

* <font color='red'>(**?**)</font> Question to answer interactively.
* <font color='blue'>(**!**)</font> Simple task to add code for the notebook.
* <font color='green'>(**@**)</font> Optional / Extra self practice.
* <font color='brown'>(**#**)</font> Note / Useful resource / Food for thought.

Code Notations:

```python
someVar    = 2; #<! Notation for a variable
vVector    = np.random.rand(4) #<! Notation for 1D array
mMatrix    = np.random.rand(4, 3) #<! Notation for 2D array
tTensor    = np.random.rand(4, 3, 2, 3) #<! Notation for nD array (Tensor)
tuTuple    = (1, 2, 3) #<! Notation for a tuple
lList      = [1, 2, 3] #<! Notation for a list
dDict      = {1: 3, 2: 2, 3: 1} #<! Notation for a dictionary
oObj       = MyClass() #<! Notation for an object
dfData     = pd.DataFrame() #<! Notation for a data frame
dsData     = pd.Series() #<! Notation for a series
hObj       = plt.Axes() #<! Notation for an object / handler / function handler
```

### Code Exercise

 - Single line fill

 ```python
 vallToFill = ???
 ```

 - Multi Line to Fill (At least one)

```python
# You need to start writing
?????
```

 - Section to Fill

```python
#===========================Fill This===========================#
# 1. Explanation about what to do.
# !! Remarks to follow / take under consideration.
mX = ???

?????
#===============================================================#
```

In [None]:
# Configuration
# %matplotlib inline

seedNum = 512
np.random.seed(seedNum)
random.seed(seedNum)

# Matplotlib default color palette
lMatPltLibclr = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', '#8c564b', '#e377c2', '#7f7f7f', '#bcbd22', '#17becf']
# sns.set_theme() #>! Apply SeaBorn theme

runInGoogleColab = 'google.colab' in str(get_ipython())

In [None]:
# Constants

FIG_SIZE_DEF    = (8, 8)
ELM_SIZE_DEF    = 50
CLASS_COLOR     = ('b', 'r')
EDGE_COLOR      = 'k'
MARKER_SIZE_DEF = 10
LINE_WIDTH_DEF  = 2

PROJECT_NAME      = 'FixelCourses'
DATA_FOLDER_PATH  = 'DataSets'
MODEL_FOLDER_PATH = 'Models'

BASE_FOLDER      = os.getcwd()[:len(os.getcwd()) - (os.getcwd()[::-1].lower().find(PROJECT_NAME.lower()[::-1]))]

L_IMG_EXT = ['.png', '.jpeg', '.jpg']

In [None]:
# Courses Packages



In [None]:
# General Auxiliary Functions

class CLIPDataset(Dataset):
    def __init__( self, dataFolderPath: str, dataFileName: str, oProc: Optional[Callable], oTokenizer: Optional[Callable], uniqueImages: bool = False ) -> None:
        """
        Constructor for the CLIP Dataset class
        """

        dfDataCaptions = pd.read_csv(os.path.join(dataFolderPath, dataFileName))
        if uniqueImages:
            dfDataCaptions = dfDataCaptions.drop_duplicates(subset = 'image')
            dfDataCaptions = dfDataCaptions.reset_index(drop = True)

        self._dataFolderPath = dataFolderPath
        self._dataFileName   = dataFileName
        self._dfDataCaptions = dfDataCaptions
        self._oProc          = oProc
        self._oTokenizer     = oTokenizer
        self._numSamples     = len(dfDataCaptions)

        self._lTokenizedCaption = clip.tokenize(dfDataCaptions['caption'].to_list())

    def __getitem__(self, idx):

        imgFileName = str(self._dfDataCaptions['image'][idx])

        tI = ski.io.imread(os.path.join(self._dataFolderPath, imgFileName))
        tI = ski.util.img_as_float32(tI)
        tI = self._oProc(tI)

        tTxt = self._lTokenizedCaption[idx]

        return tI, tTxt

    def __len__(self):

        return self._numSamples

    def GetCaption(self, idx: int) -> str:

        return self._dfDataCaptions['caption'][idx]

    def GetImage(self, idx: int) -> np.ndarray:

        imgFileName = str(self._dfDataCaptions['image'][idx])
        mI = ski.io.imread(os.path.join(self._dataFolderPath, imgFileName))
        mI = ski.util.img_as_float32(mI)

        return mI


def ModelToFloat32( oModel: nn.Module ) -> None:
    """
    Convert the parameters of a model to float32
    """
    for p in oModel.parameters():
        p.data      = p.data.float()
        if p.grad is not None:
            p.grad.data = p.grad.data.float()

def GetTopKImages( mSim: np.ndarray, paramK: int ) -> List[np.ndarray]:
    """
    Get the top 5 images for each text
    """

    lTopKInd = []

    for ii in range(mSim.shape[1]):
        # https://stackoverflow.com/a/23734295
        vIndTopK = np.argpartition(mSim[:, ii], -paramK)[-paramK:]
        vIndTopK = vIndTopK[np.argsort(mSim[vIndTopK, ii])]

        lTopKInd.append(vIndTopK)

    return lTopKInd


## Contrastive Learning

Contrastive Learning is a _self supervised_ learning technique which learns embedding which clusters data based on the knowledge which samples are similar in some sense.

![](https://i.imgur.com/wH4Yc5c.png)
<!-- ![](https://i.postimg.cc/9M6SymRV/Picture1.png) -->

### OpenAI CLIP Model

The `CLIP` model learns to match _Text_ and _Image_.  
During training it learned:
 - Embedding Text.
 - Embedding Images.
 - Match Text (Embedding) and Image (Embedding).

![](https://upload.wikimedia.org/wikipedia/commons/thumb/e/ee/Contrastive_Language-Image_Pretraining.png/800px-Contrastive_Language-Image_Pretraining.png)

Applications:

 - _Zero Shot Classification_.
 - Retrieval Systems - Extract images from a DB given text.
 - Pre Processor - For text in the context of image generation or images for featurization in text context.


* <font color='brown'>(**#**)</font> [OpenAI CLIP](https://github.com/openai/CLIP) ([Wikipedia - CLIP](https://en.wikipedia.org/wiki/Contrastive_Language-Image_Pre-training), [OpenAI CLIP Page](https://openai.com/index/clip)).
* <font color='brown'>(**#**)</font> [OpenCLIP](https://github.com/mlfoundations/open_clip/) is an open model which includes _Fine Tuning_ models and training scripts.
* <font color='brown'>(**#**)</font> [The Stanford AI Lab Blog - Understanding Deep Learning Algorithms that Leverage Unlabeled Data, Part 2: Contrastive Learning](https://ai.stanford.edu/blog/understanding-contrastive-learning).
* <font color='brown'>(**#**)</font> [Ankesh Anand - Contrastive Self Supervised Learning](https://ankeshanand.com/blog/2020/01/26/contrative-self-supervised-learning.html).
* <font color='brown'>(**#**)</font> [Lilian Weng - Contrastive Representation Learning](https://lilianweng.github.io/posts/2021-05-31-contrastive).
* <font color='brown'>(**#**)</font> [Szymon Palucha - Understanding OpenAI’s CLIP Model](https://scribe.rip/6b52bade3fa3).
* <font color='brown'>(**#**)</font> [Kerry Halupka - Getting started with OpenAI’s CLIP](https://scribe.rip/a3b8f5277867).
* <font color='brown'>(**#**)</font> [Moein Shariatnia - Simple Implementation of OpenAI CLIP Model: A Tutorial](https://scribe.rip/ace6ff01d9f2).
* <font color='brown'>(**#**)</font> [Shashank Vats - A Guide to Fine Tuning CLIP Models with Custom Data](https://scribe.rip/6c7c0d1416fb).

In [None]:
# Parameters

# Data
datasetName = 'Flickr8K'
datasetUrl  = 'https://technionmail-my.sharepoint.com/:u:/g/personal/royia_technion_ac_il/EZxtZtYu1s9AgopNp5YSXYAB4tRzJWmoQuvItw8gd3GKcA?e=kPqVOM'
datasetFile = 'captions.txt'

# Pre Processing

# Training
batchSize = 32
numEpochs = 3

# Model
modelFolderPath = os.path.join(BASE_FOLDER, MODEL_FOLDER_PATH)
inputSize       = 224
paramK          = 3 #<! K Top Images

# Data Visualization


## Generate / Load Data

Data is based on the [Kaggle - Flickr 8K Data Set](https://www.kaggle.com/datasets/adityajn105/flickr8k).



In [None]:
# Verify Data is Available

dataSetPath = os.path.join(BASE_FOLDER, DATA_FOLDER_PATH, datasetName)

if not os.path.isdir(dataSetPath):
    # Download, unzip and remove ZIP file
    onedrivedownloader.download(datasetUrl, os.path.join(BASE_FOLDER, DATA_FOLDER_PATH, datasetName + '.zip'), unzip = True, clean = True)

In [None]:
# Loader Transform

oTrns = TorchVisionTrns.Compose([
    TorchVisionTrns.ToImage(),
    TorchVisionTrns.ToDtype(torch.float, scale = True),
    TorchVisionTrns.Resize(inputSize, interpolation = torchvision.transforms.InterpolationMode.BICUBIC),
    TorchVisionTrns.CenterCrop(inputSize),
    TorchVisionTrns.Normalize(mean = (0.48145466, 0.4578275, 0.40821073), std = (0.26862954, 0.26130258, 0.27577711)),
])

# return Compose([
#         Resize(n_px, interpolation=BICUBIC),
#         CenterCrop(n_px),
#         _convert_image_to_rgb,
#         ToTensor(),
#         Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)),
#     ])

In [None]:
# Pre Processors of CLIP

oProc      = oTrns #<! Should match `_transform` in `clip.tokenize`
oTokenizer = clip.tokenize

In [None]:
# Data Set

dataFolder = os.path.join(BASE_FOLDER, DATA_FOLDER_PATH, datasetName)
dsTrain    = CLIPDataset(dataFolderPath = dataFolder, dataFileName = 'captions.txt', oProc = oTrns, oTokenizer = oTokenizer, uniqueImages = True) #<! Only unique images
# dlTrain    = torch.utils.data.DataLoader(dsTrain, shuffle = True, batch_size = batchSize, num_workers = 2, persistent_workers = True)
dlTrain    = torch.utils.data.DataLoader(dsTrain, shuffle = False, batch_size = batchSize) #<! Keep indices aligned
numSamples = len(dsTrain)

print(f'The number of training samples: {numSamples}')

In [None]:
# Image + Caption Pair
dsTrain._dfDataCaptions.head()

### Plot Data

In [None]:
# Plot the Data

hF, vHa = plt.subplots(nrows = 3, ncols = 3, figsize = (9, 9))
vHa = vHa.flat

for ii in range(9):
    hA = vHa[ii]

    sampleIdx  = random.randrange(numSamples)
    mI         = dsTrain.GetImage(sampleIdx)
    captionTxt = dsTrain.GetCaption(sampleIdx)

    hA.imshow(mI)
    hA.set_title(captionTxt, {'fontsize': 6}, wrap = True)
    hA.axis('off')

* <font color='brown'>(**#**)</font> Some of the images annotated with multiple captions.

## Load Model

Loading the CLIP model by OpenAI.

In [None]:
# Check GPU Availability

runDevice   = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') #<! The 1st CUDA device

In [None]:
# Models of CLIP
clip.available_models()

In [None]:
# Model
# By default, the model is loaded in Float16

oModel, _ = clip.load('ViT-B/16', device = runDevice, jit = False, download_root = modelFolderPath) #<! The JIT must be disabled for training

if oModel.visual.input_resolution != inputSize:
    raise ValueError(f'The input size of the model is {oModel.visual.input_resolution} and not {inputSize}')

## Image Retrieval  

In [None]:
# Iterate on the Loader
# The first batch.
tX, tY = next(iter(dlTrain)) #<! PyTorch Tensors

print(f'The batch features dimensions: {tX.shape}')
print(f'The batch labels dimensions: {tY.shape}')

In [None]:
tX = tX.to(runDevice)
tY = tY.to(runDevice)

tSimImage, tSimTxt = oModel(tX, tY) #<! Similarity between the image and the text (tSimImage -> Images as rows, tSimTxt -> Text as rows)

In [None]:
# Retrieval Text

lRetText = ['a car', 'trees or grass']
tRetText = clip.tokenize(lRetText).to(runDevice)


In [None]:
# Image Retrieval

lLogImage = []

with torch.inference_mode():
    for tI, tTxt in dlTrain:

        tI = tI.to(runDevice)

        tLogImage, _ = oModel(tI, tRetText) #<! Similarity of each image (Rows) per text (Columns) (batchSize, len(lRetText))
        mLogImage    = tLogImage.cpu().detach().numpy()

        lLogImage.append(mLogImage)

In [None]:
# Locate the Top 5 Images
mSim    = np.vstack(lLogImage)
lTopIdx = GetTopKImages(mSim, paramK)

In [None]:
# Display the Top 5 Images

hF = plt.figure(constrained_layout = True)
hF.suptitle(f'Top {paramK} Images per Text')

# Create Sub Figures
vHSF = hF.subfigures(nrows = len(lRetText), ncols = 1) #<! Sub Figures
for rowIdx, hSubF in enumerate(vHSF):
    hSubF.suptitle(f'Text: {lRetText[rowIdx]}')

    # Create Subplots per Sub Figure
    vHa = hSubF.subplots(nrows = 1, ncols = 3)
    for colIdx, hA in enumerate(vHa):
        imgIdx = lTopIdx[rowIdx][colIdx]
        hA.imshow(dsTrain.GetImage(imgIdx))
        hA.axis('off')
        hA.set_title(f'Index: {imgIdx}, Score: {mSim[imgIdx, rowIdx]:.2f}')


## Fine Tuning

In [None]:
# Data Set

batchSize = 16

dsTrain    = CLIPDataset(dataFolderPath = dataFolder, dataFileName = 'captions.txt', oProc = oTrns, oTokenizer = oTokenizer)
dlTrain    = torch.utils.data.DataLoader(dsTrain, shuffle = True, batch_size = batchSize, drop_last = True)
# dlTrain    = torch.utils.data.DataLoader(dsTrain, shuffle = False, batch_size = batchSize)
numSamples = len(dsTrain)

print(f'The number of training samples: {numSamples}')

In [None]:
# Model

ModelToFloat32(oModel)


In [None]:
# Loss Functions
hLImg = nn.CrossEntropyLoss()
hLTxt = nn.CrossEntropyLoss()


In [None]:
# Optimizer
# Set Hype Parameters. Follow the paper with a lower Learning Rate which is "safer" for fine tuning (Transfer Learning)
oOpt = torch.optim.Adam(oModel.parameters(), lr = 2e-5, betas = (0.9, 0.98), eps = 1e-6, weight_decay = 0.2)

In [None]:
# Training Loop

numBatches = len(dlTrain)

for epochIdx in range(numEpochs):
  startTime = time.time()
  epochLoss = 0
  for ii, (tI, tTxt) in enumerate(dlTrain):
    itrBatchSize = tI.shape[0]
    oOpt.zero_grad()

    tI   = tI.to(runDevice)
    tTxt = tTxt.to(runDevice)

    tSimImg, tSimTxt = oModel(tI, tTxt)

    tY = torch.arange(itrBatchSize, dtype = torch.long, device = runDevice)

    valLoss    = (hLImg(tSimImg, tY) + hLTxt(tSimTxt, tY)) / 2
    epochLoss += itrBatchSize * valLoss.item()

    valLoss.backward() #<! Backward Propagation
    oOpt.step()

    print(f'\rIteration: {(ii + 1):3d} / {numBatches}, loss: {valLoss:.6f}', end = '')

  print('', end = '\r')
  epochTime = time.time() - startTime


  print('Epoch '              f'{(epochIdx + 1):4d} / ' f'{numEpochs}', end = '')
  print(' | Train Loss: '     f'{epochLoss          :6.3f}', end = '')
  print(' | Epoch Time: '     f'{epochTime          :5.2f}', end = '')
  print(' |')

* <font color='brown'>(**#**)</font> One could train additional classifier layer on top of the output features of the model instead of the whole model:

```python
import torch.nn as nn

# Modify the model to include a classifier for subcategories
class CLIPFineTuner(nn.Module):
    def __init__(self, oClipModel: nn.Module, numCls: int):
        super(CLIPFineTuner, self).__init__()
        self.oModel     = oClipModel
        self.oClsModel  = nn.Linear(oClipModel.visual.output_dim, numCls)
    
    def forward(self, x):
        with torch.inference_mode():
            # Calculate the Features of the CLIP model
            tF = self.oModel.encode_image(x).float()  #<! The Clip model if Float16 by default -> Convert to Float32
        return self.classifier(tF)
```

<!-- * <font color='brown'>(**#**)</font> [A Beginner's Guide to Fine-Tuning CLIP Models](https://github.com/mlfoundations/open_clip/discussions/911) or [Fine Tuning CLIP Models](https://www.marqo.ai/course/fine-tuning-clip-models). -->

In [None]:
# Data Set

dataFolder = os.path.join(BASE_FOLDER, DATA_FOLDER_PATH, datasetName)
dsTrain    = CLIPDataset(dataFolderPath = dataFolder, dataFileName = 'captions.txt', oProc = oTrns, oTokenizer = oTokenizer, uniqueImages = True) #<! Only unique images
# dlTrain    = torch.utils.data.DataLoader(dsTrain, shuffle = True, batch_size = batchSize, num_workers = 2, persistent_workers = True)
dlTrain    = torch.utils.data.DataLoader(dsTrain, shuffle = False, batch_size = batchSize)
numSamples = len(dsTrain)

print(f'The number of training samples: {numSamples}')

In [None]:
# Retrieval Text

lRetText = ['a car', 'trees or grass']
tRetText = clip.tokenize(lRetText).to(runDevice)


In [None]:
# Image Retrieval

lLogImage = []

with torch.inference_mode():
    for tI, tTxt in dlTrain:

        tI = tI.to(runDevice)

        tLogImage, _ = oModel(tI, tRetText) #<! Similarity of each image (Rows) per text (Columns) (batchSize, len(lRetText))
        mLogImage    = tLogImage.cpu().detach().numpy()

        lLogImage.append(mLogImage)

# Locate the Top 5 Images

In [None]:
# Locate the Top 5 Images
mSim    = np.vstack(lLogImage)
lTopIdx = GetTopKImages(mSim, paramK)

In [None]:
# Display the Top 5 Images

hF = plt.figure(constrained_layout = True)
hF.suptitle(f'Top {paramK} Images per Text')

# Create 3x1 subfigs
vHSF = hF.subfigures(nrows = len(lRetText), ncols = 1) #<! Sub Figures
for rowIdx, hSubF in enumerate(vHSF):
    hSubF.suptitle(f'Text: {lRetText[rowIdx]}')

    # create 1x3 subplots per subfig
    vHa = hSubF.subplots(nrows = 1, ncols = 3)
    for colIdx, hA in enumerate(vHa):
        imgIdx = lTopIdx[rowIdx][colIdx]
        hA.imshow(dsTrain.GetImage(imgIdx))
        hA.axis('off')
        hA.set_title(f'Index: {imgIdx}, Score: {mSim[imgIdx, rowIdx]:.2f}')
