### Universal Fake Detection

In [1]:
%load_ext autoreload
%autoreload 2

### Testing OpenCLIP

In [2]:
import torch
from PIL import Image
import open_clip
import sys
import os

sys.path.append(os.path.abspath(".."))
from models.clip_models import CLIPModel


A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.2.4 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "/home/scur0555/.local/lib/python3.11/site-packages/ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "/home/scur0555/.local/lib/python3.11/site-packages/traitlets/config/application.py", line 1075, in launch_instance
    app.start()
  File "/home/scur0555/.local/lib/python3.11/site-packages/ipykernel/kernelapp.py", line 739, in start
    self.io_loop.start()
  File "/home/scur0555

In [3]:
name = "ViT-L/14"
pretrained = "dfn2b"
model, _, preprocess = open_clip.create_model_and_transforms(name, pretrained=pretrained)
model.eval()  # model in train mode by default, impacts some models with BatchNorm or stochastic depth active
tokenizer = open_clip.get_tokenizer('ViT-L-14')

Downloading (…)ip_pytorch_model.bin: 100%|██████████| 1.71G/1.71G [00:06<00:00, 263MB/s]
  return self.fget.__get__(instance, owner)()


In [5]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import numpy as np

# Dummy dataset of random floats
class DummyDataset(Dataset):
    def __init__(self, size=2):
        # random images
        self.data = torch.rand(size, 3, 224, 224)
        # binary labels
        self.labels = torch.randint(0, 2, (size, 1)).float()

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx], self.labels[idx]



# Provided validate function
def validate(model, loader, find_thres=False):
    with torch.no_grad():
        y_true, y_pred = [], []
        print("Length of dataset: %d" % (len(loader)))
        for img, label in loader:
            in_tens = img.cpu()
            y_pred.extend(model(in_tens).sigmoid().flatten().tolist())
            y_true.extend(label.flatten().tolist())
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    print("True labels:", y_true)
    print("Predicted scores:", y_pred)

In [20]:
CHANNELS = {
    "RN50" : 1024,
    "ViT-L/14" : 768,
    "ViT-H/14" : 1024,
    "ViT-g/14" : 1024,
}

class CLIPModel(nn.Module):
    def __init__(self, name, pretrained=None, num_classes=1):
        super(CLIPModel, self).__init__()
        self.name = name
        # self.preprecess will not be used during training, which is handled in Dataset class
        if pretrained:
            self.model, _, self.preprocess = open_clip.create_model_and_transforms(name, 
                                                                            pretrained=pretrained,
                                                                            device="cpu")
        else:
            self.model, self.preprocess = clip.load(name, device="cpu")

        # add a linear layer to the model (hard-coded for ViT)
        self.project = nn.Linear(1024, 768)
        self.fc = nn.Linear(768, num_classes)
 

    def forward(self, x, return_feature=False):
        features = self.model.encode_image(x)
        if CHANNELS.get(self.name) == 1024: 
            features = self.project(features)
        if return_feature:
            return features
        return self.fc(features)


In [21]:
name = "ViT-H/14"
pretrained = "laion2b_s32b_b79k"

# Create data loader and model
dataset = DummyDataset()
loader = DataLoader(dataset, batch_size=1)
# Load model
model = CLIPModel(name, pretrained)
state_dict = torch.load("../pretrained_weights/fc_weights.pth", map_location='cpu')
model.fc.load_state_dict(state_dict)
model = model.cpu()

# Run test
validate(model, loader)

Length of dataset: 2
True labels: [0. 1.]
Predicted scores: [0.36211863 0.43808025]


In [None]:
# OpenCLIP test (optional)
image = preprocess(Image.open("docs/CLIP.png")).unsqueeze(0)
text = tokenizer(["a diagram", "a dog", "a cat"])

with torch.no_grad(), torch.autocast("cuda"):
    image_features = model.encode_image(image)
    text_features = model.encode_text(text)
    image_features /= image_features.norm(dim=-1, keepdim=True)
    text_features /= text_features.norm(dim=-1, keepdim=True)

    text_probs = (100.0 * image_features @ text_features.T).softmax(dim=-1)

print("Label probs:", text_probs)  # prints: [[1., 0., 0.]]

In [3]:
# import open_clip
open_clip.list_pretrained()

[('RN50', 'openai'),
 ('RN50', 'yfcc15m'),
 ('RN50', 'cc12m'),
 ('RN101', 'openai'),
 ('RN101', 'yfcc15m'),
 ('RN50x4', 'openai'),
 ('RN50x16', 'openai'),
 ('RN50x64', 'openai'),
 ('ViT-B-32', 'openai'),
 ('ViT-B-32', 'laion400m_e31'),
 ('ViT-B-32', 'laion400m_e32'),
 ('ViT-B-32', 'laion2b_e16'),
 ('ViT-B-32', 'laion2b_s34b_b79k'),
 ('ViT-B-32', 'datacomp_xl_s13b_b90k'),
 ('ViT-B-32', 'datacomp_m_s128m_b4k'),
 ('ViT-B-32', 'commonpool_m_clip_s128m_b4k'),
 ('ViT-B-32', 'commonpool_m_laion_s128m_b4k'),
 ('ViT-B-32', 'commonpool_m_image_s128m_b4k'),
 ('ViT-B-32', 'commonpool_m_text_s128m_b4k'),
 ('ViT-B-32', 'commonpool_m_basic_s128m_b4k'),
 ('ViT-B-32', 'commonpool_m_s128m_b4k'),
 ('ViT-B-32', 'datacomp_s_s13m_b4k'),
 ('ViT-B-32', 'commonpool_s_clip_s13m_b4k'),
 ('ViT-B-32', 'commonpool_s_laion_s13m_b4k'),
 ('ViT-B-32', 'commonpool_s_image_s13m_b4k'),
 ('ViT-B-32', 'commonpool_s_text_s13m_b4k'),
 ('ViT-B-32', 'commonpool_s_basic_s13m_b4k'),
 ('ViT-B-32', 'commonpool_s_s13m_b4k'),
 ('ViT-

In [8]:
dic = {"a": 1}
dic.get("a", 768)

1