In [None]:
import pandas as pd
import torch
import numpy as np
from torchvision.datasets import ImageFolder
from torch.utils.data import ConcatDataset, DataLoader
from IPython.display import display
from clip_retrieval.clip_client import ClipClient, Modality
from IPython.display import Image, display
from base64 import b64decode
import matplotlib.pyplot as plt
from collections import defaultdict
from tqdm.notebook import tqdm
from joblib import Parallel, delayed
from copy import deepcopy
import json
import open_clip
from PIL import Image as PILImage
import os
import wandb
import cv2
import itertools
from pl_bolts.transforms.dataset_normalizations import imagenet_normalization
import torchvision.transforms as T
import pytorch_lightning as pl

os.chdir('/workspace')
from rtpt.rtpt import setproctitle
setproctitle('@Clipping_Privacy_CC3M_Notebook')

from facescrub_training.pl_models.resnet import ResNet50
from facescrub_training.datasets import FaceScrubCropped
from utils import TQDMParallel

First the backend of clip-retrieval has to be started.

# Prepare the Clip Retrieval Library

In [None]:
QUERY_BACKEND = False
NUM_IMAGES = 50
# after starting the docker containers get the ip addresses using the followinig command:
# docker inspect -f '{{range.NetworkSettings.Networks}}{{.IPAddress}}{{end}}' <docker_container_name>"
CLIENT_URLS = [
    'http://172.17.0.3:1337/knn-service'
]

In [None]:
clients = []
for url in CLIENT_URLS:
    clients.append(
        ClipClient(
            url=url,
            indice_name='CC3M-Train',
            aesthetic_weight=0,
            modality=Modality.IMAGE,
            use_safety_model=False,
            use_violence_detector=False,
            deduplicate=False,
            num_images=NUM_IMAGES
        )
    )
len(clients)

def log_result(result):
    image_path, image, id, similarity = result['image_path'], result['image'], result['id'], result['similarity']
    print(f"id: {id}")
    print(f"similarity: {similarity}")
    display(Image(b64decode(image)))

# Test Clip Retrieval to make sure it works properly

In [None]:
if QUERY_BACKEND:
    cat = clients[-1].query(text='an image of a cat')
    print(len(cat))
    log_result(cat[0])

# Load the FaceScrub Dataset

In [None]:
actors_dataset = ImageFolder(root='./data/facescrub/actors/images')
actresses_dataset = ImageFolder(root='./data/facescrub/actresses/images')

In [None]:
plt.imshow(actors_dataset[0][0])
plt.show()
plt.imshow(actresses_dataset[0][0])
plt.show()

# Get the NUM_IMAGES most similar images to each of the images in the FaceScrub dataset

In [None]:
def get_similar_images_from_dataset(dataset, clip_retrieval_client):
    similar_images = {}
    for img, cls in tqdm(dataset.imgs, desc='Getting Similar Images', total=len(dataset)):
        res = []
        try:
            res = clip_retrieval_client.query(image=img)
            [x.pop('image', None) for x in res]
        except Exception as e:
            print(f"Exception: {e}")
        
        similar_images[img] = res

    return similar_images   

def get_similar_images(img, clip_retrieval_client):
    res = []
    try:
        res = clip_retrieval_client.query(image=img)
        [x.pop('image', None) for x in res]
    except Exception as e:
        print(f"Exception: {e}")
    
    return res

In [None]:
if QUERY_BACKEND:
    # get the similar images as list
    sim_imgs_actors = TQDMParallel(n_jobs=len(clients), total=len(actors_dataset))(delayed(get_similar_images)(actors_dataset.imgs[i][0], clients[i%len(clients)]) for i in range(len(actors_dataset)))

    # convert the list to a dictionary
    actors_sim_imgs = {}
    for i, (img, cls)in enumerate(actors_dataset.imgs):
        actors_sim_imgs[img] = sim_imgs_actors[i]

    # save the dictionary as a json file
    with open(f'cc3m_experiments/face_scrub_top{NUM_IMAGES}_similar_conceptual_caption_images_actors.json', 'w') as json_file:
        json_file.write(json.dumps(actors_sim_imgs))
else:
    with open('cc3m_experiments/face_scrub_top{NUM_IMAGES}_similar_conceptual_caption_images_actors.json', 'r') as json_file:
        actors_sim_imgs = json.load(json_file)

In [None]:
if QUERY_BACKEND:
    # do the same as above for the actresses
    # get the similar images as list
    sim_imgs_actresses = TQDMParallel(n_jobs=len(clients), total=len(actresses_dataset))(delayed(get_similar_images)(actresses_dataset.imgs[i][0], clients[i%len(clients)]) for i in range(len(actresses_dataset)))

    # convert the list to a dictionary
    actresses_sim_imgs = {}
    for i, (img, cls)in enumerate(actresses_dataset.imgs):
        actresses_sim_imgs[img] = sim_imgs_actresses[i]

    # save the dictionary as a json file
    with open('cc3m_experiments/face_scrub_top{NUM_IMAGES}_similar_conceptual_caption_images_actresses.json', 'w') as json_file:
        json_file.write(json.dumps(actresses_sim_imgs))
else:
    with open('cc3m_experiments/face_scrub_top{NUM_IMAGES}_similar_conceptual_caption_images_actresses.json', 'r') as json_file:
        actresses_sim_imgs = json.load(json_file)

In [None]:
def create_df(sim_imgs):
    dataframes = []
    for key in sim_imgs.keys():
        df = pd.DataFrame(sim_imgs[key])
        df['image'] = key
        dataframes.append(df)
    
    df = pd.concat(dataframes, ignore_index=True)

    df = df[list(df.columns)[::-1]]
    df['name'] = df.image.str.split('/').str[-1].str.split('.').str[:-1].str.join('.').str.split('_').str[:-1].str.join('_')

    return df

actresses_df = create_df(actresses_sim_imgs)
actors_df = create_df(actors_sim_imgs)
actors_df

# Predict the Persons on the Images using the OpenClip Clip

In [None]:
class ImageListDataset(torch.utils.data.Dataset):
    def __init__(self, image_list, transform=None):
        super().__init__()
        self.img_list = image_list
        self.transform = transform

    def __len__(self):
        return len(self.img_list)

    def __getitem__(self, idx):
        img_pth = self.img_list[idx]
        img = PILImage.open(img_pth)
        if self.transform is not None:
            img = self.transform(img)

        return img

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
clip, _, preprocess = open_clip.create_model_and_transforms("ViT-B-32-quickgelu", pretrained='laion400m_e32', device=device)
actors_split_class_names = [x.replace("_", " ") for x in actors_dataset.classes]
actors_label_context_vecs = open_clip.tokenize(actors_split_class_names).to(device)
actresses_split_class_names = [x.replace("_", " ") for x in actresses_dataset.classes]
actresses_label_context_vecs = open_clip.tokenize(actresses_split_class_names).to(device)

In [None]:
index_to_classes_actors = pd.DataFrame(actors_dataset.class_to_idx.items(), columns=['class', 'idx']).set_index('idx')
index_to_classes_actresses = pd.DataFrame(actresses_dataset.class_to_idx.items(), columns=['class', 'idx']).set_index('idx')
display(index_to_classes_actors)
display(index_to_classes_actresses)

In [None]:
dataset = ImageListDataset(actors_df['image_path'], transform=preprocess)
dataloader = DataLoader(dataset, batch_size=1024, num_workers=8, pin_memory=device=='cuda')

with torch.no_grad():
    preds = []
    for x in tqdm(dataloader):
        x = x.to(device)
        image_features, text_features, logits_scale = clip(x, actors_label_context_vecs)
        # we have to calculate the cosine similarity manually. OpenAI does this internally.
        logits_per_image = logits_scale  * image_features @ text_features.T
        preds.append(logits_per_image.argmax(-1).cpu())

    preds = torch.cat(preds)

    actors_df['openclip_prediction'] = index_to_classes_actors.reindex(preds.tolist())['class'].tolist()
actors_df.to_csv('cc3m_experiments/facescrub_top50_images_predictions_VitB32_OpenCLIP_actors.csv')
actors_df = pd.read_csv('cc3m_experiments/facescrub_top50_images_predictions_VitB32_OpenCLIP_actors.csv', index_col=0)
actors_df['gender'] = 'm'
actors_df

In [None]:
dataset = ImageListDataset(actresses_df['image_path'], transform=preprocess)
dataloader = DataLoader(dataset, batch_size=1024, num_workers=8, pin_memory=device=='cuda')

with torch.no_grad():
    preds = []
    for x in tqdm(dataloader):
        x = x.to(device)
        image_features, text_features, logits_scale = clip(x, actresses_label_context_vecs)
        # we have to calculate the cosine similarity manually. OpenAI does this internally.
        logits_per_image = logits_scale  * image_features @ text_features.T
        preds.append(logits_per_image.argmax(-1).cpu())

    preds = torch.cat(preds)

    actresses_df['openclip_prediction'] = index_to_classes_actresses.reindex(preds.tolist())['class'].tolist()
actresses_df.to_csv('cc3m_experiments/facescrub_top50_images_predictions_VitB32_OpenCLIP_actresses.csv')
actresses_df = pd.read_csv('cc3m_experiments/facescrub_top50_images_predictions_VitB32_OpenCLIP_actresses.csv', index_col=0)
actresses_df['gender'] = 'f'
actresses_df

# Detect Faces in the similar images

In [None]:
def get_face_bb(image_pth_list):
    face_cascade = cv2.CascadeClassifier(os.path.join(cv2.data.haarcascades, 'haarcascade_frontalface_default.xml'))

    bbs = []
    for image_path in image_pth_list:
        img = cv2.imread(image_path)
        gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
        faces = face_cascade.detectMultiScale(gray, scaleFactor=1.1, minNeighbors=5, minSize=(32, 32))
        if len(faces) > 0:
            faces = faces.tolist()
        else:
            faces = []

        bbs.append(faces)
    
    return bbs

num_workers = 64
chunk_size = 500
image_path_list_chunks = [actors_df['image_path'][i:i+chunk_size].tolist() for i in range(0, len(actors_df['image_path']), chunk_size)]
face_bbs = TQDMParallel(
    n_jobs=num_workers, total=len(image_path_list_chunks)
)(
    delayed(get_face_bb)(chunk) for chunk in image_path_list_chunks
)

In [None]:
face_bbs = list(itertools.chain(*face_bbs))
actors_df['face_bbs'] = face_bbs
actors_df.to_csv('cc3m_experiments/facescrub_top50_images_predictions_VitB32_OpenCLIP_actors_with_BB.csv')
actors_df = pd.read_csv('cc3m_experiments/facescrub_top50_images_predictions_VitB32_OpenCLIP_actors_with_BB.csv', index_col=0)
# convert the string arrays to numpy arrays
actors_df['face_bbs'] = actors_df['face_bbs'].apply(lambda x: np.asarray(np.matrix(x)).reshape(-1, 4))
actors_df

In [None]:
image_path_list_chunks = [actresses_df['image_path'][i:i+chunk_size].tolist() for i in range(0, len(actresses_df['image_path']), chunk_size)]
face_bbs = TQDMParallel(
    n_jobs=num_workers, total=len(image_path_list_chunks)
)(
    delayed(get_face_bb)(chunk) for chunk in image_path_list_chunks
)

In [None]:
face_bbs = list(itertools.chain(*face_bbs))
actresses_df['face_bbs'] = face_bbs
actresses_df.to_csv('cc3m_experiments/facescrub_top50_images_predictions_VitB32_OpenCLIP_actresses_with_BB.csv')
actresses_df = pd.read_csv('cc3m_experiments/facescrub_top50_images_predictions_VitB32_OpenCLIP_actresses_with_BB.csv', index_col=0)
actresses_df['face_bbs'] = actresses_df['face_bbs'].apply(lambda x: np.asarray(np.matrix(x)).reshape(-1, 4))
actresses_df

# Predict the Persons on the Images using the ResNet50 trained on FaceScrub

In [None]:
resnet50 = ResNet50.load_from_checkpoint('facescrub_training/pretrained_models/rn50_facescrub.ckpt')

In [None]:
test_set = FaceScrubCropped(
    False, 
    transform=T.Compose([T.Resize(224), T.CenterCrop(224), T.ToTensor(), imagenet_normalization()])
)
trainer = pl.Trainer(
    accelerator='auto',
    devices=1,
    deterministic=True
)
trainer.test(resnet50, dataloaders=DataLoader(test_set, batch_size=128, num_workers=8, pin_memory=True))

In [None]:
class ImageListDatasetWithBB(torch.utils.data.Dataset):
    def __init__(self, image_list, bb_list, transform=None):
        super().__init__()
        self.img_list = image_list
        self.bb_list = bb_list
        self.transform = transform

    def __len__(self):
        return len(self.img_list)

    def __getitem__(self, idx):
        img_pth = self.img_list[idx]
        img = PILImage.open(img_pth).convert("RGB")
        (x, y, w, h) = self.bb_list[idx]
        img = img.crop((x, y, x+w, y+h))
        if self.transform is not None:
            img = self.transform(img)

        return img

def convert_to_lists(image_pths, face_bbs):
    input_img_list = []
    input_face_bb_list = []
    for img_pth, faces in zip(image_pths, face_bbs):
        for bb in faces:
            input_img_list.append(img_pth)
            input_face_bb_list.append(bb)
    assert len(input_img_list) == len(input_face_bb_list)

    return input_img_list, input_face_bb_list

def convert_to_df(predicted_classes, face_bbs):
    df_rows = []
    prediction_index = 0
    for faces in face_bbs:
        df_rows.append(predicted_classes[prediction_index:prediction_index+len(faces)])
        prediction_index += len(faces)

    # TODO: check return value
    return df_rows    


def get_predictions(model, dataframe):
    input_img_list, input_face_bb_list = convert_to_lists(dataframe['image_path'], dataframe['face_bbs'])
    dataset = ImageListDatasetWithBB(input_img_list, input_face_bb_list, transform=T.Compose([T.Resize(224), T.CenterCrop(224), T.ToTensor(), imagenet_normalization()]))
    dataloader = DataLoader(dataset, batch_size=1024, num_workers=16, pin_memory=device=='cuda')

    predicted_classes = []
    model = model.to(device)
    with torch.no_grad():
        for batch in tqdm(dataloader, desc='Predicting batches'):
            batch = batch.to(device)
            preds = resnet50(batch).cpu().argmax(-1)
            predicted_classes.extend(np.array(test_set.classes)[preds])

    return convert_to_df(predicted_classes, dataframe['face_bbs'])

In [None]:
resnet50_actors_predictions = get_predictions(resnet50, actors_df)
resnet50_actresses_predictions = get_predictions(resnet50, actresses_df)

In [None]:
actors_df['resnet50_predictions'] = resnet50_actors_predictions
actresses_df['resnet50_predictions'] = resnet50_actresses_predictions

# Anaylize the predictions of the CLIP and the ResNet50 Model

In [None]:
# TODO: uncomment if you run the notebook for the first time. The operations below take some time.
# concat_df = pd.concat([actors_df, actresses_df], ignore_index=True)
# concat_df['openclip_prediction_correct'] = concat_df['name'] == concat_df['openclip_prediction']
# concat_df['resnet50_prediction_correct'] = concat_df.apply(lambda x: x['name'] in x['resnet50_predictions'], axis=1)
# concat_df.to_csv('cc3m_experiments/facescrub_top50_similar_images_cc_VitB32.csv')
concat_df = pd.read_csv('cc3m_experiments/facescrub_top50_similar_images_cc_VitB32.csv', index_col=0)
concat_df

In [None]:
actors_counts = concat_df.groupby('name').count().sort_values('image', ascending=False)
actors_counts['count'] = actors_counts['image']
actors_counts = actors_counts.drop(['image', 'similarity', 'id', 'image_path', 'openclip_prediction', 'gender', 'face_bbs', 'resnet50_predictions', 'openclip_prediction_correct', 'resnet50_prediction_correct'], axis=1).reset_index()
print('Number of similar images per actor/actress')
display(actors_counts)

In [None]:
# filter all images where no face could be detected
preds_df = concat_df[concat_df['face_bbs'].map(len) > 0].reset_index(drop=True)
preds_df

In [None]:
prediction_df = preds_df.groupby('name').name.count().to_frame(name='num_similar_samples').reset_index()
prediction_df['openclip_num_correct_preds'] = concat_df.groupby('name').openclip_prediction_correct.value_counts().unstack(fill_value=0).reset_index()[True]
prediction_df['resnet50_num_correct_preds'] = concat_df.groupby('name').resnet50_prediction_correct.value_counts().unstack(fill_value=0).reset_index()[True]
prediction_df['openclip_percentage_correct_preds'] = prediction_df['openclip_num_correct_preds'] / prediction_df['num_similar_samples']
prediction_df['resnet50_percentage_correct_preds'] = prediction_df['resnet50_num_correct_preds'] / prediction_df['num_similar_samples']
prediction_df['gender'] = concat_df.groupby('name').gender.value_counts().to_frame().rename(columns={'gender': 'num_samples'}).reset_index()['gender']
prediction_df

In [None]:
# read the csv that contains the top 50 similar images of the laion 400M dataset to each of the facescrub images
laion_similar_images_to_facescrub_actors = pd.read_csv("laion400m_experiments/facescrub_top200_similar_laion400m_images_actors.csv", index_col=0)
laion_similar_images_to_facescrub_actresses = pd.read_csv("laion400m_experiments/facescrub_top200_similar_laion400m_images_actresses.csv", index_col=0)
laion_similar_images_to_facescrub = pd.concat([laion_similar_images_to_facescrub_actors, laion_similar_images_to_facescrub_actresses], ignore_index=True)

In [None]:
# get only the persons where the caption contains the names and count how many images there are with their name in the caption
images_with_names = laion_similar_images_to_facescrub[laion_similar_images_to_facescrub['caption_contains_name']]
num_images_with_names = images_with_names.groupby('class_name').caption_contains_name.value_counts().unstack(fill_value=0).reset_index().set_index('class_name')[True].reset_index()
num_images_with_names = num_images_with_names.sort_values(True, ascending=False, ignore_index=True).rename(columns={'class_name': 'name', True: 'num_samples_with_name_in_cap'}).reset_index(drop=True)
num_images_with_names

In [None]:
# get the persons where there is a name in the caption from the predictions on the cc dataset
inner_join = pd.merge(num_images_with_names, prediction_df, on='name')
inner_join

In [None]:
inner_join.describe()

In [None]:
# filter out those persons who have a higher correct prediction percentage than 20%
cc_non_members = inner_join[(inner_join['openclip_percentage_correct_preds'] <= 0.20) & (inner_join['resnet50_percentage_correct_preds'] <= 0.15)]
# filter out those persons who have less than 100 images with their name in the caption
cc_non_members = cc_non_members[cc_non_members['num_samples_with_name_in_cap'] >= 100]
cc_non_members = cc_non_members.sort_values(['resnet50_percentage_correct_preds'], ascending=True, ignore_index=True).groupby('gender').head(100).reset_index(drop=True)
cc_non_members['openclip_percentage_correct_preds_bins'] = pd.qcut(cc_non_members['openclip_percentage_correct_preds'], q=20)
cc_non_members['resnet50_percentage_correct_preds_bins'] = pd.qcut(cc_non_members['resnet50_percentage_correct_preds'], q=20)
cc_non_members = cc_non_members.sort_values(['gender', 'openclip_percentage_correct_preds_bins', 'resnet50_percentage_correct_preds_bins'])

In [None]:
actor_non_members = cc_non_members.groupby('gender').get_group('m')
# save the first half of the actors as non-members
actor_non_members = actor_non_members[:int(len(actor_non_members)/2)].reset_index(drop=True)
actor_non_members.to_csv('cc3m_experiments/conceptual_captions_facescrub_member_info/actors_non_members.csv')
actor_non_members.head(10)

In [None]:
actor_non_members = cc_non_members.groupby('gender').get_group('m')
# save the last half of the actors as members
actor_non_members = actor_non_members[int(len(actor_non_members)/2):].reset_index(drop=True)
actor_non_members.to_csv('cc3m_experiments/conceptual_captions_facescrub_member_info/actors_members.csv')
actor_non_members.head(10)

In [None]:
actresses_non_members = cc_non_members.groupby('gender').get_group('f')
# save the first half of the actors as non-members
actresses_non_members = actresses_non_members[:int(len(actresses_non_members)/2)].reset_index(drop=True)
actresses_non_members.to_csv('conceptual_captions_facescrub_member_info/actresses_non_members.csv')
actresses_non_members.head(10)

In [None]:
actresses_non_members = cc_non_members.groupby('gender').get_group('f')
# save the last half of the actors as members
actresses_non_members = actresses_non_members[int(len(actresses_non_members)/2):].reset_index(drop=True)
actresses_non_members.to_csv('cc3m_experiments/conceptual_captions_facescrub_member_info/actresses_members.csv')
actresses_non_members.head(10)

In [None]:
# see how the percentage of correct predictions is for the actors
cc_non_members.groupby('gender').get_group('f').tail(15)