In [None]:
import torch
import open_clip
from PIL import Image
import numpy as np
import cv2
import matplotlib.pyplot as plt
import pandas as pd
import torchvision.transforms as T
from torchvision.datasets import ImageFolder
from torch.utils.data import DataLoader
from torch.utils.data import ConcatDataset as PTConcatDataset
from tqdm.notebook import tqdm
from scipy.stats import entropy
from IPython.display import display
import seaborn as sns
import os
import bisect

from rtpt.rtpt import setproctitle
setproctitle('@Clipping_Privacy_LAION400M_Notebook')
os.chdir('/workspace')

from datasets import FaceScrub, SingleClassSubset

%matplotlib inline

pd.set_option('display.max_rows', 15)

# Prepare the CLIP model

In [None]:
# init clip
device = "cuda" if torch.cuda.is_available() else "cpu"
MODEL_NAMES = ['ViT-B-32', 'ViT-B-16', 'ViT-L-14']
models = {}
preprocessings = {}
for model_name in MODEL_NAMES:
    model, _, preprocess = open_clip.create_model_and_transforms(model_name, pretrained='laion400m_e32')
    preprocessings[model_name] = preprocess
    model = model.eval()
    models[model_name] = model

# Load the Members and Non-Members

In [None]:
laion_non_members = pd.read_csv('./laion400m_experiments/laion400m_non_members.csv', index_col=0).reset_index(drop=True).drop(['False', 'True'], axis='columns')

laion_non_members['name'] = laion_non_members['class_name'].apply(lambda x: x.replace("_", " "))
print('Non-Members')
display(laion_non_members)

laion_members = pd.read_csv('./laion400m_experiments/laion400m_members.csv', index_col=0).reset_index(drop=True).drop(['False', 'True'], axis='columns')
laion_members['name'] = laion_members['class_name'].apply(lambda x: x.replace("_", " "))
display(laion_members)

# Define the FaceScrub Dataset Class

In [None]:
class ConcatDataset(PTConcatDataset):
    @property
    def classes(self):
        classes = []
        for dataset in self.datasets:
            classes.extend(dataset.classes)

        return classes

    @property
    def targets(self):
        targets = []
        for i, dataset in enumerate(self.datasets):
            max_target = sum([len(self.datasets[i].classes) for i in range(0, i)])
            targets.extend((np.array(dataset.targets) + max_target).tolist())
        
        return targets

    def __getitem__(self, idx):
        if idx < 0:
            if -idx > len(self):
                raise ValueError("absolute value of index should not exceed dataset length")
            idx = len(self) + idx
        dataset_idx = bisect.bisect_right(self.cumulative_sizes, idx)
        if dataset_idx == 0:
            sample_idx = idx
        else:
            sample_idx = idx - self.cumulative_sizes[dataset_idx - 1]

        x, y = self.datasets[dataset_idx][sample_idx]

        max_target = sum([len(self.datasets[i].classes) for i in range(0, dataset_idx)])

        return x, y + max_target

In [None]:
facescrub_datasets = {model_name: FaceScrub(root='./data/facescrub', group='all', train=True, cropped=False, transform=preprocess) for model_name in MODEL_NAMES}
laion_german_non_members_actors = {model_name: ImageFolder('./data/laion_german_non_members/actors/images', transform=preprocess) for model_name in MODEL_NAMES}
laion_german_non_members_actresses = {model_name: ImageFolder('./data/laion_german_non_members/actresses/images', transform=preprocess) for model_name in MODEL_NAMES}

In [None]:
concat_datasets = {model_name: ConcatDataset([facescrub_datasets[model_name], laion_german_non_members_actors[model_name], laion_german_non_members_actresses[model_name]]) for model_name in MODEL_NAMES}

In [None]:
for model_name in MODEL_NAMES:
    print(f'---------- {model_name} ----------')
    print(f'Dataset size: {len(concat_datasets[model_name])}')
    print(f'First Few Classes: {concat_datasets[model_name].classes[:10]}')
    print(f'Last Few Classes: {concat_datasets[model_name].classes[-10:]}')
    print(f'Total Number of Classes: {len(concat_datasets[model_name].classes)}')

In [None]:
dataset_class_subset_per_model = {}
for model_name in MODEL_NAMES:
    dataset_class_subsets = []
    for class_idx in range(len(concat_datasets[model_name].classes)):
        dataset_class_subsets.append(SingleClassSubset(concat_datasets[model_name], class_idx))
    dataset_class_subset_per_model[model_name] = dataset_class_subsets

In [None]:
# visualize the first and last preprocessed image of the first class
plt.imshow(dataset_class_subset_per_model[MODEL_NAMES[0]][0][0][0].permute(1,2,0).numpy())
plt.show()
plt.imshow(dataset_class_subset_per_model[MODEL_NAMES[0]][-1][0][0].permute(1,2,0).numpy())
plt.show()

# Run the Model for Test Purposes on the first Actor

In [None]:
# get the context vector of the possible labels
split_class_names = {model_name: [x.replace("_", " ") for x in concat_datasets[model_name].classes] for model_name in MODEL_NAMES}
label_context_vecs = {model_name: open_clip.tokenize(split_class_names[model_name]) for model_name in MODEL_NAMES}

In [None]:
split_class_names[MODEL_NAMES[0]][-20:]

In [None]:
# define a function to get the predictions for an actor/actress
@torch.no_grad()
def get_preds_for_dataset(model, subset, context, batch_size=8, num_workers=8, device=device):
    datalaoder = DataLoader(subset, batch_size=batch_size, num_workers=num_workers, pin_memory=device == 'cuda')
    
    context = context.to(device)
    model = model.to(device)

    preds = []
    for x, _ in tqdm(datalaoder, desc='Iterating Dataset'):
        x = x.to(device)
        image_features, text_features, logits_scale = model(x, context)
        # we have to calculate the cosine similarity manually. OpenAI does this internally.
        logits_per_image = logits_scale  * image_features @ text_features.T
        preds.append(logits_per_image.argmax(-1).cpu())

    model = model.cpu()
    context = context.cpu()
    return torch.cat(preds)


In [None]:
test_subset_dataset = dataset_class_subset_per_model[MODEL_NAMES[0]][3]
unique_vals, counts = get_preds_for_dataset(models[MODEL_NAMES[0]], test_subset_dataset, label_context_vecs[MODEL_NAMES[0]]).unique(return_counts=True)
prediction = unique_vals[counts.argmax()]
print(f'Prediction: {concat_datasets[MODEL_NAMES[0]].classes[prediction]}\t Correct Class: {concat_datasets[MODEL_NAMES[0]].classes[test_subset_dataset.target_class]}')

In [None]:
test_subset_dataset = dataset_class_subset_per_model[MODEL_NAMES[0]][-3]
unique_vals, counts = get_preds_for_dataset(models[MODEL_NAMES[0]], test_subset_dataset, label_context_vecs[MODEL_NAMES[0]]).unique(return_counts=True)
prediction = unique_vals[counts.argmax()]
print(f'Prediction: {concat_datasets[MODEL_NAMES[0]].classes[prediction]}\t Correct Class: {concat_datasets[MODEL_NAMES[0]].classes[test_subset_dataset.target_class]}')

# Run the CLIP model on each Actress/Actor

In [None]:
preds_per_model = {}
for model_name in MODEL_NAMES:
    dataset = PTConcatDataset(dataset_class_subset_per_model[model_name])
    print(f'{model_name}:')
    preds = get_preds_for_dataset(models[model_name], dataset, label_context_vecs[model_name], batch_size=256, num_workers=64)
    assert len(preds) == len(dataset)
    preds_per_model[model_name] = preds

In [None]:
# split the large list of all predictions into prediction lists for every class
preds_per_model_per_subset = {}
for model_name, preds in preds_per_model.items():
    preds_per_subset = []
    counter = 0
    for subset in dataset_class_subset_per_model[model_name]:
        preds_per_subset.append(preds[counter:counter + len(subset)])
        counter += len(subset)
    preds_per_model_per_subset[model_name] = preds_per_subset

In [None]:
preds_df_per_model = {}
for model_name in models.keys():
    df_list = []
    for group_idx, (dataset_subset, preds_subset) in enumerate(zip(dataset_class_subsets, preds_per_model_per_subset[model_name])):
        for sample_idx, pred in enumerate(preds_subset):
            class_name = concat_datasets[model_name].classes[dataset_class_subsets[group_idx].target_class]
            df_list.append({
                'group_idx': group_idx,
                'class_name': class_name,
                'sample_idx': sample_idx,
                'prediction': concat_datasets[model_name].classes[int(pred)]
            })
    preds_df = pd.DataFrame(df_list)
    preds_df_per_model[model_name] = preds_df

In [None]:
preds_df_per_model[model_name]

In [None]:
# determine the actual membership of the samples
for model_name, preds_df in preds_df_per_model.items():
    members = pd.merge(preds_df, laion_members['class_name'], on='class_name')
    members['actual_membership'] = 'member'
    non_members = pd.merge(preds_df, laion_non_members['class_name'], on='class_name')
    non_members['actual_membership'] = 'non_member'
    preds_df_per_model[model_name] = pd.concat([members, non_members]).reset_index(drop=True)

preds_df_per_model[model_name]

In [None]:
preds_df_per_model['ViT-B-32'].groupby('class_name').sample(2).head(10)

In [None]:
len(preds_df_per_model[model_name].groupby('class_name'))

In [None]:
subsample_sizes_per_model = {}
for model_name, preds_df in preds_df_per_model.items():
    min_num_images = preds_df.value_counts('class_name').sort_values()[0]
    subsample_sizes = np.arange(0, min_num_images+1, 2).tolist()
    subsample_sizes[0] = 1
    subsample_sizes_per_model[model_name] = subsample_sizes

In [None]:
subsampled_dfs_per_model = {}
sample_draws = 20
for model_name, preds_df in preds_df_per_model.items():
    subsample_sizes = subsample_sizes_per_model[model_name]
    subsampled_dfs = []
    for sample_size in tqdm(subsample_sizes):
        for i in range(sample_draws):
            membership_prediction_df = preds_df.groupby('class_name').sample(sample_size).groupby('class_name')['prediction'].agg(pd.Series.mode).apply(lambda x: x.tolist() if isinstance(x, np.ndarray) else [x]).reset_index()
            membership_prediction_df['membership_prediction'] = membership_prediction_df.apply(lambda x: 'member' if len(x['prediction']) == 1 and x['class_name'] in x['prediction'] else 'non_member', axis='columns')
            membership_prediction_df = pd.merge(membership_prediction_df, preds_df[['class_name', 'actual_membership']].groupby('class_name')['actual_membership'].agg(pd.Series.mode), on='class_name')
            num_member, num_non_member = membership_prediction_df['actual_membership'].value_counts()['member'], membership_prediction_df['actual_membership'].value_counts()['non_member']

            tp = len(membership_prediction_df[(membership_prediction_df['membership_prediction'] == 'member') & (membership_prediction_df['actual_membership'] == 'member')])
            fp = len(membership_prediction_df[(membership_prediction_df['membership_prediction'] == 'member') & (membership_prediction_df['actual_membership'] == 'non_member')])
            fn = len(membership_prediction_df[(membership_prediction_df['membership_prediction'] == 'non_member') & (membership_prediction_df['actual_membership'] == 'member')])
            tn = len(membership_prediction_df[(membership_prediction_df['membership_prediction'] == 'non_member') & (membership_prediction_df['actual_membership'] == 'non_member')])

            subsampled_dfs.append({
                'sample_size': sample_size,
                'draw': i,
                'tpr': tp / num_member,
                'fnr': fn / num_member,
                'fpr': fp / num_non_member,
                'tnr': tn / num_non_member,
                'tp': tp,
                'fn': fn,
                'fp': fp,
                'tn': tn
            })
    subsampled_dfs_per_model[model_name] = subsampled_dfs

In [None]:
for model_name, subsampled_dfs in subsampled_dfs_per_model.items():
    subsampled_dfs_per_model[model_name] = pd.DataFrame(subsampled_dfs_per_model[model_name]).set_index('sample_size').drop('draw', axis='columns')
    subsampled_dfs_per_model[model_name] = subsampled_dfs_per_model[model_name].rename(columns={'tpr': 'True Positive Rate', 'fnr': 'False Negative Rate', 'fpr': 'False Positive Rate', 'tnr': 'True Negative Rate'})
    subsampled_dfs_per_model[model_name].index.name = 'Number of Samples'

In [None]:
subsampled_dfs_per_model['ViT-B-32']

In [None]:
# TODO: uncomment this if you run the notebook for the first time to store the predictions to a file
# for model_name, subsampled_dfs in subsampled_dfs_per_model.items():
#     subsampled_dfs.to_csv(f'laion400m_experiments/prediction_dfs/predictions_laion_{model_name}.csv')
subsampled_dfs_per_model = {}
for model_name, _ in models.items():
    subsampled_dfs_per_model[model_name] = pd.read_csv(f'laion400m_experiments/prediction_dfs/predictions_laion_{model_name}.csv', index_col=0)

In [None]:
for num_members, df in subsampled_dfs_per_model.items():
    display(df.tail(3))

In [None]:
for model_name, subsampled_dfs in subsampled_dfs_per_model.items():
    plt.clf()
    ax = sns.lineplot(data=subsampled_dfs[['True Positive Rate', 'False Negative Rate', 'False Positive Rate', 'True Negative Rate']], ci='sd')
    plt.tight_layout()
    ax.get_figure().savefig(f'./laion400m_experiments/plots/subsample_plot_LAION400M_{model_name}.pdf')
    ax.get_figure().savefig(f'./laion400m_experiments/plots/subsample_plot_LAION400M_{model_name}.png', dpi=100)
    print(model_name)
    plt.show()

In [None]:
for model_name, subsampled_dfs in subsampled_dfs_per_model.items():
    tp_std, fn_std, fp_std, tn_std = subsampled_dfs.groupby('Number of Samples').std().iloc[-1][['tp', 'fn', 'fp', 'tn']]
    tp, fn, fp, tn = subsampled_dfs.groupby('Number of Samples').mean().iloc[-1][['tp', 'fn', 'fp', 'tn']]

    tpr_std, fnr_std, fpr_std, tnr_std = subsampled_dfs.groupby('Number of Samples').std().iloc[-1][['True Positive Rate', 'False Negative Rate', 'False Positive Rate', 'True Negative Rate']]
    tpr, fnr, fpr, tnr = subsampled_dfs.groupby('Number of Samples').mean().iloc[-1][['True Positive Rate', 'False Negative Rate', 'False Positive Rate', 'True Negative Rate']]

    normalized_conf_mat = pd.DataFrame({'member': [tpr, fpr], 'non_member': [fnr, tnr]}, index=['member', 'non_member'])
    normalized_conf_mat.index.set_names('Actual Membership', inplace=True)
    normalized_conf_mat = normalized_conf_mat.rename_axis('Predicted Membership', axis='columns')

    group_names = ['TP','FN','FP','TN']
    group_counts = ["{0:0.0f} \u00B1 {1:0.2f}".format(mean, std) for mean, std in zip([tp, fn, fp, tn], [tp_std, fn_std, fp_std, tn_std])]
    percentage = ["{0:0.2f}% \u00B1 {1:0.02f}%".format(mean * 100, std * 100) for mean, std in zip([tpr, fnr, fpr, tnr], [tpr_std, fnr_std, fpr_std, tnr_std])]
    labels = [f"{v1}\n{v2}\n{v3}" for v1, v2, v3 in zip(group_names, group_counts, percentage)]
    plt.clf()
    ax = sns.heatmap(normalized_conf_mat, annot=np.asarray(labels).reshape(2, 2), fmt='', cbar=False, cmap='Blues')
    plt.tight_layout()
    ax.get_figure().savefig(f'./laion400m_experiments/plots/confusion_matrix_LAION400M_{model_name}.pdf')
    ax.get_figure().savefig(f'./laion400m_experiments/plots/confusion_matrix_LAION400M_{model_name}.png', dpi=100)
    print(model_name)
    plt.show()