In [None]:
import os

os.chdir('../')
os.environ['CUDA_VISIBLE_DEVICES'] = "1"

import torch
import open_clip
from PIL import Image
import numpy as np
import cv2
import matplotlib.pyplot as plt
import pandas as pd
import torchvision.transforms as T
from torchvision.datasets import ImageFolder
from torch.utils.data import DataLoader
from torch.utils.data import ConcatDataset as PTConcatDataset
from tqdm.notebook import tqdm
from scipy.stats import entropy
from IPython.display import display
import seaborn as sns
import os
import bisect
import random
from joblib import Parallel, delayed
from matplotlib.ticker import FormatStrFormatter

from rtpt.rtpt import setproctitle
setproctitle('@Clip_Notebook')

from datasets import FaceScrub, SingleClassSubset

%matplotlib inline

pd.set_option('display.max_rows', 15)

device = "cuda" if torch.cuda.is_available() else "cpu"

MODEL_NAMES = ['ViT-B-32', 'ViT-B-16', 'ViT-L-14']
NUM_TOTAL_NAMES = 1_000
PROMPTS = [
    '{0}',
    'an image of {0}', 
    'a photo of {0}', 
    '{0} on a photo', 
    'a photo of a person named {0}', 
    'a person named {0}', 
    'a man named {0}',
    'a woman named {0}',
    'the name of the person is {0}', 
    'a photo of a person with the name {0}', 
    '{0} at a gala', 
    'a photo of the celebrity {0}', 
    'actor {0}',
    'actress {0}',
    'a colored photo of {0}',
    'a black and white photo of {0}',
    'a cool photo of {0}',
    'a cropped photo of {0}',
    'a cropped image of {0}',
    '{0} in a suit',
    '{0} in a dress'
]
SEED = 42
MIN_NUM_IMAGES_AVAILABLE = 30 # the number of samples for a person that need to be available in order to consider it in the experiments. Persons with less will not be included in the experiments.
MAX_NUM_TRAINING_SAMPLES = 300 # the maximum number of samples of an individual in the training set to be considered for the experiments
MIN_NUM_CORRECT_PROMPT_PREDS = 1 # the number of prompts for which the majority prediction has to be correct (tau in the paper)

LOAD_PREDICTIONS_FROM_FILE = True
LOAD_PREDICTION_METRICS_FROM_FILE = True

# Prepare the CLIP model

In [None]:
# init clip
models = {}
preprocessings = {}
for model_name in MODEL_NAMES:
    model, _, preprocess = open_clip.create_model_and_transforms(model_name, pretrained='laion400m_e32')
    preprocessings[model_name] = preprocess
    model = model.eval()
    models[model_name] = model

In [None]:
# define a function to get the predictions for an actor/actress
@torch.no_grad()
def get_text_embeddings(model, context, context_batchsize=10_000, use_tqdm=False):
    context_batchsize = context_batchsize * torch.cuda.device_count()
    # if there is not batches for the context unsqueeze it
    if context.dim() < 3:
        context = context.unsqueeze(0)

    # get the batch size, the number of labels and the sequence length
    seq_len = context.shape[-1]
    viewed_context = context.view(-1, seq_len)

    text_features = []
    for context_batch_idx in tqdm(range(0, len(viewed_context), context_batchsize), desc="Calculating Text Embeddings", disable=not use_tqdm):
        context_batch = viewed_context[context_batch_idx:context_batch_idx + context_batchsize]
        batch_text_features= model.encode_text(context_batch, normalize=True).cpu()

        text_features.append(batch_text_features)
    text_features = torch.cat(text_features).view(list(context.shape[:-1]) + [-1])

    return text_features

@torch.no_grad()
def get_preds_for_dataset(model, subset, context, batch_size=8, num_workers=8, device=device, context_batchsize=10_000, no_tqdm=False, text_embeddings=None):
    dataloader = DataLoader(subset, batch_size=batch_size, num_workers=num_workers, pin_memory=device == 'cuda')

    if text_embeddings is None:
        text_embeddings = get_text_embeddings(model, context, context_batchsize=context_batchsize)

    preds = []
    for x, _ in tqdm(dataloader, desc='Iterating Dataset', disable=no_tqdm):
        x = x.to(device)
        image_features = model.encode_image(x, normalize=True).cpu()

        image_features = image_features.unsqueeze(0)

        # we have to calculate the cosine similarity manually. OpenAI does this internally.
        logits_per_image = model.logit_scale.exp().cpu()  * image_features @ text_embeddings.swapaxes(-1, -2)
        preds.append(logits_per_image.argmax(-1))

    return torch.cat(preds, dim=-1)

# Get the Datasets

In [None]:
# define a data to be able to concatenate the facescrub dataset with the dataset containing the european individuals
class ConcatDataset(PTConcatDataset):
    @property
    def classes(self):
        classes = []
        for dataset in self.datasets:
            classes.extend(dataset.classes)

        return classes

    @property
    def class_to_idx(self):
        return {name: i for i, name in enumerate(self.classes)}

    @property
    def targets(self):
        targets = []
        for i, dataset in enumerate(self.datasets):
            max_target = sum([len(self.datasets[i].classes) for i in range(0, i)])
            targets.extend((np.array(dataset.targets) + max_target).tolist())
        
        return targets

    def __getitem__(self, idx):
        if idx < 0:
            if -idx > len(self):
                raise ValueError("absolute value of index should not exceed dataset length")
            idx = len(self) + idx
        dataset_idx = bisect.bisect_right(self.cumulative_sizes, idx)
        if dataset_idx == 0:
            sample_idx = idx
        else:
            sample_idx = idx - self.cumulative_sizes[dataset_idx - 1]

        x, y = self.datasets[dataset_idx][sample_idx]

        max_target = sum([len(self.datasets[i].classes) for i in range(0, dataset_idx)])

        return x, y + max_target

In [None]:
facescrub_datasets = FaceScrub(root='./data/facescrub', group='all', train=True, cropped=False, transform=preprocess)
print(f'FaceScrub Dataset size: {len(facescrub_datasets)}')
print(f'FaceScrub Total Number of Classes: {len(facescrub_datasets.classes)}')

laion_european_actors = ImageFolder('./data/laion_european_celebs/actors/images', transform=preprocess)
laion_european_actresses = ImageFolder('./data/laion_european_celebs/actresses/images', transform=preprocess)
print(f'European Celebs Dataset size: {len(laion_european_actors) + len(laion_european_actresses)}')
print(f'European Celebs Total Number of Classes: {len(laion_european_actors.classes) + len(laion_european_actresses.classes)}')

####################################
# IMPORTANT: Not all of the classes in the dataset will be used for the experiments. Only classes that have more than MIN_NUM_IMAGES_AVAILABLE available are used for the experiments.
# See below (after the predictions) on how many classes are really used in the experiments.
####################################

In [None]:
concat_datasets = ConcatDataset([facescrub_datasets, laion_european_actors, laion_european_actresses])

In [None]:
print(f'Dataset size: {len(concat_datasets)}')
print(f'First Few Classes: {concat_datasets.classes[:10]}')
print(f'Last Few Classes: {concat_datasets.classes[-10:]}')
print(f'Total Number of Classes: {len(concat_datasets.classes)}')

In [None]:
dataset_class_subsets = []
for class_idx in range(len(concat_datasets.classes)):
    subset = SingleClassSubset(concat_datasets, class_idx)
    dataset_class_subsets.append(subset)

In [None]:
# visualize the first and last preprocessed image of the first class
plt.imshow(dataset_class_subsets[0][0][0].permute(1,2,0).numpy())
plt.show()
plt.imshow(dataset_class_subsets[-1][0][0].permute(1,2,0).numpy())
plt.show()

# Load the Members and Non-Members

In [None]:
# load the occurences of the individuals in the laion dataset
laion_individuals = pd.read_csv('laion400m_experiments/laion_membership_occurence_count.csv', index_col=0)
laion_individuals['class_name'] = laion_individuals['name'].str.split(" ").str.join("_")
# get only the individuals that are also in the dataset
laion_individuals = laion_individuals[laion_individuals['class_name'].isin([concat_datasets.classes[x.target_class] for x in dataset_class_subsets])].reset_index(drop=True)
laion_individuals

In [None]:
laion_non_member = laion_individuals[laion_individuals['membership'] == 'non_member'].copy().reset_index(drop=True)
print('Non-Member')
display(laion_non_member)

laion_member = laion_individuals[laion_individuals['membership'] == 'member'].copy().reset_index(drop=True)
print('Member')
display(laion_member.head(4))

# Combine Random First and Last Names to have more Possible Classes

In [None]:
# load the first names
# list was taken from https://github.com/hadley/data-baby-names/blob/master/baby-names.csv which contains the top 1k names for the years 1880-2008 released by the US social security administration
first_names_df = pd.read_csv('./data/common_first_names.csv')
first_names_df = first_names_df.drop(columns=['year']).drop_duplicates(['name', 'sex'])
first_names_df['sex'] = first_names_df['sex'].apply(lambda x: 'm' if x == 'boy' else 'f')
# take the top 1k male and female names
first_names_df = first_names_df.sort_values('percent', ascending=False).groupby('sex').head(1000).reset_index(drop=True).drop(columns=['percent']).rename(columns={'name': 'first_name'})
first_names_df

In [None]:
# load the last names
# list was taken from the US census burea at https://www.census.gov/topics/population/genealogy/data/2010_surnames.html and contains the top 1k surnames
last_names_df = pd.read_csv('./data/common_last_names_US_2010.csv').dropna()[['SURNAME', 'FREQUENCY (COUNT)']]
last_names_df = last_names_df.rename(columns={'SURNAME': 'last_name', 'FREQUENCY (COUNT)': 'count'})
last_names_df['last_name'] = last_names_df['last_name'].str.title()
last_names_df['count'] = last_names_df['count'].str.replace(',', '').astype(int)
last_names_df

In [None]:
# get the cross product of the first and last names
full_names_df = pd.merge(first_names_df[['first_name', 'sex']], last_names_df['last_name'], how='cross')

# sample as much names from each gender equally as we need
sampled_full_names_df = full_names_df.groupby('sex').sample(int((NUM_TOTAL_NAMES - len(concat_datasets.classes)) / 2), random_state=SEED).reset_index()
sampled_full_names_list = sampled_full_names_df.apply(lambda x: f'{x["first_name"]} {x["last_name"]}', axis=1).tolist()
print(f'Length List: {len(sampled_full_names_list)}')
sampled_full_names_list[:10]

In [None]:
# combine the names from facescrub with the sampled names and shuffle them
possible_names = [x.replace("_", " ") for x in concat_datasets.classes] + sampled_full_names_list
print(possible_names[:10], len(possible_names))
possible_names = random.sample(possible_names, k=len(possible_names))
print(f'Length Possible Names: {len(possible_names)}')
possible_names[:10]

# Run the Model for Test Purposes on the first Actor

In [None]:
# prepare and fill the prompt templates
prompts = []
for name in possible_names:
    df_dict = {}
    for prompt_idx, prompt in enumerate(PROMPTS):
        df_dict['class_name'] = "_".join(name.split(" "))
        df_dict[f'prompt_{prompt_idx}'] = prompt.format(name)
    prompts.append(df_dict)
prompts = pd.DataFrame(prompts)
prompts

In [None]:
# get the context vector of the possible labels
if not LOAD_PREDICTIONS_FROM_FILE:
    label_context_vecs = []
    for i in range(len(PROMPTS)):
        context = open_clip.tokenize(prompts[f'prompt_{i}'].to_numpy())
        label_context_vecs.append(context)
    label_context_vecs = torch.stack(label_context_vecs)

In [None]:
# calculate the embeddings for each of the models
if not LOAD_PREDICTIONS_FROM_FILE:
    label_context_vecs = label_context_vecs.to(device)

    text_embeddings_per_model = {}
    for model_name, model in models.items():
        model = model.to(device)
        text_embeddings = get_text_embeddings(model, label_context_vecs, use_tqdm=True, context_batchsize=5_000)
        text_embeddings_per_model[model_name] = text_embeddings
        model = model.cpu()

    label_context_vecs = label_context_vecs.cpu()

In [None]:
if not LOAD_PREDICTIONS_FROM_FILE:
    test_subset_dataset = dataset_class_subsets[concat_datasets.class_to_idx[laion_member['class_name'][0]]]
    for model_name, model in models.items():
        model = model.to(device)
        preds = get_preds_for_dataset(model, test_subset_dataset, label_context_vecs, num_workers=2, text_embeddings=text_embeddings_per_model[model_name])
        unique_vals, counts = [], []
        for x in preds:
            x = x.unique(return_counts=True)
            unique_vals.append(x[0])
            counts.append(x[1])
        model = model.cpu()
        predictions = [int(vals[count.topk(1, sorted=True)[1]]) for vals, count in zip(unique_vals, counts)]
        print(f'Prediction Model {model_name}: {prompts["class_name"].iloc[predictions].to_list()}\t Correct Class: {concat_datasets.classes[test_subset_dataset.target_class]}')

In [None]:
if not LOAD_PREDICTIONS_FROM_FILE:
    test_subset_dataset = dataset_class_subsets[concat_datasets.class_to_idx[laion_non_member['class_name'][0]]]
    for model_name, model in models.items():
        model = model.to(device)
        preds = get_preds_for_dataset(model, test_subset_dataset, label_context_vecs, num_workers=2, text_embeddings=text_embeddings_per_model[model_name])
        unique_vals, counts = [], []
        for x in preds:
            x = x.unique(return_counts=True)
            unique_vals.append(x[0])
            counts.append(x[1])
        model = model.cpu()
        predictions = [int(vals[count.topk(1, sorted=True)[1]]) for vals, count in zip(unique_vals, counts)]
        print(f'Prediction Model {model_name}: {prompts["class_name"].iloc[predictions].to_list()}\t Correct Class: {concat_datasets.classes[test_subset_dataset.target_class]}')

# Run the CLIP model on each Actress/Actor

In [None]:
if not LOAD_PREDICTIONS_FROM_FILE:
    filtered_subsets = []
    for subset in dataset_class_subsets:
        if concat_datasets.classes[subset.target_class] in laion_individuals['class_name'].tolist():
            filtered_subsets.append(subset)

    concat_dataset = PTConcatDataset([subset for subset in filtered_subsets])
    preds_per_model = {}
    for model_name, model in models.items():
        model = model.to(device)
        preds = get_preds_for_dataset(model, concat_dataset, label_context_vecs, batch_size=128, num_workers=32, text_embeddings=text_embeddings_per_model[model_name])
        model = model.cpu()
        assert preds.shape[1] == len(concat_dataset)
        assert preds.shape[0] == len(PROMPTS)
        # transpose the predictions such that we have len(PROMPTS) predictions for each sample
        preds = preds.T
        preds_per_model[model_name] = preds

In [None]:
# split the large list of all predictions into prediction lists for every class
if not LOAD_PREDICTIONS_FROM_FILE:
    preds_per_model_per_subset = {}
    for model_name, preds in preds_per_model.items():
        preds_per_subset = []
        counter = 0
        for subset in filtered_subsets:
            subset_preds = preds[counter:counter + len(subset)]
            assert len(subset_preds) == len(subset)
            preds_per_subset.append(subset_preds)
            counter += len(subset)
        preds_per_model_per_subset[model_name] = preds_per_subset

In [None]:
if not LOAD_PREDICTIONS_FROM_FILE:
    preds_df_per_model = {}
    for model_name in models.keys():
        df_list = []
        for group_idx, (dataset_subset, preds_subset) in enumerate(zip(filtered_subsets, preds_per_model_per_subset[model_name])):
            class_name = concat_datasets.classes[filtered_subsets[group_idx].target_class]
            training_data_samples_info = laion_individuals[laion_individuals['class_name'] == class_name]
            for sample_idx, pred in enumerate(preds_subset):
                result_dict = {
                    'group_idx': group_idx,
                    'class_name': class_name,
                    'sample_idx': sample_idx,
                    'training_sample_count': training_data_samples_info['count'].values[0],
                    'training_sample_count_bin': training_data_samples_info['bin'].values[0]
                }
                for i, pred_idx in enumerate(pred):
                    result_dict[f'name_prediction_prompt_{i}'] = prompts['class_name'].iloc[int(pred_idx)]
                df_list.append(result_dict)
        preds_df = pd.DataFrame(df_list)
        preds_df_per_model[model_name] = preds_df

In [None]:
if not LOAD_PREDICTIONS_FROM_FILE:
    display(preds_df_per_model[MODEL_NAMES[0]])

In [None]:
# only get the rows of the members and non-members
if not LOAD_PREDICTIONS_FROM_FILE:
    for model_name, preds_df in preds_df_per_model.items():
        members = pd.merge(preds_df, laion_member['class_name'], on='class_name')
        members['actual_membership'] = 'member'
        non_members = pd.merge(preds_df, laion_non_member['class_name'], on='class_name')
        non_members['actual_membership'] = 'non_member'
        preds_df_per_model[model_name] = pd.concat([members, non_members]).reset_index(drop=True)

In [None]:
# save the predictions to file if necessary to prevent long runtimes
if not LOAD_PREDICTIONS_FROM_FILE:
    for model_name, preds_df in preds_df_per_model.items():
        preds_df.to_csv(f'laion400m_experiments/prediction_dfs/predictions_multiprompt_laion400_{model_name}_{NUM_TOTAL_NAMES}_{MIN_NUM_CORRECT_PROMPT_PREDS}.csv')
else:
    preds_df_per_model = {}
    for model_name, preds_df in models.items():
        preds_df_per_model[model_name] = pd.read_csv(f'laion400m_experiments/prediction_dfs/predictions_multiprompt_laion400_{model_name}_{NUM_TOTAL_NAMES}_{MIN_NUM_CORRECT_PROMPT_PREDS}.csv', index_col=0)

In [None]:
preds_df_per_model[MODEL_NAMES[0]]

In [None]:
# filter all individuals which have less than the specified number of images available
for model_name, preds_df in preds_df_per_model.items():
    num_samples = preds_df.groupby('class_name').transform('size')
    preds_df_per_model[model_name] = preds_df[num_samples >= MIN_NUM_IMAGES_AVAILABLE]

In [None]:
# filter all individuals which occure more often in the training set than the specified amount
for model_name, preds_df in preds_df_per_model.items():
    preds_df_per_model[model_name] = preds_df[preds_df['training_sample_count'] < MAX_NUM_TRAINING_SAMPLES]

In [None]:
subsample_sizes_per_model = {}
for model_name, preds_df in preds_df_per_model.items():
    min_num_images = preds_df.value_counts('class_name').sort_values()[0]
    assert min_num_images == MIN_NUM_IMAGES_AVAILABLE
    subsample_sizes = np.arange(1, min_num_images+1, 2).tolist()
    subsample_sizes.append(30)
    subsample_sizes_per_model[model_name] = subsample_sizes

In [None]:
def get_membership_metrics(df, sample_size, sample_draws):
    subsampled_dfs = []
    subsampled_metrics_dfs = []
    for i in range(sample_draws):
        # sample the same number of images/predictions for each person
        name_predictions_df = preds_df.groupby('class_name').sample(sample_size).reset_index(drop=True)

        # get the number of members and non_members
        num_member, num_non_member = name_predictions_df[['class_name', 'actual_membership']].drop_duplicates()['actual_membership'].value_counts()

        # get the column names of the different prompts
        prompt_column_names = [f'name_prediction_prompt_{i}' for i in range(len(PROMPTS))]

        def get_name_predictions(predictions: pd.Series, values_only=False, counts_only=False):
            """Takes a series of predictions and returns the unique values and the number of prediction occurrences in descending order."""
            values, counts = np.unique(predictions, return_counts=True)
            descending_counts_indices = counts.argsort()[::-1]

            if values_only:
                return values[descending_counts_indices]
            elif counts_only:
                return counts[descending_counts_indices]
            else:
                return values[descending_counts_indices], counts[descending_counts_indices]

        name_prediction_count_df = name_predictions_df.groupby('class_name')[prompt_column_names].agg(list)
        name_prediction_count_df[[f'unique_name_predictions_prompt_{i}' for i in range(len(prompt_column_names))]] = name_prediction_count_df[prompt_column_names].apply(lambda x: x.apply(lambda y: get_name_predictions(y, values_only=True)))
        name_prediction_count_df[[f'unique_name_prediction_count_prompt_{i}' for i in range(len(prompt_column_names))]] = name_prediction_count_df[prompt_column_names].apply(lambda x: x.apply(lambda y: get_name_predictions(y, counts_only=True)))

        # get the actual membership by merging with the sampled dataframe
        name_prediction_count_df = pd.merge(name_prediction_count_df, name_predictions_df[['class_name', 'actual_membership']].drop_duplicates().set_index('class_name'), how='inner', on='class_name')

        def check_for_correct_prompt_majority(row: pd.Series):
            """Takes a row of the dataframe and checks whether the correct name was predicted the majority of the time."""
            # iterate the prompts
            num_correct_prompts = 0
            for prompt_idx in range(len(row[prompt_column_names])):
                unique_predictions = row[f'unique_name_predictions_prompt_{prompt_idx}']
                prediction_counts = row[f'unique_name_prediction_count_prompt_{prompt_idx}']
                
                # get the indices of the most often predicted names
                idx_most_often_pred_names = np.argwhere(prediction_counts == prediction_counts.max()).flatten()

                # if there are two or more names predicted the same time, we don't have a clear majority prediction and therefore skip this prompt
                if len(idx_most_often_pred_names) > 1:
                    continue

                # if a name was predicted by the majority and it is the correct name, we have a correct majority prediction
                if unique_predictions[idx_most_often_pred_names[0]] == row.name:
                    assert len(idx_most_often_pred_names) == 1
                    num_correct_prompts += 1

            # return true if the number of prompts is greater or equal to the threshold
            return num_correct_prompts >= MIN_NUM_CORRECT_PROMPT_PREDS

        name_prediction_count_df['correct_majority_prediction'] = name_prediction_count_df.apply(check_for_correct_prompt_majority, axis=1)
        name_prediction_count_df['membership_prediction'] = name_prediction_count_df['correct_majority_prediction'].apply(lambda x: 'member' if x else 'non_member')
        name_prediction_count_df['sample_size'] = sample_size
        name_prediction_count_df['draw'] = i

        tp = len(name_prediction_count_df[(name_prediction_count_df['membership_prediction'] == 'member') & (name_prediction_count_df['actual_membership'] == 'member')])
        fp = len(name_prediction_count_df[(name_prediction_count_df['membership_prediction'] == 'member') & (name_prediction_count_df['actual_membership'] == 'non_member')])
        fn = len(name_prediction_count_df[(name_prediction_count_df['membership_prediction'] == 'non_member') & (name_prediction_count_df['actual_membership'] == 'member')])
        tn = len(name_prediction_count_df[(name_prediction_count_df['membership_prediction'] == 'non_member') & (name_prediction_count_df['actual_membership'] == 'non_member')])

        subsampled_metrics_dfs.append({
            'sample_size': sample_size,
            'draw': i,
            'tpr': tp / num_member,
            'fnr': fn / num_member,
            'fpr': fp / num_non_member,
            'tnr': tn / num_non_member,
            'tp': tp,
            'fn': fn,
            'fp': fp,
            'tn': tn
        })
        subsampled_dfs.append(name_prediction_count_df)
    
    return subsampled_metrics_dfs, subsampled_dfs


class TQDMParallel(Parallel):
    def __init__(self, progress_bar=True, total=None, *args, **kwargs):
        self.progress_bar = progress_bar
        self.total = total
        super().__init__(*args, **kwargs)

    def __call__(self, *args, **kwargs):
        with tqdm(disable=not self.progress_bar, total=self.total) as self.pbar:
            return Parallel.__call__(self, *args, **kwargs)

    def print_progress(self):
        if self.total is None:
            self.pbar.total = self.n_dispatched_tasks
        self.pbar.n = self.n_completed_tasks
        self.pbar.refresh()

if not LOAD_PREDICTION_METRICS_FROM_FILE:
    subsampled_metrics_dfs_per_model = {}
    subsampled_dfs_per_model = {}
    sample_draws = 20
    for model_name, preds_df in preds_df_per_model.items():
        subsample_sizes = subsample_sizes_per_model[model_name]
        arguments_list = []
        for sample_size in subsample_sizes:
            arguments_list.append((preds_df, sample_size, sample_draws))

        print(f'{model_name} with {len(arguments_list)} predictions')
        results = TQDMParallel(n_jobs=16, total=len(arguments_list))(
            delayed(get_membership_metrics)(*arguments) for arguments in arguments_list
        )

        flattened_subsampled_metrics_dfs = []
        flattened_subsampled_dfs = []
        for res in results:
            [flattened_subsampled_metrics_dfs.append(x) for x in res[0]]
            [flattened_subsampled_dfs.append(x) for x in res[1]]
        subsampled_metrics_dfs_per_model[model_name] = flattened_subsampled_metrics_dfs
        subsampled_dfs_per_model[model_name] = flattened_subsampled_dfs

In [None]:
if not LOAD_PREDICTION_METRICS_FROM_FILE:
    for model_name, subsampled_metrics_dfs in subsampled_metrics_dfs_per_model.items():
        subsampled_metrics_dfs_per_model[model_name] = pd.DataFrame(subsampled_metrics_dfs).set_index('sample_size').drop('draw', axis='columns')
        subsampled_metrics_dfs_per_model[model_name] = subsampled_metrics_dfs_per_model[model_name].rename(columns={'tpr': 'True Positive Rate', 'fnr': 'False Negative Rate', 'fpr': 'False Positive Rate', 'tnr': 'True Negative Rate'})
        subsampled_metrics_dfs_per_model[model_name].index.name = 'Number of Samples'

    for model_name, subsampled_dfs in subsampled_dfs_per_model.items():
        subsampled_dfs_per_model[model_name] = pd.concat(subsampled_dfs)

In [None]:
if not LOAD_PREDICTION_METRICS_FROM_FILE:
    for model_name, _ in subsampled_metrics_dfs_per_model.items():
        subsampled_metrics_dfs_per_model[model_name].to_csv(f'./laion400m_experiments/prediction_metrics_dfs/multiprompt_prediction_metrics_laion400_{model_name}_{NUM_TOTAL_NAMES}_{MAX_NUM_TRAINING_SAMPLES}_{MIN_NUM_CORRECT_PROMPT_PREDS}.csv')
        subsampled_dfs_per_model[model_name].reset_index().to_feather(f'./laion400m_experiments/prediction_metrics_dfs/multiprompt_prediction_subsamples_laion400_{model_name}_{NUM_TOTAL_NAMES}_{MAX_NUM_TRAINING_SAMPLES}_{MIN_NUM_CORRECT_PROMPT_PREDS}.feather')
else:
    subsampled_metrics_dfs_per_model = {}
    subsampled_dfs_per_model = {}
    for model_name, model in models.items():
        subsampled_metrics_dfs_per_model[model_name] = pd.read_csv(f'./laion400m_experiments/prediction_metrics_dfs/multiprompt_prediction_metrics_laion400_{model_name}_{NUM_TOTAL_NAMES}_{MAX_NUM_TRAINING_SAMPLES}_{MIN_NUM_CORRECT_PROMPT_PREDS}.csv', index_col=0)
        subsampled_dfs_per_model[model_name] = pd.read_feather(f'./laion400m_experiments/prediction_metrics_dfs/multiprompt_prediction_subsamples_laion400_{model_name}_{NUM_TOTAL_NAMES}_{MAX_NUM_TRAINING_SAMPLES}_{MIN_NUM_CORRECT_PROMPT_PREDS}.feather')

In [None]:
display(subsampled_metrics_dfs_per_model[MODEL_NAMES[0]])
display(subsampled_dfs_per_model[MODEL_NAMES[-1]].reset_index())

# Evaluate the Predictions

In [None]:
for model_name, df in subsampled_metrics_dfs_per_model.items():
    display(df.tail(3))

In [None]:
# calculate the accuracy for each sampling
for model_name, subsampled_dfs in subsampled_metrics_dfs_per_model.items():
    subsampled_metrics_dfs_per_model[model_name]['Accuracy'] = (subsampled_dfs['tp'] + subsampled_dfs['tn']) / (subsampled_dfs['tp'] + subsampled_dfs['tn'] + subsampled_dfs['fp'] + subsampled_dfs['fn'])

In [None]:
# only use 3 or more images since we need a majority vote
for model_name, _ in subsampled_metrics_dfs_per_model.items():
    subsampled_metrics_dfs_per_model[model_name] = subsampled_metrics_dfs_per_model[model_name][subsampled_metrics_dfs_per_model[model_name].index >= 3]
    subsampled_dfs_per_model[model_name] = subsampled_dfs_per_model[model_name][subsampled_dfs_per_model[model_name]['sample_size'] >= 3]

In [None]:
sns.set_style('darkgrid')
show_y_axis_only_for_first_model = True
show_legend_only_in_first_plot = True
show_legend = False
for model_name, subsampled_dfs in subsampled_metrics_dfs_per_model.items():
    plt.clf()
    data = subsampled_dfs[[ 'Accuracy', 'True Positive Rate', 'False Negative Rate', 'False Positive Rate', 'True Negative Rate']]
    data = data.rename(columns={
        'Accuracy': 'Acc',
        'True Positive Rate': 'TPR', 
        'False Negative Rate': 'FNR', 
        'False Positive Rate': 'FPR', 
        'True Negative Rate': 'TNR'
        }
    )
    display(data.groupby('Number of Samples').mean())
    ax = sns.lineplot(data=data, errorbar='sd', palette='colorblind')

    ax.set_xlabel("Number of Images used for IDIA", weight="bold", size=16)
    x_ticks = [i for i in range(0, data.index.unique().max()+1, 5)]
    x_ticks[0] = 3
    ax.set_xticks(x_ticks)
    ax.set_xticklabels([int(x) for x in ax.get_xticks()], size=16)

    ax.set_yticks([i for i in np.arange(0, 1+0.1, 0.1)])
    ax.set_yticklabels(ax.get_yticks(), size=16)
    ax.yaxis.set_major_formatter(FormatStrFormatter('%.2f'))
    
    # show only every second y tick label
    plt.setp(ax.yaxis.get_ticklabels()[1::2], visible=False)

    if show_legend or (show_legend_only_in_first_plot and model_name == MODEL_NAMES[0]):
        h, l = ax.get_legend_handles_labels()
        ax.legend(h, l, ncol=3, loc='upper center', fontsize=16, bbox_to_anchor=(0, -0.05, 1, 1))
    else:
        ax.legend_.remove()

    if show_y_axis_only_for_first_model and model_name != MODEL_NAMES[0]:
        ax.set_yticklabels([])

    plt.tight_layout()
    if not os.path.exists('./laion400m_experiments/plots'):
        os.mkdir('./laion400m_experiments/plots')
    ax.get_figure().savefig(f'./laion400m_experiments/plots/num_idia_samples_plot_multiprompt_laion400_{model_name}_{NUM_TOTAL_NAMES}_{MAX_NUM_TRAINING_SAMPLES}_{MIN_NUM_CORRECT_PROMPT_PREDS}.pdf')
    ax.get_figure().savefig(f'./laion400m_experiments/plots/num_idia_samples_plot_multiprompt_laion400_{model_name}_{NUM_TOTAL_NAMES}_{MAX_NUM_TRAINING_SAMPLES}_{MIN_NUM_CORRECT_PROMPT_PREDS}.png', dpi=100)
    print(model_name)
    plt.show()

In [None]:
show_y_axis_only_for_first_model = True
for model_name, subsampled_dfs in subsampled_metrics_dfs_per_model.items():
    tp_std, fn_std, fp_std, tn_std = subsampled_dfs.groupby('Number of Samples').std().iloc[-1][['tp', 'fn', 'fp', 'tn']]
    tp, fn, fp, tn = subsampled_dfs.groupby('Number of Samples').mean().iloc[-1][['tp', 'fn', 'fp', 'tn']]

    tpr_std, fnr_std, fpr_std, tnr_std = subsampled_dfs.groupby('Number of Samples').std().iloc[-1][['True Positive Rate', 'False Negative Rate', 'False Positive Rate', 'True Negative Rate']]
    tpr, fnr, fpr, tnr = subsampled_dfs.groupby('Number of Samples').mean().iloc[-1][['True Positive Rate', 'False Negative Rate', 'False Positive Rate', 'True Negative Rate']]

    normalized_conf_mat = pd.DataFrame({'Member': [tpr, fpr], 'Non-Member': [fnr, tnr]}, index=['Member', 'Non-Member'])
    normalized_conf_mat.index.set_names('Actual Membership', inplace=True)
    normalized_conf_mat = normalized_conf_mat.rename_axis('Predicted Membership', axis='columns')

    group_names = ['TP','FN','FP','TN']
    group_counts = ["{0:0.0f} \u00B1 {1:0.2f}".format(mean, std) for mean, std in zip([tp, fn, fp, tn], [tp_std, fn_std, fp_std, tn_std])]
    percentage = ["{0:0.2f}% \u00B1 {1:0.02f}%".format(mean * 100, std * 100) for mean, std in zip([tpr, fnr, fpr, tnr], [tpr_std, fnr_std, fpr_std, tnr_std])]
    labels = [f"{v1}\n{v2}\n{v3}" for v1, v2, v3 in zip(group_names, group_counts, percentage)]
    plt.clf()
    ax = sns.heatmap(normalized_conf_mat, annot=np.asarray(labels).reshape(2, 2), fmt='', cbar=False, cmap='Blues', annot_kws={'fontsize': 16})

    ax.set_yticklabels(ax.get_yticklabels(), size=16)
    ax.set_xticklabels(ax.get_xticklabels(), size=16)

    if show_y_axis_only_for_first_model and model_name != MODEL_NAMES[0]:
        ax.set_yticklabels([])
        plt.ylabel('')
    else:
        plt.ylabel('Actual Membership', fontsize=16, weight='bold')

    plt.xlabel('Predicted Membership', fontsize=16, weight='bold')
    plt.tight_layout()
    ax.get_figure().savefig(f'./laion400m_experiments/plots/confusion_matrix_multiprompt_laion400_{model_name}_{NUM_TOTAL_NAMES}_{MAX_NUM_TRAINING_SAMPLES}_{MIN_NUM_CORRECT_PROMPT_PREDS}.pdf')
    ax.get_figure().savefig(f'./laion400m_experiments/plots/confusion_matrix_multiprompt_laion400_{model_name}_{NUM_TOTAL_NAMES}_{MAX_NUM_TRAINING_SAMPLES}_{MIN_NUM_CORRECT_PROMPT_PREDS}.png', dpi=100)
    print(model_name)
    plt.show()

# Plot the Metrics Against the Number of Samples in the Training Set

In [None]:
import re
show_legend_only_for = MODEL_NAMES[-1]
show_y_axis_only_for = MODEL_NAMES[-1]
for model_name in models.keys():
    plt.clf()
    data = subsampled_dfs_per_model[model_name]
    # get only the subsamples where 30 samples were used
    data = data[data['sample_size'] == data.groupby('sample_size').last().iloc[-1].name].copy(deep=True)
    data = pd.merge(laion_individuals[['class_name', 'count', 'bin']], data, on='class_name')

    # group by number of training sample bins and calculate the metrics
    group_metrics = []
    num_samples_per_group = []
    for grp_name, group in data.groupby(['draw', 'bin']):
        # we want to skip the non_member here
        if grp_name[1] == '[0, 1)':
            continue

        # get the number of members and non_members
        num_member = group[['class_name', 'actual_membership']].drop_duplicates()['actual_membership'].value_counts()[0]

        tp = len(group[(group['membership_prediction'] == 'member') & (group['actual_membership'] == 'member')])
        fn = len(group[(group['membership_prediction'] == 'non_member') & (group['actual_membership'] == 'member')])

        group_metrics.append({
            'draw': grp_name[0],
            'bin': pd.Interval(*map(int, re.sub(r"(\[|\))", "", grp_name[1]).split(", ")), closed='left'),
            'True Positive Rate': tp / num_member,
            'False Negative Rate': fn / num_member
        })
        num_samples_per_group.append({
            'num_samples': len(group),
            'bin': pd.Interval(*map(int, re.sub(r"(\[|\))", "", grp_name[1]).split(", ")), closed='left'),
            'draw': grp_name[0]
        })

    df = pd.DataFrame(group_metrics).rename(columns={'bin': 'Number of Samples'}).set_index('Number of Samples')
    num_samples_df =  pd.DataFrame(num_samples_per_group)[['num_samples', 'bin']].drop_duplicates().rename(columns={'bin': 'Number of Samples'}).set_index('Number of Samples')
    
    interval_index_mapping = {x: i for i, x in enumerate(df.index.sort_values().unique())}
    df['idx'] = df.apply(lambda x: interval_index_mapping[x.name], axis=1)
    num_samples_df['idx'] = num_samples_df.apply(lambda x: interval_index_mapping[x.name], axis=1)


    # plot the graph
    fig = plt.figure()
    ax = fig.add_axes([0,0,1,1])
    rows_to_plot = ['True Positive Rate', 'False Negative Rate']
    print(model_name)
    display(df[rows_to_plot].groupby(df.index).mean())

    ax = sns.lineplot(data=df[['idx'] + rows_to_plot].rename(columns={'idx': 'Number of Training Samples per Person'}).set_index('Number of Training Samples per Person'), errorbar='sd', ax=ax)
    
    ax.set_xticks(df['idx'].sort_values().unique())
    ax.set_xticklabels(df.index.sort_values().unique().astype(str).tolist(), rotation=45, ha="right", rotation_mode="anchor", size=16)
    #ax.set_xticklabels([x.right for x in df.index.sort_values().unique().tolist()], size=16)
    ax.set_xlabel("Number of Images\nper Person in LAION-400M", weight="bold", size=16)

    ax.set_yticks([i for i in np.arange(0, 1+0.1, 0.1)])
    ax.set_yticklabels(ax.get_yticks(), size=16)
    ax.yaxis.set_major_formatter(FormatStrFormatter('%.2f'))
    ax.set_ylim(-0.025, 1.025)

    # show only every second y tick label
    plt.setp(ax.yaxis.get_ticklabels()[1::2], visible=False)

    if model_name == show_legend_only_for:
        h, l = ax.get_legend_handles_labels()
        ax.legend(h, l, ncol=1, fontsize=16)
    else:
        ax.legend_.remove()

    if model_name != show_y_axis_only_for:
        ax.set_yticklabels([])

    plt.tight_layout()
    ax.get_figure().savefig(f'./laion400m_experiments/plots/num_training_samples_plot_multiprompt_laion400_{model_name}_{NUM_TOTAL_NAMES}_{MAX_NUM_TRAINING_SAMPLES}_{MIN_NUM_CORRECT_PROMPT_PREDS}.pdf', bbox_inches='tight')
    ax.get_figure().savefig(f'./laion400m_experiments/plots/num_training_samples_plot_multiprompt_laion400_{model_name}_{NUM_TOTAL_NAMES}_{MAX_NUM_TRAINING_SAMPLES}_{MIN_NUM_CORRECT_PROMPT_PREDS}.png', dpi=100, bbox_inches='tight')
    plt.show()

display(num_samples_df)
print(f'Total number of individuals used: {num_samples_df["num_samples"].sum()}')

plt.clf()
ax = sns.barplot(num_samples_df[['num_samples']].rename(columns={'num_samples': 'Number of Individuals'}).sort_values('Number of Samples').reset_index(), y='Number of Individuals', x='Number of Samples')
ax.set_xticklabels(num_samples_df.index.sort_values().unique().astype(str).tolist(), size=16, rotation=45, ha="right", rotation_mode="anchor")
ax.set_yticklabels(ax.get_yticklabels(), size=16)
ax.set_xlabel("Number of Images\nper Individual in LAION-400M", size=16, weight="bold")
ax.set_ylabel("Number of\nIndividuals", size=16, weight="bold")

plt.tight_layout()
ax.get_figure().savefig(f'./laion400m_experiments/plots/num_training_samples_histogram_laion400_{NUM_TOTAL_NAMES}_{MAX_NUM_TRAINING_SAMPLES}_{MIN_NUM_CORRECT_PROMPT_PREDS}.pdf')
ax.get_figure().savefig(f'./laion400m_experiments/plots/num_training_samples_histogram_laion400_{NUM_TOTAL_NAMES}_{MAX_NUM_TRAINING_SAMPLES}_{MIN_NUM_CORRECT_PROMPT_PREDS}.png', dpi=100)
plt.show()


# Plot a Heatmap to Visualize Influence of Number of Attack/Trainin Samples

In [None]:
show_cbar = False
show_ylabel = True
for model_name in models.keys():
    plt.clf()
    data = subsampled_dfs_per_model[model_name]
    data = pd.merge(laion_individuals[['class_name', 'count', 'bin']], data, on='class_name')

    dfs = []
    for sample_size, sample_size_group in data.groupby('sample_size'):
        # group by number of training sample bins and calculate the metrics
        group_metrics = []
        num_samples_per_group = []
        for grp_name, group in sample_size_group.groupby(['draw', 'bin']):
            # we want to skip the non_member here
            if grp_name[1] == '[0, 1)':
                continue

            # get the number of members and non_members
            num_member = group[['class_name', 'actual_membership']].drop_duplicates()['actual_membership'].value_counts()[0]

            tp = len(group[(group['membership_prediction'] == 'member') & (group['actual_membership'] == 'member')])
            fn = len(group[(group['membership_prediction'] == 'non_member') & (group['actual_membership'] == 'member')])

            group_metrics.append({
                'draw': grp_name[0],
                'bin': pd.Interval(*map(int, re.sub(r"(\[|\))", "", grp_name[1]).split(", ")), closed='left'),
                'True Positive Rate': tp / num_member,
                'False Negative Rate': fn / num_member,
                'sample_size': sample_size
            })
            
        num_training_samples_label = "Number of Images\nper Person in LAION-400M"
        dfs.append(pd.DataFrame(group_metrics).rename(columns={'bin': num_training_samples_label}).set_index(num_training_samples_label).groupby(num_training_samples_label).mean())

    combined_df = pd.concat(dfs)
    pivoted_df = combined_df.reset_index().rename(columns={'sample_size': 'Number of Images\nused for IDIA'}).pivot(index='Number of Images\nused for IDIA', columns=num_training_samples_label, values="True Positive Rate")
    pivoted_df.index = pivoted_df.index.astype(int)

    fig = plt.figure()
    ax = fig.add_axes([0,0,1,1])
    ax = sns.heatmap(pivoted_df, yticklabels=2, vmin=0, vmax=1, cmap="Blues", cbar=model_name == MODEL_NAMES[-1], ax=ax)

    if model_name == MODEL_NAMES[-1]:
        ax.figure.axes[-1].set_ylabel("True Positive Rate", weight='bold', size=16)
        ax.collections[0].colorbar.ax.tick_params(labelsize=16)


    ax.set_xlabel(ax.get_xlabel(), weight="bold", size=16)
    ax.set_ylabel(ax.get_ylabel(), weight="bold", size=16)

    ax.set_xticklabels([x.get_text() for x in ax.get_xticklabels()], size=16, rotation=45, ha="right", rotation_mode="anchor")

    ax.set_yticklabels(ax.get_ymajorticklabels(), fontsize = 16)

    ax.invert_yaxis()

    if model_name != MODEL_NAMES[0]:
        ax.set(yticklabels=[" " for x in ax.get_yticklabels()], ylabel=" ")
    
    plt.tight_layout()

    ax.get_figure().savefig(f'./laion400m_experiments/plots/heatmap_num_training_samples_laion400_{model_name}_{NUM_TOTAL_NAMES}_{MAX_NUM_TRAINING_SAMPLES}_{MIN_NUM_CORRECT_PROMPT_PREDS}.pdf', bbox_inches='tight')
    ax.get_figure().savefig(f'./laion400m_experiments/plots/heatmap_num_training_samples_laion400_{model_name}_{NUM_TOTAL_NAMES}_{MAX_NUM_TRAINING_SAMPLES}_{MIN_NUM_CORRECT_PROMPT_PREDS}.png', dpi=100, bbox_inches='tight')

    plt.show()