In [None]:
import os
os.chdir('/workspace')
os.environ['CUDA_VISIBLE_DEVICES'] = "3"

import torch
import open_clip
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import torchvision.transforms as T
from torch.utils.data import DataLoader
from torch.utils.data import ConcatDataset
from tqdm.notebook import tqdm
from scipy.stats import entropy
from IPython.display import display
import seaborn as sns
import matplotlib.pyplot as plt
import wandb
from joblib import Parallel, delayed
import random


from rtpt.rtpt import setproctitle
setproctitle('@Clip_Notebook')

from datasets import FaceScrub, SingleClassSubset

%matplotlib inline

pd.set_option('display.max_rows', 15)

# init clip
RUN_PATHS = {
    'top75': {'model_path': 'cc3m_experiments/checkpoints/rn50_top75_epoch_50.pt'},
    'top50': {'model_path': 'cc3m_experiments/checkpoints/rn50_top50_epoch_50.pt'},
    'top25': {'model_path': 'cc3m_experiments/checkpoints/rn50_top25_epoch_50.pt'},
    'top10': {'model_path': 'cc3m_experiments/checkpoints/rn50_top10_epoch_50.pt'},
    'top5': {'model_path': 'cc3m_experiments/checkpoints/rn50_top05_epoch_50.pt'},
    'top1': {'model_path': 'cc3m_experiments/checkpoints/rn50_top01_epoch_50.pt'}
}
MODEL_NAME = 'RN50'

DATASET_NAME = 'CC2M'
device = "cuda" if torch.cuda.is_available() else "cpu"
NUM_TOTAL_NAMES = 1_000
PROMPTS = [
    '{0}',
    'an image of {0}', 
    'a photo of {0}', 
    '{0} on a photo', 
    'a photo of a person named {0}', 
    'a person named {0}', 
    'a man named {0}',
    'a woman named {0}',
    'the name of the person is {0}', 
    'a photo of a person with the name {0}', 
    '{0} at a gala', 
    'a photo of the celebrity {0}', 
    'actor {0}',
    'actress {0}',
    'a colored photo of {0}',
    'a black and white photo of {0}',
    'a cool photo of {0}',
    'a cropped photo of {0}',
    'a cropped image of {0}',
    '{0} in a suit',
    '{0} in a dress'
]
MIN_NUM_IMAGES_AVAILABLE = 30 # the number of samples for a person that need to be available in order to consider it in the experiments. Persons with less will not be included in the experiments.
MIN_NUM_CORRECT_PROMPT_PREDS = 1 # the number of prompts for which the majority prediction has to be correct (tau in the paper)

SEED = 42
LOAD_PREDICTION_METRICS_FROM_FILE = True
LOAD_PREDICTIONS_FROM_FILE = True

# Prepare the CLIP model

In [None]:
models = {}
for num_members in RUN_PATHS.keys():
    # pretrained_model = wandb.restore(name=RUN_PATHS[num_members]['model_path'], run_path=RUN_PATHS[num_members]['run_path'])
    model, _, preprocess = open_clip.create_model_and_transforms(
        model_name=MODEL_NAME,
        pretrained=RUN_PATHS[num_members]['model_path'],
        precision='amp'
    )
    model = model.eval()
    # only append the model since the preprocessing is the same for all the models since they are all the same model type
    models[num_members] = model

In [None]:
# define a function to get the predictions for an actor/actress
@torch.no_grad()
def get_text_embeddings(model, context, context_batchsize=10_000, use_tqdm=False):
    context_batchsize = context_batchsize * torch.cuda.device_count()
    # if there is not batches for the context unsqueeze it
    if context.dim() < 3:
        context = context.unsqueeze(0)

    # get the batch size, the number of labels and the sequence length
    seq_len = context.shape[-1]
    viewed_context = context.view(-1, seq_len)

    text_features = []
    for context_batch_idx in tqdm(range(0, len(viewed_context), context_batchsize), desc="Calculating Text Embeddings", disable=not use_tqdm):
        context_batch = viewed_context[context_batch_idx:context_batch_idx + context_batchsize]
        batch_text_features= model.encode_text(context_batch, normalize=True).cpu()

        text_features.append(batch_text_features)
    text_features = torch.cat(text_features).view(list(context.shape[:-1]) + [-1])

    return text_features

@torch.no_grad()
def get_preds_for_dataset(model, subset, context, batch_size=8, num_workers=8, device=device, context_batchsize=10_000, no_tqdm=False, text_embeddings=None):
    dataloader = DataLoader(subset, batch_size=batch_size, num_workers=num_workers, pin_memory=device == 'cuda')

    if text_embeddings is None:
        text_embeddings = get_text_embeddings(model, context, context_batchsize=context_batchsize)

    preds = []
    for x, _ in tqdm(dataloader, desc='Iterating Dataset', disable=no_tqdm):
        x = x.to(device)
        image_features = model.encode_image(x, normalize=True).cpu()

        image_features = image_features.unsqueeze(0)

        # we have to calculate the cosine similarity manually. OpenAI does this internally.
        logits_per_image = model.logit_scale.exp().cpu()  * image_features @ text_embeddings.swapaxes(-1, -2)
        preds.append(logits_per_image.argmax(-1))

    return torch.cat(preds, dim=-1)

# Load the Member and the Non-Member

In [None]:
# load the non-members
fs_actors_non_members = pd.read_csv(
    'conceptual_captions_experiment/conceptual_captions_facescrub_member_info/actors_non_members.csv', 
    index_col=0
).rename(columns={'name': 'class_name'})
fs_actors_non_members['name'] = fs_actors_non_members['class_name'].map(lambda x: x.replace('_', ' '))

fs_actresses_non_members = pd.read_csv(
    'conceptual_captions_experiment/conceptual_captions_facescrub_member_info/actresses_non_members.csv', 
    index_col=0
).rename(columns={'name': 'class_name'})
fs_actresses_non_members['name'] = fs_actresses_non_members['class_name'].map(lambda x: x.replace('_', ' '))

# load the members
fs_actors_members = pd.read_csv(
    'conceptual_captions_experiment/conceptual_captions_facescrub_member_info/actors_members.csv', 
    index_col=0
).rename(columns={'name': 'class_name'})
fs_actors_members['name'] = fs_actors_members['class_name'].map(lambda x: x.replace('_', ' '))

fs_actresses_members = pd.read_csv(
    'conceptual_captions_experiment/conceptual_captions_facescrub_member_info/actresses_members.csv', 
    index_col=0
).rename(columns={'name': 'class_name'})
fs_actresses_members['name'] = fs_actresses_members['class_name'].map(lambda x: x.replace('_', ' '))

In [None]:
cc_members = pd.concat([fs_actors_members, fs_actresses_members], ignore_index=True)
cc_non_members = pd.concat([fs_actors_non_members, fs_actresses_non_members], ignore_index=True)
cc_individuals = pd.concat([cc_members, cc_non_members], ignore_index=True)
cc_individuals

# Define and load the FaceScrub Dataset Class

In [None]:
facescrub = FaceScrub(root='./data/facescrub', group='all', train=True, cropped=False, transform=preprocess)

In [None]:
dataset_class_subsets = []
for class_idx in range(len(facescrub.classes)):
    dataset_class_subsets.append(SingleClassSubset(facescrub, class_idx))

In [None]:
# visualize an example
plt.imshow(dataset_class_subsets[facescrub.class_to_idx[cc_members['class_name'][0]]][2][0].permute(1,2,0).numpy())
plt.show()

# Combine Random First and Last Names to have more Possible Classes

In [None]:
# load the first names
# list was taken from https://github.com/hadley/data-baby-names/blob/master/baby-names.csv which contains the top 1k names for the years 1880-2008 released by the US social security administration
first_names_df = pd.read_csv('./conceptual_captions_experiment/common_first_names.csv')
first_names_df = first_names_df.drop(columns=['year']).drop_duplicates(['name', 'sex'])
first_names_df['sex'] = first_names_df['sex'].apply(lambda x: 'm' if x == 'boy' else 'f')
# take the top 1k male and female names
first_names_df = first_names_df.sort_values('percent', ascending=False).groupby('sex').head(1000).reset_index(drop=True).drop(columns=['percent']).rename(columns={'name': 'first_name'})
first_names_df

In [None]:
# load the last names
# list was taken from the US census burea at https://www.census.gov/topics/population/genealogy/data/2010_surnames.html and contains the top 1k surnames
last_names_df = pd.read_csv('./conceptual_captions_experiment/common_last_names_US_2010.csv').dropna()[['SURNAME', 'FREQUENCY (COUNT)']]
last_names_df = last_names_df.rename(columns={'SURNAME': 'last_name', 'FREQUENCY (COUNT)': 'count'})
last_names_df['last_name'] = last_names_df['last_name'].str.title()
last_names_df['count'] = last_names_df['count'].str.replace(',', '').astype(int)
last_names_df

In [None]:
# get the cross product of the first and last names
full_names_df = pd.merge(first_names_df[['first_name', 'sex']], last_names_df['last_name'], how='cross')
# sample as much names from each gender equally as we need
sampled_full_names_df = full_names_df.groupby('sex').sample(int((NUM_TOTAL_NAMES - len(facescrub.classes)) / 2), random_state=SEED).reset_index()
sampled_full_names_list = sampled_full_names_df.apply(lambda x: f'{x["first_name"]} {x["last_name"]}', axis=1).tolist()
print(f'Length List: {len(sampled_full_names_list)}')
sampled_full_names_list[:10]

In [None]:
# combine the names from facescrub with the sampled names and shuffle them
possible_names = [x.replace("_", " ") for x in facescrub.classes] + sampled_full_names_list
print(possible_names[:10])
possible_names = random.sample(possible_names, k=len(possible_names))
print(f'Length Possible Names: {len(possible_names)}')
possible_names[:10]

# Run the Model for Test Purposes on the first Actor

In [None]:
# prepare and fill the prompt templates
prompts = []
for name in possible_names:
    df_dict = {}
    for prompt_idx, prompt in enumerate(PROMPTS):
        df_dict['class_name'] = "_".join(name.split(" "))
        df_dict[f'prompt_{prompt_idx}'] = prompt.format(name)
    prompts.append(df_dict)
prompts = pd.DataFrame(prompts)
prompts

In [None]:
# get the context vector of the possible labels
if not LOAD_PREDICTIONS_FROM_FILE:
    label_context_vecs = []
    for i in range(len(PROMPTS)):
        context = open_clip.tokenize(prompts[f'prompt_{i}'].to_numpy())
        label_context_vecs.append(context)
    label_context_vecs = torch.stack(label_context_vecs)

In [None]:
# calculate the embeddings for each of the models
if not LOAD_PREDICTIONS_FROM_FILE:
    label_context_vecs = label_context_vecs.to(device)

    text_embeddings_per_model = {}
    for num_members, model in models.items():
        model = model.to(device)
        text_embeddings = get_text_embeddings(model, label_context_vecs, use_tqdm=True)
        text_embeddings_per_model[num_members] = text_embeddings
        model = model.cpu()

    label_context_vecs = label_context_vecs.cpu()

In [None]:
if not LOAD_PREDICTIONS_FROM_FILE:
    test_subset_dataset = dataset_class_subsets[facescrub.class_to_idx[cc_members['class_name'][0]]]
    for num_members, model in models.items():
        model = model.to(device)
        preds = get_preds_for_dataset(model, test_subset_dataset, label_context_vecs, num_workers=2, text_embeddings=text_embeddings_per_model[num_members])
        unique_vals, counts = [], []
        for x in preds:
            x = x.unique(return_counts=True)
            unique_vals.append(x[0])
            counts.append(x[1])
        model = model.cpu()
        predictions = [int(vals[count.topk(1, sorted=True)[1]]) for vals, count in zip(unique_vals, counts)]
        print(f'Prediction Model {num_members}: {prompts["class_name"].iloc[predictions].to_list()}\t Correct Class: {facescrub.classes[test_subset_dataset.target_class]}')

In [None]:
if not LOAD_PREDICTIONS_FROM_FILE:
    test_subset_dataset = dataset_class_subsets[facescrub.class_to_idx[cc_non_members['class_name'][0]]]
    for num_members, model in models.items():
        model = model.to(device)
        preds = get_preds_for_dataset(model, test_subset_dataset, label_context_vecs, num_workers=2, text_embeddings=text_embeddings_per_model[num_members])
        unique_vals, counts = [], []
        for x in preds:
                x = x.unique(return_counts=True)
                unique_vals.append(x[0])
                counts.append(x[1])
        model = model.cpu()
        predictions = [int(vals[count.topk(1, sorted=True)[1]]) for vals, count in zip(unique_vals, counts)]
        print(f'Prediction Model {num_members}: {prompts["class_name"].iloc[predictions].to_list()}\t Correct Class: {facescrub.classes[test_subset_dataset.target_class]}')

# Run the CLIP model on all Actors
Filter for (Non-)Members afterwards using Pands

In [None]:
if not LOAD_PREDICTIONS_FROM_FILE:
    filtered_subsets = []
    for subset in dataset_class_subsets:
        if facescrub.classes[subset.target_class] in cc_individuals['class_name'].tolist():
            filtered_subsets.append(subset)

    concat_dataset = ConcatDataset([subset for subset in filtered_subsets])
    preds_per_model = {}
    for num_members, model in models.items():
        model = model.to(device)
        preds = get_preds_for_dataset(model, concat_dataset, label_context_vecs, batch_size=128, num_workers=32, text_embeddings=text_embeddings_per_model[num_members])
        model = model.cpu()
        assert preds.shape[1] == len(concat_dataset)
        assert preds.shape[0] == len(PROMPTS)
        # transpose the predictions such that we have len(PROMPTS) predictions for each sample
        preds = preds.T
        preds_per_model[num_members] = preds

In [None]:
# split the large list of all predictions into prediction lists for every class
if not LOAD_PREDICTIONS_FROM_FILE:
    preds_per_model_per_subset = {}
    for num_members, preds in preds_per_model.items():
        preds_per_subset = []
        counter = 0
        for subset in filtered_subsets:
            subset_preds = preds[counter:counter + len(subset)]
            assert len(subset_preds) == len(subset)
            preds_per_subset.append(subset_preds)
            counter += len(subset)
        preds_per_model_per_subset[num_members] = preds_per_subset

In [None]:
if not LOAD_PREDICTIONS_FROM_FILE:
    preds_df_per_model = {}
    for num_members in models.keys():
        df_list = []
        for group_idx, (dataset_subset, preds_subset) in enumerate(zip(filtered_subsets, preds_per_model_per_subset[num_members])):
            for sample_idx, pred in enumerate(preds_subset):
                class_name = facescrub.classes[filtered_subsets[group_idx].target_class]
                result_dict = {
                    'group_idx': group_idx,
                    'class_name': class_name,
                    'sample_idx': sample_idx
                }
                for i, pred_idx in enumerate(pred):
                    result_dict[f'name_prediction_prompt_{i}'] = prompts['class_name'].iloc[int(pred_idx)]
                df_list.append(result_dict)
        preds_df = pd.DataFrame(df_list)
        preds_df_per_model[num_members] = preds_df

In [None]:
if not LOAD_PREDICTIONS_FROM_FILE:
    display(preds_df_per_model['top75'])

In [None]:
# only get the rows of the members and non-members
if not LOAD_PREDICTIONS_FROM_FILE:
    for num_members, preds_df in preds_df_per_model.items():
        members = pd.merge(preds_df, cc_members['class_name'], on='class_name')
        members['actual_membership'] = 'member'
        non_members = pd.merge(preds_df, cc_non_members['class_name'], on='class_name')
        non_members['actual_membership'] = 'non_member'
        preds_df_per_model[num_members] = pd.concat([members, non_members])

In [None]:
# save the predictions to file if necessary to prevent long runtimes
if not LOAD_PREDICTIONS_FROM_FILE:
    for num_members, preds_df in preds_df_per_model.items():
        preds_df.to_csv(f'conceptual_captions_experiment/prediction_dfs/predictions_multiprompt_{DATASET_NAME}_{MODEL_NAME}_{num_members}_{NUM_TOTAL_NAMES}_{MIN_NUM_CORRECT_PROMPT_PREDS}.csv')
else:
    preds_df_per_model = {}
    for num_members, preds_df in models.items():
        preds_df_per_model[num_members] = pd.read_csv(f'conceptual_captions_experiment/prediction_dfs/predictions_multiprompt_{DATASET_NAME}_{MODEL_NAME}_{num_members}_{NUM_TOTAL_NAMES}_{MIN_NUM_CORRECT_PROMPT_PREDS}.csv', index_col=0)

In [None]:
preds_df_per_model['top75']

In [None]:
subsample_sizes_per_model = {}
for num_members, preds_df in preds_df_per_model.items():
    subsample_sizes = np.arange(1, MIN_NUM_IMAGES_AVAILABLE+1, 2).tolist()
    subsample_sizes.append(30)
    subsample_sizes_per_model[num_members] = subsample_sizes

In [None]:
def get_membership_metrics(df, sample_size, sample_draws):
    subsampled_metrics_dfs = []
    for i in range(sample_draws):
        # sample the same number of images/predictions for each person
        name_predictions_df = preds_df.groupby('class_name').sample(sample_size).reset_index(drop=True)

        # get the number of members and non_members
        num_member, num_non_member = name_predictions_df[['class_name', 'actual_membership']].drop_duplicates()['actual_membership'].value_counts()

        # get the column names of the different prompts
        prompt_column_names = [f'name_prediction_prompt_{i}' for i in range(len(PROMPTS))]

        def get_name_predictions(predictions: pd.Series, values_only=False, counts_only=False):
            """Takes a series of predictions and returns the unique values and the number of prediction occurrences in descending order."""
            values, counts = np.unique(predictions, return_counts=True)
            descending_counts_indices = counts.argsort()[::-1]

            if values_only:
                return values[descending_counts_indices]
            elif counts_only:
                return counts[descending_counts_indices]
            else:
                return values[descending_counts_indices], counts[descending_counts_indices]

        name_prediction_count_df = name_predictions_df.groupby('class_name')[prompt_column_names].agg(list)
        name_prediction_count_df[[f'unique_name_predictions_prompt_{i}' for i in range(len(prompt_column_names))]] = name_prediction_count_df[prompt_column_names].apply(lambda x: x.apply(lambda y: get_name_predictions(y, values_only=True)))
        name_prediction_count_df[[f'unique_name_prediction_count_prompt_{i}' for i in range(len(prompt_column_names))]] = name_prediction_count_df[prompt_column_names].apply(lambda x: x.apply(lambda y: get_name_predictions(y, counts_only=True)))

        # get the actual membership by merging with the sampled dataframe
        name_prediction_count_df = pd.merge(name_prediction_count_df, name_predictions_df[['class_name', 'actual_membership']].drop_duplicates().set_index('class_name'), how='inner', on='class_name')

        def check_for_correct_prompt_majority(row: pd.Series):
            """Takes a row of the dataframe and checks whether the correct name was predicted the majority of the time."""
            # iterate the prompts
            num_correct_prompts = 0
            for prompt_idx in range(len(row[prompt_column_names])):
                unique_predictions = row[f'unique_name_predictions_prompt_{prompt_idx}']
                prediction_counts = row[f'unique_name_prediction_count_prompt_{prompt_idx}']
                
                # get the indices of the most often predicted names
                idx_most_often_pred_names = np.argwhere(prediction_counts == prediction_counts.max()).flatten()

                # if there are two or more names predicted the same time, we don't have a clear majority prediction and therefore skip this prompt
                if len(idx_most_often_pred_names) > 1:
                    continue

                # if a name was predicted by the majority and it is the correct name, we have a correct majority prediction
                if unique_predictions[idx_most_often_pred_names[0]] == row.name:
                    assert len(idx_most_often_pred_names) == 1
                    num_correct_prompts += 1

            # return true if the number of prompts is greater or equal to the threshold
            return num_correct_prompts >= MIN_NUM_CORRECT_PROMPT_PREDS

        name_prediction_count_df['correct_majority_prediction'] = name_prediction_count_df.apply(check_for_correct_prompt_majority, axis=1)
        name_prediction_count_df['membership_prediction'] = name_prediction_count_df['correct_majority_prediction'].apply(lambda x: 'member' if x else 'non_member')
        name_prediction_count_df['sample_size'] = sample_size
        name_prediction_count_df['draw'] = i

        tp = len(name_prediction_count_df[(name_prediction_count_df['membership_prediction'] == 'member') & (name_prediction_count_df['actual_membership'] == 'member')])
        fp = len(name_prediction_count_df[(name_prediction_count_df['membership_prediction'] == 'member') & (name_prediction_count_df['actual_membership'] == 'non_member')])
        fn = len(name_prediction_count_df[(name_prediction_count_df['membership_prediction'] == 'non_member') & (name_prediction_count_df['actual_membership'] == 'member')])
        tn = len(name_prediction_count_df[(name_prediction_count_df['membership_prediction'] == 'non_member') & (name_prediction_count_df['actual_membership'] == 'non_member')])

        subsampled_metrics_dfs.append({
            'sample_size': sample_size,
            'draw': i,
            'tpr': tp / num_member,
            'fnr': fn / num_member,
            'fpr': fp / num_non_member,
            'tnr': tn / num_non_member,
            'tp': tp,
            'fn': fn,
            'fp': fp,
            'tn': tn
        })
    
    return subsampled_metrics_dfs


class TQDMParallel(Parallel):
    def __init__(self, progress_bar=True, total=None, *args, **kwargs):
        self.progress_bar = progress_bar
        self.total = total
        super().__init__(*args, **kwargs)

    def __call__(self, *args, **kwargs):
        with tqdm(disable=not self.progress_bar, total=self.total) as self.pbar:
            return Parallel.__call__(self, *args, **kwargs)

    def print_progress(self):
        if self.total is None:
            self.pbar.total = self.n_dispatched_tasks
        self.pbar.n = self.n_completed_tasks
        self.pbar.refresh()

if not LOAD_PREDICTION_METRICS_FROM_FILE:
    subsampled_dfs_per_model = {}
    sample_draws = 20
    for num_member_identities, preds_df in preds_df_per_model.items():
        subsample_sizes = subsample_sizes_per_model[num_member_identities]
        arguments_list = []
        for sample_size in subsample_sizes:
            arguments_list.append((preds_df, sample_size, sample_draws))

        print(f'{num_member_identities} with {len(arguments_list)} predictions')
        subsampled_dfs = TQDMParallel(n_jobs=1, total=len(arguments_list))(
            delayed(get_membership_metrics)(*arguments) for arguments in arguments_list
        )

        flattened_subsampled_dfs = []
        [flattened_subsampled_dfs.extend(x) for x in subsampled_dfs]
        subsampled_dfs_per_model[num_member_identities] = flattened_subsampled_dfs

In [None]:
if not LOAD_PREDICTION_METRICS_FROM_FILE:
    for num_members, subsampled_dfs in subsampled_dfs_per_model.items():
        subsampled_dfs_per_model[num_members] = pd.DataFrame(subsampled_dfs_per_model[num_members]).set_index('sample_size').drop('draw', axis='columns')
        subsampled_dfs_per_model[num_members] = subsampled_dfs_per_model[num_members].rename(columns={'tpr': 'True Positive Rate', 'fnr': 'False Negative Rate', 'fpr': 'False Positive Rate', 'tnr': 'True Negative Rate'})
        subsampled_dfs_per_model[num_members].index.name = 'Number of Samples'

In [None]:
if not LOAD_PREDICTION_METRICS_FROM_FILE:
    for num_members, subsampled_dfs in subsampled_dfs_per_model.items():
        subsampled_dfs_per_model[num_members].to_csv(f'./conceptual_captions_experiment/prediction_metrics_dfs/multiprompt_prediction_metrics_{DATASET_NAME}_{MODEL_NAME}_{num_members}_{NUM_TOTAL_NAMES}_{MIN_NUM_CORRECT_PROMPT_PREDS}.csv')
else:
    subsampled_dfs_per_model = {}
    for num_members, model in models.items():
        subsampled_dfs_per_model[num_members] = pd.read_csv(f'./conceptual_captions_experiment/prediction_metrics_dfs/multiprompt_prediction_metrics_{DATASET_NAME}_{MODEL_NAME}_{num_members}_{NUM_TOTAL_NAMES}_{MIN_NUM_CORRECT_PROMPT_PREDS}.csv', index_col=0)

In [None]:
subsampled_dfs_per_model['top75']

# Evaluate the Predictions

In [None]:
for num_members, df in subsampled_dfs_per_model.items():
    print(num_members)
    display(df.groupby('Number of Samples').mean().head(5))

In [None]:
# calculate the accuracy for each sampling
for num_members, subsampled_dfs in subsampled_dfs_per_model.items():
    subsampled_dfs_per_model[num_members]['Accuracy'] = (subsampled_dfs['tp'] + subsampled_dfs['tn']) / (subsampled_dfs['tp'] + subsampled_dfs['tn'] + subsampled_dfs['fp'] + subsampled_dfs['fn'])

In [None]:
sns.set_style('darkgrid')
from matplotlib.ticker import FormatStrFormatter
show_y_axis = MODEL_NAME == 'RN50'
show_legend = True
for num_members, subsampled_dfs in subsampled_dfs_per_model.items():
    plt.clf()
    data = subsampled_dfs[['Accuracy', 'True Positive Rate', 'False Negative Rate', 'False Positive Rate', 'True Negative Rate']]
    data = data.rename(columns={
        'True Positive Rate': 'TPR', 
        'False Negative Rate': 'FNR', 
        'False Positive Rate': 'FPR', 
        'True Negative Rate': 'TNR', 
        'Accuracy': 'Acc'
        }
    )
    ax = sns.lineplot(data=data, errorbar='sd', palette='colorblind')

    ax.set_xlabel("Number of Images used for IDIA", weight="bold", size=16)
    ax.set_xticks([i for i in range(0, data.index.unique().max()+1, 5)])
    ax.set_xticklabels([int(x) for x in ax.get_xticks()], size=16)

    h, l = ax.get_legend_handles_labels()
    ax.set_yticks([i for i in np.arange(0, 1+0.1, 0.1)])
    ax.set_yticklabels(ax.get_yticks(), size=16)
    ax.legend(h, l, ncol=2, fontsize=16)
    ax.yaxis.set_major_formatter(FormatStrFormatter('%.2f'))

    # show only every second y tick label
    plt.setp(ax.yaxis.get_ticklabels()[1::2], visible=False)

    if show_legend:
        h, l = ax.get_legend_handles_labels()
        ax.legend(h, l, ncol=2, fontsize=16)
    else:
        ax.legend_.remove()

    if not show_y_axis:
        ax.set_yticklabels([])

    plt.tight_layout()
    ax.get_figure().savefig(f'./conceptual_captions_experiment/plots/{MODEL_NAME}/num_idia_samples_plot_multiprompt_{DATASET_NAME}_{MODEL_NAME}_{num_members}_{NUM_TOTAL_NAMES}_{MIN_NUM_CORRECT_PROMPT_PREDS}.pdf')
    ax.get_figure().savefig(f'./conceptual_captions_experiment/plots/{MODEL_NAME}/num_idia_samples_plot_multiprompt_{DATASET_NAME}_{MODEL_NAME}_{num_members}_{NUM_TOTAL_NAMES}_{MIN_NUM_CORRECT_PROMPT_PREDS}.png', dpi=100)
    print(num_members)
    plt.show()

In [None]:
for num_members, subsampled_dfs in subsampled_dfs_per_model.items():
    tp_std, fn_std, fp_std, tn_std = subsampled_dfs.groupby('Number of Samples').std().iloc[-1][['tp', 'fn', 'fp', 'tn']]
    tp, fn, fp, tn = subsampled_dfs.groupby('Number of Samples').mean().iloc[-1][['tp', 'fn', 'fp', 'tn']]

    tpr_std, fnr_std, fpr_std, tnr_std = subsampled_dfs.groupby('Number of Samples').std().iloc[-1][['True Positive Rate', 'False Negative Rate', 'False Positive Rate', 'True Negative Rate']]
    tpr, fnr, fpr, tnr = subsampled_dfs.groupby('Number of Samples').mean().iloc[-1][['True Positive Rate', 'False Negative Rate', 'False Positive Rate', 'True Negative Rate']]

    normalized_conf_mat = pd.DataFrame({'Member': [tpr, fpr], 'Non-Member': [fnr, tnr]}, index=['Member', 'Non-Member'])
    normalized_conf_mat.index.set_names('Actual Membership', inplace=True)
    normalized_conf_mat = normalized_conf_mat.rename_axis('Predicted Membership', axis='columns')

    group_names = ['TP','FN','FP','TN']
    group_counts = ["{0:0.0f} \u00B1 {1:0.2f}".format(mean, std) for mean, std in zip([tp, fn, fp, tn], [tp_std, fn_std, fp_std, tn_std])]
    percentage = ["{0:0.2f}% \u00B1 {1:0.02f}%".format(mean * 100, std * 100) for mean, std in zip([tpr, fnr, fpr, tnr], [tpr_std, fnr_std, fpr_std, tnr_std])]
    labels = [f"{v1}\n{v2}\n{v3}" for v1, v2, v3 in zip(group_names, group_counts, percentage)]
    plt.clf()
    ax = sns.heatmap(normalized_conf_mat, annot=np.asarray(labels).reshape(2, 2), fmt='', cbar=False, cmap='Blues', annot_kws={'fontsize': 16})

    ax.set_yticklabels(ax.get_yticklabels(), size=16)
    ax.set_xticklabels(ax.get_xticklabels(), size=16)

    plt.ylabel('Actual Membership', fontsize=16, weight='bold')
    
    plt.xlabel('Predicted Membership', fontsize=16, weight='bold')
    plt.tight_layout()
    ax.get_figure().savefig(f'./conceptual_captions_experiment/plots/{MODEL_NAME}/confusion_matrix_multiprompt_{DATASET_NAME}_{MODEL_NAME}_{num_members}_{NUM_TOTAL_NAMES}_{MIN_NUM_CORRECT_PROMPT_PREDS}.pdf')
    ax.get_figure().savefig(f'./conceptual_captions_experiment/plots/{MODEL_NAME}/confusion_matrix_multiprompt_{DATASET_NAME}_{MODEL_NAME}_{num_members}_{NUM_TOTAL_NAMES}_{MIN_NUM_CORRECT_PROMPT_PREDS}.png', dpi=100)
    print(num_members)
    plt.show()

# Plot the Metrics Against the Number of Samples in the Traing Set

In [None]:
show_legend = True
show_y_axis = True
rows = []
for num_members in subsampled_dfs_per_model.keys():
    plt.clf()
    data = subsampled_dfs_per_model[num_members].loc[:, ['True Positive Rate', 'False Negative Rate', 'False Positive Rate', 'True Negative Rate', 'Accuracy']]
    data["Number of Images\nper Person in CC3M"] = int(num_members.replace("top", ""))
    # get the last group (30 attack samples) to calculate mean and std
    rows.append(data.groupby('Number of Samples').get_group(data.groupby('Number of Samples').last().iloc[-1].name).set_index("Number of Images\nper Person in CC3M"))

df = pd.concat(rows).rename(columns={
    'True Positive Rate': 'TPR', 
    'False Negative Rate': 'FNR',
    'False Positive Rate': 'FPR', 
    'True Negative Rate': 'TNR', 
    'Accuracy': 'Acc'
    })

# plot the graph
fig = plt.figure()
ax = fig.add_axes([0,0,1,1])
rows_to_plot = ['TPR', 'FNR']
display(df[rows_to_plot].groupby(df.index).mean())
ax = sns.lineplot(data=df[rows_to_plot], errorbar='sd')

ax.set_xticks([int(x.replace("top", "")) for x in subsampled_dfs_per_model.keys()][::-1])
ax.set_xticklabels(ax.get_xticks(), size=16)

ax.set_xlabel(df.index.name, weight='bold', size=16)

# ax.set(yticklabels=[" " for x in ax.get_yticklabels()], ylabel=" ")
# ax.set_yticklabels([x.get_text() for x in ax.get_yticklabels()], weight="bold", size=16)
ax.set_yticks([i for i in np.arange(0, 1+0.1, 0.1)])
ax.set_yticklabels(ax.get_yticks(), size=16)
ax.yaxis.set_major_formatter(FormatStrFormatter('%.2f'))
ax.set_ylim(-0.025, 1.025)

# show only every second y tick label
plt.setp(ax.yaxis.get_ticklabels()[1::2], visible=False)

if not show_y_axis:
    ax.set_yticklabels([]) 

if show_legend:
    h, l = ax.get_legend_handles_labels()
    ax.legend(h, l, ncol=2, loc='lower center', fontsize=16)
else:
    ax.legend_.remove()

# plt.tight_layout()
ax.get_figure().savefig(f'./conceptual_captions_experiment/plots/{MODEL_NAME}/num_training_samples_plot_multiprompt_{DATASET_NAME}_{MODEL_NAME}_{NUM_TOTAL_NAMES}_{MIN_NUM_CORRECT_PROMPT_PREDS}.pdf', bbox_inches='tight')
ax.get_figure().savefig(f'./conceptual_captions_experiment/plots/{MODEL_NAME}/num_training_samples_plot_multiprompt_{DATASET_NAME}_{MODEL_NAME}_{NUM_TOTAL_NAMES}_{MIN_NUM_CORRECT_PROMPT_PREDS}.png', dpi=100, bbox_inches='tight')
print(MODEL_NAME)
plt.show()

# Plot a Heatmap to Visualize Influence of Number of Attack/Trainin Samples

In [None]:
from  matplotlib.ticker import FuncFormatter
show_cbar = False
show_ylabel = True
dfs = []
for num_members in subsampled_dfs_per_model.keys():
    plt.clf()
    data = subsampled_dfs_per_model[num_members].loc[:, ['True Positive Rate', 'False Negative Rate', 'False Positive Rate', 'True Negative Rate', 'Accuracy']]
    data["Number of Images\nper Person in CC3M"] = int(num_members.replace("top", ""))
    data = data.groupby("Number of Samples").mean()
    dfs.append(data)

combined_df = pd.concat(dfs)    
pivoted_df = combined_df.reset_index().pivot(index="Number of Samples", columns="Number of Images\nper Person in CC3M", values="True Positive Rate")

fig = plt.figure()
ax = fig.add_axes([0,0,1,1])
if show_cbar:
    plt.figure(figsize=(6.2, 5))

ax = sns.heatmap(pivoted_df, yticklabels=2, vmin=0, vmax=1, cmap="Blues", cbar=show_cbar, ax=ax)

if show_cbar:
    ax.figure.axes[-1].set_ylabel("True Positive Rate", weight='bold', size=16)
    ax.collections[0].colorbar.ax.tick_params(labelsize=16)

ax.set_xlabel(ax.get_xlabel(), weight="bold", size=16)
ax.set_ylabel(ax.get_ylabel(), weight="bold", size=16)

ax.set_yticklabels([x.get_text() for x in ax.get_yticklabels()], size=16)
ax.set_xticklabels([int(float(x.get_text())) for x in ax.get_xticklabels()], size=16)

ax.invert_yaxis()

if not show_ylabel:
    ax.set(yticklabels=[" " for x in ax.get_yticklabels()], ylabel=" ")

plt.tight_layout()

ax.get_figure().savefig(f'./conceptual_captions_experiment/plots/{MODEL_NAME}/heatmap_num_training_samples_{DATASET_NAME}_{MODEL_NAME}_{NUM_TOTAL_NAMES}_{MIN_NUM_CORRECT_PROMPT_PREDS}.pdf', bbox_inches='tight')
ax.get_figure().savefig(f'./conceptual_captions_experiment/plots/{MODEL_NAME}/heatmap_num_training_samples_{DATASET_NAME}_{MODEL_NAME}_{NUM_TOTAL_NAMES}_{MIN_NUM_CORRECT_PROMPT_PREDS}.png', dpi=100, bbox_inches='tight')

plt.show()