In [None]:
import pandas as pd
import torch
import numpy as np
from torchvision.datasets import ImageFolder
from torch.utils.data import ConcatDataset, DataLoader
from IPython.display import display
from clip_retrieval.clip_client import ClipClient, Modality
from IPython.display import Image, display
from base64 import b64decode
import matplotlib.pyplot as plt
from collections import defaultdict
from tqdm.notebook import tqdm
from joblib import Parallel, delayed
from copy import deepcopy
import json
import open_clip
from PIL import Image as PILImage
import os
import itertools
import wandb

os.chdir('/workspace')
from rtpt.rtpt import setproctitle
setproctitle('@Clip_Notebook')

from facescrub_training.pl_models.resnet import ResNet50
from datasets import FaceScrub

# Load the FaceScrub Dataset

In [None]:
facescrub = FaceScrub(group='all', train=False)

# Load the names of the German actors and actresses

In [None]:
european_actors_dataset = ImageFolder(root='./data/laion_european_celebs/actors/images')
european_actresses_dataset = ImageFolder(root='./data/laion_european_celebs/actresses/images')

# Anaylze the dataset by counting the occurences of the names

In [None]:
df = pd.read_csv('./laion400m_experiments/names_found_in_laion400m_caption_search.csv', index_col=0)
df

In [None]:
# get the counts of occurences for each person
count_df = df.groupby('name').size().reset_index(name='count').sort_values('count')
count_df['membership'] = 'member'
count_df.reset_index(drop=True)

In [None]:
# add the persons that have a count of zero
names_df = pd.DataFrame(facescrub.classes + european_actors_dataset.classes + european_actresses_dataset.classes)
names_df = names_df[0].apply(lambda x: " ".join(x.split("_"))).reset_index(name='name').drop(columns='index')
missing_names = names_df[~names_df['name'].isin(count_df['name'])].copy(deep=True)
missing_names['count'] = 0
missing_names['membership'] = 'non_member'
count_df = pd.concat([count_df, missing_names]).sort_values('count').reset_index(drop=True)
count_df

In [None]:
count_df.head(15)

In [None]:
count_df['bin'] = pd.cut(count_df['count'], bins=[0, 25, 50, 75, 100, 125, 150, 200, 225, 250, 275, 300, 1000_000], right=False)
count_df = count_df.reset_index(drop=True)
count_df

In [None]:
# get the count for each of the bins
count_df.groupby('bin').transform('size').unique()

In [None]:
# save the dataframe to a csv file
count_df.to_csv('./laion400m_experiments/laion_membership_occurence_count.csv')