In [1]:
import pandas as pd
import torch
import numpy as np
from torchvision.datasets import ImageFolder
from torch.utils.data import ConcatDataset, DataLoader
from IPython.display import display
from clip_retrieval.clip_client import ClipClient, Modality
from IPython.display import Image, display
from base64 import b64decode
import matplotlib.pyplot as plt
from collections import defaultdict
from tqdm.notebook import tqdm
from joblib import Parallel, delayed
from copy import deepcopy
import json
import open_clip
from PIL import Image as PILImage
import os
import itertools
import wandb

os.chdir('/workspace')
from rtpt.rtpt import setproctitle
setproctitle('@DH_Clip_FaceScrub_Notebook')

from facescrub_training.pl_models.resnet import ResNet50
from datasets import FaceScrub

# Load the FaceScrub Dataset

In [2]:
facescrub = FaceScrub(group='all', train=False)

# Load the names of the German actors and actresses

In [3]:
german_actors_dataset = ImageFolder(root='./data/laion_german_celebs/actors/images')
german_actresses_dataset = ImageFolder(root='./data/laion_german_celebs/actresses/images')

# Anaylze the dataset by counting the occurences of the names

In [4]:
df = pd.read_csv('laion_experiment/names_found_in_laion400m_caption_search.csv', index_col=0)
df

Unnamed: 0,caption,url,NSFW,i,name
0,Adam Sandler and Paz Vega in Columbia Pictures...,http://l3.yimg.com/bt/api/res/1.2/2gkaVQL1bP31...,UNLIKELY,8445,Adam Sandler
1,"Ray Romano, Andy Garcia for 'Rob the Mob'",http://i2.cdnds.net/12/42/87x65/ustv_ray_roman...,UNLIKELY,66009,Andy Garcia
2,"Jodie Foster + Anthony Hopkins, Oscar-winners ...",https://i.pinimg.com/736x/2d/1f/83/2d1f83b15fb...,UNLIKELY,18264,Anthony Hopkins
3,"""Images from the new Arnold Schwarzenegger fil...",https://mediadevour.files.wordpress.com/2014/0...,UNLIKELY,9781,Arnold Schwarzenegger
4,<a gi-track='captionPersonalityLinkClicked' hr...,http://media.gettyimages.com/photos/arnold-sch...,UNLIKELY,77592,Arnold Schwarzenegger
...,...,...,...,...,...
1388683,Teri Hatcher and Oprah Winfrey,https://static.oprah.com/images/tows/200605/20...,UNLIKELY,407265310,Teri Hatcher
1388684,Tia Carrere completed her summer-chic look wit...,https://www1.pictures.stylebistro.com/gi/2nd+A...,UNSURE,407278026,Tia Carrere
1388685,"Hot in Cleveland: The Complete Series , Valeri...",http://media.aent-m.com/Graphics/Items/sdImage...,UNLIKELY,407274049,Valerie Bertinelli
1388686,Victoria Justice (L-R) Stars of Nickelodeon's ...,https://www2.pictures.zimbio.com/gi/Victoria+J...,UNLIKELY,407314395,Victoria Justice


In [5]:
# get the counts of occurences for each person
count_df = df.groupby('name').size().reset_index(name='count').sort_values('count')
count_df['membership'] = 'member'
count_df.reset_index(drop=True)

Unnamed: 0,name,count,membership
0,Jenilee Harrison,1,member
1,Joanna García,1,member
2,Sabine Vitua,1,member
3,Guido Cantz,2,member
4,Taylor Atelian,2,member
...,...,...,...
537,Selena Gomez,24778,member
538,Johnny Depp,27342,member
539,Brad Pitt,30524,member
540,Robert Pattinson,31238,member


In [6]:
# add the persons that have a count of zero
names_df = pd.DataFrame(facescrub.classes + german_actors_dataset.classes + german_actresses_dataset.classes)
names_df = names_df[0].apply(lambda x: " ".join(x.split("_"))).reset_index(name='name').drop(columns='index')
missing_names = names_df[~names_df['name'].isin(count_df['name'])].copy(deep=True)
missing_names['count'] = 0
missing_names['membership'] = 'non_member'
count_df = pd.concat([count_df, missing_names]).sort_values('count').reset_index(drop=True)
count_df

Unnamed: 0,name,count,membership
0,Malu Leicher,0,non_member
1,Bernhard Hoëcker,0,non_member
2,Adrianne León,0,non_member
3,Harriet Herbig-Matten,0,non_member
4,Jenilee Harrison,1,member
...,...,...,...
541,Selena Gomez,24778,member
542,Johnny Depp,27342,member
543,Brad Pitt,30524,member
544,Robert Pattinson,31238,member


In [7]:
count_df.head(15)

Unnamed: 0,name,count,membership
0,Malu Leicher,0,non_member
1,Bernhard Hoëcker,0,non_member
2,Adrianne León,0,non_member
3,Harriet Herbig-Matten,0,non_member
4,Jenilee Harrison,1,member
5,Joanna García,1,member
6,Sabine Vitua,1,member
7,Bettina Lamprecht,2,member
8,Guido Cantz,2,member
9,Taylor Atelian,2,member


In [8]:
count_df['bin'] = pd.cut(count_df['count'], bins=[0, 25, 50, 75, 100, 125, 150, 200, 225, 250, 275, 300, 1000_000], right=False)
count_df = count_df.reset_index(drop=True)
count_df

Unnamed: 0,name,count,membership,bin
0,Malu Leicher,0,non_member,"[0, 25)"
1,Bernhard Hoëcker,0,non_member,"[0, 25)"
2,Adrianne León,0,non_member,"[0, 25)"
3,Harriet Herbig-Matten,0,non_member,"[0, 25)"
4,Jenilee Harrison,1,member,"[0, 25)"
...,...,...,...,...
541,Selena Gomez,24778,member,"[300, 1000000)"
542,Johnny Depp,27342,member,"[300, 1000000)"
543,Brad Pitt,30524,member,"[300, 1000000)"
544,Robert Pattinson,31238,member,"[300, 1000000)"


In [9]:
# get the count for each of the bins
count_df.groupby('bin').transform('size').unique()

array([ 29,  21,  15,  13,  14,  17,   9,  12, 375])

In [10]:
# save the dataframe to a csv file
count_df.to_csv('laion_experiment/laion_membership_occurence_count.csv')