In [None]:
from clip_retrieval.clip_client import ClipClient, Modality
import pandas as pd
import os
from IPython.display import Image, display
import torch
import numpy as np
import itertools
from tqdm.auto import tqdm
from time import sleep
from random import uniform
import requests
from io import BytesIO
from PIL import Image as PILImage
import PIL
from joblib import Parallel, delayed

os.chdir('/workspace')
from rtpt.rtpt import setproctitle
setproctitle('@Clipping_Privacy_CC3M_Notebook')

In [None]:
QUERY_BACKEND = False

# Load the csv files of the members ans non-members

In [None]:
# load the non-members
fs_actors_non_members = pd.read_csv(
    'cc3m_experiments/conceptual_captions_facescrub_member_info/actors_non_members.csv', 
    index_col=0
).rename(columns={'name': 'class_name'})
fs_actors_non_members['name'] = fs_actors_non_members['class_name'].map(lambda x: x.replace('_', ' '))
print('actors_non_members')
display(fs_actors_non_members.head(3))
fs_actresses_non_members = pd.read_csv(
    'cc3m_experiments/conceptual_captions_facescrub_member_info/actresses_non_members.csv', 
    index_col=0
).rename(columns={'name': 'class_name'})
fs_actresses_non_members['name'] = fs_actresses_non_members['class_name'].map(lambda x: x.replace('_', ' '))
print('actresses_non_members')
display(fs_actresses_non_members.head(3))
# load the members
fs_actors_members = pd.read_csv(
    'cc3m_experiments/conceptual_captions_facescrub_member_info/actors_members.csv', 
    index_col=0
).rename(columns={'name': 'class_name'})
fs_actors_members['name'] = fs_actors_members['class_name'].map(lambda x: x.replace('_', ' '))
print('actors_members')
display(fs_actors_members.head(3))
fs_actresses_members = pd.read_csv(
    'cc3m_experiments/conceptual_captions_facescrub_member_info/actresses_members.csv', 
    index_col=0
).rename(columns={'name': 'class_name'})
fs_actresses_members['name'] = fs_actresses_members['class_name'].map(lambda x: x.replace('_', ' '))
print('actresses_members')
display(fs_actresses_members.head(3))

# Get similar images with the captions from the LAION-5B dataset

In [None]:
NUM_IMAGES_TO_QUERY_FOR=999
MIN_NUM_IMGS_PER_PERSON=100

In [None]:
client = ClipClient(
    url='https://knn5.laion.ai/knn-service',
    indice_name='laion5B',
    aesthetic_weight=0,
    modality=Modality.IMAGE,
    use_safety_model=False,
    use_violence_detector=True,
    deduplicate=True,
    num_images=NUM_IMAGES_TO_QUERY_FOR
)

def log_result(result):
    id, caption, url, similarity = result["id"], result["caption"], result["url"], result["similarity"]
    print(f"id: {id}")
    print(f"caption: {caption}")
    print(f"url: {url}")
    print(f"similarity: {similarity}")
    display(Image(url=url, unconfined=True))

In [None]:
# load the facescrub embeddings calculated with the openai clip model
embedding_dict = torch.load('./embeddings/openai_facescrub.pt')

class_list = np.array(embedding_dict['classes'])
# remove the dot from Freddy Prinze Jr
indices_freddy_prinze_jr = np.where(class_list == 'Freddy_Prinze_Jr.')
class_list[indices_freddy_prinze_jr] = 'Freddy_Prinze_Jr'
# fix typo in Leslie Nielsen
indices_leslie_nielsen = np.where(class_list == 'Leslie_Neilsen')
class_list[indices_leslie_nielsen] = 'Leslie_Nielsen'
# fix typo in Robert De Niro
indices_robert_de_niro = np.where(class_list == 'Robert_Di_Niro')
class_list[indices_robert_de_niro] = 'Robert_De_Niro'
# remove middle name from Tatyana Ali
indices_tatyana_ali = np.where(class_list == 'Tatyana_M._Ali')
class_list[indices_tatyana_ali] = 'Tatyana_Ali'
embedding_dict['classes'] = class_list.tolist()

# get the data as a df
embeddings_df = pd.DataFrame({'class_name': embedding_dict['classes'], 'image_paths': embedding_dict['image_paths'], 'embeddings': [x for x in embedding_dict['embeddings'].numpy()]})
embeddings_df['name'] = embeddings_df['class_name'].apply(lambda x: x.replace('_', ' '))
embeddings_df.groupby('name').head(1).head(10)

In [None]:
# get only the actors/actresses that are used as members and non-members
concat_dataset = pd.concat([fs_actors_members, fs_actors_non_members, fs_actresses_members, fs_actresses_non_members], ignore_index=True)
chosen_persons_for_experiment = pd.merge(embeddings_df, concat_dataset['class_name'], on='class_name', how='inner')
chosen_persons_for_experiment

In [None]:
if QUERY_BACKEND:    
    print(f'Testing on {chosen_persons_for_experiment["name"][0]}')
    test = client.query(text=chosen_persons_for_experiment['image_paths'][0])
    print(len(test))
    log_result(test[0])

In [None]:
def get_images(client, image_path):
    res = []
    try:
        res = client.query(image=image_path)
    except Exception as e:
        print(f"Exception: {e}")
    return res

def filter_imgs_for_name_in_cap(name, res):
    results = []
    for result in res:
        if name.lower() in result['caption'].lower():
            result['name'] = name
            results.append(result)
    return results

def query_backend(name, image_paths, client_instance, min_num_images=MIN_NUM_IMGS_PER_PERSON):
    result = []
    pbar = tqdm(total=min_num_images, desc=f'Total Images Retrieved for {name}', leave=False)
    for i, path in tqdm(enumerate(image_paths), total=len(image_paths), desc=f'Total Amount of Queries for {name}', leave=False):
        res = get_images(client_instance, path)
        sleep(uniform(1,5))
        res = filter_imgs_for_name_in_cap(name, res)
        client_instance.num_images = NUM_IMAGES_TO_QUERY_FOR

        result.extend(res)
        # filter out duplicate urls and duplicate captions
        result = pd.DataFrame(result).drop_duplicates('url').drop_duplicates('caption').to_dict('records')
        # update the progress bar
        pbar.n = len(result)
        pbar.refresh()
        if len(result) >= min_num_images:
            break

    print(f'{len(result)} images found for {name}')
    return result

if QUERY_BACKEND:
    groups = chosen_persons_for_experiment.groupby('name')
    similar_images = []
    for name, group in tqdm(groups, total=len(groups), desc='Total Progress'):
        similar_images_for_group = query_backend(name, group['image_paths'], client, min_num_images=MIN_NUM_IMGS_PER_PERSON)
        similar_images.append(similar_images_for_group)
        sleep(uniform(1, 10))

In [None]:
if QUERY_BACKEND:
    results = list(itertools.chain.from_iterable(similar_images))
    df = pd.DataFrame(results)
    # drop duplicate urls and duplicate captions
    df = df.drop_duplicates('url', ignore_index=True).drop_duplicates('caption', ignore_index=True)
    df.to_csv('cc3m_experiments/laion5b_similar_imgs_to_facescrub.csv')
else:
    df = pd.read_csv('cc3m_experiments/laion5b_similar_imgs_to_facescrub.csv', index_col=0)

In [None]:
print('Number of images per person')
df.groupby('name').count().sort_values('caption')

In [None]:
first_100_imgs_per_person = df.groupby('name').head(100)
first_100_imgs_per_person[['caption', 'url', 'name']]

# Downlaod the images of the LAION-5B dataset that are most similar to the FaceScrub images

In [None]:
def grab(uid, caption, url, output_root_folder, actor_name):
    """
    Taken from https://github.com/mlfoundations/open_clip/blob/main/src/data/gather_cc.py and adjusted to take caption and url separately.
    Download a single image from the TSV.
    """
    output_folder = os.path.join(output_root_folder, actor_name)
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
        
    file_path = os.path.join(output_folder, f'{uid:04d}.jpg')
    if os.path.exists(file_path):
        try:
            o = PILImage.open(file_path)
            o = np.array(o)

            print("Finished", uid, actor_name, url)
            return caption, file_path, actor_name
        except Exception as e:
            print("Failed", uid, actor_name, url, e)
            return
        

    # Let's not crash if anythign weird happens
    try:
        header = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
        dat = requests.get(url, timeout=20, headers=header)
        if dat.status_code != 200:
            print("404 file", url)
            return

        # Try to parse this as an Image file, we'll fail out if not
        im = PILImage.open(BytesIO(dat.content))
        im.thumbnail((512, 512), PIL.Image.BICUBIC)
        if min(*im.size) < max(*im.size)/3:
            print("Too small", url)
            return

        im.save(file_path)

        # Another try/catch just because sometimes saving and re-loading
        # the image is different than loading it once.
        try:
            o = PILImage.open(file_path)
            o = np.array(o)

            print("Success", o.shape, uid, actor_name, url)
            return caption, file_path, actor_name
        except Exception as e:
            print("Failed", uid, actor_name, url, e)
            
    except Exception as e:
        print("Unknown error", e)
        pass

In [None]:
os.chdir('/workspace/data/conceptual_captions_laion5b')
def grab_actor_images(name, captions, urls, root_dir):
    res = []
    underscore_name = name.replace(" ", "_")
    for i, (caption, url) in enumerate(zip(captions, urls)):
        res.append(grab(i, caption, url, root_dir, underscore_name))

    return res

class TQDMParallel(Parallel):
    def __init__(self, progress_bar=True, total=None, *args, **kwargs):
        self.progress_bar = progress_bar
        self.total = total
        super().__init__(*args, **kwargs)

    def __call__(self, *args, **kwargs):
        with tqdm(disable=not self.progress_bar, total=self.total) as self.pbar:
            return Parallel.__call__(self, *args, **kwargs)

    def print_progress(self):
        if self.total is None:
            self.pbar.total = self.n_dispatched_tasks
        self.pbar.n = self.n_completed_tasks
        self.pbar.refresh()

dfg = first_100_imgs_per_person.groupby('name')
results = TQDMParallel(n_jobs=200, total=len(dfg))(
    delayed(grab_actor_images)(name, group['caption'], group['url'], 'image_data') for name, group in dfg
)

In [None]:
chained_results = list(itertools.chain.from_iterable(results))
results_df = pd.DataFrame(chained_results, columns=['title', 'filepath', 'class_name'])
results_df['name'] = results_df['class_name'].map(lambda x: x.replace("_", " ") if x else None)
results_df

In [None]:
print('Lowest number of pictures for a person:')
lowest_imgs_for_person = results_df.groupby('name').count().min()['title']
lowest_imgs_for_person

In [None]:
# since the maximum number of downloaded images is x for some individuals, save the first x image text pairs for each person
same_num_imgs_per_person = results_df.groupby('name').head(lowest_imgs_for_person)
same_num_imgs_per_person.to_csv(f'{lowest_imgs_for_person}_images_per_person_training.csv', sep='\t', index=False)
same_num_imgs_per_person

In [None]:
# read the cc train csv file
cc_train = pd.read_csv('../conceptual_captions/Train_GCC-training_output.csv', sep='\t')
cc_train

In [None]:
members = pd.concat([fs_actors_members, fs_actresses_members])
members = pd.merge(same_num_imgs_per_person, members, how='inner', on='name').reset_index()
members = members[['title', 'filepath', 'name']]
members

In [None]:
# create a cc csv file with x images for each member
NUM_IMAGES_PER_PERSON = [75, 50, 25, 10, 1]
for num in NUM_IMAGES_PER_PERSON:
    top_members = members.groupby('name').head(num)
    top_members[['title', 'filepath']].to_csv(f'top_{num}_images_members_conceptual_captions.csv', sep='\t', index=False)

In [None]:
cc_train['filepath'] = cc_train['filepath'].map(lambda x: '../conceptual_captions/'+x)
cc_train

In [None]:
# append the images of the persons to the cc train csv
for num in NUM_IMAGES_PER_PERSON:
    members = pd.read_csv(f'top_{num}_images_members_conceptual_captions.csv', sep='\t')
    combined_df = cc_train.append(members)
    # resample the dataframe to shuffle the rows
    shuffled_df = combined_df.sample(frac=1, random_state=42)
    shuffled_df.to_csv(f'cc_top_{num}_members_train.csv', sep='\t', index=False)