# Fashion Image Captioner

This notebook finetunes a general visual language model (VLM) that can caption images to be able to generate more specific search-relevant descriptions of images from thrifted fashion item inventories

# Prepare Environment

In [None]:
# nbdev: EXPORT
#Install packages needed first time in Environment
if False:
    # !pip install pandas
    # !pip install scikit-learn
    # !pip install torch torchvision
    # !pip install transformers
    !pip install datasets #Hugging Face
    !pip install tqdm
    !pip install Pillow
    !pip install peft
    !pip install rouge_score
    !pip install evaluate
    # !apt-get install nvtop

In [None]:
#Import Libraries

#General
import os
import gc

#Standard ML
from pandas import read_csv
from pandas import DataFrame
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import Dataset, DataLoader

#Data Visualization
from torchvision.transforms import ToPILImage
from PIL import Image as pilImage
from IPython.display import display

#HuggingFace Libraries
from datasets import Dataset as hfDataset
from datasets import Image
from accelerate import Accelerator

#The Model
from transformers import Blip2ForConditionalGeneration #visual language model (VLM) can do multimodal tasks like captioning or answering questions about images.
from transformers import Blip2Processor

#Extras
from tqdm import tqdm #cute loading percent bar


In [None]:
# Check GPU Availability
# Device will be reset automatically later by accelerator, but this is a good sanity check to know what we should be working with

# Check for MPS availability
if torch.backends.mps.is_available():
    device = torch.device("mps")  # Use Apple's MPS
    print(f"Using torch version {torch.__version__}")
    print("Found MPS is available for use") # Is MPS even available? macOS 12.3+
    print(f"Current version on Pytorch was built with MPS activation: {torch.backends.mps.is_built()}") # Was the current version of PyTorch built with MPS activated?
elif torch.cuda.is_available():
    device = torch.device("cuda")
    nGPUs = torch.cuda.device_count()
    print(f"Found {nGPUs} cuda GPUs available for use")
else:
    device = torch.device("cpu")
    print("No GPU device found, running on cpu")

# Import Data

To use as-is, a root project folder should contain
* a subproject folder containing this notebook (can have any name)
* a data folder, called Data, containing
  *  DB_FILENAME: a csv file with at least two columns
     *   IMAGE_PATH_COL: contains image names
     *   CAPTION_COL: contains image cpations
  * IMAGE_FOLDERNAME: A folder containing images with the names referenced in IMAGE_PATH_COL
 
The logic for this sturcture is that there can be multiple subprojects that need to access the same dataset

In [None]:
#SET IMAGE LOCATION INFO

#Specify Data Locations
CSV_FILENAME_g = 'large_fashion_descriptions.csv' #'ThredupData.csv'
CAPTION_COL_g = "description"
IMAGE_PATH_COL_g = "filename"
IMAGE_FOLDERNAME_g = 'Extracted_Images' #'StanfordJpegs'

DATA_FOLDER_g = os.path.join('..', 'Data') #'..' means the folder one up from current notebook's folder, which happens to be where I have my data folder
IMAGE_FOLDER_g = os.path.join(DATA_FOLDER_g, IMAGE_FOLDERNAME_g)
CSV_FILEPATH_g = os.path.join(DATA_FOLDER_g, CSV_FILENAME_g)

In [None]:
#IMPORT RAW DATA TO DATAFRAME

def getDatasetDF(csv_filepath = CSV_FILEPATH_g, image_folder = IMAGE_FOLDER_g):
    
    """
    Create dataframe based on csv file
    returns data_df which has infomration to eventually create two columns: captions (string descriptions of images) and images (paths to image files) and c
    """
    
    if len(csv_filepath) == 0:
        return
    data_df = read_csv(csv_filepath)
    data_df.index.name = "df_id" #name index to be used for matching datapoints across imported df, hugging face train/test sets and torch dataloaders
    
    return data_df # Should have 
    

data_df_g = getDatasetDF()
data_df_g.head(20)

In [None]:
#Optionally manually edit dataframe to explore effect of individual changes

changeDataManually_g = False

def changeRows(data_df = data_df_g, colToChange = CAPTION_COL_g, should_change=changeDataManually_g):
    if not should_change: return data_df
    else:
        colToChange = "Captions"
        changeDict = {
            846: "a black ankleboot with fringe detailing on the side",
            4815: "a yellow beanie on a white background",
            5093: "a cage-style heel with floral print",
            2835: "a black leather heeled boot with a pointed toe"   
            }
        
        for itemId, content in changeDict.items():
            data_df.loc[itemId,colToChange] = content
        return data_df


data_df_g = changeRows()

In [None]:
#FILTER DATAFRAME TO JUST 3 COLUMNS: df_id, image_paths, captions

def imagePathColCreator(data_df, image_folder, image_path_col):
    #image_paths = list(os.path.join(image_folder,('item'+str(fileId))) for fileId in data_df["Item_Id"]) #From original ThredupData.csv Use ItemId column to create list of paths to image files
    print(data_df.columns)
    image_paths = list(os.path.join(image_folder,filePath) for filePath in data_df[image_path_col]); #From Esra's LLM generated caption dataset (files in folder and spreadhseet adjusted to include .jpg in name)
    return image_paths

def captionColCreator(data_df, caption_col):
    #captions = list(data_df["Category_Type"] + ", " + data_df["Description"]) #If using original ThredupData.csv, use Description column to create list of captions for each image  
    captions = data_df[caption_col] #From Esra's LLM generated caption dataset
    return captions

def createImageAndCaptionDf(data_df = data_df_g, image_folder = IMAGE_FOLDER_g, image_path_col= IMAGE_PATH_COL_g, caption_col = CAPTION_COL_g ):

    #Create image paths
    data_df["image_paths"] = imagePathColCreator(data_df, image_folder, image_path_col)

    #Create caption paths
    data_df["captions"] = captionColCreator(data_df, caption_col)
     
    #Discard rows without valid image paths
    print(f"Original samples in dataframe: {len(data_df)}")
    valid_rows = [ind for ind, image_path in enumerate(data_df["image_paths"]) if os.path.exists(image_path)] #Get rid of any image paths that do not point to actual image files
    invalid_rows = [ind for ind in data_df.index if ind not in valid_rows]

    invalid_data_df = data_df.iloc[invalid_rows]
    print(f"Samples with invalid image paths in dataframe: {len(invalid_data_df)}")
    data_df = data_df.iloc[valid_rows]
    print(f"Samples with valid image paths in dataframe: {len(data_df)}")

    #Discard all other columns
    data_df = data_df[["captions", "image_paths"]]

    return data_df, invalid_data_df

data_df_g, invalid_paths_df_g = createImageAndCaptionDf()

data_df_g.head()
    

# Separate Train/Valid/Test Dataframes

In [None]:
#Training Toggles

VALIDATION_PORTION_g = 0.2
TEST_PORTION_g = .1
SEED_g = 22

In [None]:
# Initialize variables for train/validation/test dataframes, huggingface datasets, and torch data loaders
train_df_g = None
train_hfset_g = None
train_dataloader_g = None

validation_df_g = None
validation_hfset_g = None
validation_dataloader_g = None

test_df_g = None
test_hfset_g = None
test_dataloader_g = None


In [None]:
#Populate train/validation/test dataframes

# Split off training data
train_df_g, test_and_validation_df_g = train_test_split(data_df_g, test_size=(VALIDATION_PORTION_g + TEST_PORTION_g), random_state=SEED_g)

#Split remaining into validation and test dats
validation_df_g, test_df_g = train_test_split(test_and_validation_df_g, test_size=TEST_PORTION_g, random_state=SEED_g)

print(f"""
Train Size: {len(train_df_g)},
Validation Size: {len(validation_df_g)},
Test Size: {len(test_df_g)} 
""")
train_df_g.head()

# Create Hugging Face Datasets


In [None]:
#Create train, valid, test huggingface datasets

def createHuggingfaceDataset(data_df):
    image_paths = list(data_df["image_paths"])
    captions = list(data_df["captions"])
    dataframe_ids = data_df.index.tolist()
    
    #Create HuggingFace Dataset from python dict containing two lists, image paths and captions
    dataset_hf = hfDataset.from_dict(
                                {
                                    "image": image_paths,
                                    "text": captions,
                                    "df_id": dataframe_ids
                                }
                            )
    dataset_hf = dataset_hf.cast_column("image", Image()) #Cast the image column to image data type so can be linked files can be accessed as images

    #Delete variables to free up memory (may not be necessary)
    del [image_paths, captions, data_df, dataframe_ids]
    gc.collect() #collect garbage 
    
    return dataset_hf

train_hfset_g = createHuggingfaceDataset(train_df_g)
print("training hugging face dataset created")

validation_hfset_g = createHuggingfaceDataset(validation_df_g)
print("validation hugging face dataset created")

test_hfset_g = createHuggingfaceDataset(test_df_g)
print("test hugging face dataset created")

# Make Functions to Go Btwn df and hf

In [None]:
#Create function for selecting train/test/valid dataset

def select_df_hf(segment,
                   train_df = train_df_g,
                   train_hfset = train_hfset_g,
                   validation_df = validation_df_g,
                   validation_hfset = validation_hfset_g,
                   test_df = test_df_g,
                   test_hfset = test_hfset_g):

    if segment == "train":
        return  train_df, train_hfset
    elif segment == "validation":
        return  validation_df, validation_hfset
    elif segment == "test":
        return  test_df, test_hfset

#train_df, train_hfset = select_df_hf("train")

In [None]:
# Create table with df_id, hf_segment_id for conversion

def create_uid_df(train_hfset, validation_hfset, test_hfset):

    uid_df = DataFrame(columns = ["df_id", "segment", "hf_segment_id"])
    
    hf_segments = {
        "train": train_hfset,
        "validation": validation_hfset,
        "test": test_hfset
    }
    
    for hf_segment_name, hf_segment in hf_segments.items():
        for row_id, hf_item in enumerate(hf_segment):
            item_id = hf_item["df_id"]
            uid_df.loc[item_id, "df_id"] = item_id
            uid_df.loc[item_id, "segment"] = hf_segment_name
            uid_df.loc[item_id,"hf_segment_id"] = row_id

    return uid_df

uid_df_g = create_uid_df(train_hfset_g, validation_hfset_g, test_hfset_g)
uid_df_g.head()
    

In [None]:
# Function to convert from hf_id to uid

def get_uid_from_hf_segment_id(segment, hf_segment_id, uid_df = uid_df_g):
    df_id = uid_df.loc[(uid_df["segment"] == segment) & (uid_df["hf_segment_id"] == hf_segment_id), "df_id"]
    assert len(df_id) == 1
    uid = df_id.values[0]
    return uid

#Example 
segment_g = "train"
hf_id_g = 0
print(f"The uid of hf {segment_g}, ex {hf_id_g}:")
print(get_uid_from_hf_segment_id(segment_g, hf_id_g))

# View Labeled Data Examples

In [None]:
#Select image to view huggingface and original dataframe data

segment_to_view_g = "train"
segment_ind_g = 6 #index of image to look at in train/validation/test segment

def view_labeled_data_example(segment = segment_to_view_g, hf_id = segment_ind_g):

        dfset, hfset = select_df_hf(segment)
        df_id = get_uid_from_hf_segment_id(segment, hf_id)
 
        #hf image
        image = hfset[hf_id]["image"].resize((252,252))
        print("HUGGING FACE IMAGE:")
        display(image)
        
        #hf caption
        print(f'HUGGING FACE CAPTION:\n{hfset[hf_id]["text"]}\n')
        
        #original dataframe info
        print(f'ORIGINAL DF INFO:\n{dfset.loc[df_id]}')

view_labeled_data_example()

# Import Datasets to Dataloaders

In [None]:
#Create Pytorch Dataset Class for input o Dataloaders

class ImageCaptioningDataset(Dataset): #this class inherits functionalities and structure of the pytorch Dataset class
    
    def __init__(self, huggingFaceDataset, processor):       
        self.dataset = huggingFaceDataset
        self.processor = processor
        #self.new_size = (256,256)

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        
        item = self.dataset[idx] #each item in self.dataset has an image and text
        
        #process image (returns a dictionary with a single item containing the processed image)
        encoding = self.processor(images = item["image"], #get image
                                  #.resize(self.new_size), #resize
                                  #padding = "max_length", #set padding
                                  return_tensors = "pt" #return as pytorch tensor dtype
                                 )
        
        # remove batch dimension
        encoding = {key: value.squeeze() for key, value in encoding.items()} #

        #add text info
        encoding["text"] = item["text"]

        # Include the original index (from hugging face) in the returned data
        encoding['hf_idx'] = idx
        
        return encoding

    def get_original_image(self, idx):
        #This method gets the original image from the Hugging Face dataset for a given index.
        return self.dataset[idx]["image"]



In [None]:
#Create Processor which does image and caption pre-processing
# (used in creation of pytorch dataset and collate functions)

def getProcessor():
    pretraining = 'Salesforce/blip2-opt-2.7b'
    processor = Blip2Processor.from_pretrained(pretraining)
    return processor

processor_g = getProcessor()

In [None]:
def collate_fn(batch, processor = processor_g):

    #batch is a list of examples (dictionaries), where each example has the same keys (image and text)
    
    # pad the tokenized_captions and attention_mask
    processed_batch = {}
    
    item_keys = batch[0].keys()
    
    for key in item_keys: #only have to iterate over keys from first example in batch, bc all examples have same keys
        if key == "pixel_values":
            processed_batch[key] = torch.stack([item[key] for item in batch]) #stack images from all examples in batch
        elif key == "text":
            #tokenize the caption (ie break up caption into chunks and convert each chunk to a number using look up table)
            text_inputs = processor.tokenizer(
                                            [item["text"] for item in batch], 
                                            padding = "max_length" ,#True, 
                                            max_length = 50, #9, #16
                                            return_tensors = "pt",
                                            truncation = True,
                                            )
            processed_batch["tokenized_captions"] = text_inputs["input_ids"] #input_ids is conventionally used in the Hugging Face Transformers library and other NLP frameworks to refer to the tokenized representation of text inputs. This naming convention is primarily for consistency and clarity within the context of language models and their input processing
            processed_batch["attention_mask"] = text_inputs["attention_mask"]
        elif key == "hf_idx":
            processed_batch[key] = [item[key] for item in batch]
        else:
            print(f"{key} KEY NOT ACOUNTED FOR !!")
    
    return processed_batch

In [None]:
#CREATE PYTORCH DATA LOADERS

BATCH_SIZE_g = 16 #(32+ Too Big on my Vast.AI instance) #56 #64 #128 #512 #1024
processor_g = getProcessor()

train_dataloader_g = DataLoader(dataset = ImageCaptioningDataset(train_hfset_g, processor_g), 
                              shuffle = True, 
                              batch_size = BATCH_SIZE_g, 
                              collate_fn = collate_fn,
                             )

validation_dataloader_g = DataLoader(dataset = ImageCaptioningDataset(validation_hfset_g, processor_g), 
                               batch_size = BATCH_SIZE_g, 
                               collate_fn = collate_fn,
                              )

test_dataloader_g = DataLoader(dataset = ImageCaptioningDataset(validation_hfset_g, processor_g), 
                               batch_size = BATCH_SIZE_g, 
                               collate_fn = collate_fn,
                              )

# del train_hfset

In [None]:
def select_dataset(segment,
                   train_dataloader = train_dataloader_g,
                   validation_dataloader = validation_dataloader_g,
                   test_dataloader = test_dataloader_g):

    selected_df, selected_hfset = select_df_hf(segment)
    if segment == "train":
        return  selected_df, selected_hfset, train_dataloader
    elif segment == "validation":
        return  selected_df, selected_hfset, validation_dataloader
    elif segment == "test":
        return  selected_df, selected_hfset, test_dataloader

#train_df, train_hfset, train_dataloader = select_dataset("train")

# Get Model

In [None]:
# LOAD MODEL
model_g = Blip2ForConditionalGeneration.from_pretrained("ybelkada/blip2-opt-2.7b-fp16-sharded", 
                                                      torch_dtype = torch.float32
                                                     )

In [None]:
# LOAD LoRA CONFIG

from peft import LoraConfig, get_peft_model   
    
config_g = LoraConfig(
    r = 10, #32, #10, #18, #16, # Rank size determined number of trainable parameters.
    lora_alpha = 32, # scaling factor sets magnitude LoRA updates have on the original weights
    lora_dropout = 0.05, #percent of cells not enabled at any time
    bias = "none",
    target_modules = ["q_proj", "k_proj"] #focus model's adaptation on the attention mechanism
)

model_g = get_peft_model(model_g, config_g) #peft model is the hugging face name for their library that uses LORA "parameter efficient fine-tuning"
    
model_g.print_trainable_parameters()

In [None]:
# OPTIMIZER
lr_g = 1e-4 #2e-4 #5e-4 #8e-4
optimizer_g = torch.optim.Adam(model_g.parameters(), lr = lr_g)

In [None]:
# ACCELERATOR
# optimizes training and evaluation loops for the available hardware in a device agnostic way,
# eliminating need to use .to(device) or wrap model with torch.nn.DataParallel for multi-GPU training

accelerator_g = Accelerator() #hugging face optimization handles device placement and optimizes training speed
model_g, optimizer_g, train_dataloader_g, validation_dataloader_g  = accelerator_g.prepare(model_g, optimizer_g, train_dataloader_g, validation_dataloader_g)

device_g = accelerator_g.device
print(f'Device: {device_g}')


# Set up to Compare Before and After Examples

In [None]:
#SUBFUNCTIONS TO COMPARE LABELS, UNTUNED AND TUNED CAPTIONS

def getSubBatch(dataloader, nSamples):
    nSamples = min(nSamples, len(dataloader.dataset))
    batch_iterator = iter(dataloader) # Create an iterator from the DataLoader
    batch = next(batch_iterator) # Fetch full batch
    subset_batch = {key: value[:nSamples] for key, value in batch.items()}
    n_samples = len(subset_batch)
    return subset_batch, n_samples 

def unpackBatch(batch):
    tokenized_captions = batch["tokenized_captions"]
    pixel_values = batch["pixel_values"].to(torch.float32).to(device_g)
    hf_indices = batch['hf_idx']
    return hf_indices,tokenized_captions, pixel_values

def getModelPredictions(model, processor, pixel_values):
    model.eval()
    predictions = model.generate(pixel_values, max_new_tokens = 100)#9 #16 #Here the predictions are text captions written in number encoded words (LUT)
    predictions = processor.batch_decode(predictions, skip_special_tokens = True) #now we're in english
    return predictions

def decipherLabels(processor, tokenized_captions):
    labels = processor.batch_decode(tokenized_captions, skip_special_tokens = True)
    return labels

def getImage(hf_dataset, hf_ind):
    return hf_dataset[hf_ind]["image"].resize((252, 252))

def showImage(image):
    display(image.resize((252,252)))

def showCaptionComparison(df_id, hf_ind, image, prediction_untuned, prediction_tuned, label):
    print(f"DF Index:{df_id}, Segment index: {hf_ind}")
    showImage(image)
    print(f"UNTUNED CAPTION: {prediction_untuned}\n")
    print(f"TUNED CAPTION: {prediction_tuned}\n")
    print(f"DESIRED CAPTION: {label}\n")
    print('==========\n\n') 

def showCaptionComparisons(segment, hfset, hf_indices, predictions_untuned, predictions_tuned, labels):
    for demo_ind,label in enumerate(predictions_untuned):
        hf_ind = hf_indices[demo_ind]
        df_id = get_uid_from_hf_segment_id(segment, hf_ind)
        image = getImage(hfset, hf_ind)
        showCaptionComparison(df_id, hf_ind, image, predictions_untuned[demo_ind], predictions_tuned[demo_ind], labels[demo_ind])

In [None]:
def compareCaptionsBefore(segment, maxSamples, model=model_g, processor=processor_g ):
    #Get dataset segment (as df, hfset and dataloader)
    df, hfset, dataloader = select_dataset(segment)

    #Get images and labels from segment dataloader
    subset_batch, n_samples = getSubBatch(dataloader, maxSamples)
    hf_indices, tokenized_captions, pixel_value_inputs = unpackBatch(subset_batch)

    #Get predicted captions by applying model
    predictions_untuned = getModelPredictions(model, processor, pixel_value_inputs)
    predictions_tuned = predictions_untuned

    #Get desired captions from labels
    labels = decipherLabels(processor, tokenized_captions)

    #Show comparison
    showCaptionComparisons(segment, hfset, hf_indices, predictions_untuned, predictions_tuned, labels)
    
    return segment, hf_indices, pixel_value_inputs, predictions_untuned, labels

def compareCaptionsAfter(comparison_segment,
                         comparison_hf_indices,
                         comparison_pixel_value_inputs,
                         comparison_predictions_untuned,
                         comparison_labels,
                         model=model_g,
                         processor=processor_g):

    #Get hfset for displaying images
    _, hfset, _ = select_dataset(comparison_segment)

    #Get new predictions
    predictions_tuned = getModelPredictions(model, processor, comparison_pixel_value_inputs)
    
    #Show comparison
    showCaptionComparisons(comparison_segment, hfset, comparison_hf_indices, comparison_predictions_untuned, predictions_tuned, comparison_labels)

In [None]:
#Look at predictions before training
segment_g = "validation"
n_samples_g = 10

comparison_segment_g, comparison_hf_indices_g, comparison_pixel_value_inputs_g, comparison_predictions_untuned_g, comparison_labels_g  = compareCaptionsBefore(segment_g, n_samples_g)

# Train Model

In [None]:
#Define default values

n_epochs_g = 1

In [None]:
def train_model(model, n_epochs = n_epochs_g, train_dataloader = train_dataloader_g, optimizer = optimizer_g, accelerator = accelerator_g): 
    for epoch in range(n_epochs):
        print(f"{epoch= }")
        
        epoch_accumulated_loss = 0
        nBatches = 0        
            
        model.train()
        for batch in tqdm(train_dataloader): #batch is a dictionary

            optimizer.zero_grad()

            #Get X and y
            tokenized_captions = batch.pop("tokenized_captions") #!!!*** pops instead of indexing to save memory (on old hardware)
            pixel_values = batch.pop("pixel_values").to(torch.float32)

            #Get predictions
            outputs = model(input_ids = tokenized_captions, #input_ids is conventional HF name for tokenized text
                            pixel_values = pixel_values,
                            labels = tokenized_captions)
            
            #Calculate loss and train
            loss = outputs.loss
            accelerator.backward(loss)
            optimizer.step()   
            optimizer.zero_grad()

            epoch_accumulated_loss += loss.item()
            nBatches+=1
    
        print(f"Average Loss: {epoch_accumulated_loss/nBatches}")
        
        print("!!!***!!! UNCOMMENT VALIDATION CHECK ONCE IMPLEMENTED")        
        # model.eval()
        
        # rouge_score_aggregator = scoring.BootstrapAggregator()
        # bleu_score_aggregator = scoring.BootstrapAggregator()
    
        # for batch in tqdm(validation_dataloader):
    
        #     tokenized_captions = batch.pop("tokenized_captions")#.to(device)
        #     pixel_values = batch.pop("pixel_values").to(torch.float32)
        
        #     predictions = model.generate(pixel_values, max_length = 10)#9 #16 #Here the predictions are text captions written in number encoded words (LUT)
        #     predictions = processor.batch_decode(predictions, skip_special_tokens = True) #now we're in english
        #     labels = processor.batch_decode(tokenized_captions, skip_special_tokens = True) #these are the pre-set captions (decoded from numbers)
    
        #     eval_metric = compute_metrics(predictions, labels) 
        

In [None]:
n_epochs_g = n_epochs_g #Set this to a new number if don't want old n_epochs_g
train_model(model_g, n_epochs_g)

# Look at Before and After

In [None]:
compareCaptionsAfter(comparison_segment_g,
                         comparison_hf_indices_g,
                         comparison_pixel_value_inputs_g,
                         comparison_predictions_untuned_g,
                         comparison_labels_g)

## KARINA TO DO:

* commit that refactored for clarity and addressed bug when displaying before and after demo data
* make a similar items benchmark csv of deduped data so can use this data as a test once have model trained
* don't need a multi-GPU instance unless change code to be able to take advantage
* implement Regi's nbdev approach to organize jupyter notebooks then delete Regis scratchpad from this notebook


## REGIS SCRATCHPAD

Command line to get list of deduplicated files in a folder: ffor file in *; do md5sum "$file"; done | sort | awk '!seen[$1]++ {print $2}

https://nbdev.fast.ai # from jeremy howard see youtube

### dataloader

### code

In [None]:
# # nbdev: EXPORT
# def load_dataset(path):
#     # return a dataset
#     pass
#     return x

### tests

In [None]:
# # create a folder
# #create 3 images inside
# ds = load_dataset('./test123')
# assert len(ds) == 3
# assert ds[0]['filename'] == 'first.jpg'
# assert ds.isin(..) != ...