This script processes the images and XXX

# Import Required Packages

In [4]:
import argparse

import torch
import clip
import numpy as np


from PIL import Image
from diffusers import StableDiffusionInpaintPipeline

In [8]:
import os
import os.path as osp

## Import Created Modules 
These modules are in the textfit github. In order for this code to fully function, we would need to include the classes and functions in this notebook, or keep them as modules in our repo.

In [None]:
#from models import create_model
#from utils.options import dict_to_nonedict, parse

# Load Images

This code loads an image and downsamples it by a factor of 2 (to reduce computational cost WE COULD DO MORE?), resizes it, converts the image into a a numpy array, and returns float32 values in in the numpy array.
This prepares the images to be loaded into the model.

In [7]:
# function to load image
def load_image(image_path):
    #set downsample factor
    downsample_factor = 2
    with open(image_path, "rb") as f:
        #open image
        image = Image.open(f)
        width, height = image.size
        #donwsample image
        width = width // downsample_factor
        height = height // downsample_factor
        #resize image
        image = image.resize(size=(width, height), resample=Image.NEAREST)
        #transpose array
        image = np.array(image).transpose(2, 0, 1)
    #return array
    return image.astype(np.float32)

# Main Pipeline Function
This function starts by parsing all the arguments. It takes in the image path,text promt, output path, and model path. It also loads the elrm model using the specifics in the yaml file. We will likely convert the yaml file to a code cell and call it in this function. 

Then it calls the load_image function, and converts the output to a tensor. 

It also takes in the text prompt and tokenizes it.

The function uses the CLIP model to encode the text, and both the image and text embeddings are passed throught the encoder.
The decoder uses `argmax`` to create a segmentation mask and converts the image to grayscale.

Then the stable diffusion model is then loaded and transfered to GPU.
The model generates an image using the input image, the segmentation mask from the elrm model and the text prompt. The diffusion model is run for 50 steps. ( mores teps = high quality image, slow, fewer steps = faster, lower quality image, more noise)

Then the generated image is blended with the input image and saved.

# Contents of other files 

In [None]:
#parse from utils.options, needed to read the options

def parse(options_dict, is_train=True):
    """Parse options from a dictionary instead of a YAML file.

    Args:
        options_dict (dict): Dictionary containing model options.
        is_train (bool): Indicates whether in training mode. Default is True.

    Returns:
        dict: Processed options.
    """
    # create copy of input dictionary to avoid modifying the original
    opt = options_dict.copy()

    #get gpu list
    gpu_list = ','.join(str(x) for x in opt['gpu_ids'])
    if opt.get('set_CUDA_VISIBLE_DEVICES', None):
        os.environ['CUDA_VISIBLE_DEVICES'] = gpu_list
        print('export CUDA_VISIBLE_DEVICES=' + gpu_list, flush=True)
    else:
        print('gpu_list: ', gpu_list, flush=True)

    opt['is_train'] = is_train

    #save optional paths (only if user wants to save logs & models)
    opt['path'] = {}
    opt['path']['root'] = os.getcwd() #set root as current working directory

    if is_train:
        #check for path, set to root if not set
        opt['path']['models'] = opt.get('models_path', os.path.join(opt['path']['root'], 'models'))
        opt['path']['logs'] = opt.get('logs_path', os.path.join(opt['path']['root'], 'logs'))
        opt['path']['visualization'] = opt.get('visualization_path', os.path.join(opt['path']['root'], 'visualization'))
        
        
        # change some options for debug mode
        #debug enabled = True
        if opt.get('debug', False):
            opt['val_freq'] = 1
            opt['print_freq'] = 1
            opt['save_checkpoint_freq'] = 1

    #for test mode    
    else:  # test
        #check for path, set to root if not set
        opt['path']['results'] = opt.get('results_path', os.path.join(opt['path']['root'], 'results'))
        opt['path']['log'] = opt.get('log_path', os.path.join(opt['path']['root'], 'test_logs'))
        opt['path']['visualization'] = opt.get('vis_path', os.path.join(opt['path']['root'], 'test_visualizations'))

    return opt

In [10]:
# contents of yaml file converted to dictionary?
options_dict = {
    'name': 'region_gen',
    'use_tb_logger': True,
    'debug_path': False,
    'set_CUDA_VISIBLE_DEVICES': True,
    'gpu_ids': [0],

    # dataset configs
    'batch_size': 8,
    'num_workers': 4,
    'mask_dir': '/path/to/DFMM-Spotlight/mask',
    'train_img_dir': '/path/to/DFMM-Spotlight/train_images',
    'test_img_dir': '/path/to/DFMM-Spotlight/test_images',
    'train_ann_file': '/path/to/DFMM-Spotlight/mask_ann/train_ann_file.jsonl',
    'test_ann_file': '/path/to/DFMM-Spotlight/mask_ann/test_ann_file.jsonl',
    'downsample_factor': 2,

    # model configs
    'model_type': 'ERLM',
    'text_embedding_dim': 512,
    'encoder_in_channels': 3,
    'fc_in_channels': 64,
    'fc_in_index': 4,
    'fc_channels': 64,
    'fc_num_convs': 1,
    'fc_concat_input': False,
    'fc_dropout_ratio': 0.1,
    'fc_num_classes': 2,
    'fc_align_corners': False,

    # training configs
    'val_freq': 5,
    'print_freq': 100,
    'weight_decay': 0,
    'manual_seed': 2023,
    'num_epochs': 100,
    'lr': 1e-4,
    'lr_decay': "step",
    'gamma': 0.1,
    'step': [50],


    'name': 'region_gen',
    'use_tb_logger': True,
    'debug_path': False,
    'set_CUDA_VISIBLE_DEVICES': True,
    'gpu_ids': [0],

    # dataset configs
    'batch_size': 8,
    'num_workers': 4,
    'mask_dir': '/path/to/DFMM-Spotlight/mask',
    'train_img_dir': '/path/to/DFMM-Spotlight/train_images',
    'test_img_dir': '/path/to/DFMM-Spotlight/test_images',
    'train_ann_file': '/path/to/DFMM-Spotlight/mask_ann/train_ann_file.jsonl',
    'test_ann_file': '/path/to/DFMM-Spotlight/mask_ann/test_ann_file.jsonl',
    'downsample_factor': 2,

    # model configs


    "downsample_factor": 2,

    "model_type": "ERLM",
    "text_embedding_dim": 512,
    "encoder_in_channels": 3,
    "fc_in_channels": 64,
    "fc_in_index": 4,
    "fc_channels": 64,
    "fc_num_convs": 1,
    "fc_concat_input": False,
    "fc_dropout_ratio": 0.1,
    "fc_num_classes": 2,
    "fc_align_corners": False,

    # training configs
    "val_freq": 5,
    "print_freq": 100,
    "weight_decay": 0,
    "manual_seed": 2023,
    "num_epochs": 100,
    "lr": "!!float 1e-4", # idk what this means
    "lr_decay": "step",
    "gamma": 0.1,
    "step": 50
}

In [None]:
def main():
    #parse arguments
    parser = argparse.ArgumentParser()
    