This is the pipeline for running the model.
we have to input an image, a text prompt, an output image path, and the correct model paths.

# Import Required Packages

In [2]:
import argparse

import torch
import clip
import numpy as np


from PIL import Image
from diffusers import StableDiffusionInpaintPipeline

In [4]:
import os
import os.path as osp

import glob
import importlib
import logging
import sys
#import mmcv

In [64]:
!python --version

Python 3.11.6


# Load Images

This code loads an image and downsamples it by a factor of 2 (to reduce computational cost WE COULD DO MORE?), resizes it, converts the image into a a numpy array, and returns float32 values in in the numpy array.  
This prepares the images to be loaded into the model.

In [5]:
# function to load image
def load_image(image_path):
    #set downsample factor
    downsample_factor = 2
    with open(image_path, "rb") as f:
        #open image
        image = Image.open(f)
        width, height = image.size
        #donwsample image
        width = width // downsample_factor
        height = height // downsample_factor
        #resize image
        image = image.resize(size=(width, height), resample=Image.NEAREST)
        #transpose array
        image = np.array(image).transpose(2, 0, 1)
    #return array
    return image.astype(np.float32)

# Contents of other files 
parse function from utils.options  
options from yaml file
create_model from models

## parse function to parse the options

In [6]:
#parse from utils.options, needed to read the options

def parse(options_dict, is_train=True):
    """Parse options from a dictionary instead of a YAML file.

    Args:
        options_dict (dict): Dictionary containing model options.
        is_train (bool): Indicates whether in training mode. Default is True.

    Returns:
        dict: Processed options.
    """
    # create copy of input dictionary to avoid modifying the original
    opt = options_dict.copy()

    #get gpu list
    gpu_list = ','.join(str(x) for x in opt['gpu_ids'])
    if opt.get('set_CUDA_VISIBLE_DEVICES', None):
        os.environ['CUDA_VISIBLE_DEVICES'] = gpu_list
        print('export CUDA_VISIBLE_DEVICES=' + gpu_list, flush=True)
    else:
        print('gpu_list: ', gpu_list, flush=True)

    opt['is_train'] = is_train

    #save optional paths (only if user wants to save logs & models)
    opt['path'] = {}
    opt['path']['root'] = os.getcwd() #set root as current working directory

    if is_train:
        #check for path, set to root if not set
        opt['path']['models'] = opt.get('models_path', os.path.join(opt['path']['root'], 'models'))
        opt['path']['logs'] = opt.get('logs_path', os.path.join(opt['path']['root'], 'logs'))
        opt['path']['visualization'] = opt.get('visualization_path', os.path.join(opt['path']['root'], 'visualization'))
        
        
        # change some options for debug mode
        #debug enabled = True
        if opt.get('debug', False):
            opt['val_freq'] = 1
            opt['print_freq'] = 1
            opt['save_checkpoint_freq'] = 1

    #for test mode    
    else:  # test
        #check for path, set to root if not set
        opt['path']['results'] = opt.get('results_path', os.path.join(opt['path']['root'], 'results'))
        opt['path']['log'] = opt.get('log_path', os.path.join(opt['path']['root'], 'test_logs'))
        opt['path']['visualization'] = opt.get('vis_path', os.path.join(opt['path']['root'], 'test_visualizations'))

    return opt

## Options for model setup including paths to models and images
The model paths and image paths should both be changed to where we have them saved

In [21]:
# contents of yaml file converted to dictionary
options_dict = {
    'name': 'region_gen',
    'use_tb_logger': True,
    'debug_path': False,
    'set_CUDA_VISIBLE_DEVICES': True,
    'gpu_ids': [0],

    # dataset configs
    'batch_size': 8,
    'num_workers': 4,
    #change depending on where you have files stored
    'mask_dir': '../DFMM-Spotlight/mask',
    'train_img_dir': '../DFMM-Spotlight/train_images',
    'test_img_dir': '../DFMM-Spotlight/test_images',
    'train_ann_file': '../DFMM-Spotlight/mask_ann/train_ann_file.jsonl',
    'test_ann_file': '../DFMM-Spotlight/mask_ann/test_ann_file.jsonl',
    'downsample_factor': 2,

    # model configs
    'model_type': 'ERLM',
    'text_embedding_dim': 512,
    'encoder_in_channels': 3,
    'fc_in_channels': 64,
    'fc_in_index': 4,
    'fc_channels': 64,
    'fc_num_convs': 1,
    'fc_concat_input': False,
    'fc_dropout_ratio': 0.1,
    'fc_num_classes': 2,
    'fc_align_corners': False,

    # training configs
    'val_freq': 5,
    'print_freq': 100,
    'weight_decay': 0,
    'manual_seed': 2023,
    'num_epochs': 100,
    'lr': 1e-4,
    'lr_decay': "step",
    'gamma': 0.1,
    'step': [50],

    #text prompt
    "text_prompt": "a blue dress", # we can change this to any text prompt we want
    
    #paths (change this to our model paths)
    'elrm_model_path' : '../ELRM/', 
    'styleswap_model_path' : '../textfit-model',
    'output_path' : 'example_output.png',
    'input_image_path': '../DFMM-Spotlight/test_images/MEN-Denim-id_00000089-03_7_additional.png',
}
    

In [1]:
def create_model(opt):
    """
    Dynamically create a model based on the configuration dictionary.
    
    Args:
        opt (dict): A dictionary containing model configuration. 
                    Must include 'model_type' key.
    
    Returns:
        model (object): An instance of the specified model class.
    """
    model_type = opt['model_type']
    
    # Find the model class from the registry
    model_cls = model_registry.get(model_type)
    
    if model_cls is None:
        raise ValueError(f"Model '{model_type}' not found in registry.")
    
    # Instantiate the model with options
    model = model_cls(opt)
    
    # Optional logging — for demonstration
    logger = logging.getLogger()
    logger.setLevel(logging.INFO)
    logger.info(f"Model [{model.__class__.__name__}] is created.")
    
    return model

## modified model function tha loads the ELRM model

In [None]:
def create_model(opt):
    """Create model.

    Args:
        opt (dict): Configuration. It constains:
            model_type (str): Model type.
    """

    #get model folder
    model_folder = opt['elrm_model_path']

    #get py file with model architecture
    model_filenames = [
        osp.splitext(osp.basename(v))[0]
        for v in glob.glob(f'{model_folder}/*_model.py')
    ]
    
    # import all the model modules
    _model_modules = [
        importlib.import_module(f'models.{file_name}')
        for file_name in model_filenames
        ]
    
    model_type = opt['model_type']

    # dynamically instantiation
    for module in _model_modules:
        model_cls = getattr(module, model_type, None)
        if model_cls is not None:
            break
    if model_cls is None:
        raise ValueError(f'Model {model_type} is not found.')

    model = model_cls(opt)
    
    # Optional logging — for demonstration
    logger = logging.getLogger()
    logger.setLevel(logging.INFO)
    logger.info(f"Model [{model.__class__.__name__}] is created.")
    
    return model

# Main Pipeline Function
This function starts by parsing all the arguments. It takes in the image path,text promt, output path, and model path. It also loads the elrm model using the specifics in the yaml file. We will likely convert the yaml file to a code cell and call it in this function. 

Then it calls the load_image function, and converts the output to a tensor. 

It also takes in the text prompt and tokenizes it.

The function uses the CLIP model to encode the text, and both the image and text embeddings are passed throught the encoder.
The decoder uses `argmax`` to create a segmentation mask and converts the image to grayscale.

Then the stable diffusion model is then loaded and transfered to GPU.
The model generates an image using the input image, the segmentation mask from the elrm model and the text prompt. The diffusion model is run for 50 steps. ( mores teps = high quality image, slow, fewer steps = faster, lower quality image, more noise)

Then the generated image is blended with the input image and saved.

In [9]:
def main():
    #parse arguments from dictionary
    opt = parse(options_dict, is_train=True)
    
    #load model cnbfiguration
    model = create_model(opt) #create model from options (need create_model function from models module))
    model.load_network()
    model.encoder.eval()
    model.decoder.eval()

    #load image
    img = load_image(opt['img_path'])
    #conver to tensor
    img= torch.from_numpy(img).unsqueeze(dim=0).to(model.device)
    
    #load text inputs with clip to encode text
    text_input = torch.cat([clip.tokenize(opt["text prompt"])]).to(model.device)

    #pass image and text through encoder
    with torch.no_grad():
        # text embedding
        text_embedding = model.clip.encode_text(text_input)
        #encode text
        text_encoding = model.encoder(img, text_embedding)
        seg_logits = model.decoder(text_encoding)

    #argmax to get segmentation map
    seg_pred = seg_logits.argmax(dim=1).cpu().numpy()[0]
    #convert to grayscale image
    seg_img = Image.fromarray(np.uint8(seg_pred * 255))

    img = Image.open(opt['img_path']).convert("RGB").resize((256, 512))

    # Load pipeline
    #load stable diffusion inpaingting model (we can change this if we want)
    pipe = StableDiffusionInpaintPipeline.from_pretrained(
        opt['texfit_model_path'], revision="fp16",
        torch_dtype=torch.float16,
        safety_checker=None,
        requires_safety_checker=False
        #move model to gpu
    ).to("cuda")

    #generate image
    generator = torch.Generator("cuda").manual_seed(2023)
    images = pipe(
        height=512,
        width=256,
        prompt=[opt['text_prompt']],
        image=img,
        mask_image=seg_img,
        num_inference_steps=50,
        generator=generator
    ).images

    #blend generated image with original image
    final_img = Image.composite(images[0], img, seg_img)
    #save image
    final_img.save(opt['output_path'])
    print('Saved edited result to', opt['output_path'])




In [10]:
# Call main() directly in the notebook
main()

export CUDA_VISIBLE_DEVICES=0


TypeError: 'str' object is not callable