In [None]:
!pip install torch torchvision transformers pillow sentencepiece

In [None]:
'''
Use this function when downloading the first model
'''
import torch
from transformers import BlipProcessor, BlipForConditionalGeneration
from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer
from PIL import Image

def model_download():
    # Need to modify path for model download
    blip_model_dir = './model/Blip'
    translate_model_dir = './model/M2M100'

    # Load Blip model and processor
    processor = BlipProcessor.from_pretrained('Salesforce/blip-image-captioning-base')
    model = BlipForConditionalGeneration.from_pretrained('Salesforce/blip-image-captioning-base')

    processor.save_pretrained(blip_model_dir)
    model.save_pretrained(blip_model_dir)
    
    # Load Translate model and processor (English to Korean)
    translator_tokenizer = M2M100Tokenizer.from_pretrained('facebook/m2m100_418M')
    translator_model = M2M100ForConditionalGeneration.from_pretrained('facebook/m2m100_418M')

    translator_tokenizer.save_pretrained(translate_model_dir)
    translator_model.save_pretrained(translate_model_dir)


model_download()

sentencepiece.bpe.model:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.94G [00:00<?, ?B/s]

Non-default generation parameters: {'max_length': 200, 'early_stopping': True, 'num_beams': 5}


In [None]:
import torch
from transformers import BlipProcessor, BlipForConditionalGeneration
from transformers import MarianMTModel, MarianTokenizer
from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer

from PIL import Image

def predict(image_path):
    # Saved model path
    blip_model_dir = './model/Blip'
    translate_model_dir = './model/M2M100'

    # Load Blip model and processor
    processor = BlipProcessor.from_pretrained(blip_model_dir)
    model = BlipForConditionalGeneration.from_pretrained(blip_model_dir)

    # Load Translate model and processor (English to Korean)
    translator_tokenizer = M2M100Tokenizer.from_pretrained(translate_model_dir)
    translator_model = M2M100ForConditionalGeneration.from_pretrained(translate_model_dir)

    # Set Source Language and Target Language
    translator_tokenizer.src_lang = "en"
    translator_tokenizer.tgt_lang = "ko"

    # Load Image and preprocessing
    raw_image = Image.open(image_path).convert('RGB')
    inputs = processor(raw_image, return_tensors='pt')

    # Create English Caption
    with torch.no_grad():
        output = model.generate(**inputs, max_length = 100, min_length = 5)

    # Decode English Caption
    english_caption = processor.decode(output[0], skip_special_tokens=True)
    print("생성된 영어 설명:", english_caption)

    # Translate English to Korean
    translator_inputs = translator_tokenizer(english_caption, return_tensors='pt')

    translated_tokens = translator_model.generate(
            **translator_inputs,
            forced_bos_token_id=translator_tokenizer.get_lang_id("ko")
        )


    # Decode Korean Caption
    korean_caption = translator_tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)
    print("생성된 한국어 설명:", korean_caption)
    
    return korean_caption

image_path = '178045.jpg'

predict(image_path)



생성된 영어 설명: a group of women in traditional dress dancing
생성된 한국어 설명: ['전통적인 드레스 댄스에서 여성 그룹']


['전통적인 드레스 댄스에서 여성 그룹']