In [42]:
# An script written to test the I2S model by
# Combining I2U, U2S and ASR

import datetime
from glob import glob
import json
import os
import sys
import yaml
from tqdm import tqdm
import argparse
import random

import numpy as np
import h5py
from imageio.v2 import imread
# from imageio import imread
from PIL import Image
import resampy
import torch
from torchvision import transforms

### Import functions for synthesizing

`utils_synthesize`: has several functions that help you load model, and generate speech.


`judge_asr`: are used for judge whether the speech contains (name, number, color). You don't have to use this.


`./models`: Contain all the models for Image-to-Speech Synthesizing. The system is based on Transformers. I also used some variations, like `gated decoder` (`TransformerSentenceLM_FixedImg_gated`) layer in `VisualGPT` , `prefix encoder` (`TransformerPrefixLM`) in `prefix-tuning`. Those are not working as well as a basic baseline model, which is a plain Transformer Decoder (`TransformerSentenceLM_FixedImg_Pool`) with pretrained Image Encoder (DINO). Please use this basic model for now.

In [10]:
# from utils_i2u import *
from utils_synthesize import *
from judge_asr import judge_ans

sys.path.append("./models")
# from models import models_modified
from models import TransformerConditionedLM
from models_modified import TransformerSentenceLM_FixedImg_gated # TransformerSentenceLM_FixedImg
from models_modified import TransformerSentenceLM_FixedImg_Pool
# from models_prompt import TransformerPrefixLM, prefix_Transformer

### Define the model paths.

Those are the current best performed models.

In [5]:
# The model paths:

# the i2u model:
model_path = "../../saved_model/I2U/origin_5_captions_256_hubert/hubert_baseline"
word_map_path = "../../saved_model/I2U/WORDMAP_HUBERT.json"

# the Tacotron2 Model trained on hubert_kmeans unit captions:
tts_model_path = "../../gslm_models/u2S/HuBERT_KM100_tts_checkpoint_best.pt"
code_dict_path = "../../gslm_models/u2S/HuBERT_KM100_code_dict"
max_decoder_steps = 500

# Vocoder HifiGAN
hifigan_checkpoint_path = "../../hifigan/LJ_FT_T2_V3/generator_v3"

# A tuned ASR model
asr_checkpoint_path = "../../saved_model/ASR/wav2vec2-base-tuned/checkpoint-3000"

In [16]:
global device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

### Load I2U model

In [32]:
config_path = glob(model_path + "/config*.yml")[0]
model_checkpoint = glob(model_path+"/*BEST*.tar")[0]

with open(word_map_path) as j:
    word_map = json.load(j)
rev_word_map = {v: k for k, v in word_map.items()}  # ix2word
special_words = {"<unk>", "<start>", "<end>", "<pad>"}

# NOTE: load_i2u is from util_synthesize, which only supports
#  model type "TransformerSentenceLM_FixedImg_Pool"
i2u_model = load_i2u(model_checkpoint, config_path, len(word_map))
i2u_model.eval()
i2u_model.to(device)

TransformerSentenceLM_FixedImg_Pool(
  (embed): Embedding(104, 1024)
  (pos_encoder): PositionalEncoding(
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (LM_decoder): None
  (classifier): Linear(in_features=1024, out_features=104, bias=True)
  (image_encoder): DinoResEncoder_Pool(
    (resnet): Sequential(
      (0): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
      (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ReLU(inplace=True)
      (3): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
      (4): Sequential(
        (0): Bottleneck(
          (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
          (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True,

### Load U2S and ASR

In [33]:
tacotron_model, tts_datasets = load_tacotron2_hubert(model_path=tts_model_path, code_dict_path=code_dict_path, max_decoder_steps=max_decoder_steps)
hifigan_model = load_hifigan(hifigan_checkpoint_path, device)
asr_model, asr_processor = load_asr(asr_checkpoint_path, device)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Removing weight norm...


### Prepare imgs
For simplicity and efficiency, I always use image data that is resized to (3, 256, 256) and stored to a hdf5 file.

But for your understanding, I'll show you the preprocess of raw Image data.

In [34]:
image_split_path = "../../data/food_image_split.json"
with open(image_split_path, "r") as f:
    image_split = json.load(f)
test_imgs = image_split["test"]

transform = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])

def preprocess_img(impath, transform, device):
    # Read, resize and tranform the image to [3, 256, 256] tensor on device
    img = imread(impath)
    resolution = 256
    if len(img.shape) == 2:
        img = img[:, :, np.newaxis]
        img = np.concatenate([img, img, img], axis=2)
    # img = imresize(img, (256, 256))
    # resolution = int(config['data']['image_resolution'])
    img = np.array(Image.fromarray(img).resize((resolution, resolution)))
    img = img.transpose(2, 0, 1)
    assert img.shape == (3, resolution, resolution)
    assert np.max(img) <= 255
    
    img = torch.FloatTensor(img / 255.)
    if transform is not None:
        img = transform(img)
    return img.to(device)

### Generate WAV from an image, and transcribe it.

In [47]:
sampled_img_pth = random.choice(test_imgs)
sampled_img_name = sampled_img_pth.split("/")[-1]
img = preprocess_img(sampled_img_pth, transform, device)
# we add one dimension on pos 0, 
# because we need to treat img as a mini-batch of batch-size 1
img = img.unsqueeze(0)
print(f"Sampled Img Info: {sampled_img_name}")

seqs = i2u_model.decode(image=img, start_unit=word_map["<start>"], end_unit=word_map["<end>"], max_len=150, beam_size=10)
print("Generated pseudo sequence (unit caption):")
print(seqs)

try:
    words = seq2words(seq=seqs, rev_word_map=rev_word_map, special_words=special_words)
    # audio = u2s(
    #     words=words,
    #     tacotron2_model=tacotron_model,
    #     hifigan_model=hifigan_model,
    #     device=device
    #     )
    audio = u2s_hubert(
        words=words,
        tacotron2_model=tacotron_model,
        tts_dataset=tts_datasets,
        hifigan_model=hifigan_model,
        device=device
        )
    print("Generated Speech:")
    import IPython.display as ipd
    display(ipd.Audio(audio, rate=22050))

    trans = s2t(audio=audio, asr_model=asr_model, asr_processor=asr_processor, device=device)
    print(f"Transcription: {trans}")
except:
    trans = None
    print("U2S not successful.")

Sampled Img Info: lemon_bl3_15.jpg
Generated pseudo sequence (unit caption):
[102, 1, 99, 50, 99, 50, 99, 90, 50, 90, 50, 90, 83, 50, 23, 84, 25, 9, 10, 36, 37, 17, 97, 24, 97, 66, 57, 58, 63, 64, 65, 79, 3, 14, 15, 25, 42, 11, 78, 61, 81, 43, 44, 45, 86, 98, 62, 63, 94, 44, 96, 3, 11, 30, 5, 13, 14, 15, 38, 60, 5, 13, 25, 26, 27, 4, 12, 5, 34, 57, 58, 26, 71, 81, 43, 44, 45, 70, 21, 29, 48, 73, 50, 49, 50, 49, 50, 49, 74, 75, 103]
Generated Speech:


Transcription: there are three lemons in a blue backgroere in und
