# [📄 Multimodal Speech Recognition for Language-Guided Embodied Agents](https://arxiv.org/abs/2302.14030)
[Allen Chang](https://www.cylumn.com/), 
[Xiaoyuan Zhu](https://www.linkedin.com/in/xiaoyuan-zhu-38005a224/), 
[Aarav Monga](https://www.linkedin.com/in/aarav-monga-517457246/), 
[Seoho Ahn](https://www.linkedin.com/in/sean-ahn-437423220/),
[Tejas Srinivasan](https://tejas1995.github.io/), 
[Jesse Thomason](https://jessethomason.com/)

## Colab Demo

### Install dependencies and imports

In [None]:
from os.path import exists

if not exists('embodied-multimodal-asr'):
  !git clone -q --depth 1 https://github.com/Cylumn/embodied-multimodal-asr

%cd embodied-multimodal-asr

!pip install git+https://github.com/openai/CLIP.git
!pip install torch==1.12.0
!pip install torchaudio==0.12.0

In [None]:
%cd models
!sh download_pretrained.sh
%cd ../

In [None]:
import numpy as np
import torch
import torchaudio
from sklearn.preprocessing import LabelEncoder
from PIL import Image
import IPython

from lib.models import UnimodalDecoder, MultimodalDecoder, ASRPipeline

### Load models

In [None]:
# Use GPU
device = 'cuda'

In [None]:
# Word Tokenizer
tokenizer = LabelEncoder()
tokenizer.classes_ = np.load('media/demo/tokenizer.npy')
n_tokens = len(tokenizer.classes_)

In [None]:
# Load ASR Models
unimodal = ASRPipeline(
    decoder=UnimodalDecoder(
        d_audio=[312, 768], d_out=n_tokens, 
        depth=4, max_target_len=25, dropout=0.3
    ),
    tokenizer=tokenizer, device=device
)
multimodal = ASRPipeline(
    decoder=MultimodalDecoder(
        d_audio=[312, 768], d_vision=512, d_out=n_tokens, 
        depth=4, max_target_len=25, dropout=0.3
    ),
    tokenizer=tokenizer, device=device
)
unimodal.eval()
multimodal.eval()

def load_weights():
    unimodal.decoder.load_state_dict(
        torch.load(f'models/unimodal_[{speaker_label}_{noise}]_pretrained.pt', map_location=device)
    )
    multimodal.decoder.load_state_dict(
        torch.load(f'models/multimodal_[{speaker_label}_{noise}]_pretrained.pt', map_location=device)
    )

### Inference

Try out different input permutations, and see what the model predicts!

Here is a great combination to try. Toggle values indicated by 🔃:\
`"seen", "unheard", "indic", "mask_0.4_nouns", 🔃 {knife/lettuce}.wav, 🔃 {knife/lettuce}.jpeg`

In [None]:
def get_waveform_path():
    return f"media/demo/test_{seen_env}_{heard_speaker}/{speaker_label}_{noise}/{waveform}"
def get_image_path():
    return f"media/demo/test_{seen_env}_{heard_speaker}/{image}"

In [None]:
#@markdown { run: "auto" }
seen_env = "seen"              #@param ["seen", "unseen"]
heard_speaker = "unheard"      #@param ["heard", "unheard"]
speaker_label = "indic"        #@param ["american", "indic"]
noise = "mask_0.4_nouns"       #@param ["clean", "mask_0.4_nouns"]
waveform = "knife.wav"         #@param ["knife.wav", "lettuce.wav"]
image = "knife.jpeg"           #@param ["knife.jpeg", "lettuce.jpeg"]

if heard_speaker == "unheard":
    assert speaker_label != "american", "Unheard tests only apply to Indic English TTS Speakers"
if waveform == "knife.wav":
    text_instruction = "Pick up the knife on the counter."
elif waveform == "lettuce.wav":
    text_instruction = "Pick up the lettuce on the counter."

audio = torchaudio.load(get_waveform_path())[0]
vision = Image.open(get_image_path())
load_weights()

display(vision)
display(IPython.display.Audio(get_waveform_path()))

print(f'Ground-Truth Instruction Text: "{text_instruction}"')
print(f'Unimodal ASR Transcript: "{unimodal(audio)}"')
print(f'Multimodal ASR Transcript: "{multimodal(audio, vision)}"')