# audiocraft-mps debugging notebook

### locally executed environment
- macOS(M1 Pro): Ventura 13.5.1
- Python: 3.11.4

### Setup

To install audiocraft, please check the [official documentation](https://github.com/facebookresearch/audiocraft).

note: Updateing masOS to >=13.0 is required to run LSTM models in mps.

Run these commands before implementing this notebook.
```sh-session
$conda env config vars set PYTORCH_ENABLE_MPS_FALLBACK=1
$conda activate <AUDIOCRAFT_VIRTUAL_ENV>
```

In [1]:
!pip freeze

aiofiles==23.2.1
aiohttp==3.8.5
aiosignal==1.3.1
altair==5.0.1
antlr4-python3-runtime==4.9.3
anyio==3.7.1
appnope==0.1.3
argon2-cffi==23.1.0
argon2-cffi-bindings==21.2.0
arrow==1.2.3
asttokens==2.2.1
async-lru==2.0.4
async-timeout==4.0.3
attrs==23.1.0
-e git+https://github.com/EbaraKoji/audiocraft.git@4ab8bc8e3429800319c10508e1ce52be91dd7e22#egg=audiocraft
audioread==3.0.0
av==10.0.0
Babel==2.12.1
backcall==0.2.0
beautifulsoup4==4.12.2
bleach==6.0.0
blis==0.7.10
catalogue==2.0.9
certifi==2023.7.22
cffi==1.15.1
charset-normalizer==3.2.0
click==8.1.7
cloudpickle==2.2.1
colorlog==6.7.0
comm==0.1.4
confection==0.1.1
contourpy==1.1.0
cycler==0.11.0
cymem==2.0.7
Cython==3.0.0
debugpy==1.6.7.post1
decorator==5.1.1
defusedxml==0.7.1
demucs==4.0.0
diffq==0.2.4
docopt==0.6.2
dora-search==0.1.12
einops==0.6.1
encodec==0.1.1
executing==1.2.0
fastapi==0.101.1
fastjsonschema==2.18.0
ffmpy==0.3.1
filelock==3.12.2
flashy==0.0.2
fonttools==4.42.1
fqdn==1.5.1
frozenlist==1.4.0
fsspec==2023.6.0
gradio==3

In [2]:
from audiocraft.models import AudioGen
from audiocraft.utils.notebook import display_audio

cpu_model = AudioGen.get_pretrained('facebook/audiogen-medium', device='cpu')
mps_model = AudioGen.get_pretrained('facebook/audiogen-medium', device='mps')

objc[25745]: Class AVFFrameReceiver is implemented in both /Users/ebarakoji/miniforge3/envs/audiogen/lib/libavdevice.58.8.100.dylib (0x1178b4798) and /Users/ebarakoji/miniforge3/envs/audiogen/lib/python3.11/site-packages/av/.dylibs/libavdevice.59.7.100.dylib (0x127be4778). One of the two will be used. Which one is undefined.
objc[25745]: Class AVFAudioReceiver is implemented in both /Users/ebarakoji/miniforge3/envs/audiogen/lib/libavdevice.58.8.100.dylib (0x1178b47e8) and /Users/ebarakoji/miniforge3/envs/audiogen/lib/python3.11/site-packages/av/.dylibs/libavdevice.59.7.100.dylib (0x127be47c8). One of the two will be used. Which one is undefined.
  tensor.erfinv_()


In [3]:
# Setting use_sampling to False to see whether gen_tokens are the same, but this causes deteriorated sounds.
gen_params_dict = {
    'use_sampling': False,
    'top_k': 250,
    'duration': 5
}

cpu_model.set_generation_params(**gen_params_dict)
mps_model.set_generation_params(**gen_params_dict)

gen_args = {
    'descriptions': ['A crow is cawing'],
    'progress': True,
    'debug_gen_tokens': True,
}   

# changed the generate function to return the tuple of (output, gen_tokens) to inspect decoder
cpu_output, cpu_gen_tokens = cpu_model.generate(**gen_args)
mps_output, mps_gen_tokens = mps_model.generate(**gen_args)

   253 /    250

In [4]:
(cpu_gen_tokens == mps_gen_tokens.to('cpu')).all()

tensor(True)

In [5]:
(cpu_output == mps_output.to('cpu')).all()

tensor(False)

In [6]:
(cpu_model.compression_model.decode_latent(mps_gen_tokens.to('cpu')) == \
    mps_model.compression_model.decode_latent(mps_gen_tokens).to('cpu')).all()

tensor(True)

In [7]:
cpu_emb = cpu_model.compression_model.decode_latent(mps_gen_tokens.to('cpu'))
mps_emb = mps_model.compression_model.decode_latent(mps_gen_tokens)
(cpu_emb == mps_emb.to('cpu')).all()

tensor(True)

In [8]:
cpu_out = cpu_model.compression_model.decoder(cpu_emb)
mps_out = mps_model.compression_model.decoder(mps_emb)
(cpu_out == mps_out.to('cpu')).all()

tensor(False)

In [9]:
mps_model.compression_model.decoder

SEANetDecoder(
  (model): Sequential(
    (0): StreamableConv1d(
      (conv): NormConv1d(
        (conv): Conv1d(128, 1024, kernel_size=(7,), stride=(1,))
        (norm): Identity()
      )
    )
    (1): StreamableLSTM(
      (lstm): LSTM(1024, 1024, num_layers=2)
    )
    (2): ELU(alpha=1.0)
    (3): StreamableConvTranspose1d(
      (convtr): NormConvTranspose1d(
        (convtr): ConvTranspose1d(1024, 512, kernel_size=(16,), stride=(8,))
        (norm): Identity()
      )
    )
    (4): SEANetResnetBlock(
      (block): Sequential(
        (0): ELU(alpha=1.0)
        (1): StreamableConv1d(
          (conv): NormConv1d(
            (conv): Conv1d(512, 256, kernel_size=(3,), stride=(1,))
            (norm): Identity()
          )
        )
        (2): ELU(alpha=1.0)
        (3): StreamableConv1d(
          (conv): NormConv1d(
            (conv): Conv1d(256, 512, kernel_size=(1,), stride=(1,))
            (norm): Identity()
          )
        )
      )
      (shortcut): Identity()


In [10]:
cpu_postprocess = cpu_model.compression_model.postprocess(cpu_out)
mps_postprocess = mps_model.compression_model.postprocess(cpu_out.to('mps'))
(mps_postprocess.to('cpu') == cpu_postprocess).all()

tensor(True)

In [11]:
# check decoder parameter values are exactly the same
for cpu_param, mps_param in zip(cpu_model.compression_model.decoder.parameters(), mps_model.compression_model.decoder.parameters()):
    assert (cpu_param == mps_param.to('cpu')).all()

In [12]:
# check whether mps_gen_tokens is of good quality when use_sampling is set to be True
mps_model = AudioGen.get_pretrained('facebook/audiogen-medium', device='mps')

mps_model.set_generation_params(
    use_sampling=True,
    top_k=250,
    duration=5
)
sampled_mps_output, sampled_mps_gen_tokens = mps_model.generate(**gen_args)

   253 /    250

In [13]:
# badly decoded
display_audio(sampled_mps_output, sample_rate=16000)

In [14]:
# well-composed, so generating tokens by mps and decoding by cpu does work
display_audio(cpu_model.compression_model.decode(sampled_mps_gen_tokens.to('cpu')), sample_rate=16000)