In [1]:
import torch
import torch.nn.functional as F
from torch import nn

import whisper
from whisper.audio import (
    log_mel_spectrogram,
    pad_or_trim,
    load_audio,
)

import jiwer
from tqdm import tqdm
from main import *

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model = whisper.load_model("base.en")

In [3]:
# collect trainable params
params = []
names = []

for name, param in model.named_parameters():
    param.requires_grad = False

for nm, m in model.named_modules():
    # print(str(nm).split('.'))
    trainable = ['weight', 'bias']
    # train_LN
    if isinstance(m, nn.LayerNorm) and str(nm).split('.')[0] == 'encoder':
        for np, p in m.named_parameters():
            if np in trainable:  
                p.requires_grad = True
                params.append(p)
                names.append(f"{nm}.{np}")
    # train_feature
    if len(str(nm).split('.')) > 1:
        if str(nm).split('.')[0] == 'encoder' and (str(nm).split('.')[1] == 'conv1' or str(nm).split('.')[1] == 'conv2'):
            for np, p in m.named_parameters():
                p.requires_grad = True
                params.append(p)
                names.append(f"{nm}.{np}")
print(names)
# check trainable parameter
# for name, param in model.named_parameters():
#     print("name: ", name)
#     print("requires_grad: ", param.requires_grad)

['encoder.conv1.weight', 'encoder.conv1.bias', 'encoder.conv2.weight', 'encoder.conv2.bias', 'encoder.blocks.0.attn_ln.weight', 'encoder.blocks.0.attn_ln.bias', 'encoder.blocks.0.mlp_ln.weight', 'encoder.blocks.0.mlp_ln.bias', 'encoder.blocks.1.attn_ln.weight', 'encoder.blocks.1.attn_ln.bias', 'encoder.blocks.1.mlp_ln.weight', 'encoder.blocks.1.mlp_ln.bias', 'encoder.blocks.2.attn_ln.weight', 'encoder.blocks.2.attn_ln.bias', 'encoder.blocks.2.mlp_ln.weight', 'encoder.blocks.2.mlp_ln.bias', 'encoder.blocks.3.attn_ln.weight', 'encoder.blocks.3.attn_ln.bias', 'encoder.blocks.3.mlp_ln.weight', 'encoder.blocks.3.mlp_ln.bias', 'encoder.blocks.4.attn_ln.weight', 'encoder.blocks.4.attn_ln.bias', 'encoder.blocks.4.mlp_ln.weight', 'encoder.blocks.4.mlp_ln.bias', 'encoder.blocks.5.attn_ln.weight', 'encoder.blocks.5.attn_ln.bias', 'encoder.blocks.5.mlp_ln.weight', 'encoder.blocks.5.mlp_ln.bias', 'encoder.ln_post.weight', 'encoder.ln_post.bias']


In [4]:
# load audio
options = whisper.DecodingOptions(language="en", without_timestamps=True)
audio = load_audio(file='./p232_022.wav')
audio = pad_or_trim(audio)
mel = log_mel_spectrogram(audio)
mel = mel.unsqueeze(-1)
mel = mel.permute(2,0,1)

## Calculate loss and adapt

In [5]:
optimizer, scheduler = setup_optimizer(params, 'AdamW', lr=3e-4, scheduler=None)
mel = mel.to(DEVICE)
model = model.to(DEVICE)
outputs = model.decode(mel, options)


[INFO]    optimizer: <class 'torch.optim.adamw.AdamW'>
[INFO]    scheduler: None


In [6]:
result_tensor = torch.stack(outputs[1], dim=0)
result_tensor=result_tensor.permute(1,0,2) # torch.Size([1, 5, 51864])
result_tensor.shape

torch.Size([1, 22, 51864])

In [7]:
e_loss = softmax_entropy(result_tensor).mean(0).mean()
e_loss


tensor(0.2503, device='cuda:0', grad_fn=<MeanBackward0>)

In [8]:
!nvidia-smi

Sun Jan 21 20:46:20 2024       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.161.03   Driver Version: 470.161.03   CUDA Version: 12.3     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-SXM2...  On   | 00000000:3D:00.0 Off |                    0 |
| N/A   28C    P0    56W / 300W |   3621MiB / 32510MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [9]:
c_loss = mcc_loss(result_tensor, class_num=51864)
c_loss

tensor(0.9996, device='cuda:0', grad_fn=<DivBackward0>)

In [10]:
!nvidia-smi

Sun Jan 21 20:46:23 2024       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.161.03   Driver Version: 470.161.03   CUDA Version: 12.3     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-SXM2...  On   | 00000000:3D:00.0 Off |                    0 |
| N/A   28C    P0    56W / 300W |  24145MiB / 32510MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [11]:
torch.cuda.empty_cache()
!nvidia-smi

Sun Jan 21 20:46:30 2024       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.161.03   Driver Version: 470.161.03   CUDA Version: 12.3     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-SXM2...  On   | 00000000:3D:00.0 Off |                    0 |
| N/A   28C    P0    66W / 300W |  13883MiB / 32510MiB |     22%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [12]:
loss = 0
loss += e_loss*0.9 + c_loss*0.1
loss.backward()
optimizer.step()
if scheduler is not None: 
    scheduler.step()
model.zero_grad()


# with torch.no_grad():
#     outputs = model.decode(mel, options)
#     print(outputs)

OutOfMemoryError: CUDA out of memory. Tried to allocate 10.02 GiB. GPU 0 has a total capacty of 31.75 GiB of which 8.17 GiB is free. Process 27609 has 23.58 GiB memory in use. Of the allocated memory 23.12 GiB is allocated by PyTorch, and 83.28 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [5]:
def forward_and_adapt(x, model, optimizer, em_coef=0.9, reweight=False, temp=1., not_blank=True, scheduler=None, 
                        div_coef=0, repeat_inference=True, skip_short_thd=None):
    """Forward and adapt model on batch of data.

    Measure entropy of the model prediction, take gradients, and update params.

    the index of <pad> in vocab is 0
    """
    # forward
    outputs = model.decode(x, options)
    logits = torch.stack(outputs[1], dim=0)
    logits=logits.permute(1,0,2) # torch.Size([1, 5, 51864])
    # adapt
    loss = 0

    if em_coef > 0: 
        e_loss = softmax_entropy(logits / temp).mean(0).mean() 
        
        loss += e_loss * em_coef

    if 1 - em_coef > 0: 
        c_loss = mcc_loss(logits / temp, reweight)
        loss += c_loss * (1 - em_coef)

    if div_coef > 0: 
        d_loss = div_loss(logits, not_blank) 
        loss += d_loss * div_coef 

    loss.backward()
    optimizer.step()
    if scheduler is not None: 
        scheduler.step()
    model.zero_grad()

    # inference again
    if repeat_inference:
        with torch.no_grad():
            outputs = model.decode(x, options)
    return outputs

In [8]:
test1 = forward_and_adapt(mel, model, optimizer)

OutOfMemoryError: CUDA out of memory. Tried to allocate 10.02 GiB. GPU 0 has a total capacty of 31.75 GiB of which 5.33 GiB is free. Process 13288 has 26.41 GiB memory in use. Of the allocated memory 25.90 GiB is allocated by PyTorch, and 141.46 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [12]:
with torch.no_grad():
    outputs = model.decode(mel, options)
    print(outputs[0][0].text)

The actual primary rainbow observed is said to be the effect of superimposition of a number of bows.


In [None]:
# for np, p in model.encoder.conv1.named_parameters():
#     if np in trainable:
#         print(p.grad)