# Inizializzazione

In [None]:
!pip install transformers

In [9]:
!pip install datasets




[notice] A new release of pip is available: 24.0 -> 24.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [10]:
!pip install librosa




[notice] A new release of pip is available: 24.0 -> 24.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [7]:
!pip install soundfile




[notice] A new release of pip is available: 24.0 -> 24.1
[notice] To update, run: python.exe -m pip install --upgrade pip


# Import AST Pretrained and test

In [78]:
from transformers import AutoFeatureExtractor, ASTForAudioClassification
from datasets import load_dataset
import torch

dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation", trust_remote_code=True)
dataset = dataset.sort("id")
sampling_rate = dataset.features["audio"].sampling_rate

feature_extractor = AutoFeatureExtractor.from_pretrained("MIT/ast-finetuned-audioset-10-10-0.4593")
model = ASTForAudioClassification.from_pretrained("MIT/ast-finetuned-audioset-10-10-0.4593")
model

ASTForAudioClassification(
  (audio_spectrogram_transformer): ASTModel(
    (embeddings): ASTEmbeddings(
      (patch_embeddings): ASTPatchEmbeddings(
        (projection): Conv2d(1, 768, kernel_size=(16, 16), stride=(10, 10))
      )
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): ASTEncoder(
      (layer): ModuleList(
        (0-11): 12 x ASTLayer(
          (attention): ASTSdpaAttention(
            (attention): ASTSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
            (output): ASTSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
          )
          (intermediate): ASTIntermediate(
       

In [11]:
# audio file is decoded on the fly
inputs = feature_extractor(dataset[0]["audio"]["array"], sampling_rate=sampling_rate, return_tensors="pt")

with torch.no_grad():
    logits = model(**inputs).logits

predicted_class_ids = torch.argmax(logits, dim=-1).item()
predicted_label = model.config.id2label[predicted_class_ids]
predicted_label

'Speech'

In [12]:
# compute loss - target_label is e.g. "down"
target_label = model.config.id2label[0]
inputs["labels"] = torch.tensor([model.config.label2id[target_label]])
loss = model(**inputs).loss
round(loss.item(), 2)

0.17

# Prompt Tuning

## Testing

In [5]:
from transformers import ASTModel, AutoProcessor
import torch
from datasets import load_dataset

dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation", trust_remote_code=True)
dataset = dataset.sort("id")
sampling_rate = dataset.features["audio"].sampling_rate

processor = AutoProcessor.from_pretrained("MIT/ast-finetuned-audioset-10-10-0.4593")

model = ASTModel.from_pretrained("MIT/ast-finetuned-audioset-10-10-0.4593")
model



ASTModel(
  (embeddings): ASTEmbeddings(
    (patch_embeddings): ASTPatchEmbeddings(
      (projection): Conv2d(1, 768, kernel_size=(16, 16), stride=(10, 10))
    )
    (dropout): Dropout(p=0.0, inplace=False)
  )
  (encoder): ASTEncoder(
    (layer): ModuleList(
      (0-11): 12 x ASTLayer(
        (attention): ASTSdpaAttention(
          (attention): ASTSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.0, inplace=False)
          )
          (output): ASTSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.0, inplace=False)
          )
        )
        (intermediate): ASTIntermediate(
          (dense): Linear(in_features=768, out_features=3072, bias=True)
          (intermediate_act_fn): GELUAct

In [6]:
# audio file is decoded on the fly
inputs = processor(dataset[0]["audio"]["array"], sampling_rate=sampling_rate, return_tensors="pt")
with torch.no_grad():
    outputs = model(**inputs)

last_hidden_states = outputs.last_hidden_state
list(last_hidden_states.shape)

[1, 1214, 768]

In [51]:
dataset

Dataset({
    features: ['file', 'audio', 'text', 'speaker_id', 'chapter_id', 'id'],
    num_rows: 73
})

In [77]:
print(dataset[0])
print(dataset[0]["audio"]["array"])
print(len(dataset[0]["audio"]["array"]))

{'file': 'C:/Users/cerru/.cache/huggingface/datasets/downloads/extracted/f3eb62acb63aa6cb8437f4bdaee2cc3fe72cb5c7d10d1f47223e14d011e73d46/dev_clean/1272/128104\\1272-128104-0000.flac', 'audio': {'path': 'C:/Users/cerru/.cache/huggingface/datasets/downloads/extracted/f3eb62acb63aa6cb8437f4bdaee2cc3fe72cb5c7d10d1f47223e14d011e73d46/dev_clean/1272/128104\\1272-128104-0000.flac', 'array': array([0.00238037, 0.0020752 , 0.00198364, ..., 0.00042725, 0.00057983,
       0.0010376 ]), 'sampling_rate': 16000}, 'text': 'MISTER QUILTER IS THE APOSTLE OF THE MIDDLE CLASSES AND WE ARE GLAD TO WELCOME HIS GOSPEL', 'speaker_id': 1272, 'chapter_id': 128104, 'id': '1272-128104-0000'}
[0.00238037 0.0020752  0.00198364 ... 0.00042725 0.00057983 0.0010376 ]
93680


## Model

In [55]:
from functools import reduce
from operator import mul
import math
import torch
import torch.nn as nn

class AST_PromptTuning(nn.Module):

    # dropout apply dropout after each prompt
    # str = "none" --> only head tuning
    def __init__(self, prompt_tokens: int = 5, prompt_dropout: float = 0.0, prompt_type: str = 'deep'):
        super().__init__()

        # load vit model
        self.encoder = ASTModel.from_pretrained("MIT/ast-finetuned-audioset-10-10-0.4593")

        # hidden_size = depth of the model
        self.classifier = nn.Sequential(
            nn.Linear(self.encoder.config.hidden_size, 384),
            # nn.Linear(self.encoder.config.hidden_size, 192),
            # nn.Linear(self.encoder.config.hidden_size, 96),
            nn.Linear(384, 15)
        )

        # freeze
        for n, p in self.encoder.named_parameters():
            p.requires_grad = False

        self.prompt_type = prompt_type # "shallow" "deep" or None

        if prompt_type is not None:

            # prompt
            self.prompt_tokens = prompt_tokens  # number of prompted tokens
            self.prompt_dropout = nn.Dropout(prompt_dropout)
            self.prompt_dim = self.encoder.config.hidden_size

            # initiate prompt (random)
            val = math.sqrt(6. / float(3 * reduce(mul, (self.encoder.config.patch_size, self.encoder.config.patch_size), 1) + self.prompt_dim))

            # my vector of learnable parameters (how many (prompt_tokens) and dimension (prompt_dim))
            self.prompt_embeddings = nn.Parameter(torch.zeros(1, self.prompt_tokens, self.prompt_dim))

            # xavier_uniform initialization
            nn.init.uniform_(self.prompt_embeddings.data, -val, val)

            if self.prompt_type == 'deep':
                self.total_d_layer = self.encoder.config.num_hidden_layers
                self.deep_prompt_embeddings = nn.Parameter(
                    # - 1 cause shallow already inserted
                    torch.zeros(self.total_d_layer-1, self.prompt_tokens, self.prompt_dim)
                )
                # xavier_uniform initialization
                nn.init.uniform_(self.deep_prompt_embeddings.data, -val, val)

    def train(self, mode=True):
        # set train status for this class: disable all but the prompt-related modules
        if mode:
            # training:
            self.encoder.eval()
            if self.prompt_type is not None:
              # enable dropout and batch normalization
                self.prompt_dropout.train()
        else:
            # eval:
            for module in self.children():
                module.train(mode)

    def incorporate_prompt(self, x, prompt_embeddings, n_prompt: int = 0):
        # x shape: (batch size, n_tokens, hidden_dim)
        # pompt_embeddings shape: (1, n_prompt, hidden_dim)
        B = x.shape[0]

        # peek the class token, add prompts, add sequence

        # concat prompts: (batch size, cls_token + n_prompt + n_patches, hidden_dim)
        x = torch.cat((
            x[:, :1, :],
            self.prompt_dropout(prompt_embeddings.expand(B, -1, -1)),
            x[:, (1+n_prompt):, :]
        ), dim=1)

        return x

    def forward_features(self, x):

        # go through the encoder embeddings
        x = self.encoder.embeddings(x)

        # add prompts
        x = self.incorporate_prompt(x, self.prompt_embeddings)

        if self.prompt_type == 'deep':
            # deep mode
            x = model.encoder.encoder.layer[0](x)[0]
            for i in range(1, self.total_d_layer):
                x = self.incorporate_prompt(x, self.deep_prompt_embeddings[i-1], self.prompt_tokens)
                x = model.encoder.encoder.layer[i](x)[0]
        else:
            # shallow mode
            x = self.encoder.encoder(x)["last_hidden_state"]

        x = self.encoder.layernorm(x)
        #print(x.shape)
        return x

    def forward(self, x):
        if self.prompt_type is not None:
            x = self.forward_features(x)[:, 0, :]
        else:
          # pass x, take the classification token
            x = self.encoder(x)["last_hidden_state"][:, 0, :]

        x = self.classifier(x)
        return x

In [56]:
model = AST_PromptTuning(prompt_type=None)
# count number of parameters
print("AST params:", sum(p.numel() for p in model.parameters()))
# count number of trainable parameters
print("Head fine-tuning:", sum(p.numel() for p in model.parameters() if p.requires_grad))
model = AST_PromptTuning(prompt_type='shallow')
# count number of trainable parameters
print("Shallow prompt-tuning:", sum(p.numel() for p in model.parameters() if p.requires_grad))
model = AST_PromptTuning(prompt_type='deep')
# count number of trainable parameters
print("Deep prompt-tuning:", sum(p.numel() for p in model.parameters() if p.requires_grad))

AST params: 86488335
Head fine-tuning: 301071
Shallow prompt-tuning: 304911
Deep prompt-tuning: 347151


In [57]:
# audio file is decoded on the fly
inputs = feature_extractor(dataset[0]["audio"]["array"], sampling_rate=sampling_rate, return_tensors="pt")

print(inputs)

with torch.no_grad():
    outputs = model(inputs['input_values'])

{'input_values': tensor([[[-0.9894, -1.2776, -0.9066,  ..., -0.5855, -0.7328, -0.7346],
         [-0.9942, -1.2776, -0.9058,  ..., -0.6302, -0.7277, -0.8872],
         [-0.8979, -1.2094, -0.8326,  ..., -0.5787, -0.6236, -0.7860],
         ...,
         [ 0.4670,  0.4670,  0.4670,  ...,  0.4670,  0.4670,  0.4670],
         [ 0.4670,  0.4670,  0.4670,  ...,  0.4670,  0.4670,  0.4670],
         [ 0.4670,  0.4670,  0.4670,  ...,  0.4670,  0.4670,  0.4670]]])}


In [58]:
outputs

tensor([[-0.7189, -0.0014,  0.2666, -0.5154, -0.0391,  0.0547,  0.0153, -0.1374,
         -0.1847,  0.2537, -0.2873,  0.2529,  0.5561, -0.3804, -0.3067]])

In [59]:
predicted_class_ids = torch.argmax(outputs, dim=-1).item()
predicted_class_ids

12

In [60]:
import torch.nn.functional as F

softmax = F.softmax(outputs, dim=1)
softmax

tensor([[0.0333, 0.0683, 0.0893, 0.0409, 0.0658, 0.0723, 0.0695, 0.0596, 0.0569,
         0.0882, 0.0513, 0.0881, 0.1193, 0.0468, 0.0503]])