# Initialization

## Library installation (only first time)

In [None]:
!pip install transformers

In [None]:
!pip install datasets

In [None]:
!pip install librosa

In [None]:
!pip install soundfile

In [None]:
!pip install --upgrade ipywidgets

## Import libraries

In [None]:
from transformers import AutoFeatureExtractor, ASTForAudioClassification, ASTModel
from datasets import load_dataset
import torch
from functools import reduce
from operator import mul
import math
import torch.nn as nn
import os
import librosa
from torch.utils.data import Dataset
import random
from tqdm.notebook import tqdm
import torch.nn.functional as F
from torch.utils.data import DataLoader


# Import AST Pretrained and test

## Import dataset huggingface

In [None]:
dataset_huggingface = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation", trust_remote_code=True)
dataset_huggingface = dataset_huggingface.sort("id")
sampling_rate = dataset_huggingface.features["audio"].sampling_rate

## Import AST huggingface

In [None]:
# ast feature extractor
feature_extractor = AutoFeatureExtractor.from_pretrained("MIT/ast-finetuned-audioset-10-10-0.4593")

In [None]:
# ast pretrained
ast_huggingface = ASTForAudioClassification.from_pretrained("MIT/ast-finetuned-audioset-10-10-0.4593")
ast_huggingface

## Test pretrained model

In [None]:
# audio file is decoded on the fly
inputs = feature_extractor(dataset_huggingface[0]["audio"]["array"], sampling_rate=sampling_rate, return_tensors="pt")

with torch.no_grad():
    logits = ast_huggingface(**inputs).logits

predicted_class_ids = torch.argmax(logits, dim=-1).item()
predicted_label = ast_huggingface.config.id2label[predicted_class_ids]
print(predicted_label)

# compute loss - target_label is e.g. "down"
target_label = ast_huggingface.config.id2label[0]
inputs["labels"] = torch.tensor([ast_huggingface.config.label2id[target_label]])
loss = ast_huggingface(**inputs).loss
round(loss.item(), 2)

# Prompt Tuning

## Retrieve Output size

In [None]:
ast_model = ASTModel.from_pretrained("MIT/ast-finetuned-audioset-10-10-0.4593")
ast_model

In [None]:
# audio file is decoded on the fly
inputs = feature_extractor(dataset_huggingface[0]["audio"]["array"], sampling_rate=sampling_rate, return_tensors="pt")
with torch.no_grad():
    outputs = ast_model(**inputs)

last_hidden_states = outputs.last_hidden_state
list(last_hidden_states.shape)

## Model and testing

In [None]:
class AST_PromptTuning(nn.Module):

    # dropout apply dropout after each prompt
    # str = "none" --> only head tuning
    def __init__(self, prompt_tokens: int = 5, prompt_dropout: float = 0.0, prompt_type: str = 'deep'):
        super().__init__()

        # load vit model
        self.encoder = ASTModel.from_pretrained("MIT/ast-finetuned-audioset-10-10-0.4593")

        # hidden_size = depth of the model
        self.classifier = nn.Sequential(
            nn.Linear(self.encoder.config.hidden_size, 384),
            # nn.Linear(self.encoder.config.hidden_size, 192),
            # nn.Linear(self.encoder.config.hidden_size, 96),
            nn.Linear(384, 15)
        )

        # freeze
        for n, p in self.encoder.named_parameters():
            p.requires_grad = False

        self.prompt_type = prompt_type # "shallow" "deep" or None

        if prompt_type is not None:

            # prompt
            self.prompt_tokens = prompt_tokens  # number of prompted tokens
            self.prompt_dropout = nn.Dropout(prompt_dropout)
            self.prompt_dim = self.encoder.config.hidden_size

            # initiate prompt (random)
            val = math.sqrt(6. / float(3 * reduce(mul, (self.encoder.config.patch_size, self.encoder.config.patch_size), 1) + self.prompt_dim))

            # my vector of learnable parameters (how many (prompt_tokens) and dimension (prompt_dim))
            self.prompt_embeddings = nn.Parameter(torch.zeros(1, self.prompt_tokens, self.prompt_dim))

            # xavier_uniform initialization
            nn.init.uniform_(self.prompt_embeddings.data, -val, val)

            if self.prompt_type == 'deep':
                self.total_d_layer = self.encoder.config.num_hidden_layers
                self.deep_prompt_embeddings = nn.Parameter(
                    # - 1 cause shallow already inserted
                    torch.zeros(self.total_d_layer-1, self.prompt_tokens, self.prompt_dim)
                )
                # xavier_uniform initialization
                nn.init.uniform_(self.deep_prompt_embeddings.data, -val, val)

    def train(self, mode=True):
        # set train status for this class: disable all but the prompt-related modules
        if mode:
            # training:
            self.encoder.eval()
            if self.prompt_type is not None:
              # enable dropout and batch normalization
                self.prompt_dropout.train()
        else:
            # eval:
            for module in self.children():
                module.train(mode)

    def incorporate_prompt(self, x, prompt_embeddings, n_prompt: int = 0):
        # x shape: (batch size, n_tokens, hidden_dim)
        # pompt_embeddings shape: (1, n_prompt, hidden_dim)
        B = x.shape[0]

        # peek the class token, add prompts, add sequence

        # concat prompts: (batch size, cls_token + n_prompt + n_patches, hidden_dim)
        x = torch.cat((
            x[:, :1, :],
            self.prompt_dropout(prompt_embeddings.expand(B, -1, -1)),
            x[:, (1+n_prompt):, :]
        ), dim=1)

        return x

    def forward_features(self, x):

        # go through the encoder embeddings
        x = self.encoder.embeddings(x)

        # add prompts
        x = self.incorporate_prompt(x, self.prompt_embeddings)

        if self.prompt_type == 'deep':
            # deep mode
            x = self.encoder.encoder.layer[0](x)[0]
            for i in range(1, self.total_d_layer):
                x = self.incorporate_prompt(x, self.deep_prompt_embeddings[i-1], self.prompt_tokens)
                x = model.encoder.encoder.layer[i](x)[0]
        else:
            # shallow mode
            x = self.encoder.encoder(x)["last_hidden_state"]

        x = self.encoder.layernorm(x)
        return x

    def forward(self, x):
        if self.prompt_type is not None:
            x = self.forward_features(x)[:, 0, :]
        else:
          # pass x, take the classification token
            x = self.encoder(x)["last_hidden_state"][:, 0, :]

        x = self.classifier(x)
        return x

In [None]:
ast_prompt = AST_PromptTuning(prompt_type=None)
# count number of parameters
print("AST params:", sum(p.numel() for p in ast_prompt.parameters()))
# count number of trainable parameters
print("Head fine-tuning:", sum(p.numel() for p in ast_prompt.parameters() if p.requires_grad))
ast_prompt_shallow = AST_PromptTuning(prompt_type='shallow')
# count number of trainable parameters
print("Shallow prompt-tuning:", sum(p.numel() for p in ast_prompt_shallow.parameters() if p.requires_grad))
ast_prompt_deep = AST_PromptTuning(prompt_type='deep')
# count number of trainable parameters
print("Deep prompt-tuning:", sum(p.numel() for p in ast_prompt_deep.parameters() if p.requires_grad))

In [None]:
# audio file is decoded on the fly
inputs = feature_extractor(dataset_huggingface[0]["audio"]["array"], sampling_rate=sampling_rate, return_tensors="pt")

with torch.no_grad():
    outputs = ast_prompt(inputs['input_values'])

predicted_class_ids = torch.argmax(outputs, dim=-1).item()
predicted_class_ids

In [None]:
softmax = F.softmax(outputs, dim=1)
softmax

# Implementation

## Utilities

In [None]:
# AutoFeatureExtractor wants in input an array that contains the audio in format .flac --> this function convert a raw audio in .flac format
def load_audio(audio_path):
    audio, sample_rate = librosa.load(audio_path, sr=16000)
    return audio, sample_rate

# this function returns the audio batch preprocessed
def feature_extractor_batch_data(batch):
    batch_feature_extractor = []
    for index in range(0, len(batch["audio"])):
        output = feature_extractor(batch["audio"][index], sampling_rate=batch["sample_rate"][index], return_tensors="pt")
        batch_feature_extractor.append(output['input_values'])
    # model wants in input a tensor with shape [num_batch, num_frame, num_mel]
    batch_audio = torch.stack(batch_feature_extractor) # stacvk all the audio in a tensor in batch size
    # prepare output
    batch["audio"] = batch_audio
    return batch["audio"], torch.tensor(batch["label"])

# test balanced dataset
def count_class_presence(path):
    class_count = {}
    with open(path, 'r') as file:
        for line in file:
            audio, label = line.strip().split()
            if label in class_count:
                class_count[label] += 1
            else:
                class_count[label] = 1
    return class_count

# model wants in input an integer for each label --> this function create a dictionary that maps each label to an index
def create_dict_label(path):
    class_dict = {}
    label_index = 0
    with open(path, 'r') as file:
        for line in file:
            _, class_name = line.strip().split()
            if class_name not in class_dict:
                class_dict[class_name] = label_index
                label_index += 1
    return class_dict

# retrieve index of given label
def from_label_to_index(label, dict_label):
    return dict_label[label]

## TUT17 Dataset

In [None]:
# from folder to PyTorch Dataset
class TUT17(Dataset):
    def __init__(self, root_dir, audio_folder, label_folder, label_filename, split = 'train', seed = 42, val_frac= 0.1, test_frac= 0.1):
        super().__init__()

        # we use seed because every time we instantiate the dataset we shuffle all the data
        # we call at least 3 times (train, validation, test) --> overlapping area
        # with seed we are sure that the dataset is shuffled always in the same way
        random.seed(seed)
        
        # store path audio and label
        self.label_path = os.path.join(os.path.join(root_dir, label_folder), label_filename)
        self.audio_path = os.path.join(root_dir, audio_folder)

        # create dict label
        self.class_dict = create_dict_label(self.label_path)
        # retrive audio files
        audio_names = os.listdir(self.audio_path)

        # split dataset (percentage)
        num_val = int(len(audio_names)*val_frac)
        num_test = int(len(audio_names)*test_frac)
        num_train = len(audio_names) - num_val - num_test

        random.shuffle(audio_names)
    
        # split dataset (indexes)
        if split == 'train':
            self.data = audio_names[:num_train]
        elif split == 'val':
            self.data = audio_names[num_train:num_train+num_val]
        elif split == 'test':
            self.data = audio_names[-num_test:]
        else:
          raise ValueError('Invalid split value.')
    
    # optional
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        audio_path = os.path.join(self.audio_path, self.data[idx])
        audio_name = audio_path.split('/')[-1][6:].replace('\\', '/').split('/')[-1]
        with open(self.label_path, "r") as f:
            while line := f.readline():
                if line.split('\t')[0][6:] == audio_name:
                    f.close()
                    audio, sample_rate = load_audio(audio_path)
                    return {'audio': audio, 'sample_rate': sample_rate, 'label': from_label_to_index(line.split('\t')[1][:-1], self.class_dict)}

In [None]:
'''
train_dataset = TUT17(root_dir = "c:/Users/cerru/Desktop/TUT17", audio_folder = "Audio", label_folder  = "labels", label_filename = "evaluate.txt", split='train')
val_dataset = TUT17(root_dir = "c:/Users/cerru/Desktop/TUT17", audio_folder = "Audio", label_folder  = "labels", label_filename = "evaluate.txt", split='val')
test_dataset = TUT17(root_dir = "c:/Users/cerru/Desktop/TUT17", audio_folder = "Audio", label_folder  = "labels", label_filename = "evaluate.txt", split='test')
'''
train_dataset = TUT17(root_dir = "C:/Users/aldob/Desktop/TUT17", audio_folder = "Audio", label_folder  = "labels", label_filename = "evaluate.txt", split='train')
val_dataset = TUT17(root_dir = "C:/Users/aldob/Desktop/TUT17", audio_folder = "Audio", label_folder  = "labels", label_filename = "evaluate.txt", split='val')
test_dataset = TUT17(root_dir = "C:/Users/aldob/Desktop/TUT17", audio_folder = "Audio", label_folder  = "labels", label_filename = "evaluate.txt", split='test')

In [None]:
# Define loaders
train_loader = DataLoader(train_dataset, batch_size=4, num_workers=0, shuffle=True, drop_last=True)
val_loader   = DataLoader(val_dataset,   batch_size=4, num_workers=0, shuffle=False, drop_last=True)
test_loader  = DataLoader(test_dataset,  batch_size=4, num_workers=0, shuffle=False, drop_last=True)

In [None]:
next(iter(train_loader))

## Train

In [None]:
def train_one_epoch(model, train_loader, criterion, optimizer, device):
    model.train()
    model.to(device)
    criterion.to(device)
    running_loss = 0.0
    for batch in tqdm(train_loader):
        # reset
        optimizer.zero_grad()

        # preprocess input AST feature extractor
        audio_list, labels = feature_extractor_batch_data(batch)

        # send input and labels to CUDA
        audio_list = audio_list.to(device)
        labels = labels.to(device)
        
        # Compute output
        output = model(audio_list.squeeze())

        loss = criterion(output, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    return running_loss / len(train_loader)

In [None]:
def validate(model, val_loader, criterion, device):
    model.eval()
    model.to(device)
    criterion.to(device)
    running_loss = 0.0
    for batch in tqdm(val_loader):
        # preprocess input AST feature extractor
        audio_list, labels = feature_extractor_batch_data(batch)

        # send input and labels to CUDA
        audio_list = audio_list.to(device)
        labels = labels.to(device)
        
        # Compute output
        output = model(audio_list.squeeze())

        loss = criterion(output, labels)
        running_loss += loss.item()
    return running_loss / len(val_loader)

In [None]:
def test(model, test_loader, criterion, device):
    model.eval()
    model.to(device)
    criterion.to(device)
    running_loss = 0.0
    labels_l = []
    predictions_l = []
    for batch in tqdm(test_loader):
         # preprocess input AST feature extractor
        audio_list, labels = feature_extractor_batch_data(batch)

        # send input and labels to CUDA
        audio_list = audio_list.to(device)
        labels = labels.to(device)
        
        # Compute output
        output = model(audio_list.squeeze())
        predictions = torch.argmax(output, dim=1)
        
        # compute accuracy
        labels_l.append(labels)
        predictions_l.append(predictions)

    labels = torch.cat(labels_l, dim=0)
    predictions = torch.cat(predictions_l, dim=0)
    
    accuracy = (predictions == labels).sum().item() / len(labels)
    return accuracy

In [None]:

def train(model, train_loader, val_loader, criterion, optimizer, device, train_mode,n_epochs: int = 10):
    if train_mode==0:
        last_model_path = os.path.join("../models_checkpoint", 'ast.pth')
    elif train_mode==1:
        last_model_path = os.path.join("../models_checkpoint", 'ast_shallow.pth')
    elif train_mode==2:
        last_model_path = os.path.join("../models_checkpoint", 'ast_deep.pth')


    if  os.path.isfile(last_model_path):
        checkpoint = torch.load(last_model_path)
        model.load_state_dict(checkpoint['model_state_dict'])
        optimizer.load_state_dict(checkpoint['optimizer_state_dict'])

    for epoch in tqdm(range(n_epochs)):
        train_loss = train_one_epoch(model, train_loader, criterion, optimizer, device)
        val_loss = validate(model, val_loader, criterion, device)
        if epoch%2 == 0:
                    torch.save({
                        'model_state_dict': model.state_dict(),
                        'optimizer_state_dict': optimizer.state_dict(),
                        'epoch': epoch, 
                    },last_model_path)

                    #torch.save(model.state_dict(), os.path.join(last_model_path, 'weights.pt'))
                    #torch.save(optimizer.state_dict(), os.path.join(last_model_path, 'optim.pt'))
        print(f'Epoch {epoch+1}/{n_epochs} : Train Loss {train_loss:.4f} : Val Loss {val_loss:.4f}')

In [None]:
ast_prompt = AST_PromptTuning(prompt_type=None)

criterion = nn.CrossEntropyLoss()

optimizer_normal = torch.optim.Adam(ast_prompt.parameters(), lr=0.0001)

dev = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(dev)

train(ast_prompt, train_loader, val_loader, criterion, optimizer_normal, dev,train_mode=0)
test(ast_prompt, test_loader, criterion, dev)

In [None]:
ast_prompt_shallow = AST_PromptTuning(prompt_type='shallow')

optimizer_shallow = torch.optim.Adam(ast_prompt_shallow.parameters(), lr=0.0001)

train(ast_prompt, train_loader, val_loader, criterion, optimizer_shallow, dev,train_mode=1)
test(ast_prompt, test_loader, criterion, dev)

In [None]:
ast_prompt_deep = AST_PromptTuning(prompt_type='deep')

optimizer_deep = torch.optim.Adam(ast_prompt_deep.parameters(), lr=0.0001)

train(ast_prompt_deep, train_loader, val_loader, criterion, optimizer_deep, dev,train_mode=2)
test(ast_prompt_deep, test_loader, criterion, dev)