In [1]:
#!pip install transformers

In [2]:
import os.path as path
import os
import pickle
from enum import Enum
from typing import NoReturn, Optional
from tqdm import tqdm
import random
from pprint import pprint
import pandas as pd
import numpy as np
import time

import librosa
import librosa.feature as audio_F

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

import torch
import torch.nn as nn
import torch.nn.functional as nn_F
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import torchaudio
import torchaudio.transforms as T

from transformers import ASTFeatureExtractor
from transformers import AutoModelForAudioClassification

import matplotlib.pyplot as plt

from IPython.display import Audio

import warnings
warnings.filterwarnings('ignore')

In [3]:
# feature extraction parameters
sample_rate = 16000  # was 11025
n_fft = 1024
overlap = 4
hop_length = n_fft // overlap
n_mels = 64

device = device = "cuda" if torch.cuda.is_available() else "cpu"
print(f'{device=}')

cpu_count = os.cpu_count()
num_workers = cpu_count if device == "cpu" else 0
print(f'{num_workers=}, {cpu_count=}')

def set_random_state(random_state:int=0) -> NoReturn:
    """Initialize random generators.

    Parameters
    ==========
    random_state : int = 0
        Determines random number generation for centroid initialization.
        Use an int to make the randomness deterministic.
    """
    torch.manual_seed(random_state)
    random.seed(random_state)
    np.random.seed(random_state)

    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(random_state)
        torch.cuda.manual_seed(random_state)

        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False
    return

random_state = 42
set_random_state(random_state)
torch.use_deterministic_algorithms(True)

device='cuda'
num_workers=0, cpu_count=4


In [4]:
%env CUBLAS_WORKSPACE_CONFIG=:4096:8
%env PYTHONHASHSEED=42

env: CUBLAS_WORKSPACE_CONFIG=:4096:8
env: PYTHONHASHSEED=42


In [5]:
# path
data_path = "./data"

train_csv = data_path + "/train.csv"
train_pickle = data_path + "/train.pickle"
test_pickle = data_path + "/test.pickle"
labels_pickle = data_path + "/labels.pickle"

In [6]:
class SetType(Enum):
    """Set type"""
    TRAIN = 1
    TEST  = 2
    DEV   = 3

class EventDetectionDataset(Dataset):
    def __init__(self, x, y=None, device="cpu", set_type=SetType.TRAIN):
        self.x = x
        self.y = y
        self.set_type = set_type
        
        stretch_factor = 0.85
        mask_param = 20
        self.transforms = nn.Sequential(
            T.TimeStretch(stretch_factor, fixed_rate=True),
            T.FrequencyMasking(freq_mask_param=mask_param),
            T.TimeMasking(time_mask_param=mask_param)
        )

    def __len__(self):
        return len(self.x)

    def __getitem__(self, idx):
        # добавить сюда prepare_shape()
        # if self.set_type == SetType.TRAIN:
        # x = prepare_shape(self.x[idx])
        # else:
        # x = self.x[idx]
        x = self.x[idx]
        if self.set_type == SetType.TRAIN:
                x = self.transforms(x)
        if self.y is not None:
            x = torch.tensor(x, dtype=torch.float32, device=device).squeeze()
            y = torch.tensor(self.y[idx], dtype=torch.int, device=device)
            
            return x, y

        return torch.tensor(x, dtype=torch.float32, device=device).squeeze()

In [7]:
train_data = pickle.load(open(train_pickle, "rb"))
test_data = pickle.load(open(test_pickle, "rb"))

label_to_id = pickle.load(open(labels_pickle, "rb"))
id_to_label = [label for label, i in label_to_id.items()]

validation_set_size = 800
test_set_size = validation_set_size / len(train_data)


x_, y_ = [], []
for i, row in enumerate(train_data):
    feature = row['feature']
    x_.append(feature)
    y_.append(row['label_id'])

x_tr, x_val, y_tr, y_val = train_test_split(x_, y_, stratify=y_, test_size=test_set_size)

print(f"""
Train set:
    x.shape: {len(x_tr)}
    y.shape: {len(y_tr)}

Validation set:
    x.shape: {len(x_val)}
    y.shape: {len(y_val)}
    
Test set:
    x.shape: {len(test_data)}
    """)

train_dset = EventDetectionDataset(x_tr, y_tr, device=device, set_type=SetType.TRAIN)
val_dset = EventDetectionDataset(x_val, y_val, device=device, set_type=SetType.TEST)


Train set:
    x.shape: 4883
    y.shape: 4883

Validation set:
    x.shape: 800
    y.shape: 800
    
Test set:
    x.shape: 3790
    


In [8]:
torch.cuda.empty_cache()
print(torch.cuda.memory_allocated()/1024**2)
print(torch.cuda.memory_cached()/1024**2)

0.0
0.0


In [10]:
train_loader = DataLoader(train_dset, batch_size=32, shuffle=True, num_workers=num_workers, drop_last=True)
val_loader = DataLoader(val_dset, batch_size=32, shuffle=False, num_workers=num_workers, drop_last=True)

In [11]:
set_random_state(random_state)

sample_x, sample_y = random.choice(val_dset)

sample_y.item(), id_to_label[sample_y]

(26, 'Laughter')

In [12]:
set_random_state(random_state)

sample_x, sample_y = random.choice(train_dset)
sample_y.item(), id_to_label[sample_y]

(12, 'Double_bass')

In [13]:
AutoModelForAudioClassification.from_pretrained("MIT/ast-finetuned-audioset-10-10-0.448") # ast-finetuned-audioset-10-10-0.448

ASTForAudioClassification(
  (audio_spectrogram_transformer): ASTModel(
    (embeddings): ASTEmbeddings(
      (patch_embeddings): ASTPatchEmbeddings(
        (projection): Conv2d(1, 768, kernel_size=(16, 16), stride=(10, 10))
      )
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): ASTEncoder(
      (layer): ModuleList(
        (0-11): 12 x ASTLayer(
          (attention): ASTAttention(
            (attention): ASTSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
            (output): ASTSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
          )
          (intermediate): ASTIntermediate(
            (de

In [19]:
model = AutoModelForAudioClassification.from_pretrained("MIT/ast-finetuned-audioset-10-10-0.448", 
                                                        num_labels=len(id_to_label), 
                                                        ignore_mismatched_sizes=True)
model.config.id2label = id_to_label
model.config.label2id = label_to_id
model.config._num_labels = len(id_to_label)
model.config.num_labels = len(id_to_label)
model.requires_grad_(False)
#model.classifier = nn.Linear(in_features=768, 
#                             out_features=len(id_to_label), bias=True)

model.classifier = nn.Sequential(
           nn.Linear(in_features=768, 
                             out_features=256, bias=True),
           nn.Linear(in_features=256, 
                             out_features=len(id_to_label), bias=True)

)

# model.classifier = nn.Sequential(
#           nn.Conv1d(1,20,2), # 767
#           nn.ReLU(),
#           nn.Conv1d(20,64,2), # 766
#           nn.ReLU(),
#           nn.Linear(in_features=49024, out_features=len(id_to_label), bias=True)
#         )
model.classifier.requires_grad_(True)
model.to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)

Some weights of ASTForAudioClassification were not initialized from the model checkpoint at MIT/ast-finetuned-audioset-10-10-0.448 and are newly initialized because the shapes did not match:
- classifier.dense.weight: found shape torch.Size([527, 768]) in the checkpoint and torch.Size([41, 768]) in the model instantiated
- classifier.dense.bias: found shape torch.Size([527]) in the checkpoint and torch.Size([41]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [20]:
model.classifier

Sequential(
  (0): Linear(in_features=768, out_features=256, bias=True)
  (1): Linear(in_features=256, out_features=41, bias=True)
)

In [21]:
model.num_labels, len(id_to_label)

(41, 41)

In [22]:
model.config

ASTConfig {
  "_name_or_path": "MIT/ast-finetuned-audioset-10-10-0.448",
  "_num_labels": 41,
  "architectures": [
    "ASTForAudioClassification"
  ],
  "attention_probs_dropout_prob": 0.0,
  "frequency_stride": 10,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "id2label": [
    "Acoustic_guitar",
    "Applause",
    "Bark",
    "Bass_drum",
    "Burping_or_eructation",
    "Bus",
    "Cello",
    "Chime",
    "Clarinet",
    "Computer_keyboard",
    "Cough",
    "Cowbell",
    "Double_bass",
    "Drawer_open_or_close",
    "Electric_piano",
    "Fart",
    "Finger_snapping",
    "Fireworks",
    "Flute",
    "Glockenspiel",
    "Gong",
    "Gunshot_or_gunfire",
    "Harmonica",
    "Hi-hat",
    "Keys_jangling",
    "Knock",
    "Laughter",
    "Meow",
    "Microwave_oven",
    "Oboe",
    "Saxophone",
    "Scissors",
    "Shatter",
    "Snare_drum",
    "Squeak",
    "Tambourine",
    "Tearing",
    "Telephone",
    "Trumpet",
    "Violin_or_fiddle",


In [23]:
start_time = time.time()

epochs = 20
train_loss = []
val_loss = []
train_f1 = []
val_f1 = []

for epoch in range(epochs):
    
    if epoch % 1 == 0:
        print(f'epoch #{epoch+1}')
    
    model.train()
    loss_list = []
    outs = []
    tgts = []
    
    for i_batch, sample_batched in enumerate(train_loader):
        
        inputs, labels = sample_batched
        outputs = model(inputs, labels=labels.long())
        loss = outputs.loss
        loss_list.append(loss.item())
        outs.append(outputs.logits.argmax(-1))
        tgts.append(labels)
        
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
    
    y_true = torch.hstack(tgts).numpy(force=True)
    y_pred = torch.hstack(outs).numpy(force=True)
    f1 = f1_score(y_true, y_pred, average='weighted')
    train_loss.append(np.mean(loss_list))
    train_f1.append(f1)
    if epoch % 1 == 0:
        print(f'[train] mean loss: {train_loss[-1]}')
        print(f'[train] f1-score:  {f1}')
    
    model.eval()
    loss_list = []
    outs = []
    tgts = []
    
    with torch.no_grad():
        for i_batch, sample_batched in enumerate(val_loader):
            
            inputs, labels = sample_batched
            outputs = model(inputs, labels=labels.long())
            loss = outputs.loss
            loss_list.append(loss.item())
            outs.append(outputs.logits.argmax(-1))
            tgts.append(labels)
        
        y_true = torch.hstack(tgts).numpy(force=True)
        y_pred = torch.hstack(outs).numpy(force=True)
        f1 = f1_score(y_true, y_pred, average='weighted')
        val_loss.append(np.mean(loss_list))
        val_f1.append(f1)
        if epoch % 1 == 0:
            print(f'[val] mean loss:   {val_loss[-1]}')
            print(f'[val] f1-score:    {f1}', end="\n\n")
        

print(f"Execution time: {(time.time() - start_time):.2f} seconds")

epoch #1
[train] mean loss: 3.019329773752313
[train] f1-score:  0.3195733273566459
[val] mean loss:   2.407400035858154
[val] f1-score:    0.5103371015946027

epoch #2


KeyboardInterrupt: 

In [18]:
{id: label for id, label in enumerate(id_to_label)}

{0: 'Acoustic_guitar',
 1: 'Applause',
 2: 'Bark',
 3: 'Bass_drum',
 4: 'Burping_or_eructation',
 5: 'Bus',
 6: 'Cello',
 7: 'Chime',
 8: 'Clarinet',
 9: 'Computer_keyboard',
 10: 'Cough',
 11: 'Cowbell',
 12: 'Double_bass',
 13: 'Drawer_open_or_close',
 14: 'Electric_piano',
 15: 'Fart',
 16: 'Finger_snapping',
 17: 'Fireworks',
 18: 'Flute',
 19: 'Glockenspiel',
 20: 'Gong',
 21: 'Gunshot_or_gunfire',
 22: 'Harmonica',
 23: 'Hi-hat',
 24: 'Keys_jangling',
 25: 'Knock',
 26: 'Laughter',
 27: 'Meow',
 28: 'Microwave_oven',
 29: 'Oboe',
 30: 'Saxophone',
 31: 'Scissors',
 32: 'Shatter',
 33: 'Snare_drum',
 34: 'Squeak',
 35: 'Tambourine',
 36: 'Tearing',
 37: 'Telephone',
 38: 'Trumpet',
 39: 'Violin_or_fiddle',
 40: 'Writing'}

In [19]:
model.config.id2label = {id: label for id, label in enumerate(id_to_label)}
model.save_pretrained("model_weights", from_pt=True)

In [20]:
torch.cuda.empty_cache()
inference = AutoModelForAudioClassification.from_pretrained("model_weights")

outputs_infer = []
output_formated_infer = []

x_test = []
for i, row in enumerate(test_data):
    feature = row['feature']
    x_test.append(feature)
test_dset = EventDetectionDataset(x_test, None, device=device, set_type=SetType.TEST)
test_loader = DataLoader(test_dset, batch_size=8, shuffle=False, 
                         num_workers=num_workers, drop_last=False)


Some weights of the model checkpoint at model_weights were not used when initializing ASTForAudioClassification: ['classifier.weight', 'classifier.bias']
- This IS expected if you are initializing ASTForAudioClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ASTForAudioClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ASTForAudioClassification were not initialized from the model checkpoint at model_weights and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier.layernorm.weight', 'classifier.layernorm.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [21]:
inference.to(device)
inference.eval()
with torch.no_grad():
  for i_batch, sample_batched in enumerate(test_loader):
    x = sample_batched

    output_infer = model(x)
    outputs_infer.append(output_infer.logits.argmax(-1))
  output_formated_infer = torch.hstack(outputs_infer).numpy(force=True)

In [22]:
labled_outs = [id_to_label[i] for i in output_formated_infer]
print(labled_outs)

['Oboe', 'Double_bass', 'Clarinet', 'Harmonica', 'Burping_or_eructation', 'Fart', 'Tambourine', 'Bus', 'Hi-hat', 'Snare_drum', 'Telephone', 'Flute', 'Laughter', 'Cough', 'Fireworks', 'Cello', 'Hi-hat', 'Tearing', 'Acoustic_guitar', 'Bass_drum', 'Snare_drum', 'Gong', 'Shatter', 'Cello', 'Tearing', 'Burping_or_eructation', 'Laughter', 'Burping_or_eructation', 'Double_bass', 'Burping_or_eructation', 'Bus', 'Shatter', 'Acoustic_guitar', 'Tearing', 'Flute', 'Snare_drum', 'Chime', 'Knock', 'Snare_drum', 'Violin_or_fiddle', 'Flute', 'Cowbell', 'Applause', 'Glockenspiel', 'Squeak', 'Burping_or_eructation', 'Writing', 'Acoustic_guitar', 'Tearing', 'Snare_drum', 'Cough', 'Clarinet', 'Glockenspiel', 'Fireworks', 'Cough', 'Drawer_open_or_close', 'Trumpet', 'Violin_or_fiddle', 'Writing', 'Cowbell', 'Applause', 'Trumpet', 'Oboe', 'Keys_jangling', 'Bark', 'Snare_drum', 'Gunshot_or_gunfire', 'Burping_or_eructation', 'Squeak', 'Fart', 'Drawer_open_or_close', 'Acoustic_guitar', 'Clarinet', 'Cough', 'Ele

In [23]:
fnames = []
for datum in test_data:
  fnames.append(datum['fname'])

In [24]:
pd.DataFrame({'fname': fnames, 'label': labled_outs}).to_csv('predict_2.csv', index=False)

In [25]:
len(labled_outs)

3790