In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.metrics import confusion_matrix

import os
import gc
import random
import tqdm.notebook as tqdm

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import torchaudio
from torch.utils.tensorboard import SummaryWriter

from transformers import (
    AutoFeatureExtractor,
    Wav2Vec2Processor,
    ASTForAudioClassification,
    WavLMForSequenceClassification,
    AutoModelForSequenceClassification
)

from typing import List, Optional, Union
import librosa
from transformers import SequenceFeatureExtractor, BatchFeature, TensorType

from utils import compute_score, train

%matplotlib inline

In [2]:
def set_seed(SEED):
    random.seed(SEED)
    np.random.seed(SEED) 
    torch.manual_seed(SEED) 
    torch.cuda.manual_seed(SEED)
    torch.cuda.manual_seed_all(SEED) 
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True  
    os.environ['PYTHONHASHSEED'] = str(SEED)
set_seed(42)

In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

## Просмотр данных и формирование датасета

In [4]:
DIR_TRAIN = "wavs/train_"
DIR_VAL = "wavs/val_"
DIR_TEST = "wavs/test_"

PATH_TRAIN = "train.csv"
PATH_VAL = "val.csv"
PATH_TEST = "test.csv"

df_train = pd.read_csv(PATH_TRAIN)
df_val = pd.read_csv(PATH_VAL)
df_test = pd.read_csv(PATH_TEST)

print("В обучающей выборке {} аудиозаписей".format(len(df_train)))
print("В валидационной выборке {} аудиозаписей".format(len(df_val)))
print("В тестовой выборке {} аудиозаписей".format(len(df_test)))

В обучающей выборке 9321 аудиозаписей
В валидационной выборке 815 аудиозаписей
В тестовой выборке 833 аудиозаписей


In [5]:
class EmotionDataset(Dataset):
    def __init__(self, train_df, val_df, test_df):
        """
        Аргументы:
            train_df, val_df, test_df (pd.DataFrame): тренировочный, валидационный и тестовый наборы данных
        """
        
        self.train_df = train_df
        self.train_size = len(self.train_df)
        
        self.val_df = val_df
        self.val_size = len(self.val_df)
        
        self.test_df = test_df
        self.test_size = len(self.test_df)
        
        self._lookup_dict = {'train': (self.train_df, self.train_size),
                             'val': (self.val_df, self.val_size),
                             'test': (self.test_df, self.test_size)}

        self.set_split('train') 
    
    @classmethod
    def load_dataset(cls, path_train, path_val, path_test, features_dir=None, 
                     dir_train=None, dir_val=None, dir_test=None,
                     feature_extractor=None, load_features=True, save_features=False):
        """Загрузка данных датасета
        Аргументы:
            path_train, path_val, path_test (str): пути к тренировочному, валидационному и тестовому наборам данных
            features_dir (str): директория признаков аудиозаписей
            dir_train, dir_val, dir_test (str): директории с аудиозаписями
            feature_extractor (): объект класса, извлекающего признаки из аудиозаписи
            load_features (bool): True -> загрузка выделенных ранее признаков, 
                                  False -> выделение признаков из аудиозаписей
            save_featurees (bool): True -> сохранение выделенных признаков в features_dir
        Возвращает:
            экземпляр EmotionDataset
        """
        #загрузка данных
        train_df = pd.read_csv(path_train)
        val_df = pd.read_csv(path_val)
        test_df = pd.read_csv(path_test)
        
        #загрузка признаков
        if load_features:
            features = cls._load_features(train_df, 'train', features_dir)
            train_df.insert(len(train_df.columns), 'features', features)

            features = cls._load_features(val_df, 'val', features_dir)
            val_df.insert(len(val_df.columns), 'features', features)

            features = cls._load_features(test_df, 'test', features_dir)
            test_df.insert(len(test_df.columns), 'features', features)
        
        #векторизация аудиозаписей и сохранение признаков
        else:
            
            features = cls._extract_features(train_df, dir_train, feature_extractor,
                                             'train', features_dir, save_features)
            train_df.insert(len(train_df.columns), 'features', features)

            features = cls._extract_features(val_df, dir_val, feature_extractor, 
                                             'val', features_dir, save_features)
            val_df.insert(len(val_df.columns), 'features', features)
            
            features = cls._extract_features(test_df, dir_test, feature_extractor, 
                                             'test', features_dir, save_features)
            test_df.insert(len(test_df.columns), 'features', features)
        
        return cls(train_df, val_df, test_df)
    
    def _extract_features(df, files_dir, feature_extractor, mode, features_dir=None, save_features=False):
        """Выделение признаков
        """
        features = []
        pbar = tqdm.tqdm(range(len(df['hash_id'])), total=len(df['hash_id']))
        pbar.set_description(mode + ' dataset loading')
        for i in pbar:
            waveform, sampling_rate = torchaudio.load(files_dir + df['audio_path'][i])
            waveform = waveform.squeeze().numpy()
            feature = feature_extractor(waveform, sampling_rate).input_values[0]
            features.append(feature)
            if save_features:
                np.save('{0}{1}/{2}.npy'.format(features_dir, mode, df['hash_id'][i]), feature)
        return features
    
    def _load_features(df, mode, features_dir):
        """Загрузка ранее выделенных признаков
        """
        features = []
        pbar = tqdm.tqdm(range(len(df['hash_id'])), total=len(df['hash_id']))
        pbar.set_description(mode + ' dataset loading')
        for i in pbar:
            feature = np.load('{0}{1}/{2}.npy'.format(features_dir, mode, df['hash_id'][i]))
            features.append(feature)
        return features
        
    def set_split(self, split="train"):
        """Выбор фрагментов набора данных по столбцу из объекта dataframe
        Аргументы:
            split (str): "train"/"val"/"test"
        """
        self._target_split = split
        self._target_df, self._target_size = self._lookup_dict[self._target_split]

    def __getitem__(self, idx):
        row = self._target_df.iloc[idx]
        return {"features": row['features'],
                "class": row['emotion']}

    def __len__(self):
        return self._target_size

## Дообучение Audio Spectrogram Transformer (AST)

In [6]:
MODEL_PATH = "MIT/ast-finetuned-audioset-10-10-0.4593"
CLASSES = ['positive', 'sad', 'angry', 'neutral']
DIR_FEATURES = 'ast_features/'
EXPERIMENT_DIR = 'ast_experiments/'

### Загрузка датасета

In [7]:
auto_feature_extractor = AutoFeatureExtractor.from_pretrained(MODEL_PATH)
feature_extractor = lambda waveform, sampling_rate: auto_feature_extractor(waveform, sampling_rate=sampling_rate, 
                                                                           return_tensors="np")
ds = EmotionDataset.load_dataset(PATH_TRAIN, PATH_VAL, PATH_TEST, features_dir=DIR_FEATURES,
                                 #dir_train=DIR_TRAIN, dir_val=DIR_VAL, dir_test=DIR_TEST, 
                                 #feature_extractor = feature_extractor, load_features=False, save_features=True)
                                 feature_extractor = feature_extractor, load_features=True)
                                 

  0%|          | 0/9321 [00:00<?, ?it/s]

  0%|          | 0/815 [00:00<?, ?it/s]

  0%|          | 0/833 [00:00<?, ?it/s]

In [9]:
EPOCH = 10
BATCH_SIZE = 2
SEED = 42
EXP_NUM = 1

set_seed(SEED)
    
model = ASTForAudioClassification.from_pretrained(MODEL_PATH, num_labels=len(CLASSES), 
                                                  return_dict=False, ignore_mismatched_sizes=True)
model.to(device)

loss_func = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-5)#, momentum=0.9)
sсheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=np.arange(2, EPOCH), gamma=0.5)

writer =  SummaryWriter("{0}runs/exp{1}".format(EXPERIMENT_DIR, EXP_NUM))
writer.add_graph(model, input_to_model=torch.Tensor(np.array([ds[i]['features'] 
                                                              for i in range(BATCH_SIZE)])).to(device, torch.float32))

train(model, ds, loss_func, optimizer, sсheduler, EPOCH, BATCH_SIZE, device, writer, CLASSES,
      "{0}checkpoints/exp{1}".format(EXPERIMENT_DIR, EXP_NUM), gradient_accumulation_steps=4)

writer.flush()
writer.close()

gc.collect()
torch.cuda.empty_cache()

Some weights of ASTForAudioClassification were not initialized from the model checkpoint at MIT/ast-finetuned-audioset-10-10-0.4593 and are newly initialized because the shapes did not match:
- classifier.dense.weight: found shape torch.Size([527, 768]) in the checkpoint and torch.Size([4, 768]) in the model instantiated
- classifier.dense.bias: found shape torch.Size([527]) in the checkpoint and torch.Size([4]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/4660 [00:00<?, ?it/s]

  0%|          | 0/408 [00:00<?, ?it/s]

  0%|          | 0/4660 [00:00<?, ?it/s]

  0%|          | 0/408 [00:00<?, ?it/s]

  0%|          | 0/4660 [00:00<?, ?it/s]

  0%|          | 0/408 [00:00<?, ?it/s]

  0%|          | 0/4660 [00:00<?, ?it/s]

  0%|          | 0/408 [00:00<?, ?it/s]

  0%|          | 0/4660 [00:00<?, ?it/s]

  0%|          | 0/408 [00:00<?, ?it/s]

  0%|          | 0/4660 [00:00<?, ?it/s]

  0%|          | 0/408 [00:00<?, ?it/s]

  0%|          | 0/4660 [00:00<?, ?it/s]

  0%|          | 0/408 [00:00<?, ?it/s]

  0%|          | 0/4660 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [10]:
EPOCH = 5
BATCH_SIZE = 12
SEED = 42
EXP_NUM = 2

set_seed(SEED)
    
model = ASTForAudioClassification.from_pretrained(MODEL_PATH, num_labels=len(CLASSES), 
                                                  return_dict=False, ignore_mismatched_sizes=True)

for param in model.audio_spectrogram_transformer.parameters():
    param.requires_grad = False

model.to(device)

loss_func = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.classifier.parameters(), lr=1e-5)
sсheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=np.arange(2, EPOCH), gamma=0.5)

writer =  SummaryWriter("{0}runs/exp{1}".format(EXPERIMENT_DIR, EXP_NUM))
writer.add_graph(model, input_to_model=torch.Tensor(np.array([ds[i]['features'] 
                                                              for i in range(BATCH_SIZE)])).to(device, torch.float32))

train(model, ds, loss_func, optimizer, sсheduler, EPOCH, BATCH_SIZE, device, writer, CLASSES,
      "{0}checkpoints/exp{1}".format(EXPERIMENT_DIR, EXP_NUM), gradient_accumulation_steps=1)

writer.flush()
writer.close()

gc.collect()
torch.cuda.empty_cache()

Some weights of ASTForAudioClassification were not initialized from the model checkpoint at MIT/ast-finetuned-audioset-10-10-0.4593 and are newly initialized because the shapes did not match:
- classifier.dense.weight: found shape torch.Size([527, 768]) in the checkpoint and torch.Size([4, 768]) in the model instantiated
- classifier.dense.bias: found shape torch.Size([527]) in the checkpoint and torch.Size([4]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/776 [00:00<?, ?it/s]

  0%|          | 0/68 [00:00<?, ?it/s]

  0%|          | 0/776 [00:00<?, ?it/s]

  0%|          | 0/68 [00:00<?, ?it/s]

  0%|          | 0/776 [00:00<?, ?it/s]

  0%|          | 0/68 [00:00<?, ?it/s]

  0%|          | 0/776 [00:00<?, ?it/s]

  0%|          | 0/68 [00:00<?, ?it/s]

  0%|          | 0/776 [00:00<?, ?it/s]

  0%|          | 0/68 [00:00<?, ?it/s]

In [11]:
EPOCH = 5
BATCH_SIZE = 12
SEED = 42
EXP_NUM = 3

set_seed(SEED)
    
model = ASTForAudioClassification.from_pretrained(MODEL_PATH, num_labels=len(CLASSES), 
                                                  return_dict=False, ignore_mismatched_sizes=True)

for param in model.audio_spectrogram_transformer.parameters():
    param.requires_grad = False

model.to(device)

loss_func = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.classifier.parameters(), lr=1e-2)
sсheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=np.arange(2, EPOCH), gamma=0.5)

writer =  SummaryWriter("{0}runs/exp{1}".format(EXPERIMENT_DIR, EXP_NUM))
writer.add_graph(model, input_to_model=torch.Tensor(np.array([ds[i]['features'] 
                                                              for i in range(BATCH_SIZE)])).to(device, torch.float32))

train(model, ds, loss_func, optimizer, sсheduler, EPOCH, BATCH_SIZE, device, writer, CLASSES,
      "{0}checkpoints/exp{1}".format(EXPERIMENT_DIR, EXP_NUM), gradient_accumulation_steps=1)

writer.flush()
writer.close()

gc.collect()
torch.cuda.empty_cache()

Some weights of ASTForAudioClassification were not initialized from the model checkpoint at MIT/ast-finetuned-audioset-10-10-0.4593 and are newly initialized because the shapes did not match:
- classifier.dense.weight: found shape torch.Size([527, 768]) in the checkpoint and torch.Size([4, 768]) in the model instantiated
- classifier.dense.bias: found shape torch.Size([527]) in the checkpoint and torch.Size([4]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/776 [00:00<?, ?it/s]

  0%|          | 0/68 [00:00<?, ?it/s]

  0%|          | 0/776 [00:00<?, ?it/s]

  0%|          | 0/68 [00:00<?, ?it/s]

  0%|          | 0/776 [00:00<?, ?it/s]

  0%|          | 0/68 [00:00<?, ?it/s]

  0%|          | 0/776 [00:00<?, ?it/s]

  0%|          | 0/68 [00:00<?, ?it/s]

  0%|          | 0/776 [00:00<?, ?it/s]

  0%|          | 0/68 [00:00<?, ?it/s]

In [12]:
EPOCH = 5
BATCH_SIZE = 12
SEED = 42
EXP_NUM = 4

set_seed(SEED)
    
model = ASTForAudioClassification.from_pretrained(MODEL_PATH, num_labels=len(CLASSES), 
                                                  return_dict=False, ignore_mismatched_sizes=True)

for param in model.audio_spectrogram_transformer.parameters():
    param.requires_grad = False

model.to(device)

loss_func = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.classifier.parameters(), lr=5e-2)
sсheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=np.arange(2, EPOCH), gamma=0.5)

writer =  SummaryWriter("{0}runs/exp{1}".format(EXPERIMENT_DIR, EXP_NUM))
writer.add_graph(model, input_to_model=torch.Tensor(np.array([ds[i]['features'] 
                                                              for i in range(BATCH_SIZE)])).to(device, torch.float32))

train(model, ds, loss_func, optimizer, sсheduler, EPOCH, BATCH_SIZE, device, writer, CLASSES,
      "{0}checkpoints/exp{1}".format(EXPERIMENT_DIR, EXP_NUM), gradient_accumulation_steps=1)

writer.flush()
writer.close()

gc.collect()
torch.cuda.empty_cache()

Some weights of ASTForAudioClassification were not initialized from the model checkpoint at MIT/ast-finetuned-audioset-10-10-0.4593 and are newly initialized because the shapes did not match:
- classifier.dense.weight: found shape torch.Size([527, 768]) in the checkpoint and torch.Size([4, 768]) in the model instantiated
- classifier.dense.bias: found shape torch.Size([527]) in the checkpoint and torch.Size([4]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/776 [00:00<?, ?it/s]

  0%|          | 0/68 [00:00<?, ?it/s]

  0%|          | 0/776 [00:00<?, ?it/s]

  0%|          | 0/68 [00:00<?, ?it/s]

  0%|          | 0/776 [00:00<?, ?it/s]

  0%|          | 0/68 [00:00<?, ?it/s]

  0%|          | 0/776 [00:00<?, ?it/s]

  0%|          | 0/68 [00:00<?, ?it/s]

  0%|          | 0/776 [00:00<?, ?it/s]

  0%|          | 0/68 [00:00<?, ?it/s]

In [13]:
EPOCH = 5
BATCH_SIZE = 12
SEED = 42
EXP_NUM = 5

set_seed(SEED)
    
model = ASTForAudioClassification.from_pretrained(MODEL_PATH, num_labels=len(CLASSES), 
                                                  return_dict=False, ignore_mismatched_sizes=True)

for param in model.audio_spectrogram_transformer.parameters():
    param.requires_grad = False

model.to(device)

loss_func = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.classifier.parameters(), lr=1e-1)
sсheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=np.arange(2, EPOCH), gamma=0.5)

writer =  SummaryWriter("{0}runs/exp{1}".format(EXPERIMENT_DIR, EXP_NUM))
writer.add_graph(model, input_to_model=torch.Tensor(np.array([ds[i]['features'] 
                                                              for i in range(BATCH_SIZE)])).to(device, torch.float32))

train(model, ds, loss_func, optimizer, sсheduler, EPOCH, BATCH_SIZE, device, writer, CLASSES,
      "{0}checkpoints/exp{1}".format(EXPERIMENT_DIR, EXP_NUM), gradient_accumulation_steps=1)

writer.flush()
writer.close()

gc.collect()
torch.cuda.empty_cache()

Some weights of ASTForAudioClassification were not initialized from the model checkpoint at MIT/ast-finetuned-audioset-10-10-0.4593 and are newly initialized because the shapes did not match:
- classifier.dense.weight: found shape torch.Size([527, 768]) in the checkpoint and torch.Size([4, 768]) in the model instantiated
- classifier.dense.bias: found shape torch.Size([527]) in the checkpoint and torch.Size([4]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/776 [00:00<?, ?it/s]

  0%|          | 0/68 [00:00<?, ?it/s]

  0%|          | 0/776 [00:00<?, ?it/s]

  0%|          | 0/68 [00:00<?, ?it/s]

  0%|          | 0/776 [00:00<?, ?it/s]

  0%|          | 0/68 [00:00<?, ?it/s]

  0%|          | 0/776 [00:00<?, ?it/s]

  0%|          | 0/68 [00:00<?, ?it/s]

  0%|          | 0/776 [00:00<?, ?it/s]

  0%|          | 0/68 [00:00<?, ?it/s]

In [14]:
EPOCH = 5
BATCH_SIZE = 12
SEED = 42
EXP_NUM = 6

set_seed(SEED)
    
model = ASTForAudioClassification.from_pretrained(MODEL_PATH, num_labels=len(CLASSES), 
                                                  return_dict=False, ignore_mismatched_sizes=True)

for param in model.audio_spectrogram_transformer.parameters():
    param.requires_grad = False

model.to(device)

loss_func = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.classifier.parameters(), lr=1e-2)
sсheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=np.arange(2, EPOCH), gamma=0.5)

writer =  SummaryWriter("{0}runs/exp{1}".format(EXPERIMENT_DIR, EXP_NUM))
writer.add_graph(model, input_to_model=torch.Tensor(np.array([ds[i]['features'] 
                                                              for i in range(BATCH_SIZE)])).to(device, torch.float32))

train(model, ds, loss_func, optimizer, sсheduler, EPOCH, BATCH_SIZE, device, writer, CLASSES,
      "{0}checkpoints/exp{1}".format(EXPERIMENT_DIR, EXP_NUM), gradient_accumulation_steps=1)

writer.flush()
writer.close()

gc.collect()
torch.cuda.empty_cache()

Some weights of ASTForAudioClassification were not initialized from the model checkpoint at MIT/ast-finetuned-audioset-10-10-0.4593 and are newly initialized because the shapes did not match:
- classifier.dense.weight: found shape torch.Size([527, 768]) in the checkpoint and torch.Size([4, 768]) in the model instantiated
- classifier.dense.bias: found shape torch.Size([527]) in the checkpoint and torch.Size([4]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/776 [00:00<?, ?it/s]

  0%|          | 0/68 [00:00<?, ?it/s]

  0%|          | 0/776 [00:00<?, ?it/s]

  0%|          | 0/68 [00:00<?, ?it/s]

  0%|          | 0/776 [00:00<?, ?it/s]

  0%|          | 0/68 [00:00<?, ?it/s]

  0%|          | 0/776 [00:00<?, ?it/s]

  0%|          | 0/68 [00:00<?, ?it/s]

  0%|          | 0/776 [00:00<?, ?it/s]

  0%|          | 0/68 [00:00<?, ?it/s]

In [15]:
EPOCH = 5
BATCH_SIZE = 12
SEED = 42
EXP_NUM = 7

set_seed(SEED)
    
model = ASTForAudioClassification.from_pretrained(MODEL_PATH, num_labels=len(CLASSES), 
                                                  return_dict=False, ignore_mismatched_sizes=True)

for param in model.audio_spectrogram_transformer.parameters():
    param.requires_grad = False

model.to(device)

loss_func = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.classifier.parameters(), lr=1e-2)
sсheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=[EPOCH + 1], gamma=0.5)

writer =  SummaryWriter("{0}runs/exp{1}".format(EXPERIMENT_DIR, EXP_NUM))
writer.add_graph(model, input_to_model=torch.Tensor(np.array([ds[i]['features'] 
                                                              for i in range(BATCH_SIZE)])).to(device, torch.float32))

train(model, ds, loss_func, optimizer, sсheduler, EPOCH, BATCH_SIZE, device, writer, CLASSES,
      "{0}checkpoints/exp{1}".format(EXPERIMENT_DIR, EXP_NUM), gradient_accumulation_steps=1)

writer.flush()
writer.close()

gc.collect()
torch.cuda.empty_cache()

Some weights of ASTForAudioClassification were not initialized from the model checkpoint at MIT/ast-finetuned-audioset-10-10-0.4593 and are newly initialized because the shapes did not match:
- classifier.dense.weight: found shape torch.Size([527, 768]) in the checkpoint and torch.Size([4, 768]) in the model instantiated
- classifier.dense.bias: found shape torch.Size([527]) in the checkpoint and torch.Size([4]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/776 [00:00<?, ?it/s]

  0%|          | 0/68 [00:00<?, ?it/s]

  0%|          | 0/776 [00:00<?, ?it/s]

  0%|          | 0/68 [00:00<?, ?it/s]

  0%|          | 0/776 [00:00<?, ?it/s]

  0%|          | 0/68 [00:00<?, ?it/s]

  0%|          | 0/776 [00:00<?, ?it/s]

  0%|          | 0/68 [00:00<?, ?it/s]

  0%|          | 0/776 [00:00<?, ?it/s]

  0%|          | 0/68 [00:00<?, ?it/s]

In [16]:
EPOCH = 5
BATCH_SIZE = 12
SEED = 42
EXP_NUM = 8

set_seed(SEED)
    
model = ASTForAudioClassification.from_pretrained(MODEL_PATH, num_labels=len(CLASSES), 
                                                  return_dict=False, ignore_mismatched_sizes=True)

for param in model.audio_spectrogram_transformer.parameters():
    param.requires_grad = False

model.to(device)

loss_func = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.classifier.parameters(), lr=1e-2)
sсheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=[EPOCH + 1], gamma=0.5)

writer =  SummaryWriter("{0}runs/exp{1}".format(EXPERIMENT_DIR, EXP_NUM))
writer.add_graph(model, input_to_model=torch.Tensor(np.array([ds[i]['features'] 
                                                              for i in range(BATCH_SIZE)])).to(device, torch.float32))

train(model, ds, loss_func, optimizer, sсheduler, EPOCH, BATCH_SIZE, device, writer, CLASSES,
      "{0}checkpoints/exp{1}".format(EXPERIMENT_DIR, EXP_NUM), gradient_accumulation_steps=1)

writer.flush()
writer.close()

gc.collect()
torch.cuda.empty_cache()

Some weights of ASTForAudioClassification were not initialized from the model checkpoint at MIT/ast-finetuned-audioset-10-10-0.4593 and are newly initialized because the shapes did not match:
- classifier.dense.weight: found shape torch.Size([527, 768]) in the checkpoint and torch.Size([4, 768]) in the model instantiated
- classifier.dense.bias: found shape torch.Size([527]) in the checkpoint and torch.Size([4]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/776 [00:00<?, ?it/s]

  0%|          | 0/68 [00:00<?, ?it/s]

  0%|          | 0/776 [00:00<?, ?it/s]

  0%|          | 0/68 [00:00<?, ?it/s]

  0%|          | 0/776 [00:00<?, ?it/s]

  0%|          | 0/68 [00:00<?, ?it/s]

  0%|          | 0/776 [00:00<?, ?it/s]

  0%|          | 0/68 [00:00<?, ?it/s]

  0%|          | 0/776 [00:00<?, ?it/s]

  0%|          | 0/68 [00:00<?, ?it/s]

In [17]:
EPOCH = 5
BATCH_SIZE = 12
SEED = 42
EXP_NUM = 9

set_seed(SEED)
    
model = ASTForAudioClassification.from_pretrained(MODEL_PATH, num_labels=len(CLASSES), 
                                                  return_dict=False, ignore_mismatched_sizes=True)

for param in model.audio_spectrogram_transformer.parameters():
    param.requires_grad = False

model.to(device)

loss_func = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.classifier.parameters(), lr=1e-2)
sсheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=np.arange(2, EPOCH, 2), gamma=0.5)

writer =  SummaryWriter("{0}runs/exp{1}".format(EXPERIMENT_DIR, EXP_NUM))
writer.add_graph(model, input_to_model=torch.Tensor(np.array([ds[i]['features'] 
                                                              for i in range(BATCH_SIZE)])).to(device, torch.float32))

train(model, ds, loss_func, optimizer, sсheduler, EPOCH, BATCH_SIZE, device, writer, CLASSES,
      "{0}checkpoints/exp{1}".format(EXPERIMENT_DIR, EXP_NUM), gradient_accumulation_steps=1)

writer.flush()
writer.close()

gc.collect()
torch.cuda.empty_cache()

Some weights of ASTForAudioClassification were not initialized from the model checkpoint at MIT/ast-finetuned-audioset-10-10-0.4593 and are newly initialized because the shapes did not match:
- classifier.dense.weight: found shape torch.Size([527, 768]) in the checkpoint and torch.Size([4, 768]) in the model instantiated
- classifier.dense.bias: found shape torch.Size([527]) in the checkpoint and torch.Size([4]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/776 [00:00<?, ?it/s]

  0%|          | 0/68 [00:00<?, ?it/s]

  0%|          | 0/776 [00:00<?, ?it/s]

  0%|          | 0/68 [00:00<?, ?it/s]

  0%|          | 0/776 [00:00<?, ?it/s]

  0%|          | 0/68 [00:00<?, ?it/s]

  0%|          | 0/776 [00:00<?, ?it/s]

  0%|          | 0/68 [00:00<?, ?it/s]

  0%|          | 0/776 [00:00<?, ?it/s]

  0%|          | 0/68 [00:00<?, ?it/s]

In [18]:
EPOCH = 5
BATCH_SIZE = 12
SEED = 42
EXP_NUM = 10

set_seed(SEED)
    
model = ASTForAudioClassification.from_pretrained(MODEL_PATH, num_labels=len(CLASSES), 
                                                  return_dict=False, ignore_mismatched_sizes=True)

for param in model.audio_spectrogram_transformer.parameters():
    param.requires_grad = False

model.to(device)

loss_func = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.classifier.parameters(), lr=1e-1)
sсheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=np.arange(2, EPOCH, 2), gamma=0.5)

writer =  SummaryWriter("{0}runs/exp{1}".format(EXPERIMENT_DIR, EXP_NUM))
writer.add_graph(model, input_to_model=torch.Tensor(np.array([ds[i]['features'] 
                                                              for i in range(BATCH_SIZE)])).to(device, torch.float32))

train(model, ds, loss_func, optimizer, sсheduler, EPOCH, BATCH_SIZE, device, writer, CLASSES,
      "{0}checkpoints/exp{1}".format(EXPERIMENT_DIR, EXP_NUM), gradient_accumulation_steps=1)

writer.flush()
writer.close()

gc.collect()
torch.cuda.empty_cache()

Some weights of ASTForAudioClassification were not initialized from the model checkpoint at MIT/ast-finetuned-audioset-10-10-0.4593 and are newly initialized because the shapes did not match:
- classifier.dense.weight: found shape torch.Size([527, 768]) in the checkpoint and torch.Size([4, 768]) in the model instantiated
- classifier.dense.bias: found shape torch.Size([527]) in the checkpoint and torch.Size([4]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/776 [00:00<?, ?it/s]

  0%|          | 0/68 [00:00<?, ?it/s]

  0%|          | 0/776 [00:00<?, ?it/s]

  0%|          | 0/68 [00:00<?, ?it/s]

  0%|          | 0/776 [00:00<?, ?it/s]

  0%|          | 0/68 [00:00<?, ?it/s]

  0%|          | 0/776 [00:00<?, ?it/s]

  0%|          | 0/68 [00:00<?, ?it/s]

  0%|          | 0/776 [00:00<?, ?it/s]

  0%|          | 0/68 [00:00<?, ?it/s]

In [19]:
EPOCH = 5
BATCH_SIZE = 12
SEED = 42
EXP_NUM = 11

set_seed(SEED)
    
model = ASTForAudioClassification.from_pretrained(MODEL_PATH, num_labels=len(CLASSES), 
                                                  return_dict=False, ignore_mismatched_sizes=True)

for param in model.audio_spectrogram_transformer.parameters():
    param.requires_grad = False

model.to(device)

loss_func = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.classifier.parameters(), lr=1e-2)
sсheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=np.arange(2, EPOCH), gamma=0.75)

writer =  SummaryWriter("{0}runs/exp{1}".format(EXPERIMENT_DIR, EXP_NUM))
writer.add_graph(model, input_to_model=torch.Tensor(np.array([ds[i]['features'] 
                                                              for i in range(BATCH_SIZE)])).to(device, torch.float32))

train(model, ds, loss_func, optimizer, sсheduler, EPOCH, BATCH_SIZE, device, writer, CLASSES,
      "{0}checkpoints/exp{1}".format(EXPERIMENT_DIR, EXP_NUM), gradient_accumulation_steps=1)

writer.flush()
writer.close()

gc.collect()
torch.cuda.empty_cache()

Some weights of ASTForAudioClassification were not initialized from the model checkpoint at MIT/ast-finetuned-audioset-10-10-0.4593 and are newly initialized because the shapes did not match:
- classifier.dense.weight: found shape torch.Size([527, 768]) in the checkpoint and torch.Size([4, 768]) in the model instantiated
- classifier.dense.bias: found shape torch.Size([527]) in the checkpoint and torch.Size([4]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/776 [00:00<?, ?it/s]

  0%|          | 0/68 [00:00<?, ?it/s]

  0%|          | 0/776 [00:00<?, ?it/s]

  0%|          | 0/68 [00:00<?, ?it/s]

  0%|          | 0/776 [00:00<?, ?it/s]

  0%|          | 0/68 [00:00<?, ?it/s]

  0%|          | 0/776 [00:00<?, ?it/s]

  0%|          | 0/68 [00:00<?, ?it/s]

  0%|          | 0/776 [00:00<?, ?it/s]

  0%|          | 0/68 [00:00<?, ?it/s]

In [20]:
EPOCH = 5
BATCH_SIZE = 12
SEED = 42
EXP_NUM = 12

set_seed(SEED)
    
model = ASTForAudioClassification.from_pretrained(MODEL_PATH, num_labels=len(CLASSES), 
                                                  return_dict=False, ignore_mismatched_sizes=True)

for param in model.audio_spectrogram_transformer.parameters():
    param.requires_grad = False

model.to(device)

loss_func = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.classifier.parameters(), lr=5e-1)
sсheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=np.arange(2, EPOCH), gamma=0.75)

writer =  SummaryWriter("{0}runs/exp{1}".format(EXPERIMENT_DIR, EXP_NUM))
writer.add_graph(model, input_to_model=torch.Tensor(np.array([ds[i]['features'] 
                                                              for i in range(BATCH_SIZE)])).to(device, torch.float32))

train(model, ds, loss_func, optimizer, sсheduler, EPOCH, BATCH_SIZE, device, writer, CLASSES,
      "{0}checkpoints/exp{1}".format(EXPERIMENT_DIR, EXP_NUM), gradient_accumulation_steps=1)

writer.flush()
writer.close()

gc.collect()
torch.cuda.empty_cache()

Some weights of ASTForAudioClassification were not initialized from the model checkpoint at MIT/ast-finetuned-audioset-10-10-0.4593 and are newly initialized because the shapes did not match:
- classifier.dense.weight: found shape torch.Size([527, 768]) in the checkpoint and torch.Size([4, 768]) in the model instantiated
- classifier.dense.bias: found shape torch.Size([527]) in the checkpoint and torch.Size([4]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/776 [00:00<?, ?it/s]

  0%|          | 0/68 [00:00<?, ?it/s]

  0%|          | 0/776 [00:00<?, ?it/s]

  0%|          | 0/68 [00:00<?, ?it/s]

  0%|          | 0/776 [00:00<?, ?it/s]

  0%|          | 0/68 [00:00<?, ?it/s]

  0%|          | 0/776 [00:00<?, ?it/s]

  0%|          | 0/68 [00:00<?, ?it/s]

  0%|          | 0/776 [00:00<?, ?it/s]

  0%|          | 0/68 [00:00<?, ?it/s]

In [21]:
EPOCH = 5
BATCH_SIZE = 2
SEED = 42
EXP_NUM = 13

set_seed(SEED)
    
model = ASTForAudioClassification.from_pretrained(MODEL_PATH, num_labels=len(CLASSES), 
                                                  return_dict=False, ignore_mismatched_sizes=True)
model.to(device)

loss_func = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-6)
sсheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=np.arange(2, EPOCH), gamma=0.5)

writer =  SummaryWriter("{0}runs/exp{1}".format(EXPERIMENT_DIR, EXP_NUM))
writer.add_graph(model, input_to_model=torch.Tensor(np.array([ds[i]['features'] 
                                                              for i in range(BATCH_SIZE)])).to(device, torch.float32))

train(model, ds, loss_func, optimizer, sсheduler, EPOCH, BATCH_SIZE, device, writer, CLASSES,
      "{0}checkpoints/exp{1}".format(EXPERIMENT_DIR, EXP_NUM), gradient_accumulation_steps=4)

writer.flush()
writer.close()

gc.collect()
torch.cuda.empty_cache()

Some weights of ASTForAudioClassification were not initialized from the model checkpoint at MIT/ast-finetuned-audioset-10-10-0.4593 and are newly initialized because the shapes did not match:
- classifier.dense.weight: found shape torch.Size([527, 768]) in the checkpoint and torch.Size([4, 768]) in the model instantiated
- classifier.dense.bias: found shape torch.Size([527]) in the checkpoint and torch.Size([4]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/4660 [00:00<?, ?it/s]

  0%|          | 0/408 [00:00<?, ?it/s]

  0%|          | 0/4660 [00:00<?, ?it/s]

  0%|          | 0/408 [00:00<?, ?it/s]

  0%|          | 0/4660 [00:00<?, ?it/s]

  0%|          | 0/408 [00:00<?, ?it/s]

  0%|          | 0/4660 [00:00<?, ?it/s]

  0%|          | 0/408 [00:00<?, ?it/s]

  0%|          | 0/4660 [00:00<?, ?it/s]

  0%|          | 0/408 [00:00<?, ?it/s]

In [8]:
EPOCH = 5
BATCH_SIZE = 3
SEED = 42
EXP_NUM = 14

set_seed(SEED)
    
model = ASTForAudioClassification.from_pretrained(MODEL_PATH, num_labels=len(CLASSES), 
                                                  return_dict=False, ignore_mismatched_sizes=True)
model.to(device)

loss_func = nn.CrossEntropyLoss()
optimizer = [optim.Adam(model.audio_spectrogram_transformer.parameters(), lr=1e-5),
             optim.AdamW(model.classifier.parameters(), lr=1e-5)
            ]
sсheduler = [optim.lr_scheduler.MultiStepLR(optimizer[0], milestones=np.arange(2, EPOCH), gamma=0.5),
             optim.lr_scheduler.MultiStepLR(optimizer[1], milestones=[EPOCH + 1])
            ]

writer =  SummaryWriter("{0}runs/exp{1}".format(EXPERIMENT_DIR, EXP_NUM))
writer.add_graph(model, input_to_model=torch.Tensor(np.array([ds[i]['features'] 
                                                              for i in range(BATCH_SIZE)])).to(device, torch.float32))

train(model, ds, loss_func, optimizer, sсheduler, EPOCH, BATCH_SIZE, device, writer, CLASSES,
      "{0}checkpoints/exp{1}".format(EXPERIMENT_DIR, EXP_NUM), gradient_accumulation_steps=4)

writer.flush()
writer.close()

gc.collect()
torch.cuda.empty_cache()

Some weights of ASTForAudioClassification were not initialized from the model checkpoint at MIT/ast-finetuned-audioset-10-10-0.4593 and are newly initialized because the shapes did not match:
- classifier.dense.weight: found shape torch.Size([527, 768]) in the checkpoint and torch.Size([4, 768]) in the model instantiated
- classifier.dense.bias: found shape torch.Size([527]) in the checkpoint and torch.Size([4]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/3107 [00:00<?, ?it/s]

  0%|          | 0/272 [00:00<?, ?it/s]

  0%|          | 0/3107 [00:00<?, ?it/s]

  0%|          | 0/272 [00:00<?, ?it/s]

  0%|          | 0/3107 [00:00<?, ?it/s]

  0%|          | 0/272 [00:00<?, ?it/s]

  0%|          | 0/3107 [00:00<?, ?it/s]

  0%|          | 0/272 [00:00<?, ?it/s]

  0%|          | 0/3107 [00:00<?, ?it/s]

  0%|          | 0/272 [00:00<?, ?it/s]

In [9]:
EPOCH = 5
BATCH_SIZE = 3
SEED = 42
EXP_NUM = 15

set_seed(SEED)
    
model = ASTForAudioClassification.from_pretrained(MODEL_PATH, num_labels=len(CLASSES), 
                                                  return_dict=False, ignore_mismatched_sizes=True)
model.to(device)

loss_func = nn.CrossEntropyLoss()
optimizer = [optim.Adam(model.audio_spectrogram_transformer.parameters(), lr=1e-6),
             optim.AdamW(model.classifier.parameters(), lr=1e-6)
            ]
sсheduler = [optim.lr_scheduler.MultiStepLR(optimizer[0], milestones=np.arange(2, EPOCH), gamma=0.5),
             optim.lr_scheduler.MultiStepLR(optimizer[1], milestones=[EPOCH + 1])
            ]

writer =  SummaryWriter("{0}runs/exp{1}".format(EXPERIMENT_DIR, EXP_NUM))
writer.add_graph(model, input_to_model=torch.Tensor(np.array([ds[i]['features'] 
                                                              for i in range(BATCH_SIZE)])).to(device, torch.float32))

train(model, ds, loss_func, optimizer, sсheduler, EPOCH, BATCH_SIZE, device, writer, CLASSES,
      "{0}checkpoints/exp{1}".format(EXPERIMENT_DIR, EXP_NUM), gradient_accumulation_steps=4)

writer.flush()
writer.close()

gc.collect()
torch.cuda.empty_cache()

Some weights of ASTForAudioClassification were not initialized from the model checkpoint at MIT/ast-finetuned-audioset-10-10-0.4593 and are newly initialized because the shapes did not match:
- classifier.dense.weight: found shape torch.Size([527, 768]) in the checkpoint and torch.Size([4, 768]) in the model instantiated
- classifier.dense.bias: found shape torch.Size([527]) in the checkpoint and torch.Size([4]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/3107 [00:00<?, ?it/s]

  0%|          | 0/272 [00:00<?, ?it/s]

  0%|          | 0/3107 [00:00<?, ?it/s]

  0%|          | 0/272 [00:00<?, ?it/s]

  0%|          | 0/3107 [00:00<?, ?it/s]

  0%|          | 0/272 [00:00<?, ?it/s]

  0%|          | 0/3107 [00:00<?, ?it/s]

  0%|          | 0/272 [00:00<?, ?it/s]

  0%|          | 0/3107 [00:00<?, ?it/s]

  0%|          | 0/272 [00:00<?, ?it/s]

In [10]:
EPOCH = 5
BATCH_SIZE = 2
SEED = 42
EXP_NUM = 16

set_seed(SEED)
    
model = ASTForAudioClassification.from_pretrained(MODEL_PATH, num_labels=len(CLASSES), 
                                                  return_dict=False, ignore_mismatched_sizes=True)
model.to(device)

loss_func = nn.CrossEntropyLoss()
optimizer = [optim.Adam(model.audio_spectrogram_transformer.parameters(), lr=1e-6),
             optim.AdamW(model.classifier.parameters(), lr=1e-6)
            ]
sсheduler = [optim.lr_scheduler.MultiStepLR(optimizer[0], milestones=np.arange(2, EPOCH), gamma=0.5),
             optim.lr_scheduler.MultiStepLR(optimizer[1], milestones=[EPOCH + 1])
            ]

writer =  SummaryWriter("{0}runs/exp{1}".format(EXPERIMENT_DIR, EXP_NUM))
writer.add_graph(model, input_to_model=torch.Tensor(np.array([ds[i]['features'] 
                                                              for i in range(BATCH_SIZE)])).to(device, torch.float32))

train(model, ds, loss_func, optimizer, sсheduler, EPOCH, BATCH_SIZE, device, writer, CLASSES,
      "{0}checkpoints/exp{1}".format(EXPERIMENT_DIR, EXP_NUM), gradient_accumulation_steps=4)

writer.flush()
writer.close()

gc.collect()
torch.cuda.empty_cache()

Some weights of ASTForAudioClassification were not initialized from the model checkpoint at MIT/ast-finetuned-audioset-10-10-0.4593 and are newly initialized because the shapes did not match:
- classifier.dense.weight: found shape torch.Size([527, 768]) in the checkpoint and torch.Size([4, 768]) in the model instantiated
- classifier.dense.bias: found shape torch.Size([527]) in the checkpoint and torch.Size([4]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/4660 [00:00<?, ?it/s]

  0%|          | 0/408 [00:00<?, ?it/s]

  0%|          | 0/4660 [00:00<?, ?it/s]

  0%|          | 0/408 [00:00<?, ?it/s]

  0%|          | 0/4660 [00:00<?, ?it/s]

  0%|          | 0/408 [00:00<?, ?it/s]

  0%|          | 0/4660 [00:00<?, ?it/s]

  0%|          | 0/408 [00:00<?, ?it/s]

  0%|          | 0/4660 [00:00<?, ?it/s]

  0%|          | 0/408 [00:00<?, ?it/s]