In [1]:
import logging
import hydra

import erc

logger = erc.utils.get_logger()

with hydra.initialize(version_base=None, config_path="./config"):
    cfg = hydra.compose(config_name="config", overrides={"dataset._target_=erc.datasets.KEMDy19Dataset"})

In [2]:
# change torch dataset into huggingface dataset ... 
fold_num = 1
train_dataset = hydra.utils.instantiate(cfg.dataset, mode = "train", validation_fold = fold_num)
valid_dataset = hydra.utils.instantiate(cfg.dataset, mode = "valid", validation_fold = fold_num)

train_ds = erc.preprocess.generate_datasets(
    train_dataset,
    save_name = 'audio_dataset_19',
    mode  = 'train',
    validation_fold =fold_num,
    overrides=False
)
valid_ds = erc.preprocess.generate_datasets(
    valid_dataset,
    save_name = 'audio_dataset_19',
    mode  = 'valid',
    validation_fold =fold_num,
    overrides=False
)

INFO:erc.utils:Instantiate KEMDy19 Dataset
INFO:erc.utils:Instantiate KEMDy19 Dataset


In [3]:
# apply tokenizer to the txt 
def map_to_str_with_tokenizer(batch,
                              tokenizer):

    batch['txt'] = tokenizer(batch['txt'])

    return batch
# test_ds = test_ds.map(map_to_str_with_tokenizer)

# apply tokenizer to the txt 
def map_labels2long(batch,):

    batch['labels'] = batch['labels'].long()

    return batch
train_ds = train_ds.map(map_labels2long)




In [4]:

valid_ds = valid_ds.map(map_labels2long)




## Build Model

In [5]:
# Wav2Vec2 
from transformers import AutoConfig, Wav2Vec2Processor
from erc.constants import idx2emotion, emotion2idx


# default value 
model_name_or_path = "kresnik/wav2vec2-large-xlsr-korean"
num_labels = 7 
pooling_mode = "mean" # max or min 

# set config
config = AutoConfig.from_pretrained(
    model_name_or_path,
    num_labels=num_labels,
    label2id=emotion2idx,
    id2label=idx2emotion,
    finetuning_task="wav2vec2_clf",
    pooling_mode = 'mean'
)

# setattr(config, 'pooling_mode', pooling_mode)

In [6]:
from transformers import Wav2Vec2ForSequenceClassification

pretrained_model = Wav2Vec2ForSequenceClassification.from_pretrained(model_name_or_path, config=config)
processor= Wav2Vec2Processor.from_pretrained(model_name_or_path)

Some weights of the model checkpoint at kresnik/wav2vec2-large-xlsr-korean were not used when initializing Wav2Vec2ForSequenceClassification: ['lm_head.weight', 'lm_head.bias']
- This IS expected if you are initializing Wav2Vec2ForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at kresnik/wav2vec2-large-xlsr-korean and are newly initialized: ['projector.weight', 'classifier.bias', 'projector.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it f

In [7]:
from typing import Any, Dict, Union

import torch
from packaging import version
from torch import nn

from transformers import (
    Trainer,
    is_apex_available,
)

# if is_apex_available():
    # from apex import amp

# if version.parse(torch.__version__) >= version.parse("1.6"):
    # _is_native_amp_available = True

from torch.cuda import amp
from torch.cuda.amp import autocast


class CTCTrainer(Trainer):
    def training_step(self, model: nn.Module, inputs: Dict[str, Union[torch.Tensor, Any]]) -> torch.Tensor:
        """
        Perform a training step on a batch of inputs.

        Subclass and override to inject custom behavior.

        Args:
            model (:obj:`nn.Module`):
                The model to train.
            inputs (:obj:`Dict[str, Union[torch.Tensor, Any]]`):
                The inputs and targets of the model.

                The dictionary will be unpacked before being fed to the model. Most models expect the targets under the
                argument :obj:`labels`. Check your model's documentation for all accepted arguments.

        Return:
            :obj:`torch.Tensor`: The tensor with training loss on this batch.
        """

        model.train()
        inputs = self._prepare_inputs(inputs)

        # if self.use_amp:
        # with autocast():
            # loss = self.compute_loss(model, inputs)
        # else:
        loss = self.compute_loss(model, inputs)

        if self.args.gradient_accumulation_steps > 1:
            loss = loss / self.args.gradient_accumulation_steps

        # if self.use_amp:
        # self.scaler.scale(loss).backward()
        # elif self.use_apex:
        # with amp.scale_loss(loss, self.optimizer) as scaled_loss:
            # scaled_loss.backward()
        # elif self.deepspeed:
            # self.deepspeed.backward(loss)
        # else:
        loss.mean().backward()

        return loss.detach()


In [8]:
from dataclasses import dataclass
from typing import Dict, List, Optional, Union
import torch

import transformers
from transformers import Wav2Vec2Processor


@dataclass
class DataCollatorCTCWithPadding:
    """
    Data collator that will dynamically pad the inputs received.
    Args:
        processor (:class:`~transformers.Wav2Vec2Processor`)
            The processor used for proccessing the data.
        padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
            Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
            among:
            * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
              sequence if provided).
            * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
              maximum acceptable input length for the model if that argument is not provided.
            * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
              different lengths).
        max_length (:obj:`int`, `optional`):
            Maximum length of the ``input_values`` of the returned list and optionally padding length (see above).
        max_length_labels (:obj:`int`, `optional`):
            Maximum length of the ``labels`` returned list and optionally padding length (see above).
        pad_to_multiple_of (:obj:`int`, `optional`):
            If set will pad the sequence to a multiple of the provided value.
            This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >=
            7.5 (Volta).
    """

    processor: Wav2Vec2Processor
    padding: Union[bool, str] = True
    max_length: Optional[int] = None
    max_length_labels: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None
    pad_to_multiple_of_labels: Optional[int] = None

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        input_features = [{"input_values": feature["input_values"]} for feature in features]
        label_features = [feature["labels"] for feature in features]

        d_type = torch.long if isinstance(label_features[0], int) else torch.float

        batch = self.processor.pad(
            input_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",
        )

        batch["labels"] = torch.tensor(label_features, dtype=d_type)

        return batch

In [9]:
data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)

In [10]:
import numpy as np
from transformers import EvalPrediction


def compute_metrics(p: EvalPrediction, is_regression = False):
    preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
    preds = np.squeeze(preds) if is_regression else np.argmax(preds, axis=1)
    
    if is_regression:
        return {"mse": ((preds - p.label_ids) ** 2).mean().item()}
    else:
        return {"accuracy": (preds == p.label_ids).astype(np.float32).mean().item()}

In [11]:
#  torch.cuda.set_device(2)

In [12]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./wav2vec2-xlsr-speech-emotion-classification",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=2,
    evaluation_strategy="steps",
    num_train_epochs=1.0,
    fp16=True,
    save_steps=10,
    eval_steps=10,
    logging_steps=10,
    learning_rate=1e-4,
    save_total_limit=2,
)

In [13]:
# trainer = CTCTrainer(
#     model=pretrained_model,
#     # data_collator=data_collator,
#     args=training_args,
#     # compute_metrics=compute_metrics,
#     train_dataset=train_ds,
#     eval_dataset=valid_ds,
#     tokenizer= processor.feature_extractor,
# )

In [14]:
from transformers import Trainer
trainer = Trainer(
    model=pretrained_model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=valid_ds,
    compute_metrics=compute_metrics,
    tokenizer= processor.feature_extractor,
)

Using cuda_amp half precision backend


In [15]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `Wav2Vec2ForSequenceClassification.forward` and have been ignored: id. If id are not expected by `Wav2Vec2ForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 7525
  Num Epochs = 1
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 24
  Gradient Accumulation steps = 2
  Total optimization steps = 314
  Number of trainable parameters = 315702919


Step,Training Loss,Validation Loss


The following columns in the evaluation set don't have a corresponding argument in `Wav2Vec2ForSequenceClassification.forward` and have been ignored: id. If id are not expected by `Wav2Vec2ForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1835
  Batch size = 12
***** Running Evaluation *****
  Num examples = 1835
  Batch size = 12
Saving model checkpoint to ./wav2vec2-xlsr-speech-emotion-classification/checkpoint-10
Configuration saved in ./wav2vec2-xlsr-speech-emotion-classification/checkpoint-10/config.json
Model weights saved in ./wav2vec2-xlsr-speech-emotion-classification/checkpoint-10/pytorch_model.bin
Feature extractor saved in ./wav2vec2-xlsr-speech-emotion-classification/checkpoint-10/preprocessor_config.json
The following columns in the evaluation set don't have a corresponding argument in `Wav2Vec2ForSequenceClassification.forward` and have been ignored: id. If id are not expected by `Wav2Vec2ForSequenceC

In [None]:

import torch
import torch.nn as nn 
import torch.nn.functional as F
from transformers import AdamW
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
import torch
from jiwer import wer # wer metircs 
from transformers import Wav2Vec2Processor
device = torch.device('cuda:1')

In [None]:
# Pre-training Scheme ... 



# pretrain_str = "w11wo/wav2vec2-xls-r-300m-korean"


pretrained_model = Wav2Vec2ForSequenceClassification.from_pretrained(
    # "wav2vec2-xls-r-300m-korean",
    pretrain_str,
    num_labels=7
    )

In [None]:
import erc

erc.utils.count_parameters(pretrained_model)

In [None]:
model = pretrained_model.to(device)
# criterion = nn.MultiLabelSoftMarginLoss()
criterion = nn.CrossEntropyLoss()
optimizer = AdamW(model.parameters(), lr = 1e-5,  eps = 1e-8)

In [None]:

total_loss = 0
train_acc_sum = 0
train_loss = []
for step, batch in enumerate(train_loader): 
    optimizer.zero_grad()
    labels = (batch['emotion']).to(device)
    input_values = processor(batch["wav"],
                             sampling_rate=16000,
                             return_tensors="pt",
                             return_attention_mask = False)['input_values'].squeeze()
    inputs = {"input_values":input_values,
              "attention_mask":batch['wav_mask'],
    }
    inputs = {key: inputs[key].to(device) for key in inputs}


    logits = model(**inputs).logits

    
    # outputs = torch.argmax(logits, dim=-1)
    # print(logi)

    loss = criterion(logits, labels.long())
    total_loss += loss.item()
    train_loss.append(total_loss/(step+1))
    # print(loss.item())
    loss.backward()
    optimizer.step()

avg_train_loss = total_loss / len(train_loader)
print(f'  Average training loss: {avg_train_loss:.2f}')



In [None]:
print(logits.shape)
print(labels.shape)
print(labels.dtype)
print(logits.dtype)


In [None]:
model.config.id2label

In [None]:
torch.argmax(logits, dim=1)


In [None]:
predicted_class_ids

In [None]:
predicted_label = model.config.id2label[predicted_class_ids]

In [None]:
predicted_label

In [None]:


class EmotionClassfierWithAudio(nn.Module):
    def __init__(self,pretrained_model, n_classes=7):
        super().__init__()
        self.pretrianed_model = pretrained_model
        # for p in self.pretrianed_model.parameters():
            # p.requires_grad = False

        self.fc = nn.Linear(1025,n_classes)

    def forward(self, x):
        x = F.adaptive_avg_pool3d(self.pretrianed_model(x).logits , axis=1).squeeze()# mean of 355 // logit output shape: B, Seq(355), 1025
        x = self.fc(x)
        return x

In [None]:
# !pip install jiwer
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
# import soundfile as sf
import torch
from jiwer import wer # wer metircs 
from transformers import Wav2Vec2Processor
batch = train_dataset[2]


processor = Wav2Vec2Processor.from_pretrained("kresnik/wav2vec2-large-xlsr-korean")
inputs = processor(batch["wav"], sampling_rate=16000, return_tensors="pt", return_attention_mask = False,
                #    padding="longest"
                   )

In [None]:
inputs.keys()

In [None]:
inputs['input_values']

In [None]:
batch.keys()

In [None]:
batch['wav']

In [None]:
batch['wav_mask']

In [None]:
batch['wav_mask'].shape

In [None]:
inputs['attention_mask'].squeeze()


In [None]:
inputs['attention_mask'].shape

In [None]:



model = Wav2Vec2ForCTC.from_pretrained("kresnik/wav2vec2-large-xlsr-korean").to('cuda')

input_values = inputs.input_values
print(input_values.shape)
with torch.no_grad():
    logits = model(input_values.type(torch.FloatTensor).to("cuda")).logits
predicted_ids = torch.argmax(logits, dim=-1)
transcription = processor.batch_decode(predicted_ids)
print(batch['txt'])
print(transcription)
print("WER:", wer(batch['txt'], transcription))

In [None]:
logits.shape

In [None]:
logits[:,-1].shape

In [None]:
logits.shape

In [None]:
logits.mean(axis=1).shape

In [None]:
logits[:,-1]

In [None]:
logits

In [None]:
logits

In [None]:
model(input_values.type(torch.FloatTensor).to("cuda"))

In [None]:
predicted_ids