In [24]:
import logging
import hydra

from erc.preprocess import generate_datasets

logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

with hydra.initialize(version_base=None, config_path="./config"):
    cfg = hydra.compose(config_name="config", overrides={"dataset._target_=erc.datasets.KEMDy19Dataset"})

ImportError: cannot import name 'generate_datasets' from 'erc.preprocess' (/home/hoesungryu/etri-erc/erc/preprocess.py)

In [None]:
# change torch dataset into huggingface dataset ... 
fold_num = 1
train_dataset = hydra.utils.instantiate(cfg.dataset, mode = "train", validation_fold = fold_num)
valid_dataset = hydra.utils.instantiate(cfg.dataset, mode = "valid", validation_fold = fold_num)

train_ds = generate_datasets(
    train_dataset,
    save_name = 'audio_dataset_19',
    mode  = 'train',
    validation_fold =fold_num,
    overrides=False
)
valid_ds = generate_datasets(
    valid_dataset,
    save_name = 'audio_dataset_19',
    mode  = 'valid',
    validation_fold =fold_num,
    overrides=False
)

In [None]:
# Wav2Vec2 
from transformers import AutoConfig, Wav2Vec2Processor
from erc.constants import idx2emotion, emotion2idx


# default value 
model_name_or_path = "kresnik/wav2vec2-large-xlsr-korean"
num_labels = 7 
pooling_mode = "mean" # max or min 


# set config
config = AutoConfig.from_pretrained(
    model_name_or_path,
    num_labels=num_labels,
    label2id=emotion2idx,
    id2label=idx2emotion,
    finetuning_task="wav2vec2_clf",
)

setattr(config, 'pooling_mode', pooling_mode)

In [None]:
processor = Wav2Vec2Processor.from_pretrained(model_name_or_path,)
target_sampling_rate = processor.feature_extractor.sampling_rate
print(f"The target sampling rate: {target_sampling_rate}")

In [None]:
train_dataset[0].keys()

In [None]:

import torch
import torch.nn as nn 
import torch.nn.functional as F
from transformers import AdamW
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
import torch
from jiwer import wer # wer metircs 
from transformers import Wav2Vec2Processor
device = torch.device('cuda:1')




In [None]:
# Pre-training Scheme ... 
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor, Wav2Vec2ForSequenceClassification


# pretrain_str = "w11wo/wav2vec2-xls-r-300m-korean"

processor= Wav2Vec2Processor.from_pretrained(pretrain_str)
pretrained_model = Wav2Vec2ForSequenceClassification.from_pretrained(
    # "wav2vec2-xls-r-300m-korean",
    pretrain_str,
    num_labels=7
    )

In [None]:
import erc

erc.utils.count_parameters(pretrained_model)

In [None]:
model = pretrained_model.to(device)
# criterion = nn.MultiLabelSoftMarginLoss()
criterion = nn.CrossEntropyLoss()
optimizer = AdamW(model.parameters(), lr = 1e-5,  eps = 1e-8)

In [None]:
import torch 
from torch.utils.data import DataLoader

train_loader = DataLoader(train_dataset, batch_size= 2)


from accelerate import Accelerator
accelerator = Accelerator()

model, optimizer, training_dataloader = accelerator.prepare(
     model, optimizer, train_loader)


In [None]:

total_loss = 0
train_acc_sum = 0
train_loss = []
for step, batch in enumerate(train_loader): 
    optimizer.zero_grad()
    labels = (batch['emotion']).to(device)
    input_values = processor(batch["wav"],
                             sampling_rate=16000,
                             return_tensors="pt",
                             return_attention_mask = False)['input_values'].squeeze()
    inputs = {"input_values":input_values,
              "attention_mask":batch['wav_mask'],
    }
    inputs = {key: inputs[key].to(device) for key in inputs}


    logits = model(**inputs).logits

    
    # outputs = torch.argmax(logits, dim=-1)
    # print(logi)

    loss = criterion(logits, labels.long())
    total_loss += loss.item()
    train_loss.append(total_loss/(step+1))
    # print(loss.item())
    loss.backward()
    optimizer.step()

avg_train_loss = total_loss / len(train_loader)
print(f'  Average training loss: {avg_train_loss:.2f}')



In [None]:
print(logits.shape)
print(labels.shape)
print(labels.dtype)
print(logits.dtype)


In [None]:
batch

In [None]:
logits.shape

In [None]:
model.config.id2label

In [None]:
torch.argmax(logits, dim=1)


In [None]:
predicted_class_ids

In [None]:
predicted_label = model.config.id2label[predicted_class_ids]

In [None]:
predicted_label

In [None]:


class EmotionClassfierWithAudio(nn.Module):
    def __init__(self,pretrained_model, n_classes=7):
        super().__init__()
        self.pretrianed_model = pretrained_model
        # for p in self.pretrianed_model.parameters():
            # p.requires_grad = False

        self.fc = nn.Linear(1025,n_classes)

    def forward(self, x):
        x = F.adaptive_avg_pool3d(self.pretrianed_model(x).logits , axis=1).squeeze()# mean of 355 // logit output shape: B, Seq(355), 1025
        x = self.fc(x)
        return x

In [None]:
# !pip install jiwer
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
# import soundfile as sf
import torch
from jiwer import wer # wer metircs 
from transformers import Wav2Vec2Processor
batch = train_dataset[2]


processor = Wav2Vec2Processor.from_pretrained("kresnik/wav2vec2-large-xlsr-korean")
inputs = processor(batch["wav"], sampling_rate=16000, return_tensors="pt", return_attention_mask = False,
                #    padding="longest"
                   )

In [None]:
inputs.keys()

In [None]:
inputs['input_values']

In [None]:
batch.keys()

In [None]:
batch['wav']

In [None]:
batch['wav_mask']

In [None]:
batch['wav_mask'].shape

In [None]:
inputs['attention_mask'].squeeze()


In [None]:
inputs['attention_mask'].shape

In [None]:



model = Wav2Vec2ForCTC.from_pretrained("kresnik/wav2vec2-large-xlsr-korean").to('cuda')

input_values = inputs.input_values
print(input_values.shape)
with torch.no_grad():
    logits = model(input_values.type(torch.FloatTensor).to("cuda")).logits
predicted_ids = torch.argmax(logits, dim=-1)
transcription = processor.batch_decode(predicted_ids)
print(batch['txt'])
print(transcription)
print("WER:", wer(batch['txt'], transcription))

In [None]:
logits.shape

In [None]:
logits[:,-1].shape

In [None]:
logits.shape

In [None]:
logits.mean(axis=1).shape

In [None]:
logits[:,-1]

In [None]:
logits

In [None]:
logits

In [None]:
model(input_values.type(torch.FloatTensor).to("cuda"))

In [None]:
predicted_ids