In [3]:
import logging
import hydra
import pandas as pd

from collections import defaultdict
from tqdm import tqdm 

import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib import colormaps
import matplotlib.cm as cm
import matplotlib as mpl
from matplotlib.patches import Ellipse

from erc import drawing_ellipse, split_df_by_gender


logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

with hydra.initialize(version_base=None, config_path="./config"):
    cfg = hydra.compose(config_name="config", overrides={"dataset._target_=erc.datasets.KEMDy19Dataset"})

# select 1-fold 
train_dataset = hydra.utils.instantiate(cfg.dataset, mode = "train")
# valid_dataset = hydra.utils.instantiate(cfg.dataset, mode = "valid")

INFO:erc.utils:Instantiate KEMDy19 Dataset
INFO:erc.utils:./data/kemdy19.csv does not exists. Process from raw data


Processing ECG / EDA / Label from /home/hoesungryu/workspace:   0%|          | 0/20 [00:00<?, ?it/s]

INFO:erc.utils:New dataframe saved as data/kemdy19.csv


InstantiationException: Error in call to target 'erc.datasets.KEMDy19Dataset':
OSError("Cannot save file into a non-existent directory: 'data'")
full_key: dataset

In [1]:

import torch
import torch.nn as nn 
import torch.nn.functional as F
from transformers import AdamW
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
import torch
from jiwer import wer # wer metircs 
from transformers import Wav2Vec2Processor
device = torch.device('cuda:1')




In [2]:
# Pre-training Scheme ... 
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor, Wav2Vec2ForSequenceClassification

pretrain_str = "kresnik/wav2vec2-large-xlsr-korean"
# pretrain_str = "w11wo/wav2vec2-xls-r-300m-korean"

processor= Wav2Vec2Processor.from_pretrained(pretrain_str)
pretrained_model = Wav2Vec2ForSequenceClassification.from_pretrained(
    # "wav2vec2-xls-r-300m-korean",
    pretrain_str,
    num_labels=7
    )

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Some weights of the model checkpoint at kresnik/wav2vec2-large-xlsr-korean were not used when initializing Wav2Vec2ForSequenceClassification: ['lm_head.weight', 'lm_head.bias']
- This IS expected if you are initializing Wav2Vec2ForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at kresnik/wav2vec2-large-xlsr-korean and are newly initialized: ['classifier.bias', 'projector.weight', 'p

In [3]:
import erc

erc.utils.count_parameters(pretrained_model)

315702919

In [4]:
model = pretrained_model.to(device)
# criterion = nn.MultiLabelSoftMarginLoss()
criterion = nn.CrossEntropyLoss()
optimizer = AdamW(model.parameters(), lr = 1e-5,  eps = 1e-8)



In [7]:
import torch 
from torch.utils.data import DataLoader

train_loader = DataLoader(train_dataset, batch_size= 2)


from accelerate import Accelerator
accelerator = Accelerator()

model, optimizer, training_dataloader = accelerator.prepare(
     model, optimizer, train_loader)


2.1555280685424805
1.876920461654663
2.027853488922119
2.0851197242736816
1.8583887815475464
2.0513827800750732
2.1173458099365234
2.0462806224823
1.9930295944213867
2.056407928466797
1.8575515747070312
1.8788491487503052
1.8656747341156006
1.8904151916503906
1.7740397453308105
1.8491510152816772
1.8243505954742432
1.8593350648880005
1.8837928771972656
1.7847356796264648
1.9228465557098389
1.8982136249542236
2.12068772315979
1.873547911643982
1.7592494487762451
1.9251984357833862
2.132913112640381
2.089188575744629
2.152311086654663
2.1025381088256836
2.107727527618408
2.067348003387451
2.1951334476470947
2.1749963760375977
1.983643889427185
2.0064048767089844
1.9801409244537354
1.9811813831329346
2.04870343208313
1.6701644659042358
1.9914988279342651
1.8618470430374146
1.7785487174987793
1.7649991512298584
1.8274312019348145
1.9521520137786865
2.119351387023926
1.9559354782104492
1.8991422653198242
1.9091132879257202
1.9706895351409912
1.881345510482788
2.113330364227295
1.93231034278

KeyboardInterrupt: 

In [None]:

total_loss = 0
train_acc_sum = 0
train_loss = []
for step, batch in enumerate(train_loader): 
    optimizer.zero_grad()
    labels = (batch['emotion']).to(device)
    input_values = processor(batch["wav"],
                             sampling_rate=16000,
                             return_tensors="pt",
                             return_attention_mask = False)['input_values'].squeeze()
    inputs = {"input_values":input_values,
              "attention_mask":batch['wav_mask'],
    }
    inputs = {key: inputs[key].to(device) for key in inputs}


    logits = model(**inputs).logits

    
    # outputs = torch.argmax(logits, dim=-1)
    # print(logi)

    loss = criterion(logits, labels.long())
    total_loss += loss.item()
    train_loss.append(total_loss/(step+1))
    # print(loss.item())
    loss.backward()
    optimizer.step()

avg_train_loss = total_loss / len(train_loader)
print(f'  Average training loss: {avg_train_loss:.2f}')



In [None]:
print(logits.shape)
print(labels.shape)
print(labels.dtype)
print(logits.dtype)


In [None]:
batch

In [None]:
logits.shape

In [None]:
model.config.id2label

In [None]:
torch.argmax(logits, dim=1)


In [None]:
predicted_class_ids

In [None]:
predicted_label = model.config.id2label[predicted_class_ids]

In [None]:
predicted_label

In [None]:


class EmotionClassfierWithAudio(nn.Module):
    def __init__(self,pretrained_model, n_classes=7):
        super().__init__()
        self.pretrianed_model = pretrained_model
        # for p in self.pretrianed_model.parameters():
            # p.requires_grad = False

        self.fc = nn.Linear(1025,n_classes)

    def forward(self, x):
        x = F.adaptive_avg_pool3d(self.pretrianed_model(x).logits , axis=1).squeeze()# mean of 355 // logit output shape: B, Seq(355), 1025
        x = self.fc(x)
        return x

In [None]:
# !pip install jiwer
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
# import soundfile as sf
import torch
from jiwer import wer # wer metircs 
from transformers import Wav2Vec2Processor
batch = train_dataset[2]


processor = Wav2Vec2Processor.from_pretrained("kresnik/wav2vec2-large-xlsr-korean")
inputs = processor(batch["wav"], sampling_rate=16000, return_tensors="pt", return_attention_mask = False,
                #    padding="longest"
                   )

In [None]:
inputs.keys()

In [None]:
inputs['input_values']

In [None]:
batch.keys()

In [None]:
batch['wav']

In [None]:
batch['wav_mask']

In [None]:
batch['wav_mask'].shape

In [None]:
inputs['attention_mask'].squeeze()


In [None]:
inputs['attention_mask'].shape

In [None]:



model = Wav2Vec2ForCTC.from_pretrained("kresnik/wav2vec2-large-xlsr-korean").to('cuda')

input_values = inputs.input_values
print(input_values.shape)
with torch.no_grad():
    logits = model(input_values.type(torch.FloatTensor).to("cuda")).logits
predicted_ids = torch.argmax(logits, dim=-1)
transcription = processor.batch_decode(predicted_ids)
print(batch['txt'])
print(transcription)
print("WER:", wer(batch['txt'], transcription))

In [None]:
logits.shape

In [None]:
logits[:,-1].shape

In [None]:
logits.shape

In [None]:
logits.mean(axis=1).shape

In [None]:
logits[:,-1]

In [None]:
logits

In [None]:
logits

In [None]:
model(input_values.type(torch.FloatTensor).to("cuda"))

In [None]:
predicted_ids