In [1]:
!pip install pandas
!pip install transformers
!pip install accelerate -U
!pip install datasets
!pip install scikit-learn
!pip install wandb

[0m

In [2]:
import os
import sys
import torch
import pandas as pd
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
from transformers import AutoFeatureExtractor, WhisperForAudioClassification, Trainer, TrainingArguments
import datetime
from sklearn.metrics import accuracy_score
# sys.path.append("E:/university/FYT/repos/multi_modal_ser")
sys.path.append("/home/multi_modal_ser")
from utils.dataset import MMSERDataset
from datasets import load_metric
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
from torch.utils.data import Dataset, DataLoader
import torch
from torch.utils.data import Dataset, Subset

### Log the Notebook

In [3]:
import sys
import logging

nblog = open("hubert_nb.log", "a+")
sys.stdout.echo = nblog
sys.stderr.echo = nblog

get_ipython().log.handlers[0].stream = nblog
get_ipython().log.setLevel(logging.INFO)

%autosave 5

Autosaving every 5 seconds


In [4]:
print(device)
!nvidia-smi

cuda
Sun Oct 15 12:56:57 2023       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.54.03              Driver Version: 535.54.03    CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA GeForce RTX 4090        On  | 00000000:81:00.0 Off |                  Off |
| 46%   34C    P8              19W / 450W |      5MiB / 24564MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
|   1  NVIDIA GeForce RTX 4090        On  | 00000000:82:00.0 Of

In [5]:
# mmser_ds = torch.load("E:/datasets/preprocessed/dataset/mmser_ds.pt")
mmser_ds = torch.load("/home/mmser_ds.pt")
print("Emotion ID: ", mmser_ds.df_["emotion_id"].unique())

Emotion ID:  [2. 1. 3. 0.]


### HUBERT

In [6]:
MODEL_NAME = "facebook/hubert-large-ls960-ft"

In [7]:
from transformers import AutoProcessor, HubertModel
from tqdm import tqdm

class ProcessedDataset(Dataset):
    
    def __init__(self, base_ds, pretrained_model):
        self.base_ds = base_ds
        self.processor = AutoProcessor.from_pretrained(pretrained_model)
        self.__process__()
        
    def __process__(self):
        self.input_values_list = []
        self.attention_mask_list = []
        for raw_audio in tqdm(self.base_ds.raw_list):
            processed = self.processor(raw_audio, 
                                       sampling_rate=16000,
                                       padding='max_length',
                                       max_length=300000,
                                       truncation=True, 
                                      return_tensors="np")
            self.input_values_list.append(processed["input_values"].squeeze())
            self.attention_mask_list.append(processed["attention_mask"].squeeze())
    def __len__(self):
        return len(self.base_ds)
    
    def __getitem__(self, idx):
        base_dict = self.base_ds[idx]
        base_dict["input_values"] = self.input_values_list[idx]
        base_dict["attention_mask"] = self.attention_mask_list[idx]
        del base_dict["audio"]
        return base_dict
        
        

In [8]:
AutoProcessor.from_pretrained(MODEL_NAME)
processed_ds = ProcessedDataset(mmser_ds, MODEL_NAME)

100%|██████████| 5531/5531 [00:12<00:00, 453.20it/s]


### Build Model

In [9]:
from transformers import AutoProcessor, HubertModel, AutoModel
from datasets import load_dataset

processor = AutoProcessor.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME)
model.config.use_weighted_layer_sum

Some weights of HubertModel were not initialized from the model checkpoint at facebook/hubert-large-ls960-ft and are newly initialized: ['hubert.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'hubert.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


False

In [10]:
class CustomClassifier(torch.nn.Module):

    def __init__(self, pretrained_model, num_labels=4):
        super(CustomClassifier, self).__init__()
        self.encoder = AutoModel.from_pretrained(pretrained_model)
        self.config = self.encoder.config
        self.config.num_labels = num_labels
        self.projector = nn.Linear(self.config.hidden_size, self.config.classifier_proj_size)
        self.classifier = nn.Linear(self.config.classifier_proj_size, num_labels)

    def forward(
        self,
        input_values,
        attention_mask = None,
        output_attentions = None,
        output_hidden_states = None,
        return_dict = None,
        labels = None,
    ):
        outputs = self.encoder(
            input_values,
            attention_mask=attention_mask,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        
        hidden_states = outputs[0]

        hidden_states = self.projector(hidden_states)
        if attention_mask is None:
            pooled_output = hidden_states.mean(dim=1)
        else:
            padding_mask = self.encoder._get_feature_vector_attention_mask(hidden_states.shape[1], attention_mask)
            hidden_states[~padding_mask] = 0.0
            pooled_output = hidden_states.sum(dim=1) / padding_mask.sum(dim=1).view(-1, 1)
        logits = self.classifier(pooled_output)
        return {"logits":logits}
    
model = CustomClassifier(MODEL_NAME, mmser_ds.df_["emotion_id"].nunique())

Some weights of HubertModel were not initialized from the model checkpoint at facebook/hubert-large-ls960-ft and are newly initialized: ['hubert.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'hubert.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
# logits = model(torch.Tensor(processed_ds[:2]["input_values"]).squeeze(), 
#                       torch.Tensor(processed_ds[:2]["attention_mask"]).squeeze()).last_hidden_state
# print(logits.shape)

### Split Dataset

In [12]:
meta_df_ = processed_ds.base_ds.df_
sess_dict = meta_df_.groupby("session").groups
all_indices = set(meta_df_.index.tolist())

In [13]:
sess_ds = {}
for sess in sess_dict:
    sess_ds[sess+"_train"] = Subset(processed_ds, 
                                    indices=list(all_indices-set(sess_dict[sess])))
    sess_ds[sess+"_test"] = Subset(processed_ds, 
                                    indices=sess_dict[sess])
    

In [14]:
processed_ds[2]

{'sess': 'Ses01F',
 'fn': 'Ses01F_impro01_F002',
 'text': 'Is there a problem?',
 'labels': 2.0,
 'input_values': array([-0.3434539 , -0.27715507, -0.23425584, ...,  0.        ,
         0.        ,  0.        ]),
 'attention_mask': array([1, 1, 1, ..., 0, 0, 0], dtype=int32)}

### Custom Trainer, Metrics

In [15]:
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels").type(torch.LongTensor).to(device)
        
        input_values = inputs["input_values"].to(device).to(torch.float32)
        attention_mask = inputs["attention_mask"].to(device).to(torch.float32)
        outputs = model(input_values, 
                       attention_mask)
        logits = outputs.get("logits")
        loss_fct = nn.CrossEntropyLoss() 
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))        
        return (loss, outputs) if return_outputs else loss
    
def weighted_acc(y_true, y_pred):
    return np.sum((np.array(y_pred).ravel() == np.array(y_true).ravel()))*1.0/len(y_true)
    
def unweighted_acc(y_true, y_pred):
    y_true = np.array(y_true).ravel()
    y_pred = np.array(y_pred).ravel()
    classes = np.unique(y_true)
    classes_accuracies = np.zeros(classes.shape[0])
    for num, cls in enumerate(classes):
        classes_accuracies[num] = weighted_acc(y_true[y_true == cls], y_pred[y_true == cls])
    return np.mean(classes_accuracies)

def compute_metrics(eval_preds):
    logits, labels = eval_preds.predictions, eval_preds.label_ids
    predictions = np.argmax(logits, axis=-1)

    metric_f1 = load_metric("f1")
    metric_acc = load_metric("accuracy")
    logits, labels = eval_preds.predictions, eval_preds.label_ids
    predictions = np.argmax(logits, axis=-1)
    f1_ = metric_f1.compute(predictions=predictions, references=labels, average="weighted")["f1"]
    acc_ = metric_acc.compute(predictions=predictions, references=labels)["accuracy"]
    
    return {"wa":weighted_acc(labels, predictions), 
            "ua":unweighted_acc(labels, predictions),
            "f1":f1_, 
            "accuracy":acc_}

### Set SESS_ID

In [16]:
SESS_ID = list(sess_dict.keys())[2]

def build_ds(sess_id):
    train_size = int(len(sess_ds[sess_id+"_train"])*0.75)
    val_size = len(sess_ds[sess_id+"_train"])-train_size
    train_set, val_set = torch.utils.data.random_split(sess_ds[sess_id+"_train"], [train_size, val_size])
    test_set = sess_ds[sess_id+"_test"]

    print("Train Samples:", len(train_set))
    print("Val Samples:", len(val_set))
    print("Test Samples:", len(test_set))
    
    return train_set, val_set, test_set

In [17]:
train_set, val_set, test_set = build_ds(SESS_ID)

Train Samples: 3769
Val Samples: 1257
Test Samples: 505


##### Load Model

In [18]:
# model.load_state_dict(torch.load(output_dir+"/pytorch_model.bin"))

##### Freeze

In [19]:
model

CustomClassifier(
  (encoder): HubertModel(
    (feature_extractor): HubertFeatureEncoder(
      (conv_layers): ModuleList(
        (0): HubertLayerNormConvLayer(
          (conv): Conv1d(1, 512, kernel_size=(10,), stride=(5,))
          (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation): GELUActivation()
        )
        (1-4): 4 x HubertLayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,))
          (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation): GELUActivation()
        )
        (5-6): 2 x HubertLayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(2,), stride=(2,))
          (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation): GELUActivation()
        )
      )
    )
    (feature_projection): HubertFeatureProjection(
      (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      (projection): Line

In [20]:
for param in model.encoder.parameters():
    param.requires_grad = False

# for param in model.projector.parameters():
#     param.requires_grad = False

In [21]:
output_dir=os.path.join("/home/multi_modal_ser/finetune_encoder/check_pts", "HUBERT", SESS_ID, datetime.datetime.now().date().strftime(format="%Y-%m-%d"))

training_args = TrainingArguments(output_dir,report_to="wandb")
training_args.logging_steps = 250
training_args.eval_steps = 250
training_args.remove_unused_columns=False
training_args.per_device_train_batch_size=4
training_args.per_device_eval_batch_size=16
training_args.evaluation_strategy="steps" 
training_args.logging_strategy="steps"
training_args.load_best_model_at_end=True,
training_args.save_strategy = "no"
training_args.learning_rate=5e-4
training_args.num_train_epochs=30

trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=train_set,
    eval_dataset=val_set,
    compute_metrics=compute_metrics,
)

API: 2999b8f99f0f62b4f64c48a1c8be9a16945183e9

In [22]:
print(SESS_ID)

Ses02F


In [None]:
trainer.train()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mmmser[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss,Validation Loss,Wa,Ua,F1,Accuracy
250,1.261,1.210477,0.41607,0.414667,0.380882,0.41607


  metric_f1 = load_metric("f1")


In [None]:
trainer.evaluate()
trainer.save_model(output_dir)

In [None]:
trainer.predict(test_set)