In [1]:
!pip install pandas
!pip install transformers
!pip install accelerate -U
!pip install datasets
!pip install scikit-learn

[0m

In [2]:
import sys
import torch
import pandas as pd
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
from transformers import AutoFeatureExtractor, WhisperForAudioClassification, Trainer, TrainingArguments
import datetime
from sklearn.metrics import accuracy_score
# sys.path.append("E:/university/Year 5 Spring/FYT/code/multi_modal_ser")
sys.path.append("/home/multi_modal_ser")
from utils.dataset import MMSERDataset
from datasets import load_metric
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
print(device)
!nvidia-smi

cuda
Tue Oct 10 16:38:41 2023       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.113.01             Driver Version: 535.113.01   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA GeForce RTX 4090        On  | 00000000:81:00.0 Off |                  Off |
| 43%   50C    P8              37W / 440W |     16MiB / 24564MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
|   1  NVIDIA GeForce RTX 4090        On  | 00000000:82:00.0 Of

In [1]:
# mmser_ds = torch.load("E:/datasets/preprocessed/dataset/mmser_ds.pt")
mmser_ds = torch.load("/home/mmser_ds.pt")
print("Emotions: ", mmser_ds.df_["emotion_id"].unique())
print("Emotion ID: ", mmser_ds.df_["emotion_id"].unique())

NameError: name 'torch' is not defined

### Build Model

In [7]:
# raw
model = WhisperForAudioClassification.from_pretrained("openai/whisper-base").to(device)
# finetuned
model = WhisperForAudioClassification.from_pretrained("/home/multi_modal_ser/finetune_encoder/2023-10-10").to(device)

model.config.num_labels = mmser_ds.df_["emotion_id"].nunique()
model.classifier = nn.Linear(model.projector.out_features, model.config.num_labels)
model

Emotions:  ['neu' 'ang' 'sad' 'hap' 'exc']


WhisperForAudioClassification(
  (encoder): WhisperEncoder(
    (conv1): Conv1d(80, 512, kernel_size=(3,), stride=(1,), padding=(1,))
    (conv2): Conv1d(512, 512, kernel_size=(3,), stride=(2,), padding=(1,))
    (embed_positions): Embedding(1500, 512)
    (layers): ModuleList(
      (0-5): 6 x WhisperEncoderLayer(
        (self_attn): WhisperAttention(
          (k_proj): Linear(in_features=512, out_features=512, bias=False)
          (v_proj): Linear(in_features=512, out_features=512, bias=True)
          (q_proj): Linear(in_features=512, out_features=512, bias=True)
          (out_proj): Linear(in_features=512, out_features=512, bias=True)
        )
        (self_attn_layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (activation_fn): GELUActivation()
        (fc1): Linear(in_features=512, out_features=2048, bias=True)
        (fc2): Linear(in_features=2048, out_features=512, bias=True)
        (final_layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affi

##### Freeze

In [None]:
for param in model.encoder.parameters():
    param.requires_grad = False
    
model

### Build Trainer

In [8]:
train_size = int(len(mmser_ds)*0.7)
val_size = int(len(mmser_ds)*0.2)
test_size = len(mmser_ds)-int(len(mmser_ds)*0.7)-int(len(mmser_ds)*0.2)

train_set, val_set = torch.utils.data.random_split(mmser_ds, [train_size, val_size+test_size])
val_set, test_set = torch.utils.data.random_split(val_set, [val_size, test_size])

In [9]:
output_dir=datetime.datetime.now().date().strftime(format="%Y-%m-%d")

training_args = TrainingArguments(output_dir)
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels").type(torch.LongTensor).to(device)
        outputs = model(input_features=inputs["audio"].to(device))
        logits = outputs.get("logits")
        # compute custom loss (suppose one has 3 labels with different weights)
        loss_fct = nn.CrossEntropyLoss() # weight=torch.tensor([1.0, 2.0, 3.0], device=model.device))
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        
        return (loss, outputs) if return_outputs else loss

In [12]:
training_args.logging_steps = 50
training_args.eval_steps = 50
training_args.remove_unused_columns=False
training_args.per_device_train_batch_size=100
training_args.per_device_eval_batch_size=100
training_args.evaluation_strategy="steps" 
training_args.logging_strategy="steps"
training_args.load_best_model_at_end=True,
training_args.save_strategy = "no"
training_args.learning_rate=5e-3
training_args.num_train_epochs=200

trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=train_set,
    eval_dataset=val_set,
    compute_metrics=compute_metrics,
)

### Build Metrics

In [10]:
def weighted_acc(y_true, y_pred):
    return np.sum((np.array(y_pred).ravel() == np.array(y_true).ravel()))*1.0/len(y_true)
    
def unweighted_acc(y_true, y_pred):
    y_true = np.array(y_true).ravel()
    y_pred = np.array(y_pred).ravel()
    classes = np.unique(y_true)
    classes_accuracies = np.zeros(classes.shape[0])
    for num, cls in enumerate(classes):
        classes_accuracies[num] = weighted_acc(y_true[y_true == cls], y_pred[y_true == cls])
    return np.mean(classes_accuracies)

def compute_metrics(eval_preds):
    logits, labels = eval_preds.predictions, eval_preds.label_ids
    predictions = np.argmax(logits, axis=-1)

    metric_f1 = load_metric("f1")
    metric_acc = load_metric("accuracy")
    logits, labels = eval_preds.predictions, eval_preds.label_ids
    predictions = np.argmax(logits, axis=-1)
    f1_ = metric_f1.compute(predictions=predictions, references=labels, average="weighted")["f1"]
    acc_ = metric_acc.compute(predictions=predictions, references=labels)["accuracy"]
    
    return {"wa":weighted_acc(labels, predictions), 
            "ua":unweighted_acc(labels, predictions),
            "f1":f1_, 
            "accuracy":acc_}

### Train

In [15]:
trainer.train()
trainer.save_model(output_dir)

Could not estimate the number of tokens of the input, floating-point operations will not be computed


Step,Training Loss,Validation Loss,Wa,Ua,F1,Acc
50,1.4525,1.147513,0.46745,0.465194,{'f1': 0.4660856156216825},{'accuracy': 0.4674502712477396}
100,1.1154,1.080714,0.487342,0.460607,{'f1': 0.4796047341887976},{'accuracy': 0.4873417721518987}
150,1.0803,1.06109,0.517179,0.53266,{'f1': 0.502444637927201},{'accuracy': 0.5171790235081374}
200,1.0898,1.163428,0.449367,0.522294,{'f1': 0.39600447585800963},{'accuracy': 0.44936708860759494}
250,1.0869,1.080725,0.488246,0.451807,{'f1': 0.47585315942380507},{'accuracy': 0.488245931283906}
300,1.0782,1.133318,0.459313,0.482201,{'f1': 0.38191636709828425},{'accuracy': 0.4593128390596745}
350,1.0548,1.031421,0.528029,0.553038,{'f1': 0.5209047144878074},{'accuracy': 0.5280289330922242}
400,1.0564,1.035787,0.536166,0.550564,{'f1': 0.5302753744171311},{'accuracy': 0.5361663652802894}
450,1.0454,1.031998,0.528029,0.545603,{'f1': 0.5137772757040523},{'accuracy': 0.5280289330922242}
500,1.0546,1.012513,0.545208,0.562457,{'f1': 0.5430993139110954},{'accuracy': 0.5452079566003617}


  metric_f1 = load_metric("f1")


In [16]:
trainer.evaluate()

{'eval_loss': 0.9691619277000427,
 'eval_wa': 0.5804701627486437,
 'eval_ua': 0.5992112408067607,
 'eval_F1': {'f1': 0.5782772149553942},
 'eval_acc': {'accuracy': 0.5804701627486437},
 'eval_runtime': 3.8118,
 'eval_samples_per_second': 290.149,
 'eval_steps_per_second': 1.574,
 'epoch': 200.0}

In [25]:
import pickle

with open("log_hist_1010.pkl", 'wb') as f:
    pickle.dump(trainer.state.log_history, f)