In [1]:
!pip install pandas
!pip install transformers
!pip install accelerate -U
!pip install datasets
!pip install scikit-learn

[0m

In [2]:
import os
import sys
import torch
import pandas as pd
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
from transformers import AutoFeatureExtractor, WhisperForAudioClassification, Trainer, TrainingArguments
import datetime
from sklearn.metrics import accuracy_score
# sys.path.append("E:/university/Year 5 Spring/FYT/code/multi_modal_ser")
sys.path.append("/home/multi_modal_ser")
from utils.dataset import MMSERDataset
from datasets import load_metric
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

### Log the Notebook

In [3]:
import sys
import logging

nblog = open("nb.log", "a+")
sys.stdout.echo = nblog
sys.stderr.echo = nblog

get_ipython().log.handlers[0].stream = nblog
get_ipython().log.setLevel(logging.INFO)

%autosave 5

Autosaving every 5 seconds


In [4]:
print(device)
!nvidia-smi

cuda
Wed Oct 11 06:12:17 2023       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.86.05              Driver Version: 535.86.05    CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA GeForce RTX 4090        On  | 00000000:B1:00.0 Off |                  Off |
|  0%   41C    P8              22W / 450W |      6MiB / 24564MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
|   1  NVIDIA GeForce RTX 4090        On  | 00000000:B7:00.0 Of

In [5]:
# mmser_ds = torch.load("E:/datasets/preprocessed/dataset/mmser_ds.pt")
mmser_ds = torch.load("/home/mmser_ds.pt")
print("Emotions: ", mmser_ds.df_["emotion_id"].unique())
print("Emotion ID: ", mmser_ds.df_["emotion_id"].unique())

Emotions:  [2. 1. 3. 0.]
Emotion ID:  [2. 1. 3. 0.]


### Build Model

In [6]:
# raw
# model = WhisperForAudioClassification.from_pretrained("openai/whisper-base").to(device)
# model.config.num_labels = mmser_ds.df_["emotion_id"].nunique()
# model.classifier = nn.Linear(model.projector.out_features, model.config.num_labels)
# model
# finetuned
model = WhisperForAudioClassification.from_pretrained("/home/multi_modal_ser/finetune_encoder/finetune/2023-10-10").to(device)

##### Freeze

In [7]:
for param in model.encoder.parameters():
    param.requires_grad = False
for param in model.projector.parameters():
    param.requires_grad = True
for param in model.classifier.parameters():
    param.requires_grad = True
    
model

WhisperForAudioClassification(
  (encoder): WhisperEncoder(
    (conv1): Conv1d(80, 512, kernel_size=(3,), stride=(1,), padding=(1,))
    (conv2): Conv1d(512, 512, kernel_size=(3,), stride=(2,), padding=(1,))
    (embed_positions): Embedding(1500, 512)
    (layers): ModuleList(
      (0-5): 6 x WhisperEncoderLayer(
        (self_attn): WhisperAttention(
          (k_proj): Linear(in_features=512, out_features=512, bias=False)
          (v_proj): Linear(in_features=512, out_features=512, bias=True)
          (q_proj): Linear(in_features=512, out_features=512, bias=True)
          (out_proj): Linear(in_features=512, out_features=512, bias=True)
        )
        (self_attn_layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (activation_fn): GELUActivation()
        (fc1): Linear(in_features=512, out_features=2048, bias=True)
        (fc2): Linear(in_features=2048, out_features=512, bias=True)
        (final_layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affi

### Build Metrics

In [8]:
def weighted_acc(y_true, y_pred):
    return np.sum((np.array(y_pred).ravel() == np.array(y_true).ravel()))*1.0/len(y_true)
    
def unweighted_acc(y_true, y_pred):
    y_true = np.array(y_true).ravel()
    y_pred = np.array(y_pred).ravel()
    classes = np.unique(y_true)
    classes_accuracies = np.zeros(classes.shape[0])
    for num, cls in enumerate(classes):
        classes_accuracies[num] = weighted_acc(y_true[y_true == cls], y_pred[y_true == cls])
    return np.mean(classes_accuracies)

def compute_metrics(eval_preds):
    logits, labels = eval_preds.predictions, eval_preds.label_ids
    predictions = np.argmax(logits, axis=-1)

    metric_f1 = load_metric("f1")
    metric_acc = load_metric("accuracy")
    logits, labels = eval_preds.predictions, eval_preds.label_ids
    predictions = np.argmax(logits, axis=-1)
    f1_ = metric_f1.compute(predictions=predictions, references=labels, average="weighted")["f1"]
    acc_ = metric_acc.compute(predictions=predictions, references=labels)["accuracy"]
    
    return {"wa":weighted_acc(labels, predictions), 
            "ua":unweighted_acc(labels, predictions),
            "f1":f1_, 
            "accuracy":acc_}

### Build Trainer

In [9]:
train_size = int(len(mmser_ds)*0.7)
val_size = int(len(mmser_ds)*0.2)
test_size = len(mmser_ds)-int(len(mmser_ds)*0.7)-int(len(mmser_ds)*0.2)

train_set, val_set = torch.utils.data.random_split(mmser_ds, [train_size, val_size+test_size])
val_set, test_set = torch.utils.data.random_split(val_set, [val_size, test_size])

In [10]:
output_dir=os.path.join("/home/multi_modal_ser/finetune_encoder", datetime.datetime.now().date().strftime(format="%Y-%m-%d"))

training_args = TrainingArguments(output_dir)
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels").type(torch.LongTensor).to(device)
        outputs = model(input_features=inputs["audio"].to(device))
        logits = outputs.get("logits")
        # compute custom loss (suppose one has 3 labels with different weights)
        loss_fct = nn.CrossEntropyLoss() # weight=torch.tensor([1.0, 2.0, 3.0], device=model.device))
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        
        return (loss, outputs) if return_outputs else loss

In [11]:
training_args.logging_steps = 50
training_args.eval_steps = 50
training_args.remove_unused_columns=False
training_args.per_device_train_batch_size=100
training_args.per_device_eval_batch_size=100
# training_args.per_device_train_batch_size=16
# training_args.per_device_eval_batch_size=32
training_args.evaluation_strategy="steps" 
training_args.logging_strategy="steps"
training_args.load_best_model_at_end=True,
training_args.save_strategy = "no"
training_args.learning_rate=1e-4
training_args.num_train_epochs=200

trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=train_set,
    eval_dataset=val_set,
    compute_metrics=compute_metrics,
)

### Train

In [12]:
# evaluate the init model
trainer.evaluate()

  metric_f1 = load_metric("f1")


{'eval_loss': 0.8975237011909485,
 'eval_wa': 0.593128390596745,
 'eval_ua': 0.6100681793323267,
 'eval_f1': 0.5919631215280048,
 'eval_accuracy': 0.593128390596745,
 'eval_runtime': 7.1744,
 'eval_samples_per_second': 154.158,
 'eval_steps_per_second': 0.836}

In [13]:
trainer.train()
trainer.save_model(output_dir)

Could not estimate the number of tokens of the input, floating-point operations will not be computed


Step,Training Loss,Validation Loss,Wa,Ua,F1,Accuracy
50,0.9059,0.90209,0.59132,0.599382,0.590285,0.59132
100,0.8932,0.896708,0.601266,0.616664,0.60052,0.601266
150,0.8955,0.894669,0.592224,0.59674,0.593208,0.592224
200,0.8986,0.906027,0.59132,0.616777,0.58675,0.59132
250,0.8955,0.901015,0.592224,0.606838,0.59075,0.592224
300,0.8975,0.897827,0.599458,0.616335,0.598729,0.599458
350,0.8965,0.899199,0.585895,0.592968,0.58657,0.585895
400,0.8904,0.903996,0.583183,0.601893,0.579203,0.583183
450,0.8884,0.895725,0.596745,0.606547,0.597332,0.596745
500,0.8956,0.897038,0.590416,0.594417,0.591466,0.590416


KeyboardInterrupt: 

In [None]:
trainer.evaluate()

In [None]:
import pickle

with open("log_hist_1010.pkl", 'wb') as f:
    pickle.dump(trainer.state.log_history, f)