In [1]:
!pip install pandas
!pip install transformers
!pip install accelerate -U
!pip install datasets
!pip install scikit-learn

[0m

In [2]:
import sys
import torch
import pandas as pd
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
# sys.path.append("E:/university/Year 5 Spring/FYT/code/multi_modal_ser")
sys.path.append("/home/multi_modal_ser")
from utils.dataset import MMSERDataset
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
print(device)
!nvidia-smi

cuda
Sat Oct  7 17:07:04 2023       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.113.01             Driver Version: 535.113.01   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA GeForce RTX 4090        On  | 00000000:01:00.0 Off |                  Off |
|  0%   38C    P8              18W / 450W |      5MiB / 24564MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
|   1  NVIDIA GeForce RTX 4090        On  | 00000000:81:00.0 Of

In [4]:
# mmser_ds = torch.load("E:/datasets/preprocessed/dataset/mmser_ds.pt")
mmser_ds = torch.load("/home/mmser_ds.pt")

### Audio Classification

In [5]:
import torch
from transformers import AutoFeatureExtractor, WhisperForAudioClassification

model = WhisperForAudioClassification.from_pretrained("openai/whisper-base").to(device)
input_features = mmser_ds[78:79]["audio"].to(device)

Some weights of WhisperForAudioClassification were not initialized from the model checkpoint at openai/whisper-base and are newly initialized: ['model.projector.bias', 'model.classifier.weight', 'model.projector.weight', 'model.classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
with torch.no_grad():
    logits = model(input_features).logits

predicted_class_ids = torch.argmax(logits, axis=1).item()
predicted_label = model.config.id2label[predicted_class_ids]
predicted_label

'LABEL_0'

### Build Model

In [7]:
print("Emotions: ", mmser_ds.df_["emotion"].unique())
model.config.num_labels = mmser_ds.df_["emotion"].nunique()
model.classifier = nn.Linear(model.projector.out_features, model.config.num_labels)
# model

Emotions:  ['neu' 'fru' 'ang' 'sad' 'hap' 'sur' 'exc' 'oth' 'fea' 'dis']


### Train Model

In [8]:
train_size = int(len(mmser_ds)*0.7)
val_size = int(len(mmser_ds)*0.2)
test_size = len(mmser_ds)-int(len(mmser_ds)*0.7)-int(len(mmser_ds)*0.2)

train_set, val_set = torch.utils.data.random_split(mmser_ds, [train_size, val_size+test_size])
val_set, test_set = torch.utils.data.random_split(val_set, [val_size, test_size])

In [9]:
from torch import nn
from transformers import Trainer
from transformers import TrainingArguments
import datetime
from sklearn.metrics import accuracy_score
output_dir=datetime.datetime.now().date().strftime(format="%Y-%m-%d")

training_args = TrainingArguments(output_dir)
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels").to(device)
        outputs = model(input_features=inputs["audio"].to(device))
        logits = outputs.get("logits")
        # compute custom loss (suppose one has 3 labels with different weights)
        loss_fct = nn.CrossEntropyLoss() # weight=torch.tensor([1.0, 2.0, 3.0], device=model.device))
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        
        return (loss, outputs) if return_outputs else loss

In [10]:
from datasets import load_metric
def compute_metrics(eval_preds):
    metric = load_metric("accuracy", "f1")
    logits, labels = eval_preds.predictions, eval_preds.label_ids
    predictions = np.argmax(logits, axis=-1)
    print(metric.compute(predictions=predictions, references=labels))
    return metric.compute(predictions=predictions, references=labels)

In [12]:
training_args.logging_steps = 10
training_args.eval_steps = 50
training_args.remove_unused_columns=False
training_args.per_device_train_batch_size=16
training_args.per_device_eval_batch_size=32
training_args.evaluation_strategy="steps" 
training_args.logging_strategy="steps"
training_args.load_best_model_at_end=True,
training_args.save_strategy = "no"
training_args.learning_rate=5e-4


trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=train_set,
    eval_dataset=val_set,
    compute_metrics=compute_metrics,
)

In [13]:
# https://huggingface.co/blog/fine-tune-whisper

In [14]:
# training_args

In [15]:
trainer.train()
trainer.save_model(output_dir)

Could not estimate the number of tokens of the input, floating-point operations will not be computed


Step,Training Loss,Validation Loss,Accuracy
50,1.6321,1.602101,0.353918
100,1.5801,1.704738,0.343958
150,1.6152,1.577165,0.371846
200,1.5978,1.565608,0.384462


  metric = load_metric("accuracy", "f1")


{'accuracy': 0.35391766268260294}
{'accuracy': 0.34395750332005315}
{'accuracy': 0.37184594953519257}
{'accuracy': 0.3844621513944223}


In [16]:
trainer.evaluate()

{'accuracy': 0.397742363877822}


{'eval_loss': 1.544136643409729,
 'eval_accuracy': 0.397742363877822,
 'eval_runtime': 41.7057,
 'eval_samples_per_second': 36.11,
 'eval_steps_per_second': 0.288,
 'epoch': 3.0}