In [23]:
!pip install pandas
!pip install transformers
!pip install accelerate -U
!pip install datasets
!pip install scikit-learn

[0m

In [24]:
import os
import sys
import torch
import pandas as pd
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
from transformers import AutoFeatureExtractor, WhisperForAudioClassification, Trainer, TrainingArguments
from transformers import AutoTokenizer, BertForSequenceClassification
import datetime
from sklearn.metrics import accuracy_score
# sys.path.append("E:/university/Year 5 Spring/FYT/code/multi_modal_ser")
sys.path.append("/home/multi_modal_ser")
from utils.dataset import MMSERDataset
from datasets import load_metric
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

### Log the Notebook

In [25]:
import sys
import logging

nblog = open("nb.log", "a+")
sys.stdout.echo = nblog
sys.stderr.echo = nblog

get_ipython().log.handlers[0].stream = nblog
get_ipython().log.setLevel(logging.INFO)

%autosave 5

Autosaving every 5 seconds


In [26]:
print(device)
!nvidia-smi

cuda
Wed Oct 11 06:38:52 2023       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.86.05              Driver Version: 535.86.05    CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA GeForce RTX 4090        On  | 00000000:B1:00.0 Off |                  Off |
|  0%   41C    P8              20W / 450W |    562MiB / 24564MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
|   1  NVIDIA GeForce RTX 4090        On  | 00000000:B7:00.0 Of

In [27]:
# mmser_ds = torch.load("E:/datasets/preprocessed/dataset/mmser_ds.pt")
mmser_ds = torch.load("/home/mmser_ds.pt")
print("Emotions: ", mmser_ds.df_["emotion_id"].unique())
print("Emotion ID: ", mmser_ds.df_["emotion_id"].unique())

Emotions:  [2. 1. 3. 0.]
Emotion ID:  [2. 1. 3. 0.]


### Build Model

In [28]:
# raw
# model = WhisperForAudioClassification.from_pretrained("openai/whisper-base").to(device)
# model.config.num_labels = mmser_ds.df_["emotion_id"].nunique()
# model.classifier = nn.Linear(model.projector.out_features, model.config.num_labels)
# model
# finetuned

# "/home/multi_modal_ser/finetune_encoder/finetune/2023-10-10"

In [105]:
import torch

class MMSERConcat(torch.nn.Module):

    def __init__(self, 
                 audio_model_path, 
                 text_model_path, 
                 audio_encoding_dim=256, 
                 text_encoding_dim=768, 
                 num_labels=4):
        super(MMSERConcat, self).__init__()
        self.audio_encoder = WhisperForAudioClassification.from_pretrained(audio_model_path).to(device)
        self.audio_encoder.classifier=nn.Sequential()
        self.text_encoder = BertForSequenceClassification.from_pretrained(text_model_path).to(device)
        self.text_encoder.classifier=nn.Sequential()
        self.concat_linear = torch.nn.Linear(audio_encoding_dim+text_encoding_dim, num_labels).to(device)
        self.tokenizer = AutoTokenizer.from_pretrained(text_model_path)
        
    def forward(self, audio, text, **kwargs):
        print(audio)
        text_inputs = tokenizer(text.tolist(), return_tensors="pt", padding=True).to(device)
        audio_encoding=self.audio_encoder(audio.to(device)).logits
        text_encoding=self.text_encoder(**text_inputs).logits
        x = torch.cat((audio_encoding, text_encoding), 1)
        x = self.concat_linear(x)
        return x

model = MMSERConcat("/home/multi_modal_ser/finetune_encoder/finetune/2023-10-10", "bert-base-uncased")
# model

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


##### Freeze

In [106]:
for param in model.audio_encoder.encoder.parameters():
    param.requires_grad = False
for param in model.text_encoder.bert.parameters():
    param.requires_grad = False
    
model

MMSERConcat(
  (audio_encoder): WhisperForAudioClassification(
    (encoder): WhisperEncoder(
      (conv1): Conv1d(80, 512, kernel_size=(3,), stride=(1,), padding=(1,))
      (conv2): Conv1d(512, 512, kernel_size=(3,), stride=(2,), padding=(1,))
      (embed_positions): Embedding(1500, 512)
      (layers): ModuleList(
        (0-5): 6 x WhisperEncoderLayer(
          (self_attn): WhisperAttention(
            (k_proj): Linear(in_features=512, out_features=512, bias=False)
            (v_proj): Linear(in_features=512, out_features=512, bias=True)
            (q_proj): Linear(in_features=512, out_features=512, bias=True)
            (out_proj): Linear(in_features=512, out_features=512, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=512, out_features=2048, bias=True)
          (fc2): Linear(in_features=2048, out_features=512, bias=True)
         

### Build Metrics

In [107]:
def weighted_acc(y_true, y_pred):
    return np.sum((np.array(y_pred).ravel() == np.array(y_true).ravel()))*1.0/len(y_true)
    
def unweighted_acc(y_true, y_pred):
    y_true = np.array(y_true).ravel()
    y_pred = np.array(y_pred).ravel()
    classes = np.unique(y_true)
    classes_accuracies = np.zeros(classes.shape[0])
    for num, cls in enumerate(classes):
        classes_accuracies[num] = weighted_acc(y_true[y_true == cls], y_pred[y_true == cls])
    return np.mean(classes_accuracies)

def compute_metrics(eval_preds):
    logits, labels = eval_preds.predictions, eval_preds.label_ids
    predictions = np.argmax(logits, axis=-1)

    metric_f1 = load_metric("f1")
    metric_acc = load_metric("accuracy")
    logits, labels = eval_preds.predictions, eval_preds.label_ids
    predictions = np.argmax(logits, axis=-1)
    f1_ = metric_f1.compute(predictions=predictions, references=labels, average="weighted")["f1"]
    acc_ = metric_acc.compute(predictions=predictions, references=labels)["accuracy"]
    
    return {"wa":weighted_acc(labels, predictions), 
            "ua":unweighted_acc(labels, predictions),
            "f1":f1_, 
            "accuracy":acc_}

### Build Trainer

In [108]:
train_size = int(len(mmser_ds)*0.7)
val_size = int(len(mmser_ds)*0.2)
test_size = len(mmser_ds)-int(len(mmser_ds)*0.7)-int(len(mmser_ds)*0.2)

train_set, val_set = torch.utils.data.random_split(mmser_ds, [train_size, val_size+test_size])
val_set, test_set = torch.utils.data.random_split(val_set, [val_size, test_size])

In [109]:
output_dir=os.path.join("/home/multi_modal_ser/finetune_encoder", datetime.datetime.now().date().strftime(format="%Y-%m-%d"))

training_args = TrainingArguments(output_dir)
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels").type(torch.LongTensor).to(device)
        print(inputs["audio"])
        outputs = model(inputs["audio"], inputs["text"])
        logits = outputs.get("logits")
        # compute custom loss (suppose one has 3 labels with different weights)
        loss_fct = nn.CrossEntropyLoss() # weight=torch.tensor([1.0, 2.0, 3.0], device=model.device))
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        
        return (loss, outputs) if return_outputs else loss

In [110]:
training_args.logging_steps = 50
training_args.eval_steps = 50
training_args.remove_unused_columns=False
training_args.per_device_train_batch_size=100
training_args.per_device_eval_batch_size=100
# training_args.per_device_train_batch_size=16
# training_args.per_device_eval_batch_size=32
training_args.evaluation_strategy="steps" 
training_args.logging_strategy="steps"
training_args.load_best_model_at_end=True,
training_args.save_strategy = "no"
training_args.learning_rate=1e-4
training_args.num_train_epochs=200

trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=train_set,
    eval_dataset=val_set,
    compute_metrics=compute_metrics,
)

### Train

In [111]:
# evaluate the init model
trainer.evaluate()

TypeError: Caught TypeError in replica 0 on device 0.
Original Traceback (most recent call last):
  File "/opt/conda/lib/python3.10/site-packages/torch/nn/parallel/parallel_apply.py", line 85, in _worker
    output = module(*input, **kwargs)
  File "/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
    return forward_call(*args, **kwargs)
TypeError: MMSERConcat.forward() missing 1 required positional argument: 'text'


In [None]:
trainer.train()
trainer.save_model(output_dir)

In [None]:
trainer.evaluate()

In [None]:
import pickle

with open("log_hist_1010.pkl", 'wb') as f:
    pickle.dump(trainer.state.log_history, f)