### Prepare Repo

In [23]:
# !pip install omegaconf==2.1.1
# !pip install hydra-core==1.1.1
# !pip install -U numpy==1.23.5
# !apt-get update && apt-get install -y python3-opencv
# !pip install opencv-python
# !pip install scikit-image 
# !pip install transformers
# !pip install datasets
# !pip install transformers[torch]
# !pip install accelerate -U
# !pip install wandb
# !pip install scikit-learn

In [1]:
!git clone https://github.com/facebookresearch/av_hubert.git

%cd av_hubert
!git submodule init
!git submodule update
!pip install scipy
!pip install sentencepiece
!pip install python_speech_features
!pip install scikit-video

%cd fairseq
!pip install ./

Cloning into 'av_hubert'...
remote: Enumerating objects: 149, done.[K
remote: Counting objects: 100% (38/38), done.[K
remote: Compressing objects: 100% (24/24), done.[K
remote: Total 149 (delta 18), reused 22 (delta 14), pack-reused 111[K
Receiving objects: 100% (149/149), 4.65 MiB | 8.29 MiB/s, done.
Resolving deltas: 100% (64/64), done.
/home/multi_modal_ser/finetune_encoder/audio_video/av_hubert
Submodule 'fairseq' (https://github.com/pytorch/fairseq) registered for path 'fairseq'
Cloning into '/home/multi_modal_ser/finetune_encoder/audio_video/av_hubert/fairseq'...
Submodule path 'fairseq': checked out 'afc77bdf4bb51453ce76f1572ef2ee6ddcda8eeb'
[33mDEPRECATION: omegaconf 2.0.6 has a non-standard dependency specifier PyYAML>=5.1.*. pip 23.3 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of omegaconf or contact the author to suggest that they release a version with a conforming dependency specifiers. Discussion can be found at https:/

In [2]:
import fairseq
from fairseq import checkpoint_utils, options, tasks, utils
import cv2
import tempfile
import torch
from transformers import Trainer, TrainingArguments
import sys
sys.path.append("/home/multi_modal_ser/finetune_encoder/audio_video/av_hubert/avhubert")
%cd /home/multi_modal_ser/finetune_encoder/audio_video/av_hubert/
import utils as avhubert_utils
from argparse import Namespace
from IPython.display import HTML
import numpy as np
import sys
print(sys.version)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
import torch.nn as nn
import wandb
from torch.utils.data import Dataset, Subset
import os
import datetime

/home/multi_modal_ser/finetune_encoder/audio_video/av_hubert
3.10.13 (main, Sep 11 2023, 13:44:35) [GCC 11.2.0]


In [3]:
print(device)
!nvidia-smi

cuda
Tue Oct 24 11:51:23 2023       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.86.05              Driver Version: 535.86.05    CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA GeForce RTX 4090        On  | 00000000:01:00.0 Off |                  Off |
|  0%   32C    P8              15W / 450W |      5MiB / 24564MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
|   1  NVIDIA GeForce RTX 4090        On  | 00000000:03:00.0 Of

### Download Model

In [4]:
# os.makedirs("/home/check_pts/")
# # !wget https://dl.fbaipublicfiles.com/avhubert/model/lrs3_vox/vsr/base_vox_433h.pt -O /home/check_pts/avhubert.pt
# !wget https://dl.fbaipublicfiles.com/avhubert/model/lrs3_vox/clean-pretrain/base_vox_iter4.pt -O /home/check_pts/avhubert.pt

### Build Model Pipeline

In [5]:
user_dir = "/home/multi_modal_ser/finetune_encoder/audio_video/av_hubert/avhubert"
utils.import_user_module(Namespace(user_dir=user_dir))
ckpt_path = "/home/check_pts/avhubert.pt"
models, saved_cfg, task = checkpoint_utils.load_model_ensemble_and_task([ckpt_path])  
model = models[0]
if hasattr(models[0], 'decoder'):
    print(f"Checkpoint: fine-tuned")
    model = models[0].encoder.w2v_model
else:
    print(f"Checkpoint: pre-trained w/o fine-tuning")



Checkpoint: pre-trained w/o fine-tuning


### Load Dataset

In [6]:
import torch
print(torch.__version__)

2.1.0


In [7]:
from avhubert_ds import AVHUBERTDataset
mmser_ds = torch.load("/home/avhubert_ds2.pt")
mmser_ds.video_path = "/home/face_raw/"

# outputs = model.extract_finetune(mmser_ds[:2])

In [8]:
mmser_ds.cached = False
mmser_ds.__cache__()

100%|██████████| 5531/5531 [03:38<00:00, 25.35it/s]


### Define the model

In [105]:
from avhubert_classifier import AVHUBERTClassifier

In [106]:
# classifier = AVHUBERTClassifier(model, 768, 256, mmser_ds.df_["emotion_id"].nunique())
# classifier(**mmser_ds[:4])

### Build Train Test DS

In [107]:
meta_df_ = mmser_ds.df_
mmser_ds.df_["bigsess"] = mmser_ds.df_["session"].apply(lambda x: x[:-1])
sess_dict = mmser_ds.df_.groupby("bigsess").groups
all_indices = set(mmser_ds.df_.index.tolist())

sess_ds = {}
for i in range(1,6):
    sess = "Ses0{}".format(i)
    sess_val = "Ses0{}".format(i%5+1)
    sess_ds[sess+"_test"] = Subset(mmser_ds, 
                                    indices=sess_dict[sess])
    # sess_ds[sess+"_val"] = Subset(mmser_ds, 
    #                                 indices=sess_dict[sess_val])
    sess_ds[sess+"_train"] = Subset(mmser_ds, 
                                    indices=list(all_indices-set(sess_dict[sess])))
    

In [108]:
def build_ds(sess_id):
    train_size = int(len(sess_ds[sess_id+"_train"])*0.8)
    val_size = len(sess_ds[sess_id+"_train"])-train_size
    train_set, val_set = torch.utils.data.random_split(sess_ds[sess_id+"_train"], [train_size, val_size])
    test_set = sess_ds[sess_id+"_test"]
    # train_set = sess_ds[sess_id+"_train"]
    # val_set = sess_ds[sess_id+"_val"]

    print("Train Samples:", len(train_set))
    print("Val Samples:", len(val_set))
    print("Test Samples:", len(test_set))
    
    return train_set, val_set, test_set

### Run Pipeline

API: 2999b8f99f0f62b4f64c48a1c8be9a16945183e9

In [116]:
user_dir = "/home/multi_modal_ser/finetune_encoder/audio_video/av_hubert/avhubert"
utils.import_user_module(Namespace(user_dir=user_dir))
ckpt_path = "/home/check_pts/avhubert.pt"
models, saved_cfg, task = checkpoint_utils.load_model_ensemble_and_task([ckpt_path])  
model = models[0]
if hasattr(models[0], 'decoder'):
    print(f"Checkpoint: fine-tuned")
    model = models[0].encoder.w2v_model
else:
    print(f"Checkpoint: pre-trained w/o fine-tuning")



Checkpoint: pre-trained w/o fine-tuning


In [117]:
import json
sess_id = list(sess_dict.keys())[0]
print("="*10, sess_id, "="*10)

avhubert_classifier = AVHUBERTClassifier(model, 768, 256, mmser_ds.df_["emotion_id"].nunique())
for param in avhubert_classifier.parameters():
    param.requires_grad = True

wandb.init()
print(sess_id)
train_set, val_set, test_set = build_ds(sess_id)






VBox(children=(Label(value='0.002 MB of 0.002 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
eval/accuracy,█▃▇▂▁▂▂▂▁▂▂▂
eval/f1,█▃▆▁▁▁▁▁▁▁▁▁
eval/loss,▁█▂▅▅▅▅▅▅▅▅▅
eval/runtime,▁▇▂▅▄▁▂▄▃▆▂█
eval/samples_per_second,█▂▇▄▅█▇▅▆▃▇▁
eval/steps_per_second,█▂▇▄▅█▇▅▆▃▇▁
eval/ua,█▂▆▁▁▁▁▁▁▁▁▁
eval/wa,█▃▇▂▁▂▂▂▁▂▂▂
train/epoch,▁▁▂▂▂▂▃▃▃▃▄▄▄▄▅▅▆▆▆▆▇▇▇▇█
train/global_step,▁▁▂▂▂▂▃▃▃▃▄▄▅▅▅▅▆▆▆▆▇▇▇▇█

0,1
eval/accuracy,0.31573
eval/f1,0.15153
eval/loss,1.36241
eval/runtime,18.1037
eval/samples_per_second,49.161
eval/steps_per_second,4.143
eval/ua,0.25
eval/wa,0.31573
train/epoch,7.27
train/global_step,2158.0


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011112358374521136, max=1.0…

Ses01
Train Samples: 3556
Val Samples: 890
Test Samples: 1085


In [129]:
output_dir=os.path.join("check_pts", "AVHUBERT", sess_id, datetime.datetime.now().date().strftime(format="%Y-%m-%d"))

training_args = TrainingArguments(output_dir,report_to="wandb")
training_args.remove_unused_columns=False
training_args.per_device_train_batch_size=6
training_args.per_device_eval_batch_size=6
training_args.logging_steps = int(1000/training_args.per_device_train_batch_size)
training_args.eval_steps = int(1000/training_args.per_device_train_batch_size)
training_args.evaluation_strategy="steps" 
training_args.logging_strategy="steps"
training_args.load_best_model_at_end=True,
training_args.save_strategy = "no"
training_args.learning_rate=5e-3
training_args.num_train_epochs=15
training_args.metric_for_best_model = 'loss'

In [130]:
from avhubert_trainer import CustomTrainer , compute_metrics
from transformers import EarlyStoppingCallback, TrainerCallback, TrainerState

avhubert_classifier = avhubert_classifier.to(device)
trainer = CustomTrainer(
    model=avhubert_classifier,
    args=training_args,
    train_dataset=train_set,
    eval_dataset=val_set,
    compute_metrics=compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience=3)]
)


##### Gradual Freezing

In [131]:
class FreezingCallback(TrainerCallback):
    
    def __init__(self, freeze_encoder_epochs: int):
        self.freeze_encoder_epochs = freeze_encoder_epochs

    def on_epoch_begin(self, args, state, control, **kwargs):
        print(state.epoch, self.freeze_encoder_epochs)
        model = kwargs["model"]
        if state.epoch >= self.freeze_encoder_epochs:
            print("="*10, "Freezing", "="*10)
            for param in model.encoder.feature_extractor_video.parameters():
                param.requires_grad = False

    def on_save(self, args, state, control, **kwargs):
        model = kwargs["model"]
        for name, param in model.named_parameters():
            param.requires_grad = True

In [132]:
freezing_callback = FreezingCallback(5)
trainer.add_callback(freezing_callback)

In [None]:
trainer.train()

0 5


Step,Training Loss,Validation Loss,Wa,Ua,F1,Accuracy
166,1.226,1.174462,0.467416,0.43288,0.423563,0.467416
332,1.1944,1.160704,0.461798,0.435711,0.453494,0.461798
498,1.2161,1.177264,0.434831,0.402085,0.37317,0.434831
664,1.217,1.146338,0.488764,0.477488,0.47715,0.488764
830,1.1895,1.138939,0.467416,0.469305,0.459201,0.467416
996,1.1814,1.139251,0.48427,0.446065,0.439037,0.48427
1162,1.2062,1.165566,0.462921,0.440203,0.423509,0.462921
1328,1.1887,1.145339,0.470787,0.446933,0.420578,0.470787
1494,1.1955,1.155846,0.468539,0.43138,0.4265,0.468539


1.0 5
2.0 5
3.0 5
4.0 5
5.0 5


In [128]:
del trainer

In [56]:
val_preds = trainer.predict(val_set)

In [67]:
import pandas as pd
pred_labels = val_preds.predictions.argmax(axis=1)
true_labels = val_preds.label_ids

In [71]:
print(pred_labels[10:15])
print(true_labels[10:15])

[0 0 2 3 0]
[0. 0. 0. 0. 0.]


In [74]:
from sklearn.metrics import f1_score

In [76]:
f1_score(true_labels, pred_labels, average=None)

array([0.78546713, 0.80825959, 0.75142315, 0.76785714])

In [102]:
train_ids = [fn["fn"] for fn in train_set]    
val_ids = [fn["fn"] for fn in val_set]    

In [103]:
len(set(train_ids).intersection(set(val_ids)))

0

In [134]:
eval_result = trainer.evaluate()
test_result = trainer.predict(test_set).metrics

In [135]:
test_result

{'test_loss': 1.121928095817566,
 'test_wa': 0.46912442396313364,
 'test_ua': 0.491775778649466,
 'test_f1': 0.43331144715120545,
 'test_accuracy': 0.46912442396313364,
 'test_runtime': 21.4022,
 'test_samples_per_second': 50.696,
 'test_steps_per_second': 4.252}

In [None]:
FREEZE_PROJ_PATH = "/home/freeze/{}/projector".format(sess_id)
FREEZE_CLAS_PATH = "/home/freeze/{}/classifier".format(sess_id)
os.makedirs(FREEZE_PROJ_PATH, exist_ok=True)
os.makedirs(FREEZE_CLAS_PATH, exist_ok=True)

FREEZE_PROJ = os.path.join(FREEZE_PROJ_PATH, datetime.datetime.now().date().strftime(format="%Y-%m-%d")+".pt")
FREEZE_CLAS = os.path.join(FREEZE_CLAS_PATH, datetime.datetime.now().date().strftime(format="%Y-%m-%d")+".pt")

torch.save(avhubert_classifier.projector.state_dict(), FREEZE_PROJ)
torch.save(avhubert_classifier.classifier.state_dict(), FREEZE_CLAS)

avhubert_classifier.projector.load_state_dict(torch.load(FREEZE_PROJ))
avhubert_classifier.classifier.load_state_dict(torch.load(FREEZE_CLAS))

print(eval_result)
print(test_result)

json_test = json.dumps(test_result, indent=4)
json_eval = json.dumps(eval_result, indent=4)

# Writing to sample.json
with open("{}_eval.json".format(sess_id), "w") as outfile:
    outfile.write(json_eval)
with open("{}_test.json".format(sess_id), "w") as outfile:
    outfile.write(json_test)
