In [8]:
#Import Dependencies

import audeer
import audonnx
import numpy as np
import pandas as pd
import audiofile

cache_root = audeer.mkdir('cache')
model_root = audeer.mkdir('model')

In [10]:
#Download model if necessary
url = 'https://zenodo.org/record/6221127/files/w2v2-L-robust-12.6bc4a7fd-1.1.0.zip'

archive_path = audeer.download_url(url, cache_root, verbose=True)
audeer.extract_archive(archive_path, model_root)

['model.onnx', 'model.yaml']

In [11]:
#Load model and test with random sample
model = audonnx.load(model_root)
sampling_rate = 16000
signal = np.random.normal(size=sampling_rate).astype(np.float32)
model(signal, sampling_rate)

{'hidden_states': array([[-0.00721893,  0.00638158, -0.00823902, ...,  0.00665349,
          0.00967707,  0.00256732]], dtype=float32),
 'logits': array([[0.68168986, 0.6574491 , 0.49838078]], dtype=float32)}

In [12]:
#Function definitions to use the model to extract arousal valence dominance from an audiofile

def predict_avd(path):
    signal, sampling_rate = audiofile.read(
                path,
                always_2d=True,
            )
    ret = model(signal, sampling_rate)
    return ret["logits"][0]

def extract_avd(input_file,output_file):
    df = pd.read_csv(input_file)
    #df = df.sample(n=3) # test on small subset
    data = {"ActorID":[],"emotion":[],"path":[],"arousal":[],"valence":[],"dominance":[]}
    for index, row in df.iterrows():
        ret = predict_avd(row["path"])
        data["arousal"].append(ret[0])
        data["valence"].append(ret[1])
        data["dominance"].append(ret[2])
        data["path"].append(row['path'])
        data["emotion"].append(row['emotion'])
        data["ActorID"].append(row['ActorID'])
    resdf = pd.DataFrame.from_dict(data)
    resdf.to_csv(output_file,index=False)


In [13]:
extract_avd("data/crema_df.csv","data/crema_avd.csv")

In [32]:
def feature_embeddings(path):
    signal, sampling_rate = audiofile.read(
                path,
                always_2d=True,
            )
    ret = model(signal, sampling_rate)
    return ret["hidden_states"][0]

In [40]:
from tqdm import tqdm

def extract_transformer_embeddings(input_file, output_file):
    df = pd.read_csv(input_file)
    #df = df.sample(n=3)  # test on small subset
    data = {"path": [], "feature_embeddings": []}
    
    # Wrap the iterable with tqdm for the progress bar
    for index, row in tqdm(df.iterrows(), total=len(df), desc="Processing rows"):
        ret = feature_embeddings(row["path"])
        data["path"].append(row['path'])
        data["feature_embeddings"].append(list(ret))
    
    resdf = pd.DataFrame.from_dict(data)
    resdf.to_csv(output_file, index=False)
        
    

In [41]:
extract_transformer_embeddings("data/crema_df.csv","data/crema_embeddings.csv")

Processing rows:   0%|          | 0/7442 [00:00<?, ?it/s]

Processing rows: 100%|██████████| 7442/7442 [5:48:37<00:00,  2.81s/it]  


In [43]:
df = pd.read_csv("data/crema_embeddings.csv")
df2 = pd.read_csv("data/crema_df.csv")

df3 = df.merge(df2, how="left",on=["path"])

Unnamed: 0,path,feature_embeddings,ActorID,emotion
0,./AudioWAV/1001_DFA_ANG_XX.wav,"[-0.007677641, 0.0055411533, -0.00857792, -0.0...",1001,angry
1,./AudioWAV/1001_DFA_DIS_XX.wav,"[-0.0076479497, 0.005411356, -0.009961039, -0....",1001,disgust
2,./AudioWAV/1001_DFA_FEA_XX.wav,"[-0.0073989383, 0.005450367, -0.009070151, -0....",1001,fear
3,./AudioWAV/1001_DFA_HAP_XX.wav,"[-0.007742562, 0.0052845012, -0.00848962, -0.0...",1001,happy
4,./AudioWAV/1001_DFA_NEU_XX.wav,"[-0.0076925997, 0.0050833263, -0.009366785, -0...",1001,neutral
