In [1]:
import os
os.chdir('/kaggle/working')

In [2]:
!pip install audeer audonnx audiofile

Collecting audeer
  Downloading audeer-2.0.0-py3-none-any.whl.metadata (4.1 kB)
Collecting audonnx
  Downloading audonnx-0.7.0-py3-none-any.whl.metadata (4.2 kB)
Collecting audiofile
  Downloading audiofile-1.4.0-py3-none-any.whl.metadata (4.9 kB)
Collecting audobject>=0.7.2 (from audonnx)
  Downloading audobject-0.7.11-py3-none-any.whl.metadata (2.6 kB)
Collecting onnxruntime>=1.8.0 (from audonnx)
  Downloading onnxruntime-1.18.0-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (4.3 kB)
Collecting audmath>=1.3.0 (from audiofile)
  Downloading audmath-1.4.0-py3-none-any.whl.metadata (3.7 kB)
Collecting oyaml (from audobject>=0.7.2->audonnx)
  Downloading oyaml-1.0-py2.py3-none-any.whl.metadata (1.2 kB)
Collecting coloredlogs (from onnxruntime>=1.8.0->audonnx)
  Downloading coloredlogs-15.0.1-py2.py3-none-any.whl.metadata (12 kB)
Collecting humanfriendly>=9.1 (from coloredlogs->onnxruntime>=1.8.0->audonnx)
  Downloading humanfriendly-10.0-py2.py3-non

In [3]:
#Import Dependencies

import audeer
import audonnx
import numpy as np
import pandas as pd
import audiofile

cache_root = audeer.mkdir('cache')
model_root = audeer.mkdir('model')

In [4]:
#Download model if necessary
url = 'https://zenodo.org/record/6221127/files/w2v2-L-robust-12.6bc4a7fd-1.1.0.zip'

archive_path = audeer.download_url(url, cache_root, verbose=True)
audeer.extract_archive(archive_path, model_root)

                                                                                                    

['model.onnx', 'model.yaml']

In [5]:
#Load model and test with random sample
model = audonnx.load(model_root)
sampling_rate = 16000
signal = np.random.normal(size=sampling_rate).astype(np.float32)
model(signal, sampling_rate)

{'hidden_states': array([[-0.00722236,  0.00646646, -0.00779154, ...,  0.0067355 ,
          0.00964987,  0.00271548]], dtype=float32),
 'logits': array([[0.6779038, 0.652977 , 0.5041521]], dtype=float32)}

In [6]:
#Function definitions to use the model to extract arousal valence dominance from an audiofile

def predict_avd(path):
    signal, sampling_rate = audiofile.read(
                path,
                always_2d=True,
            )
    ret = model(signal, sampling_rate)
    return ret["logits"][0]

def extract_avd(input_file,output_file):
    df = pd.read_csv(input_file)
    #df = df.sample(n=3) # test on small subset
    data = {"SpeakerID":[],"Emotion":[],"Path":[],"arousal":[],"valence":[],"dominance":[]}
    for index, row in df.iterrows():
        ret = predict_avd(row["Path"])
        data["arousal"].append(ret[0])
        data["valence"].append(ret[1])
        data["dominance"].append(ret[2])
        data["Path"].append(row['Path'])
        data["Emotion"].append(row['Emotion'])
        data["SpeakerID"].append(row['SpeakerID'])
        print("On row {} of {}".format(index + 1, len(df)))
    resdf = pd.DataFrame.from_dict(data)
    resdf.to_csv(output_file,index=False)


In [7]:
extract_avd('/kaggle/input/cusser-data/data/opensmileDF_emodb.csv', "/kaggle/working/emodb_avd.csv")

On row 1 of 535
On row 2 of 535
On row 3 of 535
On row 4 of 535
On row 5 of 535
On row 6 of 535
On row 7 of 535
On row 8 of 535
On row 9 of 535
On row 10 of 535
On row 11 of 535
On row 12 of 535
On row 13 of 535
On row 14 of 535
On row 15 of 535
On row 16 of 535
On row 17 of 535
On row 18 of 535
On row 19 of 535
On row 20 of 535
On row 21 of 535
On row 22 of 535
On row 23 of 535
On row 24 of 535
On row 25 of 535
On row 26 of 535
On row 27 of 535
On row 28 of 535
On row 29 of 535
On row 30 of 535
On row 31 of 535
On row 32 of 535
On row 33 of 535
On row 34 of 535
On row 35 of 535
On row 36 of 535
On row 37 of 535
On row 38 of 535
On row 39 of 535
On row 40 of 535
On row 41 of 535
On row 42 of 535
On row 43 of 535
On row 44 of 535
On row 45 of 535
On row 46 of 535
On row 47 of 535
On row 48 of 535
On row 49 of 535
On row 50 of 535
On row 51 of 535
On row 52 of 535
On row 53 of 535
On row 54 of 535
On row 55 of 535
On row 56 of 535
On row 57 of 535
On row 58 of 535
On row 59 of 535
On row

In [8]:
def feature_embeddings(path):
    signal, sampling_rate = audiofile.read(
                path,
                always_2d=True,
            )
    ret = model(signal, sampling_rate)
    return ret["hidden_states"][0]

In [9]:
from tqdm import tqdm

def extract_transformer_embeddings(input_file, output_file):
    df = pd.read_csv(input_file)
    #df = df.sample(n=3)  # test on small subset
    data = {"Path": [], "feature_embeddings": []}
    
    # Wrap the iterable with tqdm for the progress bar
    for index, row in tqdm(df.iterrows(), total=len(df), desc="Processing rows"):
        ret = feature_embeddings(row["Path"])
        data["Path"].append(row['Path'])
        data["feature_embeddings"].append(list(ret))
    
    resdf = pd.DataFrame.from_dict(data)
    resdf.to_csv(output_file, index=False)
        

In [10]:
extract_transformer_embeddings("/kaggle/input/cusser-data/data/opensmileDF_emodb.csv","/kaggle/working/emodb_embeddings.csv")

Processing rows: 100%|██████████| 535/535 [09:52<00:00,  1.11s/it]


In [11]:
df = pd.read_csv("/kaggle/working/emodb_embeddings.csv")
df2 = pd.read_csv("/kaggle/input/cusser-data/data/opensmileDF_emodb.csv")

df3 = df.merge(df2, how="left",on=["Path"])

In [12]:
df3.to_csv('/kaggle/working/merged_emodb_df_embeddings')