In [1]:
import os
os.chdir('/kaggle/working')

In [2]:
!pip install audeer audonnx audiofile

Collecting audeer
  Downloading audeer-2.0.0-py3-none-any.whl.metadata (4.1 kB)
Collecting audonnx
  Downloading audonnx-0.7.0-py3-none-any.whl.metadata (4.2 kB)
Collecting audiofile
  Downloading audiofile-1.4.0-py3-none-any.whl.metadata (4.9 kB)
Collecting audobject>=0.7.2 (from audonnx)
  Downloading audobject-0.7.11-py3-none-any.whl.metadata (2.6 kB)
Collecting onnxruntime>=1.8.0 (from audonnx)
  Downloading onnxruntime-1.18.0-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (4.3 kB)
Collecting audmath>=1.3.0 (from audiofile)
  Downloading audmath-1.4.0-py3-none-any.whl.metadata (3.7 kB)
Collecting oyaml (from audobject>=0.7.2->audonnx)
  Downloading oyaml-1.0-py2.py3-none-any.whl.metadata (1.2 kB)
Collecting coloredlogs (from onnxruntime>=1.8.0->audonnx)
  Downloading coloredlogs-15.0.1-py2.py3-none-any.whl.metadata (12 kB)
Collecting humanfriendly>=9.1 (from coloredlogs->onnxruntime>=1.8.0->audonnx)
  Downloading humanfriendly-10.0-py2.py3-non

In [3]:
#Import Dependencies

import audeer
import audonnx
import numpy as np
import pandas as pd
import audiofile
import time
import datetime as dt

cache_root = audeer.mkdir('cache')
model_root = audeer.mkdir('model')

In [4]:
def calcProcessTime(starttime, cur_iter, max_iter):

    telapsed = time.time() - starttime
    testimated = (telapsed/cur_iter)*(max_iter)

    finishtime = starttime + testimated
    finishtime = dt.datetime.fromtimestamp(finishtime).strftime("%H:%M:%S")  # in time

    lefttime = testimated-telapsed  # in seconds

    return (int(telapsed), int(lefttime), finishtime)

In [5]:
#Download model if necessary
url = 'https://zenodo.org/record/6221127/files/w2v2-L-robust-12.6bc4a7fd-1.1.0.zip'

archive_path = audeer.download_url(url, cache_root, verbose=True)
audeer.extract_archive(archive_path, model_root)

                                                                                                    

['model.onnx', 'model.yaml']

In [6]:
#Load model and test with random sample
model = audonnx.load(model_root)
sampling_rate = 16000
signal = np.random.normal(size=sampling_rate).astype(np.float32)
model(signal, sampling_rate)

{'hidden_states': array([[-0.00711837,  0.00628876, -0.00727758, ...,  0.00662992,
          0.00973976,  0.00297482]], dtype=float32),
 'logits': array([[0.68658626, 0.658509  , 0.49043864]], dtype=float32)}

In [7]:
#Function definitions to use the model to extract arousal valence dominance from an audiofile

def predict_avd(path):
    signal, sampling_rate = audiofile.read(
                path,
                always_2d=True,
            )
    ret = model(signal, sampling_rate)
    return ret["logits"][0]
start = time.time()

def extract_avd(input_file,output_file):
    df = pd.read_csv(input_file)
    # df = df.sample(n=3) # test on small subset
    data = {"ActorID":[],"Emotion":[],"Path":[],"arousal":[],"valence":[],"dominance":[]}
    cur_iter = 0
    for index, row in df.iterrows():
        ret = predict_avd(row["Path"])
        data["arousal"].append(ret[0])
        data["valence"].append(ret[1])
        data["dominance"].append(ret[2])
        data["Path"].append(row['Path'])
        data["Emotion"].append(row['Emotion'])
        data["ActorID"].append(row['ActorID'])
        cur_iter += 1
        print("On {}%".format(round(cur_iter / df.shape[0] * 100, 2)))
        prstime = calcProcessTime(start, cur_iter ,df.shape[0])
        print("time elapsed: %s(s), time left: %s(s), estimated finish time: %s"%prstime)
    resdf = pd.DataFrame.from_dict(data)
    resdf.to_csv(output_file,index=False)


In [8]:
extract_avd('/kaggle/input/opensmile-beasc/opensmileDF_beasc.csv', "/kaggle/working/beasc_avd.csv")

On 0.08%
time elapsed: 10(s), time left: 12691(s), estimated finish time: 22:00:07
On 0.16%
time elapsed: 19(s), time left: 11986(s), estimated finish time: 21:48:31
On 0.25%
time elapsed: 26(s), time left: 10812(s), estimated finish time: 21:29:03
On 0.33%
time elapsed: 36(s), time left: 11144(s), estimated finish time: 21:34:46
On 0.41%
time elapsed: 43(s), time left: 10623(s), estimated finish time: 21:26:12
On 0.49%
time elapsed: 48(s), time left: 9929(s), estimated finish time: 21:14:43
On 0.57%
time elapsed: 58(s), time left: 10110(s), estimated finish time: 21:17:53
On 0.65%
time elapsed: 67(s), time left: 10319(s), estimated finish time: 21:21:32
On 0.74%
time elapsed: 76(s), time left: 10361(s), estimated finish time: 21:22:23
On 0.82%
time elapsed: 86(s), time left: 10526(s), estimated finish time: 21:25:18
On 0.9%
time elapsed: 95(s), time left: 10580(s), estimated finish time: 21:26:21
On 0.98%
time elapsed: 106(s), time left: 10765(s), estimated finish time: 21:29:37
On 1.

In [9]:
def feature_embeddings(path):
    signal, sampling_rate = audiofile.read(
                path,
                always_2d=True,
            )
    ret = model(signal, sampling_rate)
    return ret["hidden_states"][0]

In [10]:
from tqdm import tqdm

def extract_transformer_embeddings(input_file, output_file):
    df = pd.read_csv(input_file)
    #df = df.sample(n=3)  # test on small subset
    data = {"Path": [], "feature_embeddings": []}
    
    # Wrap the iterable with tqdm for the progress bar
    for index, row in tqdm(df.iterrows(), total=len(df), desc="Processing rows"):
        ret = feature_embeddings(row["Path"])
        data["Path"].append(row['Path'])
        data["feature_embeddings"].append(list(ret))
    
    resdf = pd.DataFrame.from_dict(data)
    resdf.to_csv(output_file, index=False)
        

In [11]:
extract_transformer_embeddings("/kaggle/input/opensmile-beasc/opensmileDF_beasc.csv","/kaggle/working/beasc_embeddings.csv")

Processing rows: 100%|██████████| 1224/1224 [2:27:37<00:00,  7.24s/it]


In [12]:
df = pd.read_csv("/kaggle/working/beasc_embeddings.csv")
df2 = pd.read_csv("/kaggle/input/opensmile-beasc/opensmileDF_beasc.csv")

df3 = df.merge(df2, how="left",on=["Path"])

In [13]:
df3.to_csv('/kaggle/working/merged_beasc_df_embeddings')