In [1]:
import os
import torch
import librosa
import pandas as pd
import numpy as np
from transformers import Wav2Vec2Processor, Wav2Vec2Model
from sklearn.feature_extraction.text import TfidfVectorizer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
AUDIO_FOLDER = "audio_output_4_final"
METADATA_CSV = "audio_metadata_filtered_4_final.csv"
SR = 16000
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

In [3]:
df = pd.read_csv(METADATA_CSV)
df = df[['wav_filename','emotion','text']]

In [4]:
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
wav2vec_model = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-base-960h").to(DEVICE)
wav2vec_model.eval()

Some weights of Wav2Vec2Model were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Wav2Vec2Model(
  (feature_extractor): Wav2Vec2FeatureEncoder(
    (conv_layers): ModuleList(
      (0): Wav2Vec2GroupNormConvLayer(
        (conv): Conv1d(1, 512, kernel_size=(10,), stride=(5,), bias=False)
        (activation): GELUActivation()
        (layer_norm): GroupNorm(512, 512, eps=1e-05, affine=True)
      )
      (1-4): 4 x Wav2Vec2NoLayerNormConvLayer(
        (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,), bias=False)
        (activation): GELUActivation()
      )
      (5-6): 2 x Wav2Vec2NoLayerNormConvLayer(
        (conv): Conv1d(512, 512, kernel_size=(2,), stride=(2,), bias=False)
        (activation): GELUActivation()
      )
    )
  )
  (feature_projection): Wav2Vec2FeatureProjection(
    (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
    (projection): Linear(in_features=512, out_features=768, bias=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): Wav2Vec2Encoder(
    (pos_conv_embed): Wav2Vec2PositionalConvEmbedding(
  

In [5]:
def get_audio_embedding(file_path):
    y, sr = librosa.load(file_path, sr=SR)
    input_values = processor(y, sampling_rate=sr, return_tensors="pt").input_values.to(DEVICE)
    with torch.no_grad():
        outputs = wav2vec_model(input_values)
        embedding = outputs.last_hidden_state.mean(dim=1)  
    return embedding.cpu().numpy().flatten()

In [6]:
audio_embeddings = []
for idx, row in df.iterrows():
    wav_path = os.path.join(AUDIO_FOLDER, row['wav_filename'])
    emb = get_audio_embedding(wav_path)
    audio_embeddings.append(emb)
    if (idx+1) % 20 == 0:
        print(f"Processed {idx+1}/{len(df)} audio files")

audio_embeddings = np.array(audio_embeddings)
print("Audio embeddings shape:", audio_embeddings.shape)

Processed 20/2933 audio files
Processed 40/2933 audio files
Processed 60/2933 audio files
Processed 80/2933 audio files
Processed 100/2933 audio files
Processed 120/2933 audio files
Processed 140/2933 audio files
Processed 160/2933 audio files
Processed 180/2933 audio files
Processed 200/2933 audio files
Processed 220/2933 audio files
Processed 240/2933 audio files
Processed 260/2933 audio files
Processed 280/2933 audio files
Processed 300/2933 audio files
Processed 320/2933 audio files
Processed 340/2933 audio files
Processed 360/2933 audio files
Processed 380/2933 audio files
Processed 400/2933 audio files
Processed 420/2933 audio files
Processed 440/2933 audio files
Processed 460/2933 audio files
Processed 480/2933 audio files
Processed 500/2933 audio files
Processed 520/2933 audio files
Processed 540/2933 audio files
Processed 560/2933 audio files
Processed 580/2933 audio files
Processed 600/2933 audio files
Processed 620/2933 audio files
Processed 640/2933 audio files
Processed 66

In [7]:
audio_df = pd.DataFrame(audio_embeddings)
audio_df['emotion'] = df['emotion']
audio_df['wav_filename'] = df['wav_filename']
audio_df.to_csv('audio_features_4.csv', index=False)
print(f"Saved audio features")

Saved audio features


In [8]:
from sentence_transformers import SentenceTransformer
import numpy as np
import pandas as pd

bert_model = SentenceTransformer('all-MiniLM-L6-v2')  
text_embeddings = bert_model.encode(df['text'].tolist(), batch_size=16, show_progress_bar=True)

print("Text embeddings shape:", text_embeddings.shape) 




Batches: 100%|██████████| 184/184 [00:13<00:00, 13.84it/s]


Text embeddings shape: (2933, 384)


In [9]:
text_df = pd.DataFrame(text_embeddings)
text_df['emotion'] = df['emotion'].values
text_df.to_csv('text_features_4.csv', index=False)
print(f"Saved text features")

Saved text features


In [10]:
X = np.hstack([audio_embeddings, text_embeddings])
print("Combined feature shape:", X.shape)


Combined feature shape: (2933, 1152)


In [11]:
combined_df = pd.DataFrame(X)
combined_df['emotion'] = df['emotion']
combined_df['wav_filename'] = df['wav_filename']
combined_df.to_csv('audio_text_features_4.csv', index=False)
print(f"Saved combined embeddings")

Saved combined embeddings
