In [2]:
import os
import torch
import pandas as pd
import numpy as np
from transformers import Wav2Vec2Processor, Wav2Vec2Model
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, f1_score
import librosa
import soundfile as sf


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# load metadata
df = pd.read_csv("../data/metadata.csv")
df.head()


Unnamed: 0,filepath,emotion,gender
0,../data/RADVESS/Actor_16/03-01-05-01-02-01-16.wav,angry,female
1,../data/RADVESS/Actor_16/03-01-05-02-01-01-16.wav,angry,female
2,../data/RADVESS/Actor_16/03-01-04-01-01-02-16.wav,sad,female
3,../data/RADVESS/Actor_16/03-01-04-02-02-02-16.wav,sad,female
4,../data/RADVESS/Actor_16/03-01-03-02-02-02-16.wav,happy,female


In [4]:
# load wave2vec model and processor
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
wav2vec = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-base-960h")
wav2vec.eval()

print("Model loaded.")


Some weights of Wav2Vec2Model were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model loaded.


In [12]:
# embeddig extraction function
def extract_embedding(path):
    audio, sr = librosa.load(path, sr=16000)
    inputs = processor(audio, sampling_rate=16000, return_tensors="pt", padding=True)

    with torch.no_grad():
        outputs = wav2vec(**inputs)

    hidden_states = outputs.last_hidden_state  # shape: (1, T, 768)

    # Mean pooling
    mean_emb = hidden_states.mean(dim=1).squeeze().numpy()

    # Std pooling (variation across time)
    std_emb = hidden_states.std(dim=1).squeeze().numpy()

    # Fusion: best-performing trick
    embedding = np.concatenate([mean_emb, std_emb])  # shape: 1536

    return embedding



In [20]:
# extract embeddings for all samples
embeddings = []
labels = []

for idx, row in df.iterrows():
    emb = extract_embedding(row["filepath"])
    embeddings.append(emb)
    labels.append(row["emotion"])

embeddings = np.array(embeddings)
labels = np.array(labels)

embeddings.shape, labels.shape


((672, 1536), (672,))

In [28]:
original_indices = np.arange(len(df))

In [29]:
# normalize embeddings
from sklearn.preprocessing import StandardScaler
import joblib

scaler = StandardScaler()
embeddings = scaler.fit_transform(embeddings)

# Save scaler for reproducibility
joblib.dump(scaler, "../models/transformer_scaler.pkl")


['../models/transformer_scaler.pkl']

In [30]:
# train test split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test, idx_train, idx_test = train_test_split(
    embeddings, labels, original_indices,
    test_size=0.2, stratify=labels, random_state=42
)

len(X_train), len(X_test)


(537, 135)

In [31]:
# train transformer model -linear svm
from sklearn.svm import LinearSVC

svm_clf = LinearSVC(C=2.0)  # tuned for strongest separation
svm_clf.fit(X_train, y_train)


In [32]:
# eveluate trasnformer model
from sklearn.metrics import accuracy_score, f1_score

preds = svm_clf.predict(X_test)

acc = accuracy_score(y_test, preds)
f1 = f1_score(y_test, preds, average="weighted")

print("Transformer + SVM Accuracy:", acc)
print("F1 Score:", f1)


Transformer + SVM Accuracy: 0.6592592592592592
F1 Score: 0.6588686526263026


In [33]:
# save transformer model
import joblib

joblib.dump(svm_clf, "../models/transformer_clf.pkl")


['../models/transformer_clf.pkl']

In [34]:
pred_df = pd.DataFrame({
    "filepath": df.loc[idx_test, "filepath"].values,
    "true": y_test,
    "predicted": preds
})

pred_df.to_csv("../results/transformer_predictions.csv", index=False)
