In [None]:
!pip install transformers datasets librosa soundfile jiwer

Collecting jiwer
  Downloading jiwer-4.0.0-py3-none-any.whl.metadata (3.3 kB)
Collecting rapidfuzz>=3.9.7 (from jiwer)
  Downloading rapidfuzz-3.14.3-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (12 kB)
Downloading jiwer-4.0.0-py3-none-any.whl (23 kB)
Downloading rapidfuzz-3.14.3-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (3.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m73.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rapidfuzz, jiwer
Successfully installed jiwer-4.0.0 rapidfuzz-3.14.3


In [None]:
import os
import numpy as np
import pandas as pd
import librosa
from datasets import Dataset

In [None]:
ravdess_dir = "/content/drive/MyDrive/Datasets/RAVDESS"
emotion_map = {
    "01": "neutral",
    "02": "calm",
    "03": "happy",
    "04": "sad",
    "05": "angry",
    "06": "fearful",
    "07": "disgust",
    "08": "surprised"
}

label2id = {v: i for i, v in enumerate(emotion_map.values())}
id2label = {i: v for v, i in label2id.items()}

emotion_map, label2id


({'01': 'neutral',
  '02': 'calm',
  '03': 'happy',
  '04': 'sad',
  '05': 'angry',
  '06': 'fearful',
  '07': 'disgust',
  '08': 'surprised'},
 {'neutral': 0,
  'calm': 1,
  'happy': 2,
  'sad': 3,
  'angry': 4,
  'fearful': 5,
  'disgust': 6,
  'surprised': 7})

In [None]:
import os

files = []
labels = []

def get_emotion(filename):
    # Example: "03-01-05-02-02-01-12.wav"
    emotion_id = filename.split("-")[2]
    return emotion_map[emotion_id]

for actor in os.listdir(ravdess_dir):
    actor_path = os.path.join(ravdess_dir, actor)
    for f in os.listdir(actor_path):
        if f.endswith(".wav"):
            files.append(os.path.join(actor_path, f))
            labels.append(label2id[get_emotion(f)])

len(files), len(labels)


(1440, 1440)

In [None]:
import pandas as pd
from datasets import Dataset

df = pd.DataFrame({
    "path": files,
    "label": labels
})

dataset = Dataset.from_pandas(df)
dataset


Dataset({
    features: ['path', 'label'],
    num_rows: 1440
})

In [None]:
from transformers import Wav2Vec2Processor

processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base")




In [None]:
import librosa

def preprocess(batch):
    try:
        # Try loading normally
        speech, sr = librosa.load(batch["path"], sr=16000)
    except Exception:
        print("❌ Skipping corrupted file:", batch["path"])

        # Replace with 1 second of silence
        speech = np.zeros(16000)

    encoded = processor(
        speech,
        sampling_rate=16000,
        padding="max_length",
        max_length=16000 * 5,  # 5 seconds
        truncation=True,
        return_attention_mask=True
    )

    batch["input_values"] = encoded["input_values"][0]
    batch["attention_mask"] = encoded["attention_mask"][0]
    return batch


In [None]:
dataset = dataset.map(preprocess, remove_columns=["path"])


Map:   0%|          | 0/1440 [00:00<?, ? examples/s]

  speech, sr = librosa.load(batch["path"], sr=16000)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


❌ Skipping corrupted file: /content/drive/MyDrive/Datasets/RAVDESS/Actor_02/03-01-03-01-01-02-02.wav


In [None]:
dataset = dataset.shuffle(seed=42).train_test_split(test_size=0.2)

train_ds = dataset["train"]
test_ds = dataset["test"]


In [None]:
from transformers import Wav2Vec2ForSequenceClassification

model = Wav2Vec2ForSequenceClassification.from_pretrained(
    "facebook/wav2vec2-base",
    num_labels=len(label2id),
    label2id=label2id,
    id2label=id2label,
    gradient_checkpointing=True
)


Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:

from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/dlProject/NModal/wav2vec2-ser",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=1e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=25,
    logging_steps=50,
    fp16=True,
    push_to_hub=False
)

In [None]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.6-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.6


In [None]:
import numpy as np
import evaluate

accuracy_metric = evaluate.load("accuracy")

def compute_metrics(pred):
    logits = pred.predictions
    preds = np.argmax(logits, axis=-1)
    labels = pred.label_ids
    return accuracy_metric.compute(predictions=preds, references=labels)


In [None]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=test_ds,
    tokenizer=processor,
    compute_metrics=compute_metrics
)


  trainer = Trainer(


In [None]:
trainer.train()


Epoch,Training Loss,Validation Loss,Accuracy
1,1.8297,1.694556,0.434028
2,1.4615,1.464118,0.520833
3,1.1794,1.167464,0.6875
4,0.9483,0.901605,0.784722
5,0.5874,0.65063,0.840278
6,0.3681,0.562905,0.829861
7,0.2867,1.013958,0.708333
8,0.1856,0.557308,0.857639
9,0.1685,0.503378,0.892361
10,0.0693,0.603651,0.857639


Epoch,Training Loss,Validation Loss,Accuracy
1,1.8297,1.694556,0.434028
2,1.4615,1.464118,0.520833
3,1.1794,1.167464,0.6875
4,0.9483,0.901605,0.784722
5,0.5874,0.65063,0.840278
6,0.3681,0.562905,0.829861
7,0.2867,1.013958,0.708333
8,0.1856,0.557308,0.857639
9,0.1685,0.503378,0.892361
10,0.0693,0.603651,0.857639


TrainOutput(global_step=7200, training_loss=0.3161458562480079, metrics={'train_runtime': 5296.5879, 'train_samples_per_second': 5.437, 'train_steps_per_second': 1.359, 'total_flos': 1.307344416768e+18, 'train_loss': 0.3161458562480079, 'epoch': 25.0})

In [None]:
metrics = trainer.evaluate()
print(metrics)

{'eval_loss': 0.5633840560913086, 'eval_accuracy': 0.9166666666666666, 'eval_runtime': 93.6825, 'eval_samples_per_second': 3.074, 'eval_steps_per_second': 0.769, 'epoch': 25.0}


In [None]:
trainer.save_model("/content/drive/MyDrive/dlProject/NModal/wav2vec2-ser-ravdess")
processor.save_pretrained("/content/drive/MyDrive/dlProject/NModal/wav2vec2-ser-ravdess")


[]

In [None]:
from transformers import AutoModelForAudioClassification, AutoFeatureExtractor
import torch
import librosa
import numpy as np

model_path = "/content/drive/MyDrive/dlProject/NModal/wav2vec2-ser-ravdess"

# Load processor + model
feature_extractor = AutoFeatureExtractor.from_pretrained(model_path)
model = AutoModelForAudioClassification.from_pretrained(model_path)
model.eval()


Wav2Vec2ForSequenceClassification(
  (wav2vec2): Wav2Vec2Model(
    (feature_extractor): Wav2Vec2FeatureEncoder(
      (conv_layers): ModuleList(
        (0): Wav2Vec2GroupNormConvLayer(
          (conv): Conv1d(1, 512, kernel_size=(10,), stride=(5,), bias=False)
          (activation): GELUActivation()
          (layer_norm): GroupNorm(512, 512, eps=1e-05, affine=True)
        )
        (1-4): 4 x Wav2Vec2NoLayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,), bias=False)
          (activation): GELUActivation()
        )
        (5-6): 2 x Wav2Vec2NoLayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(2,), stride=(2,), bias=False)
          (activation): GELUActivation()
        )
      )
    )
    (feature_projection): Wav2Vec2FeatureProjection(
      (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      (projection): Linear(in_features=512, out_features=768, bias=True)
      (dropout): Dropout(p=0.1, inplace=False)


In [None]:
def predict_emotion(file_path):
    # Load audio (16kHz)
    speech, sr = librosa.load(file_path, sr=16000)

    MAX_LEN = 16000 * 5  # 5 seconds

    # Pad or truncate exactly like training
    if len(speech) < MAX_LEN:
        speech = np.pad(speech, (0, MAX_LEN - len(speech)))
    else:
        speech = speech[:MAX_LEN]

    # Extract features
    inputs = feature_extractor(
        [speech],
        sampling_rate=16000,
        return_tensors="pt",
        padding="do_not_pad"
    )

    # Predict
    with torch.no_grad():
        logits = model(**inputs).logits
        pred_id = torch.argmax(logits, dim=-1).item()

    # Convert to label
    return model.config.id2label[pred_id]


In [None]:
test_wav = "/content/drive/MyDrive/dlProject/Datasets/ahad.wav"

emotion = predict_emotion(test_wav)
print("Predicted Emotion:", emotion)


Predicted Emotion: calm
