In [1]:
#installs
!pip install fsspec==2023.9.2
!pip install -U datasets
!pip install pandas numpy tensorflow matplotlib torch
!pip install transformers
!pip install tensorflow
!pip install scikit-learn
!pip install librosa
!pip install seaborn




In [2]:
#imports for Machine and Deep learning
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, recall_score
from scipy.special import rel_entr
from sklearn.model_selection import KFold


import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.optim as optim


#imports for data science
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from transformers import Wav2Vec2Model, Wav2Vec2FeatureExtractor, Wav2Vec2Processor


#other imports
from datasets import load_dataset
import librosa
import random
import os




  from .autonotebook import tqdm as notebook_tqdm


# DATA PROCESSING

In [3]:
SEED = 42
BATCH_SIZE = 32
N_MELS = 128
MAX_LEN = 300

def set_seed(seed=SEED):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed()


In [4]:
ds = load_dataset("AbstractTTS/IEMOCAP", split="train", download_mode="force_redownload")
ds[0]

Generating train split: 100%|██████████| 10039/10039 [00:01<00:00, 5154.53 examples/s]


{'file': 'Ses01F_impro01_F000.wav',
 'audio': {'path': 'Ses01F_impro01_F000.wav',
  'array': array([-0.0050354 , -0.00497437, -0.0038147 , ..., -0.00265503,
         -0.00317383, -0.00418091]),
  'sampling_rate': 16000},
 'frustrated': 0.0062500000931322575,
 'angry': 0.0062500000931322575,
 'sad': 0.0062500000931322575,
 'disgust': 0.0062500000931322575,
 'excited': 0.0062500000931322575,
 'fear': 0.0062500000931322575,
 'neutral': 0.949999988079071,
 'surprise': 0.0062500000931322575,
 'happy': 0.0062500000931322575,
 'EmoAct': 2.3333330154418945,
 'EmoVal': 2.6666669845581055,
 'EmoDom': 2.0,
 'gender': 'Female',
 'transcription': ' Excuse me.',
 'major_emotion': 'neutral',
 'speaking_rate': 5.139999866485596,
 'pitch_mean': 202.79881286621094,
 'pitch_std': 76.12785339355469,
 'rms': 0.00788376946002245,
 'relative_db': -17.938434600830078}

In [5]:
TARGET_EMOTIONS = ['happy', 'sad', 'angry', 'neutral']

def normalize_emotion(e):
    if e == 'excited':
        return 'happy'
    return e if e in TARGET_EMOTIONS else None


In [6]:
#LABEL ENCODING
def process_label(example):
    emotion = normalize_emotion(example['major_emotion'])
    if emotion is None:
        return None
    example['label'] = TARGET_EMOTIONS.index(emotion)
    return example

processed_ds = ds.filter(lambda x: normalize_emotion(x['major_emotion']) is not None)
processed_ds = processed_ds.map(process_label)


Filter: 100%|██████████| 10039/10039 [00:04<00:00, 2045.91 examples/s]
Map: 100%|██████████| 6877/6877 [00:02<00:00, 2562.76 examples/s]


# WAVE-2-VEC-2 TRAINING PIPELINE

In [7]:
# Feature extractor initialization
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("facebook/wav2vec2-large")

# Custom collate function for DataLoader
def collate_fn(batch):
    inputs = [item["audio"]["array"] for item in batch]
    labels = [item["label"] for item in batch]

    # Efficient preprocessing: feature extraction with padding
    inputs = feature_extractor(inputs, sampling_rate=16000, return_tensors="pt", padding=True)
    input_values = inputs.input_values

    labels = torch.tensor(labels, dtype=torch.long)

    return input_values, labels


In [8]:
class Wav2Vec2SER(nn.Module):
    def __init__(self, num_classes=len(TARGET_EMOTIONS)):
        super(Wav2Vec2SER, self).__init__()
        self.wav2vec2 = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-large")

        # Freeze Wav2Vec2 parameters to prevent backpropagation
        for param in self.wav2vec2.parameters():
            param.requires_grad = False

        self.classifier = nn.Sequential(
            nn.Linear(1024, 512),
            nn.BatchNorm1d(512),
            nn.ReLU(),
            nn.Dropout(0.6),
            nn.Linear(512, 256),
            nn.BatchNorm1d(256),
            nn.ReLU(),
            nn.Dropout(0.6),
            nn.Linear(256, num_classes)
        )

    def forward(self, x):
        with torch.no_grad():  # Disable gradient computation through Wav2Vec2
            features = self.wav2vec2(x).last_hidden_state
        pooled_features = torch.mean(features, dim=1)
        logits = self.classifier(pooled_features)
        return nn.functional.log_softmax(logits, dim=-1), pooled_features


In [9]:
# Dataset splits and loaders
train_val = processed_ds.train_test_split(test_size=0.2, seed=SEED)
train_dataset = train_val['train']
val_dataset = train_val['test']

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)

In [10]:
# Device setup and model initialization
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = Wav2Vec2SER().to(device)

criterion = nn.NLLLoss()  # Using NLLLoss to match log_softmax output
optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=1e-4)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.1)


In [11]:
def train_epoch():
    model.train()
    total_loss = 0
    for inputs, labels in train_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        optimizer.zero_grad()
        log_probs, _ = model(inputs)
        loss = criterion(log_probs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(train_loader)

In [12]:
def validate_epoch():
    model.eval()
    total_loss = 0
    correct = 0
    total = 0
    with torch.no_grad():
        for inputs, labels in val_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            log_probs, _ = model(inputs)
            loss = criterion(log_probs, labels)
            total_loss += loss.item()
            preds = log_probs.argmax(dim=1)
            correct += (preds == labels).sum().item()
            total += labels.size(0)
    accuracy = correct / total
    return total_loss / len(val_loader), accuracy

In [13]:
len(train_loader.dataset), len(val_loader.dataset)

(5501, 1376)

In [None]:
epochs = 10

for epoch in range(1, epochs + 1):
    # Training phase
    model.train()
    total_train_loss = 0
    for inputs, labels in train_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        optimizer.zero_grad()
        log_probs, _ = model(inputs)
        loss = criterion(log_probs, labels)
        loss.backward()
        optimizer.step()
        total_train_loss += loss.item()
    avg_train_loss = total_train_loss / len(train_loader)

    # Validation phase
    model.eval()
    total_val_loss = 0
    correct = 0
    total = 0
    with torch.no_grad():
        for inputs, labels in val_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            log_probs, _ = model(inputs)
            loss = criterion(log_probs, labels)
            total_val_loss += loss.item()
            preds = log_probs.argmax(dim=1)
            correct += (preds == labels).sum().item()
            total += labels.size(0)
    avg_val_loss = total_val_loss / len(val_loader)
    val_accuracy = correct / total

    print(f"Epoch [{epoch}/{epochs}] Train Loss: {avg_train_loss:.4f} | Val Loss: {avg_val_loss:.4f} | Val Accuracy: {val_accuracy:.4f}")

    scheduler.step()

In [None]:
# Evaluation on validation/test set
model.eval()
all_preds = []
all_labels = []
with torch.no_grad():
    for inputs, labels in val_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        log_probs, _ = model(inputs)
        preds = log_probs.argmax(dim=1)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

print("Classification Report:\n")
print(classification_report(all_labels, all_preds, target_names=TARGET_EMOTIONS))

cm = confusion_matrix(all_labels, all_preds)
plt.figure(figsize=(8,6))
sns.heatmap(cm, annot=True, fmt='d', xticklabels=TARGET_EMOTIONS, yticklabels=TARGET_EMOTIONS, cmap='Blues')
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.title("Confusion Matrix")
plt.show()
